├── .gitignore ├── .travis.yml ├── README.md ├── autokeras_pretrained ├── __init__.py ├── base.py ├── bert │ ├── __init__.py │ ├── modeling.py │ ├── optimization.py │ ├── tokenization.py │ └── utils.py ├── constant.py ├── face_detector.py ├── object_detector.py ├── text_classifier.py ├── utils.py ├── voice_generator │ ├── __init__.py │ ├── deepvoice3_pytorch │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── conv.py │ │ ├── deepvoice3.py │ │ ├── frontend.py │ │ ├── model.py │ │ ├── modules.py │ │ ├── text │ │ │ ├── __init__.py │ │ │ ├── cleaners.py │ │ │ ├── cmudict.py │ │ │ ├── numbers.py │ │ │ ├── symbols.py │ │ │ └── text.py │ │ └── version.py │ └── voice_generator.py └── voice_recognizer.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── common.py ├── pretrained ├── __init__.py ├── test_face_detector.py ├── test_object_detection.py ├── test_sentiment_analysis.py ├── test_topic_classifier.py ├── test_voice_generator.py └── test_voice_recognizer.py └── resources ├── images_test ├── Black_white_images │ ├── Aaron_Eckhart_0001.jpg │ ├── Aaron_Peirsol_0001.jpg │ ├── Aaron_Peirsol_0002.jpg │ ├── Aaron_Peirsol_0003.jpg │ ├── Aaron_Peirsol_0004.jpg │ ├── Aaron_Sorkin_0001.jpg │ ├── Aaron_Sorkin_0002.jpg │ ├── Abdel_Nasser_Assidi_0001.jpg │ ├── Abdel_Nasser_Assidi_0002.jpg │ ├── Abel_Pacheco_0001.jpg │ ├── Abel_Pacheco_0002.jpg │ ├── Abel_Pacheco_0003.jpg │ ├── Abel_Pacheco_0004.jpg │ ├── Abel_Pacheco_0005.jpg │ └── Abel_Pacheco_0006.jpg ├── Color_images │ ├── Aaron_Eckhart_0001.jpg │ ├── Aaron_Peirsol_0001.jpg │ ├── Aaron_Peirsol_0002.jpg │ ├── Aaron_Peirsol_0003.jpg │ ├── Aaron_Peirsol_0004.jpg │ ├── Aaron_Sorkin_0001.jpg │ ├── Aaron_Sorkin_0002.jpg │ ├── Abdel_Nasser_Assidi_0001.jpg │ ├── Abdel_Nasser_Assidi_0002.jpg │ ├── Abel_Pacheco_0001.jpg │ ├── Abel_Pacheco_0002.jpg │ ├── Abel_Pacheco_0003.jpg │ ├── Abel_Pacheco_0004.jpg │ ├── Abel_Pacheco_0005.jpg │ └── Abel_Pacheco_0006.jpg ├── face_detector.jpg ├── images_name.csv └── od.JPG └── temp └── .gitkeep /.gitignore: -------------------------------------------------------------------------------- 1 | # vim swp files 2 | *.swp 3 | # caffe/pytorch model files 4 | *.pth 5 | 6 | # Mkdocs 7 | /docs/ 8 | /mkdocs/docs/temp 9 | 10 | .DS_Store 11 | .idea 12 | .pytest_cache 13 | /experiments 14 | 15 | # resource temp folder 16 | tests/resources/temp/* 17 | !tests/resources/temp/.gitkeep 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | .static_storage/ 74 | .media/ 75 | local_settings.py 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # SageMath parsed files 100 | *.sage.py 101 | 102 | # Environments 103 | .env 104 | .venv 105 | env/ 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | 124 | examples/text_cnn/glove_embedding/ 125 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.6' 4 | install: 5 | - pip install -r requirements.txt --quiet 6 | - pip install pytest 7 | - pip install pytest-cov 8 | - pip install coverage 9 | - pip install codacy-coverage 10 | script: 11 | - pytest tests --cov=autokeras_pretrained --cov-report xml:coverage.xml 12 | after_script: 13 | - python-codacy-coverage -r coverage.xml 14 | deploy: 15 | - provider: pypi 16 | user: jhfjhfj1 17 | password: 18 | secure: B36Cg4YbMhIrHyi1LAxilS2fek905PAyU+0anw8BTWgjOF0oM7hLRb64LcgpHrp6E0VawIscE7KGfCaiJR+14S3gqO2blaiJ0Os+ovpbzSu8sAnBVVKfT8Lv24o7prxeq2G8UROXPDST2ZEhy4rugCFjNeMv65WSezpG08TJasmvUAySidQU7rvzoIgoEiKkc7329bzaHpWVQDyuXs/slQSufTK23WBA+OM3cclLrQE02wVH0q/+BtjvkN+44XOVM7Q/gmIrVndj8KGVTIFPYjmKuIiyxjcSGv67oQhXNRH/SSOJyMb93elu/o4y7zexG8eZ2BVr0+Q2pkyBo8JAmfuuWLnz0gONwfU4xwMlLy2m/muGDclMr9eC1i0KWOG0E7afbktFXMVG9EBWr19OPApqGvtfU4997sPUcx7hDFLAwLyznR1hOFIKbeSdtFSUUfGoCnzrCcobsJkD1QKGLoPwr0/gp5o+HJC0bLyMSo45ETsH1m5UGFgFO9Xk4KiGQndL0SpSq7VfZxnMVJttRObqq/8/4wwgFBp8p/bXPpkoS5NK5QajdQgTZ2u8O5SABDBStMq2rdsLdqX4/1tyBcG+u8cKiXjJOfn3chVAwJUNTgddlOa6aGZFM1h3qB8WHgIYJjWxygAKfnH14XRDPAbeEvGdXfwZhJqLfFo6rQs= 19 | on: 20 | tags: true 21 | repo: jhfjhfj1/autokeras-pretrained 22 | env: 23 | global: 24 | secure: 0ZmkjLmGZnZSjZplHjm/1x2izv+OC+/S+/jUWaSNGZvHBtGRwsf1EHi65RzkCD9V8YCN42HE3SFh+mEN5nVaYPzxMvhnzrXVzce8oAU8o0qmCTh1K3d74KTzHtJdQGxCZi6KjAhkIZdQclR8FkTnU629BbDGMJ+MCm1imcawyI4ooXFpkCPsa0l+U9A8gZ47/FdKK7lP3Idw/dvIAcU6Cx1+hhkpmyWLqPiig2vB6vYFPYbrRvw2YCkvhhQoADT9pREK9vfYBf/1F9LOt00MAPXcHT/mjok7ziEiqQIrns2sa8CumEX9nUhaTq4TliMxPFDCvKxtMq4+cO5/slr6xD/Nh5hURUjaKytRJG44FmonqBOqKB7Zo3pgRF5/gY6YGUapuh+C2suTQmsJxwXCsG64sFk9SGmpqOImbfvivxxEhcXl7TGQNn7UNH1fxwrklkSj1B84BBnTIDwgAAFQS+4JlHSGo5KAZZwXCzPh0j+6KR2TdPLsrhqm1JIYRDqI8Jq4EkZICSGIjKO52XYq1Hrl+9uudoy6+rvQXlEQz/Pj/jj9DwATt1keqjk8p9Xinnw7X+h0nD+Q2Wtfn6LCzH+E3rDrlpdBRtWhnI37aZHiLwovmqDiMwxY76SsEm5XDsFKABYXxvoiXT5IVHqVd7tEveZR1l+N8mN53B5Q8Z0= 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # autokeras-pretrained 2 | 3 | [![Build Status](https://travis-ci.org/jhfjhfj1/autokeras-pretrained.svg?branch=master)](https://travis-ci.org/jhfjhfj1/autokeras-pretrained) 4 | [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/f6e2bbcea21a486fb3b5d9af80368e58)](https://www.codacy.com/app/jhfjhfj1/autokeras-pretrained?utm_source=github.com&utm_medium=referral&utm_content=datamllab/autokeras-pretrained&utm_campaign=Badge_Coverage) 5 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f6e2bbcea21a486fb3b5d9af80368e58)](https://www.codacy.com/app/jhfjhfj1/autokeras-pretrained?utm_source=github.com&utm_medium=referral&utm_content=datamllab/autokeras-pretrained&utm_campaign=Badge_Grade) 6 | 7 | Pretrained models in Auto-Keras. 8 | No custom training data needed. 9 | 10 | -------------------------------------------------------------------------------- /autokeras_pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.object_detector import ObjectDetector 2 | from autokeras_pretrained.face_detector import FaceDetector 3 | from autokeras_pretrained.voice_generator.voice_generator import VoiceGenerator 4 | from autokeras_pretrained.voice_recognizer import VoiceRecognizer -------------------------------------------------------------------------------- /autokeras_pretrained/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from autokeras_pretrained.utils import temp_path_generator, ensure_dir, download_file_from_google_drive, get_device 6 | 7 | 8 | class Pretrained(ABC): 9 | """The base class for all pretrained task.""" 10 | 11 | def __init__(self, verbose=True, model_path=None): 12 | """Initialize the instance.""" 13 | self.verbose = verbose 14 | self.model = None 15 | self.device = get_device() 16 | self.model_path = model_path if model_path is not None else temp_path_generator() 17 | ensure_dir(self.model_path) 18 | self.local_paths = [os.path.join(self.model_path, x.local_name) for x in self._google_drive_files] 19 | for path, x in zip(self.local_paths, self._google_drive_files): 20 | if not os.path.exists(path): 21 | download_file_from_google_drive(file_id=x.google_drive_id, 22 | dest_path=path, 23 | verbose=True) 24 | 25 | @property 26 | @abstractmethod 27 | def _google_drive_files(self): 28 | pass 29 | 30 | @abstractmethod 31 | def predict(self, input_data, **kwargs): 32 | """Return predict results for the given image 33 | Returns: 34 | A numpy.ndarray containing the results. 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /autokeras_pretrained/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/bert/__init__.py -------------------------------------------------------------------------------- /autokeras_pretrained/bert/modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch BERT model.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import copy 24 | import json 25 | import math 26 | import logging 27 | import tarfile 28 | import tempfile 29 | import shutil 30 | 31 | import torch 32 | from torch import nn 33 | from torch.nn import CrossEntropyLoss 34 | 35 | from autokeras_pretrained.constant import Constant 36 | from autokeras_pretrained.bert.utils import cached_path 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | PRETRAINED_MODEL_ARCHIVE_MAP = { 41 | 'bert-base-uncased': Constant.PRETRAINED_MODEL_BERT_BASE_UNCASED, 42 | 'bert-base-cased': Constant.PRETRAINED_MODEL_BERT_BASE_CASED 43 | } 44 | 45 | CONFIG_NAME = 'bert_config.json' 46 | WEIGHTS_NAME = 'pytorch_model.bin' 47 | 48 | 49 | def gelu(x): 50 | """Implementation of the gelu activation function. 51 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 52 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 53 | """ 54 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 55 | 56 | 57 | def swish(x): 58 | return x * torch.sigmoid(x) 59 | 60 | 61 | ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} 62 | 63 | 64 | class BertConfig(object): 65 | """Configuration class to store the configuration of a `BertModel`. 66 | """ 67 | def __init__(self, 68 | vocab_size_or_config_json_file, 69 | hidden_size=768, 70 | num_hidden_layers=12, 71 | num_attention_heads=12, 72 | intermediate_size=3072, 73 | hidden_act="gelu", 74 | hidden_dropout_prob=0.1, 75 | attention_probs_dropout_prob=0.1, 76 | max_position_embeddings=512, 77 | type_vocab_size=2, 78 | initializer_range=0.02): 79 | """Constructs BertConfig. 80 | 81 | Args: 82 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. 83 | hidden_size: Size of the encoder layers and the pooler layer. 84 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 85 | num_attention_heads: Number of attention heads for each attention layer in 86 | the Transformer encoder. 87 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 88 | layer in the Transformer encoder. 89 | hidden_act: The non-linear activation function (function or string) in the 90 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 91 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 92 | layers in the embeddings, encoder, and pooler. 93 | attention_probs_dropout_prob: The dropout ratio for the attention 94 | probabilities. 95 | max_position_embeddings: The maximum sequence length that this model might 96 | ever be used with. Typically set this to something large just in case 97 | (e.g., 512 or 1024 or 2048). 98 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 99 | `BertModel`. 100 | initializer_range: The sttdev of the truncated_normal_initializer for 101 | initializing all weight matrices. 102 | """ 103 | if isinstance(vocab_size_or_config_json_file, str): 104 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 105 | json_config = json.loads(reader.read()) 106 | for key, value in json_config.items(): 107 | self.__dict__[key] = value 108 | elif isinstance(vocab_size_or_config_json_file, int): 109 | self.vocab_size = vocab_size_or_config_json_file 110 | self.hidden_size = hidden_size 111 | self.num_hidden_layers = num_hidden_layers 112 | self.num_attention_heads = num_attention_heads 113 | self.hidden_act = hidden_act 114 | self.intermediate_size = intermediate_size 115 | self.hidden_dropout_prob = hidden_dropout_prob 116 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 117 | self.max_position_embeddings = max_position_embeddings 118 | self.type_vocab_size = type_vocab_size 119 | self.initializer_range = initializer_range 120 | else: 121 | raise ValueError("First argument must be either a vocabulary size (int)" 122 | "or the path to a pretrained model config file (str)") 123 | 124 | @classmethod 125 | def from_dict(cls, json_object): 126 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 127 | config = BertConfig(vocab_size_or_config_json_file=-1) 128 | for key, value in json_object.items(): 129 | config.__dict__[key] = value 130 | return config 131 | 132 | @classmethod 133 | def from_json_file(cls, json_file): 134 | """Constructs a `BertConfig` from a json file of parameters.""" 135 | with open(json_file, "r", encoding='utf-8') as reader: 136 | text = reader.read() 137 | return cls.from_dict(json.loads(text)) 138 | 139 | def __repr__(self): 140 | return str(self.to_json_string()) 141 | 142 | def to_dict(self): 143 | """Serializes this instance to a Python dictionary.""" 144 | output = copy.deepcopy(self.__dict__) 145 | return output 146 | 147 | def to_json_string(self): 148 | """Serializes this instance to a JSON string.""" 149 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 150 | 151 | 152 | try: 153 | from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm 154 | except ImportError: 155 | print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") 156 | 157 | class BertLayerNorm(nn.Module): 158 | def __init__(self, hidden_size, eps=1e-12): 159 | """Construct a layernorm module in the TF style (epsilon inside the square root). 160 | """ 161 | super(BertLayerNorm, self).__init__() 162 | self.weight = nn.Parameter(torch.ones(hidden_size)) 163 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 164 | self.variance_epsilon = eps 165 | 166 | def forward(self, x): 167 | u = x.mean(-1, keepdim=True) 168 | s = (x - u).pow(2).mean(-1, keepdim=True) 169 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 170 | return self.weight * x + self.bias 171 | 172 | 173 | class BertEmbeddings(nn.Module): 174 | """Construct the embeddings from word, position and token_type embeddings. 175 | """ 176 | def __init__(self, config): 177 | super(BertEmbeddings, self).__init__() 178 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) 179 | self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) 180 | self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) 181 | 182 | # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load 183 | # any TensorFlow checkpoint file 184 | self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) 185 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 186 | 187 | def forward(self, input_ids, token_type_ids=None): 188 | seq_length = input_ids.size(1) 189 | position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) 190 | position_ids = position_ids.unsqueeze(0).expand_as(input_ids) 191 | if token_type_ids is None: 192 | token_type_ids = torch.zeros_like(input_ids) 193 | 194 | words_embeddings = self.word_embeddings(input_ids) 195 | position_embeddings = self.position_embeddings(position_ids) 196 | token_type_embeddings = self.token_type_embeddings(token_type_ids) 197 | 198 | embeddings = words_embeddings + position_embeddings + token_type_embeddings 199 | embeddings = self.LayerNorm(embeddings) 200 | embeddings = self.dropout(embeddings) 201 | return embeddings 202 | 203 | 204 | class BertSelfAttention(nn.Module): 205 | def __init__(self, config): 206 | super(BertSelfAttention, self).__init__() 207 | if config.hidden_size % config.num_attention_heads != 0: 208 | raise ValueError( 209 | "The hidden size (%d) is not a multiple of the number of attention " 210 | "heads (%d)" % (config.hidden_size, config.num_attention_heads)) 211 | self.num_attention_heads = config.num_attention_heads 212 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) 213 | self.all_head_size = self.num_attention_heads * self.attention_head_size 214 | 215 | self.query = nn.Linear(config.hidden_size, self.all_head_size) 216 | self.key = nn.Linear(config.hidden_size, self.all_head_size) 217 | self.value = nn.Linear(config.hidden_size, self.all_head_size) 218 | 219 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 220 | 221 | def transpose_for_scores(self, x): 222 | new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) 223 | x = x.view(*new_x_shape) 224 | return x.permute(0, 2, 1, 3) 225 | 226 | def forward(self, hidden_states, attention_mask): 227 | mixed_query_layer = self.query(hidden_states) 228 | mixed_key_layer = self.key(hidden_states) 229 | mixed_value_layer = self.value(hidden_states) 230 | 231 | query_layer = self.transpose_for_scores(mixed_query_layer) 232 | key_layer = self.transpose_for_scores(mixed_key_layer) 233 | value_layer = self.transpose_for_scores(mixed_value_layer) 234 | 235 | # Take the dot product between "query" and "key" to get the raw attention scores. 236 | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) 237 | attention_scores = attention_scores / math.sqrt(self.attention_head_size) 238 | # Apply the attention mask is (precomputed for all layers in BertModel forward() function) 239 | attention_scores = attention_scores + attention_mask 240 | 241 | # Normalize the attention scores to probabilities. 242 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 243 | 244 | # This is actually dropping out entire tokens to attend to, which might 245 | # seem a bit unusual, but is taken from the original Transformer paper. 246 | attention_probs = self.dropout(attention_probs) 247 | 248 | context_layer = torch.matmul(attention_probs, value_layer) 249 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() 250 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) 251 | context_layer = context_layer.view(*new_context_layer_shape) 252 | return context_layer 253 | 254 | 255 | class BertSelfOutput(nn.Module): 256 | def __init__(self, config): 257 | super(BertSelfOutput, self).__init__() 258 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 259 | self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) 260 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 261 | 262 | def forward(self, hidden_states, input_tensor): 263 | hidden_states = self.dense(hidden_states) 264 | hidden_states = self.dropout(hidden_states) 265 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 266 | return hidden_states 267 | 268 | 269 | class BertAttention(nn.Module): 270 | def __init__(self, config): 271 | super(BertAttention, self).__init__() 272 | self.self = BertSelfAttention(config) 273 | self.output = BertSelfOutput(config) 274 | 275 | def forward(self, input_tensor, attention_mask): 276 | self_output = self.self(input_tensor, attention_mask) 277 | attention_output = self.output(self_output, input_tensor) 278 | return attention_output 279 | 280 | 281 | class BertIntermediate(nn.Module): 282 | def __init__(self, config): 283 | super(BertIntermediate, self).__init__() 284 | self.dense = nn.Linear(config.hidden_size, config.intermediate_size) 285 | self.intermediate_act_fn = ACT2FN[config.hidden_act] \ 286 | if isinstance(config.hidden_act, str) else config.hidden_act 287 | 288 | def forward(self, hidden_states): 289 | hidden_states = self.dense(hidden_states) 290 | hidden_states = self.intermediate_act_fn(hidden_states) 291 | return hidden_states 292 | 293 | 294 | class BertOutput(nn.Module): 295 | def __init__(self, config): 296 | super(BertOutput, self).__init__() 297 | self.dense = nn.Linear(config.intermediate_size, config.hidden_size) 298 | self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) 299 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 300 | 301 | def forward(self, hidden_states, input_tensor): 302 | hidden_states = self.dense(hidden_states) 303 | hidden_states = self.dropout(hidden_states) 304 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 305 | return hidden_states 306 | 307 | 308 | class BertLayer(nn.Module): 309 | def __init__(self, config): 310 | super(BertLayer, self).__init__() 311 | self.attention = BertAttention(config) 312 | self.intermediate = BertIntermediate(config) 313 | self.output = BertOutput(config) 314 | 315 | def forward(self, hidden_states, attention_mask): 316 | attention_output = self.attention(hidden_states, attention_mask) 317 | intermediate_output = self.intermediate(attention_output) 318 | layer_output = self.output(intermediate_output, attention_output) 319 | return layer_output 320 | 321 | 322 | class BertEncoder(nn.Module): 323 | def __init__(self, config): 324 | super(BertEncoder, self).__init__() 325 | layer = BertLayer(config) 326 | self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) 327 | 328 | def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): 329 | all_encoder_layers = [] 330 | for layer_module in self.layer: 331 | hidden_states = layer_module(hidden_states, attention_mask) 332 | if output_all_encoded_layers: 333 | all_encoder_layers.append(hidden_states) 334 | if not output_all_encoded_layers: 335 | all_encoder_layers.append(hidden_states) 336 | return all_encoder_layers 337 | 338 | 339 | class BertPooler(nn.Module): 340 | def __init__(self, config): 341 | super(BertPooler, self).__init__() 342 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 343 | self.activation = nn.Tanh() 344 | 345 | def forward(self, hidden_states): 346 | # We "pool" the model by simply taking the hidden state corresponding 347 | # to the first token. 348 | first_token_tensor = hidden_states[:, 0] 349 | pooled_output = self.dense(first_token_tensor) 350 | pooled_output = self.activation(pooled_output) 351 | return pooled_output 352 | 353 | 354 | class PreTrainedBertModel(nn.Module): 355 | """ An abstract class to handle weights initialization and 356 | a simple interface for dowloading and loading pretrained models. 357 | """ 358 | def __init__(self, config, *inputs, **kwargs): 359 | super(PreTrainedBertModel, self).__init__() 360 | if not isinstance(config, BertConfig): 361 | raise ValueError( 362 | "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " 363 | "To create a model from a Google pretrained model use " 364 | "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( 365 | self.__class__.__name__, self.__class__.__name__ 366 | )) 367 | self.config = config 368 | 369 | def init_bert_weights(self, module): 370 | """ Initialize the weights. 371 | """ 372 | if isinstance(module, (nn.Linear, nn.Embedding)): 373 | # Slightly different from the TF version which uses truncated_normal for initialization 374 | # cf https://github.com/pytorch/pytorch/pull/5617 375 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 376 | elif isinstance(module, BertLayerNorm): 377 | module.bias.data.normal_(mean=0.0, std=self.config.initializer_range) 378 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 379 | if isinstance(module, nn.Linear) and module.bias is not None: 380 | module.bias.data.zero_() 381 | 382 | @classmethod 383 | def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs): 384 | """ 385 | Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. 386 | Download and cache the pre-trained model file if needed. 387 | 388 | Params: 389 | pretrained_model_name: either: 390 | - a str with the name of a pre-trained model to load selected in the list of: 391 | . `bert-base-uncased` 392 | . `bert-large-uncased` 393 | . `bert-base-cased` 394 | . `bert-base-multilingual` 395 | . `bert-base-chinese` 396 | - a path or url to a pretrained model archive containing: 397 | . `bert_config.json` a configuration file for the model 398 | . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance 399 | cache_dir: an optional path to a folder in which the pre-trained models will be cached. 400 | state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models 401 | *inputs, **kwargs: additional input for the specific Bert class 402 | (ex: num_labels for BertForSequenceClassification) 403 | """ 404 | try: 405 | if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: 406 | archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] 407 | else: 408 | raise KeyError 409 | except KeyError: 410 | logger.error(str(pretrained_model_name) + " model is not available/supported.") 411 | 412 | # redirect to the cache, if necessary 413 | try: 414 | resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) 415 | except FileNotFoundError: 416 | logger.error( 417 | "Model name '{}' was not found in model name list ({}). " 418 | "We assumed '{}' was a path or url but couldn't find any file " 419 | "associated to this path or url.".format( 420 | pretrained_model_name, 421 | ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), 422 | archive_file)) 423 | return None 424 | if resolved_archive_file == archive_file: 425 | logger.info("loading archive file {}".format(archive_file)) 426 | else: 427 | logger.info("loading archive file {} from cache at {}".format( 428 | archive_file, resolved_archive_file)) 429 | tempdir = None 430 | if os.path.isdir(resolved_archive_file): 431 | serialization_dir = resolved_archive_file 432 | else: 433 | # Extract archive to temp dir 434 | tempdir = tempfile.mkdtemp() 435 | logger.info("extracting archive file {} to temp dir {}".format( 436 | resolved_archive_file, tempdir)) 437 | with tarfile.open(resolved_archive_file, 'r:gz') as archive: 438 | archive.extractall(tempdir) 439 | serialization_dir = tempdir 440 | # Load config 441 | config_file = os.path.join(serialization_dir, CONFIG_NAME) 442 | config = BertConfig.from_json_file(config_file) 443 | logger.info("Model config {}".format(config)) 444 | # Instantiate model. 445 | model = cls(config, *inputs, **kwargs) 446 | if state_dict is None: 447 | weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) 448 | state_dict = torch.load(weights_path) 449 | 450 | old_keys = [] 451 | new_keys = [] 452 | for key in state_dict.keys(): 453 | new_key = None 454 | if 'gamma' in key: 455 | new_key = key.replace('gamma', 'weight') 456 | if 'beta' in key: 457 | new_key = key.replace('beta', 'bias') 458 | if new_key: 459 | old_keys.append(key) 460 | new_keys.append(new_key) 461 | for old_key, new_key in zip(old_keys, new_keys): 462 | state_dict[new_key] = state_dict.pop(old_key) 463 | 464 | missing_keys = [] 465 | unexpected_keys = [] 466 | error_msgs = [] 467 | # copy state_dict so _load_from_state_dict can modify it 468 | metadata = getattr(state_dict, '_metadata', None) 469 | state_dict = state_dict.copy() 470 | if metadata is not None: 471 | state_dict._metadata = metadata 472 | 473 | def load(module, prefix=''): 474 | local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) 475 | module._load_from_state_dict( 476 | state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) 477 | for name, child in module._modules.items(): 478 | if child is not None: 479 | load(child, prefix + name + '.') 480 | load(model, prefix='' if hasattr(model, 'bert') else 'bert.') 481 | if len(missing_keys) > 0: 482 | logger.info("Weights of {} not initialized from pretrained model: {}".format( 483 | model.__class__.__name__, missing_keys)) 484 | if len(unexpected_keys) > 0: 485 | logger.info("Weights from pretrained model not used in {}: {}".format( 486 | model.__class__.__name__, unexpected_keys)) 487 | if tempdir: 488 | # Clean up temp dir 489 | shutil.rmtree(tempdir) 490 | return model 491 | 492 | 493 | class BertModel(PreTrainedBertModel): 494 | """BERT model ("Bidirectional Embedding Representations from a Transformer"). 495 | 496 | Params: 497 | config: a BertConfig class instance with the configuration to build a new model 498 | 499 | Inputs: 500 | `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] 501 | with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts 502 | `extract_features.py`, `run_classifier.py` and `run_squad.py`) 503 | `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token 504 | types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to 505 | a `sentence B` token (see BERT paper for more details). 506 | `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices 507 | selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max 508 | input sequence length in the current batch. It's the mask that we typically use for attention when 509 | a batch has varying length sentences. 510 | `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. 511 | 512 | Outputs: Tuple of (encoded_layers, pooled_output) 513 | `encoded_layers`: controled by `output_all_encoded_layers` argument: 514 | - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end 515 | of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each 516 | encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], 517 | - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding 518 | to the last attention block of shape [batch_size, sequence_length, hidden_size], 519 | `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a 520 | classifier pretrained on top of the hidden state associated to the first character of the 521 | input (`CLF`) to train on the Next-Sentence task (see BERT's paper). 522 | 523 | Example usage: 524 | ```python 525 | # Already been converted into WordPiece token ids 526 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 527 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 528 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) 529 | 530 | config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, 531 | num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) 532 | 533 | model = modeling.BertModel(config=config) 534 | all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) 535 | ``` 536 | """ 537 | def __init__(self, config): 538 | super(BertModel, self).__init__(config) 539 | self.embeddings = BertEmbeddings(config) 540 | self.encoder = BertEncoder(config) 541 | self.pooler = BertPooler(config) 542 | self.apply(self.init_bert_weights) 543 | 544 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): 545 | if attention_mask is None: 546 | attention_mask = torch.ones_like(input_ids) 547 | if token_type_ids is None: 548 | token_type_ids = torch.zeros_like(input_ids) 549 | 550 | # We create a 3D attention mask from a 2D tensor mask. 551 | # Sizes are [batch_size, 1, 1, to_seq_length] 552 | # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] 553 | # this attention mask is more simple than the triangular masking of causal attention 554 | # used in OpenAI GPT, we just need to prepare the broadcast dimension here. 555 | extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) 556 | 557 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 558 | # masked positions, this operation will create a tensor which is 0.0 for 559 | # positions we want to attend and -10000.0 for masked positions. 560 | # Since we are adding it to the raw scores before the softmax, this is 561 | # effectively the same as removing these entirely. 562 | extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility 563 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 564 | 565 | embedding_output = self.embeddings(input_ids, token_type_ids) 566 | encoded_layers = self.encoder(embedding_output, 567 | extended_attention_mask, 568 | output_all_encoded_layers=output_all_encoded_layers) 569 | sequence_output = encoded_layers[-1] 570 | pooled_output = self.pooler(sequence_output) 571 | if not output_all_encoded_layers: 572 | encoded_layers = encoded_layers[-1] 573 | return encoded_layers, pooled_output 574 | 575 | 576 | class BertForSequenceClassification(PreTrainedBertModel): 577 | """BERT model for classification. 578 | This module is composed of the BERT model with a linear layer on top of 579 | the pooled output. 580 | 581 | Params: 582 | `config`: a BertConfig class instance with the configuration to build a new model. 583 | `num_labels`: the number of classes for the classifier. Default = 2. 584 | 585 | Inputs: 586 | `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] 587 | with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts 588 | `extract_features.py`, `run_classifier.py` and `run_squad.py`) 589 | `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token 590 | types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to 591 | a `sentence B` token (see BERT paper for more details). 592 | `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices 593 | selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max 594 | input sequence length in the current batch. It's the mask that we typically use for attention when 595 | a batch has varying length sentences. 596 | `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] 597 | with indices selected in [0, ..., num_labels]. 598 | 599 | Outputs: 600 | if `labels` is not `None`: 601 | Outputs the CrossEntropy classification loss of the output with the labels. 602 | if `labels` is `None`: 603 | Outputs the classification logits of shape [batch_size, num_labels]. 604 | 605 | Example usage: 606 | ```python 607 | # Already been converted into WordPiece token ids 608 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 609 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 610 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) 611 | 612 | config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, 613 | num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) 614 | 615 | num_labels = 2 616 | 617 | model = BertForSequenceClassification(config, num_labels) 618 | logits = model(input_ids, token_type_ids, input_mask) 619 | ``` 620 | """ 621 | def __init__(self, config, num_labels=2): 622 | super(BertForSequenceClassification, self).__init__(config) 623 | self.num_labels = num_labels 624 | self.bert = BertModel(config) 625 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 626 | self.classifier = nn.Linear(config.hidden_size, num_labels) 627 | self.apply(self.init_bert_weights) 628 | 629 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): 630 | _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) 631 | pooled_output = self.dropout(pooled_output) 632 | logits = self.classifier(pooled_output) 633 | 634 | if labels is not None: 635 | loss_fct = CrossEntropyLoss() 636 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 637 | return loss 638 | else: 639 | return logits 640 | -------------------------------------------------------------------------------- /autokeras_pretrained/bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import torch 18 | from torch.optim import Optimizer 19 | from torch.optim.optimizer import required 20 | from torch.nn.utils import clip_grad_norm_ 21 | 22 | 23 | def warmup_linear(x, warmup=0.002): 24 | if x < warmup: 25 | return x/warmup 26 | return 1.0 - x 27 | 28 | 29 | def get_lr_scheduled(group, state): 30 | if group['t_total'] != -1: 31 | schedule_fct = SCHEDULES[group['schedule']] 32 | lr_scheduled = group['lr'] * schedule_fct(state['step'] / group['t_total'], group['warmup']) 33 | else: 34 | lr_scheduled = group['lr'] 35 | return lr_scheduled 36 | 37 | 38 | SCHEDULES = { 39 | 'warmup_linear':warmup_linear, 40 | } 41 | 42 | 43 | class BertAdam(Optimizer): 44 | """Implements BERT version of Adam algorithm with weight decay fix. 45 | Params: 46 | lr: learning rate 47 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 48 | t_total: total number of training steps for the learning 49 | rate schedule, -1 means constant learning rate. Default: -1 50 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 51 | b1: Adams b1. Default: 0.9 52 | b2: Adams b2. Default: 0.999 53 | e: Adams epsilon. Default: 1e-6 54 | weight_decay: Weight decay. Default: 0.01 55 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 56 | """ 57 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 58 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, 59 | max_grad_norm=1.0): 60 | if lr is not required and lr < 0.0: 61 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 62 | if schedule not in SCHEDULES: 63 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 64 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 65 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 66 | if not 0.0 <= b1 < 1.0: 67 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 68 | if not 0.0 <= b2 < 1.0: 69 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 70 | if not e >= 0.0: 71 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 72 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 73 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 74 | max_grad_norm=max_grad_norm) 75 | super(BertAdam, self).__init__(params, defaults) 76 | 77 | def step(self, closure=None): 78 | """Performs a single optimization step. 79 | Arguments: 80 | closure (callable, optional): A closure that reevaluates the model 81 | and returns the loss. 82 | """ 83 | loss = None 84 | if closure is not None: 85 | loss = closure() 86 | 87 | for group in self.param_groups: 88 | for p in group['params']: 89 | if p.grad is None: 90 | continue 91 | grad = p.grad.data 92 | if grad.is_sparse: 93 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 94 | 95 | state = self.state[p] 96 | 97 | # State initialization 98 | if len(state) == 0: 99 | state['step'] = 0 100 | # Exponential moving average of gradient values 101 | state['next_m'] = torch.zeros_like(p.data) 102 | # Exponential moving average of squared gradient values 103 | state['next_v'] = torch.zeros_like(p.data) 104 | 105 | next_m, next_v = state['next_m'], state['next_v'] 106 | beta1, beta2 = group['b1'], group['b2'] 107 | 108 | # Add grad clipping 109 | if group['max_grad_norm'] > 0: 110 | clip_grad_norm_(p, group['max_grad_norm']) 111 | 112 | # Decay the first and second moment running average coefficient 113 | # In-place operations to update the averages at the same time 114 | next_m.mul_(beta1).add_(1 - beta1, grad) 115 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 116 | update = next_m / (next_v.sqrt() + group['e']) 117 | 118 | # Just adding the square of the weights to the loss function is *not* 119 | # the correct way of using L2 regularization/weight decay with Adam, 120 | # since that will interact with the m and v parameters in strange ways. 121 | # 122 | # Instead we want to decay the weights in a manner that doesn't interact 123 | # with the m/v parameters. This is equivalent to adding the square 124 | # of the weights to the loss with plain (non-momentum) SGD. 125 | if group['weight_decay'] > 0.0: 126 | update += group['weight_decay'] * p.data 127 | 128 | update_with_lr = get_lr_scheduled(group, state) * update 129 | p.data.add_(-update_with_lr) 130 | 131 | state['step'] += 1 132 | 133 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 134 | # No bias correction 135 | # bias_correction1 = 1 - beta1 ** state['step'] 136 | # bias_correction2 = 1 - beta2 ** state['step'] 137 | 138 | return loss 139 | -------------------------------------------------------------------------------- /autokeras_pretrained/bert/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import os 24 | import logging 25 | 26 | from autokeras_pretrained.constant import Constant 27 | from autokeras_pretrained.bert.utils import cached_path 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | PRETRAINED_VOCAB_ARCHIVE_MAP = { 32 | 'bert-base-uncased': Constant.PRETRAINED_VOCAB_BERT_BASE_UNCASED, 33 | 'bert-base-cased': Constant.PRETRAINED_VOCAB_BERT_BASE_UNCASED 34 | } 35 | 36 | VOCAB_NAME = 'vocab.txt' 37 | 38 | 39 | def load_vocab(vocab_file): 40 | """Loads a vocabulary file into a dictionary.""" 41 | vocab = collections.OrderedDict() 42 | index = 0 43 | with open(vocab_file, "r", encoding="utf-8") as reader: 44 | while True: 45 | token = reader.readline() 46 | if not token: 47 | break 48 | token = token.strip() 49 | vocab[token] = index 50 | index += 1 51 | return vocab 52 | 53 | 54 | def whitespace_tokenize(text): 55 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 56 | text = text.strip() 57 | if not text: 58 | return [] 59 | tokens = text.split() 60 | return tokens 61 | 62 | 63 | class BertTokenizer(object): 64 | """Runs end-to-end tokenization: punctuation splitting + wordpiece""" 65 | 66 | def __init__(self, vocab_file, do_lower_case=True): 67 | if not os.path.isfile(vocab_file): 68 | raise ValueError( 69 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 70 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 71 | self.vocab = load_vocab(vocab_file) 72 | self.ids_to_tokens = collections.OrderedDict( 73 | [(ids, tok) for tok, ids in self.vocab.items()]) 74 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 75 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 76 | 77 | def tokenize(self, text): 78 | split_tokens = [] 79 | for token in self.basic_tokenizer.tokenize(text): 80 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 81 | split_tokens.append(sub_token) 82 | return split_tokens 83 | 84 | def convert_tokens_to_ids(self, tokens): 85 | """Converts a sequence of tokens into ids using the vocab.""" 86 | ids = [] 87 | for token in tokens: 88 | ids.append(self.vocab[token]) 89 | return ids 90 | 91 | @classmethod 92 | def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): 93 | """ 94 | Instantiate a PreTrainedBertModel from a pre-trained model file. 95 | Download and cache the pre-trained model file if needed. 96 | """ 97 | try: 98 | if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: 99 | vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] 100 | else: 101 | raise KeyError 102 | except KeyError: 103 | logger.error(str(pretrained_model_name) + " tokenizer is not available/supported.") 104 | 105 | # redirect to the cache, if necessary 106 | try: 107 | resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) 108 | except FileNotFoundError: 109 | logger.error( 110 | "Model name '{}' was not found in model name list ({}). " 111 | "We assumed '{}' was a path or url but couldn't find any file " 112 | "associated to this path or url.".format( 113 | pretrained_model_name, 114 | ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), 115 | vocab_file)) 116 | return None 117 | if resolved_vocab_file == vocab_file: 118 | logger.info("loading vocabulary file {}".format(vocab_file)) 119 | else: 120 | logger.info("loading vocabulary file {} from cache at {}".format( 121 | vocab_file, resolved_vocab_file)) 122 | # Instantiate tokenizer. 123 | tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) 124 | return tokenizer 125 | 126 | 127 | class BasicTokenizer(object): 128 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 129 | 130 | def __init__(self, do_lower_case=True): 131 | """Constructs a BasicTokenizer. 132 | 133 | Args: 134 | do_lower_case: Whether to lower case the input. 135 | """ 136 | self.do_lower_case = do_lower_case 137 | 138 | def tokenize(self, text): 139 | """Tokenizes a piece of text.""" 140 | text = self._clean_text(text) 141 | # This was added on November 1st, 2018 for the multilingual and Chinese 142 | # models. This is also applied to the English models now, but it doesn't 143 | # matter since the English models were not trained on any Chinese data 144 | # and generally don't have any Chinese data in them (there are Chinese 145 | # characters in the vocabulary because Wikipedia does have some Chinese 146 | # words in the English Wikipedia.). 147 | text = self._tokenize_chinese_chars(text) 148 | orig_tokens = whitespace_tokenize(text) 149 | split_tokens = [] 150 | for token in orig_tokens: 151 | if self.do_lower_case: 152 | token = token.lower() 153 | token = self._run_strip_accents(token) 154 | split_tokens.extend(self._run_split_on_punc(token)) 155 | 156 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 157 | return output_tokens 158 | 159 | @staticmethod 160 | def _run_strip_accents(text): 161 | """Strips accents from a piece of text.""" 162 | text = unicodedata.normalize("NFD", text) 163 | output = [] 164 | for char in text: 165 | cat = unicodedata.category(char) 166 | if cat == "Mn": 167 | continue 168 | output.append(char) 169 | return "".join(output) 170 | 171 | @staticmethod 172 | def _run_split_on_punc(text): 173 | """Splits punctuation on a piece of text.""" 174 | chars = list(text) 175 | i = 0 176 | start_new_word = True 177 | output = [] 178 | while i < len(chars): 179 | char = chars[i] 180 | if _is_punctuation(char): 181 | output.append([char]) 182 | start_new_word = True 183 | else: 184 | if start_new_word: 185 | output.append([]) 186 | start_new_word = False 187 | output[-1].append(char) 188 | i += 1 189 | return ["".join(x) for x in output] 190 | 191 | @classmethod 192 | def _tokenize_chinese_chars(cls, text): 193 | """Adds whitespace around any CJK character.""" 194 | output = [] 195 | for char in text: 196 | cp = ord(char) 197 | if cls._is_chinese_char(cp): 198 | output.append(" ") 199 | output.append(char) 200 | output.append(" ") 201 | else: 202 | output.append(char) 203 | return "".join(output) 204 | 205 | @staticmethod 206 | def _is_chinese_char(cp): 207 | """Checks whether CP is the codepoint of a CJK character.""" 208 | # This defines a "chinese character" as anything in the CJK Unicode block: 209 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 210 | # 211 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 212 | # despite its name. The modern Korean Hangul alphabet is a different block, 213 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 214 | # space-separated words, so they are not treated specially and handled 215 | # like the all of the other languages. 216 | chinese_character_ranges = [ 217 | (0x4E00, 0x9FFF), 218 | (0x3400, 0x4DBF), 219 | (0xF900, 0xFAFF), 220 | (0x20000, 0x2A6DF), 221 | (0x2A700, 0x2B73F), 222 | (0x2B740, 0x2B81F), 223 | (0x2B820, 0x2CEAF), 224 | (0x2F800, 0x2FA1F)] 225 | for start, end in chinese_character_ranges: 226 | if start <= cp <= end: 227 | return True 228 | return False 229 | 230 | @staticmethod 231 | def _clean_text(text): 232 | """Performs invalid character removal and whitespace cleanup on text.""" 233 | output = [] 234 | for char in text: 235 | cp = ord(char) 236 | if cp == 0 or cp == 0xfffd or _is_control(char): 237 | continue 238 | if _is_whitespace(char): 239 | output.append(" ") 240 | else: 241 | output.append(char) 242 | return "".join(output) 243 | 244 | 245 | class WordpieceTokenizer(object): 246 | """Runs WordPiece tokenization.""" 247 | 248 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 249 | self.vocab = vocab 250 | self.unk_token = unk_token 251 | self.max_input_chars_per_word = max_input_chars_per_word 252 | 253 | def tokenize(self, text): 254 | """Tokenizes a piece of text into its word pieces. 255 | 256 | This uses a greedy longest-match-first algorithm to perform tokenization 257 | using the given vocabulary. 258 | 259 | For example: 260 | input = "unaffable" 261 | output = ["un", "##aff", "##able"] 262 | 263 | Args: 264 | text: A single token or whitespace separated tokens. This should have 265 | already been passed through `BasicTokenizer. 266 | 267 | Returns: 268 | A list of wordpiece tokens. 269 | """ 270 | 271 | output_tokens = [] 272 | for token in whitespace_tokenize(text): 273 | chars = list(token) 274 | if len(chars) > self.max_input_chars_per_word: 275 | output_tokens.append(self.unk_token) 276 | continue 277 | 278 | is_bad = False 279 | start = 0 280 | sub_tokens = [] 281 | while start < len(chars): 282 | end = len(chars) 283 | cur_substr = None 284 | while start < end: 285 | substr = "".join(chars[start:end]) 286 | if start > 0: 287 | substr = "##" + substr 288 | if substr in self.vocab: 289 | cur_substr = substr 290 | break 291 | end -= 1 292 | if cur_substr is None: 293 | is_bad = True 294 | break 295 | sub_tokens.append(cur_substr) 296 | start = end 297 | 298 | if is_bad: 299 | output_tokens.append(self.unk_token) 300 | else: 301 | output_tokens.extend(sub_tokens) 302 | return output_tokens 303 | 304 | 305 | def _is_whitespace(char): 306 | """Checks whether `chars` is a whitespace character.""" 307 | # \t, \n, and \r are technically contorl characters but we treat them 308 | # as whitespace since they are generally considered as such. 309 | if char == " " or char == "\t" or char == "\n" or char == "\r": 310 | return True 311 | cat = unicodedata.category(char) 312 | if cat == "Zs": 313 | return True 314 | return False 315 | 316 | 317 | def _is_control(char): 318 | """Checks whether `chars` is a control character.""" 319 | # These are technically control characters but we count them as whitespace 320 | # characters. 321 | if char == "\t" or char == "\n" or char == "\r": 322 | return False 323 | cat = unicodedata.category(char) 324 | if cat.startswith("C"): 325 | return True 326 | return False 327 | 328 | 329 | def _is_punctuation(char): 330 | """Checks whether `chars` is a punctuation character.""" 331 | cp = ord(char) 332 | # We treat all non-letter/number ASCII as punctuation. 333 | # Characters such as "^", "$", and "`" are not in the Unicode 334 | # Punctuation class but we treat them as punctuation anyways, for 335 | # consistency. 336 | punctuation_ranges = [(33, 47), (58, 64), (91, 96), (123, 126)] 337 | for start, end in punctuation_ranges: 338 | if start <= cp <= end: 339 | return True 340 | cat = unicodedata.category(char) 341 | if cat.startswith("P"): 342 | return True 343 | return False 344 | -------------------------------------------------------------------------------- /autokeras_pretrained/bert/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Original work Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Modified work Copyright 2019 The AutoKeras team. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | import torch 20 | 21 | from pathlib import Path 22 | 23 | from autokeras_pretrained.utils import download_file_from_google_drive 24 | 25 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 26 | Path.home() / '.pytorch_pretrained_bert')) 27 | 28 | 29 | class InputFeatures(object): 30 | """A single set of features of data.""" 31 | 32 | def __init__(self, input_ids, input_mask, segment_ids): 33 | self.input_ids = input_ids 34 | self.input_mask = input_mask 35 | self.segment_ids = segment_ids 36 | 37 | 38 | def convert_examples_to_features(examples, tokenizer, max_seq_length): 39 | """ Convert text examples to BERT specific input format. 40 | 41 | Tokenize the input text and convert into features. 42 | 43 | Args: 44 | examples: Text data. 45 | tokenizer: Tokenizer to process the text into tokens. 46 | max_seq_length: The maximum length of the text sequence supported. 47 | 48 | Returns: 49 | all_input_ids: ndarray containing the ids for each token. 50 | all_input_masks: ndarray containing 1's or 0's based on if the tokens are real or padded. 51 | all_segment_ids: ndarray containing all 0's since it is a classification task. 52 | """ 53 | features = [] 54 | for (_, example) in enumerate(examples): 55 | tokens_a = tokenizer.tokenize(example) 56 | 57 | if len(tokens_a) > max_seq_length - 2: 58 | tokens_a = tokens_a[:(max_seq_length - 2)] 59 | 60 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 61 | segment_ids = [0] * len(tokens) 62 | 63 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 64 | 65 | input_mask = [1] * len(input_ids) 66 | 67 | padding = [0] * (max_seq_length - len(input_ids)) 68 | input_ids += padding 69 | input_mask += padding 70 | segment_ids += padding 71 | 72 | if len(input_ids) != max_seq_length or \ 73 | len(input_mask) != max_seq_length or \ 74 | len(segment_ids) != max_seq_length: 75 | raise AssertionError() 76 | 77 | features.append(InputFeatures(input_ids=input_ids, 78 | input_mask=input_mask, 79 | segment_ids=segment_ids)) 80 | 81 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 82 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 83 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) 84 | 85 | return all_input_ids, all_input_mask, all_segment_ids 86 | 87 | 88 | def cached_path(file_info, cache_dir=None): 89 | if cache_dir is None: 90 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 91 | 92 | os.makedirs(cache_dir, exist_ok=True) 93 | file_path = os.path.join(cache_dir, file_info.local_name) 94 | 95 | if not os.path.exists(file_path): 96 | download_file_from_google_drive(file_id=file_info.google_drive_id, 97 | dest_path=file_path, 98 | verbose=True) 99 | return file_path 100 | -------------------------------------------------------------------------------- /autokeras_pretrained/constant.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | GoogleDriveFile = namedtuple('GoogleDriveFile', ['google_drive_id', 'local_name']) 4 | 5 | 6 | class Constant: 7 | 8 | # Text Classifier 9 | 10 | BERT_TRAINER_EPOCHS = 4 11 | BERT_TRAINER_BATCH_SIZE = 32 12 | 13 | # text preprocessor 14 | 15 | EMBEDDING_DIM = 100 16 | MAX_SEQUENCE_LENGTH = 400 17 | MAX_NB_WORDS = 5000 18 | EXTRACT_PATH = "glove/" 19 | STORE_PATH = '' 20 | 21 | # Download file name 22 | 23 | FILE_PATH = "glove.zip" 24 | PRE_TRAIN_FILE_LINK = "http://nlp.stanford.edu/data/glove.6B.zip" 25 | PRE_TRAIN_FILE_NAME = "glove.6B.100d.txt" 26 | 27 | PRE_TRAIN_DETECTION_FILE_LINK = "https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth" 28 | 29 | VOICE_GENERATOR_MODELS = [ 30 | GoogleDriveFile(google_drive_id='1E-B92LZz4dgg8DU81D6pyhOzM9yvvBTj', local_name='vg.pth')] 31 | VOICE_RECONGINIZER_MODELS = [ 32 | GoogleDriveFile(google_drive_id='1RQQB-Yd-aqb6scWtnu1K4nlSTxTyaKjI', local_name='vr.pth')] 33 | FACE_DETECTOR_MODELS = [ 34 | GoogleDriveFile(google_drive_id='1QJWKpAHRrAjrYPl6hQNDaoyBjoa_LRgz', local_name='pnet.pt'), 35 | GoogleDriveFile(google_drive_id='10aCiR393E6TLkp9KPPl4JhZamYqUVBO1', local_name='rnet.pt'), 36 | GoogleDriveFile(google_drive_id='1RRBtPlzw46peS-A8pyYGsPRHHFIUrSVV', local_name='onet.pt')] 37 | OBJECT_DETECTOR_MODELS = [ 38 | GoogleDriveFile(google_drive_id='1QGG1trfj-z5_2OGNoSarUB4wx81cG-sa', local_name='oo.pth')] 39 | SENTIMENT_ANALYSIS_MODELS = [ 40 | GoogleDriveFile(google_drive_id='1flRlQjfIa2toQ6HNmInhqrh4NuxGh8pT', local_name='sa.pth')] 41 | TOPIC_CLASSIFIER_MODELS = [ 42 | GoogleDriveFile(google_drive_id='1U7C3xPid1ZvBKpkfW9KikrmNui0yJqnk', local_name='tc.pth')] 43 | PRETRAINED_VOCAB_BERT_BASE_UNCASED = \ 44 | GoogleDriveFile(google_drive_id='1hlPkUSPeT5ZQBYZ1Z734BbnHIvpx2ZLj', local_name='vbbu.txt') 45 | PRETRAINED_VOCAB_BERT_BASE_CASED = \ 46 | GoogleDriveFile(google_drive_id='1FLytUhOIF0mTfA4A9MtE3aQ1kJr96oTR', local_name='vbbc.txt') 47 | PRETRAINED_MODEL_BERT_BASE_UNCASED = \ 48 | GoogleDriveFile(google_drive_id='1rp1rVBoQwqgvg-JE8JwLL-adgLE07oTG', local_name='mbbu.pth') 49 | PRETRAINED_MODEL_BERT_BASE_CASED = \ 50 | GoogleDriveFile(google_drive_id='1YKoGj-e4zoyTabt5dYpgEPe-PAmjOTDV', local_name='mbbc.pth') 51 | 52 | VOICE_RECONGINIZER_LABELS = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ " 53 | VOICE_RECONGINIZER_AUDIO_CONF = {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 54 | 'window': 'hamming', 'noise_dir': None, 'noise_prob': 0.4, 55 | 'noise_levels': (0.0, 0.5)} 56 | 57 | # Image Resize 58 | 59 | MAX_IMAGE_SIZE = 128 * 128 60 | 61 | # SYS Constant 62 | 63 | SYS_LINUX = 'linux' 64 | SYS_WINDOWS = 'windows' 65 | SYS_GOOGLE_COLAB = 'goog_colab' 66 | 67 | # Google drive downloader 68 | CHUNK_SIZE = 32768 69 | DOWNLOAD_URL = "https://docs.google.com/uc?export=download" 70 | -------------------------------------------------------------------------------- /autokeras_pretrained/face_detector.py: -------------------------------------------------------------------------------- 1 | # This is DFace's implementation of MTCNN modified for AutoKeras 2 | # Link to DFace: https://github.com/kuaikuaikim/DFace 3 | import os 4 | 5 | import cv2 6 | import matplotlib.patches as patches 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torchvision.transforms as transforms 12 | from torch.autograd.variable import Variable 13 | 14 | from autokeras_pretrained.constant import Constant 15 | from autokeras_pretrained.base import Pretrained 16 | 17 | 18 | def weights_init(m): 19 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): 20 | nn.init.xavier_uniform_(m.weight.data) 21 | nn.init.constant_(m.bias, 0.1) 22 | 23 | 24 | class PNet(nn.Module): 25 | 26 | def __init__(self): 27 | super(PNet, self).__init__() 28 | 29 | self.pre_layer = nn.Sequential( 30 | nn.Conv2d(3, 10, kernel_size=3, stride=1), 31 | nn.PReLU(), 32 | nn.MaxPool2d(kernel_size=2, stride=2), 33 | nn.Conv2d(10, 16, kernel_size=3, stride=1), 34 | nn.PReLU(), 35 | nn.Conv2d(16, 32, kernel_size=3, stride=1), 36 | nn.PReLU() 37 | ) 38 | self.conv4_1 = nn.Conv2d(32, 1, kernel_size=1, stride=1) 39 | self.conv4_2 = nn.Conv2d(32, 4, kernel_size=1, stride=1) 40 | self.conv4_3 = nn.Conv2d(32, 10, kernel_size=1, stride=1) 41 | 42 | self.apply(weights_init) 43 | 44 | def forward(self, x): 45 | x = self.pre_layer(x) 46 | label = torch.sigmoid(self.conv4_1(x)) 47 | offset = self.conv4_2(x) 48 | return label, offset 49 | 50 | 51 | class RNet(nn.Module): 52 | 53 | def __init__(self): 54 | super(RNet, self).__init__() 55 | 56 | self.pre_layer = nn.Sequential( 57 | nn.Conv2d(3, 28, kernel_size=3, stride=1), 58 | nn.PReLU(), 59 | nn.MaxPool2d(kernel_size=3, stride=2), 60 | nn.Conv2d(28, 48, kernel_size=3, stride=1), 61 | nn.PReLU(), 62 | nn.MaxPool2d(kernel_size=3, stride=2), 63 | nn.Conv2d(48, 64, kernel_size=2, stride=1), 64 | nn.PReLU() 65 | 66 | ) 67 | self.conv4 = nn.Linear(64 * 2 * 2, 128) 68 | self.prelu4 = nn.PReLU() 69 | self.conv5_1 = nn.Linear(128, 1) 70 | self.conv5_2 = nn.Linear(128, 4) 71 | self.conv5_3 = nn.Linear(128, 10) 72 | self.apply(weights_init) 73 | 74 | def forward(self, x): 75 | x = self.pre_layer(x) 76 | x = x.view(x.size(0), -1) 77 | x = self.conv4(x) 78 | x = self.prelu4(x) 79 | det = torch.sigmoid(self.conv5_1(x)) 80 | box = self.conv5_2(x) 81 | return det, box 82 | 83 | 84 | class ONet(nn.Module): 85 | 86 | def __init__(self): 87 | super(ONet, self).__init__() 88 | 89 | self.pre_layer = nn.Sequential( 90 | nn.Conv2d(3, 32, kernel_size=3, stride=1), 91 | nn.PReLU(), 92 | nn.MaxPool2d(kernel_size=3, stride=2), 93 | nn.Conv2d(32, 64, kernel_size=3, stride=1), 94 | nn.PReLU(), 95 | nn.MaxPool2d(kernel_size=3, stride=2), 96 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 97 | nn.PReLU(), 98 | nn.MaxPool2d(kernel_size=2, stride=2), 99 | nn.Conv2d(64, 128, kernel_size=2, stride=1), 100 | nn.PReLU() 101 | ) 102 | self.conv5 = nn.Linear(128 * 2 * 2, 256) 103 | self.prelu5 = nn.PReLU() 104 | self.conv6_1 = nn.Linear(256, 1) 105 | self.conv6_2 = nn.Linear(256, 4) 106 | self.conv6_3 = nn.Linear(256, 10) 107 | self.apply(weights_init) 108 | 109 | def forward(self, x): 110 | x = self.pre_layer(x) 111 | x = x.view(x.size(0), -1) 112 | x = self.conv5(x) 113 | x = self.prelu5(x) 114 | det = torch.sigmoid(self.conv6_1(x)) 115 | box = self.conv6_2(x) 116 | landmark = self.conv6_3(x) 117 | return det, box, landmark 118 | 119 | 120 | def get_square_bbox(bbox): 121 | square_bbox = bbox.copy() 122 | 123 | h = bbox[:, 3] - bbox[:, 1] + 1 124 | w = bbox[:, 2] - bbox[:, 0] + 1 125 | l = np.maximum(h, w) 126 | square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - l * 0.5 127 | square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - l * 0.5 128 | 129 | square_bbox[:, 2] = square_bbox[:, 0] + l - 1 130 | square_bbox[:, 3] = square_bbox[:, 1] + l - 1 131 | return square_bbox 132 | 133 | 134 | def generate_bounding_box(map_, reg, scale, threshold): 135 | stride = 2 136 | cellsize = 12 137 | 138 | t_index = np.where(map_ > threshold) 139 | 140 | if t_index[0].size == 0: 141 | return np.array([]) 142 | 143 | dx1, dy1, dx2, dy2 = [reg[0, t_index[0], t_index[1], i] for i in range(4)] 144 | reg = np.array([dx1, dy1, dx2, dy2]) 145 | 146 | score = map_[t_index[0], t_index[1], 0] 147 | boundingbox = np.vstack([np.round((stride * t_index[1]) / scale), 148 | np.round((stride * t_index[0]) / scale), 149 | np.round((stride * t_index[1] + cellsize) / scale), 150 | np.round((stride * t_index[0] + cellsize) / scale), 151 | score, 152 | reg 153 | ]) 154 | 155 | return boundingbox.T 156 | 157 | 158 | def resize_image(img, scale): 159 | height, width, _ = img.shape 160 | new_height = int(height * scale) 161 | new_width = int(width * scale) 162 | new_dim = (new_width, new_height) 163 | img_resized = cv2.resize(img, new_dim, interpolation=cv2.INTER_LINEAR) 164 | return img_resized 165 | 166 | 167 | def pad(bboxes, w, h): 168 | tmpw = (bboxes[:, 2] - bboxes[:, 0] + 1).astype(np.int32) 169 | tmph = (bboxes[:, 3] - bboxes[:, 1] + 1).astype(np.int32) 170 | numbox = bboxes.shape[0] 171 | 172 | dx = np.zeros((numbox,)) 173 | dy = np.zeros((numbox,)) 174 | edx, edy = tmpw.copy() - 1, tmph.copy() - 1 175 | 176 | x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] 177 | 178 | tmp_index = np.where(ex > w - 1) 179 | edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index] 180 | ex[tmp_index] = w - 1 181 | 182 | tmp_index = np.where(ey > h - 1) 183 | edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index] 184 | ey[tmp_index] = h - 1 185 | 186 | tmp_index = np.where(x < 0) 187 | dx[tmp_index] = 0 - x[tmp_index] 188 | x[tmp_index] = 0 189 | 190 | tmp_index = np.where(y < 0) 191 | dy[tmp_index] = 0 - y[tmp_index] 192 | y[tmp_index] = 0 193 | 194 | return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] 195 | return_list = [item.astype(np.int32) for item in return_list] 196 | 197 | return return_list 198 | 199 | 200 | def nms(dets, thresh, mode="Union"): 201 | x1 = dets[:, 0] 202 | y1 = dets[:, 1] 203 | x2 = dets[:, 2] 204 | y2 = dets[:, 3] 205 | scores = dets[:, 4] 206 | 207 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 208 | order = scores.argsort()[::-1] 209 | 210 | keep = [] 211 | while order.size > 0: 212 | i = order[0] 213 | keep.append(i) 214 | xx1 = np.maximum(x1[i], x1[order[1:]]) 215 | yy1 = np.maximum(y1[i], y1[order[1:]]) 216 | xx2 = np.minimum(x2[i], x2[order[1:]]) 217 | yy2 = np.minimum(y2[i], y2[order[1:]]) 218 | 219 | w = np.maximum(0.0, xx2 - xx1 + 1) 220 | h = np.maximum(0.0, yy2 - yy1 + 1) 221 | inter = w * h 222 | if mode == "Union": 223 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 224 | elif mode == "Minimum": 225 | ovr = inter / np.minimum(areas[i], areas[order[1:]]) 226 | 227 | inds = np.where(ovr <= thresh)[0] 228 | order = order[inds + 1] 229 | 230 | return keep 231 | 232 | 233 | def convert_image_to_tensor(image): 234 | transform = transforms.ToTensor() 235 | return transform(image) 236 | 237 | 238 | def convert_chw_tensor_to_hwc_numpy(tensor): 239 | if isinstance(tensor, Variable): 240 | return np.transpose(tensor.data.numpy(), (0, 2, 3, 1)) 241 | elif isinstance(tensor, torch.FloatTensor): 242 | return np.transpose(tensor.numpy(), (0, 2, 3, 1)) 243 | else: 244 | raise Exception("covert b*c*h*w tensor to b*h*w*c numpy error.This tensor must have 4 dimension.") 245 | 246 | 247 | def vis_face(im_array, dets, output_file_path, landmarks=None): 248 | fig, ax = plt.subplots(1) 249 | ax.imshow(im_array) 250 | 251 | for i in range(dets.shape[0]): 252 | bbox = dets[i, :4] 253 | 254 | rect = plt.Rectangle((bbox[0], bbox[1]), 255 | bbox[2] - bbox[0], 256 | bbox[3] - bbox[1], fill=False, 257 | edgecolor='yellow', linewidth=0.9) 258 | ax.add_patch(rect) 259 | 260 | if landmarks is not None: 261 | for i in range(landmarks.shape[0]): 262 | landmarks_one = landmarks[i, :] 263 | landmarks_one = landmarks_one.reshape((5, 2)) 264 | for j in range(5): 265 | cir1 = patches.Circle(xy=(landmarks_one[j, 0], landmarks_one[j, 1]), radius=2, alpha=0.4, color="red") 266 | ax.add_patch(cir1) 267 | plt.axis('off') 268 | fig.savefig(output_file_path, bbox_inches='tight', pad_inches=0) 269 | 270 | 271 | class FaceDetector(Pretrained): 272 | """A class to predict faces using the MTCNN pre-trained model. 273 | """ 274 | 275 | def __init__(self, **kwargs): 276 | super().__init__(**kwargs) 277 | pnet, rnet, onet = (torch.load(path, map_location=lambda storage, loc: storage) for path in self.local_paths) 278 | 279 | self.pnet_detector = PNet() 280 | self.pnet_detector.load_state_dict(pnet) 281 | self.pnet_detector = self.pnet_detector.to(self.device) 282 | self.pnet_detector.eval() 283 | 284 | self.rnet_detector = RNet() 285 | self.rnet_detector.load_state_dict(rnet) 286 | self.rnet_detector = self.rnet_detector.to(self.device) 287 | self.rnet_detector.eval() 288 | 289 | self.onet_detector = ONet() 290 | self.onet_detector.load_state_dict(onet) 291 | self.onet_detector = self.onet_detector.to(self.device) 292 | self.onet_detector.eval() 293 | 294 | self.min_face_size = 24 295 | self.stride = 2 296 | self.threshold = [0.6, 0.7, 0.7] 297 | self.scale_factor = 0.709 298 | 299 | @property 300 | def _google_drive_files(self): 301 | return Constant.FACE_DETECTOR_MODELS 302 | 303 | def predict(self, img_path, output_file_path=None): 304 | """Predicts faces in an image. 305 | 306 | Args: 307 | img_path: A string. The path to the image on which the prediction is to be done. 308 | output_file_path: A string. The path where the output image is to be saved after the prediction. `None` by default. 309 | 310 | Returns: 311 | A tuple containing numpy arrays of bounding boxes and landmarks. Bounding boxes are of shape `(n, 5)` and 312 | landmarks are of shape `(n, 10)` where `n` is the number of faces predicted. Each bounding box is of length 313 | 5 and the corresponding rectangle is defined by the first four values. Each bounding box has five landmarks 314 | represented by 10 coordinates. 315 | """ 316 | if not os.path.exists(img_path): 317 | raise ValueError('Image does not exist') 318 | img = cv2.imread(img_path) 319 | img_bg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 320 | bounding_boxes, landmarks = self.detect_face(img) 321 | if output_file_path is not None: 322 | vis_face(img_bg, bounding_boxes, output_file_path, landmarks) 323 | return bounding_boxes, landmarks 324 | 325 | def detect_pnet(self, im): 326 | h, w, c = im.shape 327 | net_size = 12 328 | 329 | current_scale = float(net_size) / self.min_face_size 330 | im_resized = resize_image(im, current_scale) 331 | current_height, current_width, _ = im_resized.shape 332 | 333 | all_boxes = list() 334 | while min(current_height, current_width) > net_size: 335 | feed_imgs = [] 336 | image_tensor = convert_image_to_tensor(im_resized) 337 | feed_imgs.append(image_tensor) 338 | feed_imgs = torch.stack(feed_imgs) 339 | feed_imgs = Variable(feed_imgs) 340 | 341 | feed_imgs = feed_imgs.to(self.device) 342 | 343 | cls_map, reg = self.pnet_detector(feed_imgs) 344 | 345 | cls_map_np = convert_chw_tensor_to_hwc_numpy(cls_map.cpu()) 346 | reg_np = convert_chw_tensor_to_hwc_numpy(reg.cpu()) 347 | 348 | boxes = generate_bounding_box(cls_map_np[0, :, :], reg_np, current_scale, self.threshold[0]) 349 | 350 | current_scale *= self.scale_factor 351 | im_resized = resize_image(im, current_scale) 352 | current_height, current_width, _ = im_resized.shape 353 | 354 | if boxes.size == 0: 355 | continue 356 | keep = nms(boxes[:, :5], 0.5, 'Union') 357 | boxes = boxes[keep] 358 | all_boxes.append(boxes) 359 | 360 | if len(all_boxes) == 0: 361 | return None, None 362 | 363 | all_boxes = np.vstack(all_boxes) 364 | 365 | keep = nms(all_boxes[:, 0:5], 0.7, 'Union') 366 | all_boxes = all_boxes[keep] 367 | 368 | bw = all_boxes[:, 2] - all_boxes[:, 0] + 1 369 | bh = all_boxes[:, 3] - all_boxes[:, 1] + 1 370 | 371 | boxes = np.vstack([all_boxes[:, 0], 372 | all_boxes[:, 1], 373 | all_boxes[:, 2], 374 | all_boxes[:, 3], 375 | all_boxes[:, 4] 376 | ]) 377 | 378 | boxes = boxes.T 379 | 380 | align_topx = all_boxes[:, 0] + all_boxes[:, 5] * bw 381 | align_topy = all_boxes[:, 1] + all_boxes[:, 6] * bh 382 | align_bottomx = all_boxes[:, 2] + all_boxes[:, 7] * bw 383 | align_bottomy = all_boxes[:, 3] + all_boxes[:, 8] * bh 384 | 385 | boxes_align = np.vstack([align_topx, 386 | align_topy, 387 | align_bottomx, 388 | align_bottomy, 389 | all_boxes[:, 4] 390 | ]) 391 | boxes_align = boxes_align.T 392 | 393 | return boxes, boxes_align 394 | 395 | def detect_rnet(self, im, dets): 396 | h, w, c = im.shape 397 | 398 | if dets is None: 399 | return None, None 400 | 401 | dets = get_square_bbox(dets) 402 | dets[:, 0:4] = np.round(dets[:, 0:4]) 403 | 404 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(dets, w, h) 405 | num_boxes = dets.shape[0] 406 | 407 | cropped_ims_tensors = [] 408 | for i in range(num_boxes): 409 | tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) 410 | tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :] 411 | crop_im = cv2.resize(tmp, (24, 24)) 412 | crop_im_tensor = convert_image_to_tensor(crop_im) 413 | cropped_ims_tensors.append(crop_im_tensor) 414 | feed_imgs = Variable(torch.stack(cropped_ims_tensors)) 415 | 416 | feed_imgs = feed_imgs.to(self.device) 417 | 418 | cls_map, reg = self.rnet_detector(feed_imgs) 419 | 420 | cls_map = cls_map.cpu().data.numpy() 421 | reg = reg.cpu().data.numpy() 422 | 423 | keep_inds = np.where(cls_map > self.threshold[1])[0] 424 | 425 | if len(keep_inds) > 0: 426 | boxes = dets[keep_inds] 427 | cls = cls_map[keep_inds] 428 | reg = reg[keep_inds] 429 | else: 430 | return None, None 431 | 432 | keep = nms(boxes, 0.7) 433 | 434 | if len(keep) == 0: 435 | return None, None 436 | 437 | keep_cls = cls[keep] 438 | keep_boxes = boxes[keep] 439 | keep_reg = reg[keep] 440 | 441 | bw = keep_boxes[:, 2] - keep_boxes[:, 0] + 1 442 | bh = keep_boxes[:, 3] - keep_boxes[:, 1] + 1 443 | 444 | boxes = np.vstack([keep_boxes[:, 0], 445 | keep_boxes[:, 1], 446 | keep_boxes[:, 2], 447 | keep_boxes[:, 3], 448 | keep_cls[:, 0] 449 | ]) 450 | 451 | align_topx = keep_boxes[:, 0] + keep_reg[:, 0] * bw 452 | align_topy = keep_boxes[:, 1] + keep_reg[:, 1] * bh 453 | align_bottomx = keep_boxes[:, 2] + keep_reg[:, 2] * bw 454 | align_bottomy = keep_boxes[:, 3] + keep_reg[:, 3] * bh 455 | 456 | boxes_align = np.vstack([align_topx, 457 | align_topy, 458 | align_bottomx, 459 | align_bottomy, 460 | keep_cls[:, 0] 461 | ]) 462 | 463 | boxes = boxes.T 464 | boxes_align = boxes_align.T 465 | 466 | return boxes, boxes_align 467 | 468 | def detect_onet(self, im, dets): 469 | h, w, _ = im.shape 470 | 471 | if dets is None: 472 | return None, None 473 | 474 | dets = get_square_bbox(dets) 475 | dets[:, 0:4] = np.round(dets[:, 0:4]) 476 | 477 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(dets, w, h) 478 | num_boxes = dets.shape[0] 479 | 480 | cropped_ims_tensors = [] 481 | for i in range(num_boxes): 482 | tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) 483 | tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :] 484 | crop_im = cv2.resize(tmp, (48, 48)) 485 | crop_im_tensor = convert_image_to_tensor(crop_im) 486 | cropped_ims_tensors.append(crop_im_tensor) 487 | feed_imgs = Variable(torch.stack(cropped_ims_tensors)) 488 | 489 | feed_imgs = feed_imgs.to(self.device) 490 | 491 | cls_map, reg, landmark = self.onet_detector(feed_imgs) 492 | 493 | cls_map = cls_map.cpu().data.numpy() 494 | reg = reg.cpu().data.numpy() 495 | landmark = landmark.cpu().data.numpy() 496 | 497 | keep_inds = np.where(cls_map > self.threshold[2])[0] 498 | 499 | if len(keep_inds) > 0: 500 | boxes = dets[keep_inds] 501 | cls = cls_map[keep_inds] 502 | reg = reg[keep_inds] 503 | landmark = landmark[keep_inds] 504 | else: 505 | return None, None 506 | 507 | keep = nms(boxes, 0.7, mode="Minimum") 508 | 509 | if len(keep) == 0: 510 | return None, None 511 | 512 | keep_cls = cls[keep] 513 | keep_boxes = boxes[keep] 514 | keep_reg = reg[keep] 515 | keep_landmark = landmark[keep] 516 | 517 | bw = keep_boxes[:, 2] - keep_boxes[:, 0] + 1 518 | bh = keep_boxes[:, 3] - keep_boxes[:, 1] + 1 519 | 520 | align_topx = keep_boxes[:, 0] + keep_reg[:, 0] * bw 521 | align_topy = keep_boxes[:, 1] + keep_reg[:, 1] * bh 522 | align_bottomx = keep_boxes[:, 2] + keep_reg[:, 2] * bw 523 | align_bottomy = keep_boxes[:, 3] + keep_reg[:, 3] * bh 524 | 525 | align_landmark_topx = keep_boxes[:, 0] 526 | align_landmark_topy = keep_boxes[:, 1] 527 | 528 | boxes_align = np.vstack([align_topx, 529 | align_topy, 530 | align_bottomx, 531 | align_bottomy, 532 | keep_cls[:, 0] 533 | ]) 534 | 535 | boxes_align = boxes_align.T 536 | 537 | landmark = np.vstack([ 538 | align_landmark_topx + keep_landmark[:, 0] * bw, 539 | align_landmark_topy + keep_landmark[:, 1] * bh, 540 | align_landmark_topx + keep_landmark[:, 2] * bw, 541 | align_landmark_topy + keep_landmark[:, 3] * bh, 542 | align_landmark_topx + keep_landmark[:, 4] * bw, 543 | align_landmark_topy + keep_landmark[:, 5] * bh, 544 | align_landmark_topx + keep_landmark[:, 6] * bw, 545 | align_landmark_topy + keep_landmark[:, 7] * bh, 546 | align_landmark_topx + keep_landmark[:, 8] * bw, 547 | align_landmark_topy + keep_landmark[:, 9] * bh, 548 | ]) 549 | 550 | landmark_align = landmark.T 551 | 552 | return boxes_align, landmark_align 553 | 554 | def detect_face(self, img): 555 | boxes_align = np.array([]) 556 | landmark_align = np.array([]) 557 | 558 | if self.pnet_detector: 559 | _, boxes_align = self.detect_pnet(img) 560 | if boxes_align is None: 561 | return np.array([]), np.array([]) 562 | 563 | if self.rnet_detector: 564 | boxes, boxes_align = self.detect_rnet(img, boxes_align) 565 | if boxes_align is None: 566 | return np.array([]), np.array([]) 567 | 568 | if self.onet_detector: 569 | boxes_align, landmark_align = self.detect_onet(img, boxes_align) 570 | if boxes_align is None: 571 | return np.array([]), np.array([]) 572 | 573 | return boxes_align, landmark_align 574 | -------------------------------------------------------------------------------- /autokeras_pretrained/object_detector.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------- 2 | # The idea of the classes and functions in this file is largely borrowed from 3 | # https://github.com/amdegroot/ssd.pytorch 4 | # A huge thank you to the authors: Max deGroot and Ellis Brown 5 | # modified by: Wuyang Chen, Haifeng Jin 6 | # ---------------------------------- 7 | 8 | import os 9 | from itertools import product as product 10 | from math import sqrt as sqrt 11 | 12 | import cv2 13 | import numpy as np 14 | import torch 15 | from matplotlib import pyplot as plt 16 | from torch import nn as nn 17 | from torch.autograd import Variable, Function 18 | from torch.nn import functional 19 | from torch.nn import init as init 20 | 21 | from autokeras_pretrained.utils import get_device 22 | from autokeras_pretrained.constant import Constant 23 | from autokeras_pretrained.base import Pretrained 24 | 25 | """VOC Dataset Classes 26 | 27 | Original author: Francisco Massa 28 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 29 | 30 | Updated by: Ellis Brown, Max deGroot 31 | """ 32 | 33 | # gets home dir cross platform 34 | HOME = os.path.expanduser("~") 35 | 36 | # for making bounding boxes pretty 37 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 38 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 39 | 40 | MEANS = (104, 117, 123) 41 | 42 | # SSD300 CONFIGS 43 | VOC = { 44 | 'num_classes': 21, 45 | 'lr_steps': (80000, 100000, 120000), 46 | 'max_iter': 120000, 47 | 'feature_maps': [38, 19, 10, 5, 3, 1], 48 | 'min_dim': 300, 49 | 'steps': [8, 16, 32, 64, 100, 300], 50 | 'min_sizes': [30, 60, 111, 162, 213, 264], 51 | 'max_sizes': [60, 111, 162, 213, 264, 315], 52 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 53 | 'variance': [0.1, 0.2], 54 | 'clip': True, 55 | 'name': 'VOC', 56 | } 57 | 58 | COCO = { 59 | 'num_classes': 91, 60 | 'lr_steps': (280000, 360000, 400000), 61 | 'max_iter': 400000, 62 | 'feature_maps': [38, 19, 10, 5, 3, 1], 63 | 'min_dim': 300, 64 | 'steps': [8, 16, 32, 64, 100, 300], 65 | 'min_sizes': [21, 45, 99, 153, 207, 261], 66 | 'max_sizes': [45, 99, 153, 207, 261, 315], 67 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 68 | 'variance': [0.1, 0.2], 69 | 'clip': True, 70 | 'name': 'COCO', 71 | } 72 | 73 | VOC_CLASSES = ( # always index 0 74 | 'aeroplane', 'bicycle', 'bird', 'boat', 75 | 'bottle', 'bus', 'car', 'cat', 'chair', 76 | 'cow', 'diningtable', 'dog', 'horse', 77 | 'motorbike', 'person', 'pottedplant', 78 | 'sheep', 'sofa', 'train', 'tvmonitor') 79 | 80 | # note: if you used our download scripts, this should be right 81 | VOC_ROOT = os.path.join(HOME, "object_detection/data/VOCdevkit/") 82 | 83 | 84 | class SSD(nn.Module): 85 | """Single Shot Multibox Architecture 86 | The network is composed of a base VGG network followed by the 87 | added multibox conv layers. Each multibox layer branches into 88 | 1) conv2d for class conf scores 89 | 2) conv2d for localization predictions 90 | 3) associated priorbox layer to produce default bounding 91 | boxes specific to the layer's feature map size. 92 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 93 | 94 | Args: 95 | phase: (string) Can be "test" or "train" 96 | size: input image size 97 | base: VGG16 layers for input, size of either 300 or 500 98 | extras: extra layers that feed to multibox loc and conf layers 99 | head: "multibox head" consists of loc and conf conv layers 100 | """ 101 | 102 | def __init__(self, phase, size, base, extras, head, num_classes, device): 103 | super(SSD, self).__init__() 104 | self.phase = phase 105 | self.num_classes = num_classes 106 | self.cfg = (COCO, VOC)[num_classes == 21] 107 | self.priorbox = PriorBox(self.cfg) 108 | self.device = device 109 | self.priors = Variable(self.priorbox.forward(), volatile=True).to(self.device) 110 | self.size = size 111 | 112 | # SSD network 113 | self.vgg = nn.ModuleList(base) 114 | # Layer learns to scale the l2 normalized features from conv4_3 115 | self.L2Norm = L2Norm(512, 20) 116 | self.extras = nn.ModuleList(extras) 117 | 118 | self.loc = nn.ModuleList(head[0]) 119 | self.conf = nn.ModuleList(head[1]) 120 | 121 | if phase == 'test': 122 | self.softmax = nn.Softmax(dim=-1) 123 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 124 | 125 | def forward(self, x): 126 | """Applies network layers and ops on input image(s) x. 127 | 128 | Args: 129 | x: input image or batch of images. Shape: [batch,3,300,300]. 130 | 131 | Return: 132 | Depending on phase: 133 | test: 134 | Variable(tensor) of output class label predictions, 135 | confidence score, and corresponding location predictions for 136 | each object detected. Shape: [batch,topk,7] 137 | 138 | train: 139 | list of concat outputs from: 140 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 141 | 2: localization layers, Shape: [batch,num_priors*4] 142 | 3: priorbox layers, Shape: [2,num_priors*4] 143 | """ 144 | sources = list() 145 | loc = list() 146 | conf = list() 147 | 148 | # apply vgg up to conv4_3 relu 149 | for k in range(23): 150 | x = self.vgg[k](x) 151 | 152 | s = self.L2Norm(x) 153 | sources.append(s) 154 | 155 | # apply vgg up to fc7 156 | for k in range(23, len(self.vgg)): 157 | x = self.vgg[k](x) 158 | sources.append(x) 159 | 160 | # apply extra layers and cache source layer outputs 161 | for k, v in enumerate(self.extras): 162 | x = functional.relu(v(x), inplace=True) 163 | if k % 2 == 1: 164 | sources.append(x) 165 | 166 | # apply multibox head to source layers 167 | for (x, l, c) in zip(sources, self.loc, self.conf): 168 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 169 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 170 | 171 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 172 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 173 | if self.phase == "test": 174 | output = self.detect( 175 | loc.view(loc.size(0), -1, 4).to(self.device), # loc preds 176 | self.softmax(conf.view(conf.size(0), -1, 177 | self.num_classes)).to(self.device), # conf preds 178 | self.priors.type(type(x.data)).to(self.device) # default boxes 179 | ) 180 | else: 181 | output = ( 182 | loc.view(loc.size(0), -1, 4), 183 | conf.view(conf.size(0), -1, self.num_classes), 184 | self.priors 185 | ) 186 | return output 187 | 188 | def load_weights(self, base_file): 189 | _, ext = os.path.splitext(base_file) 190 | if ext == '.pkl' or '.pth': 191 | print('Loading weights into state dict...') 192 | self.load_state_dict(torch.load(base_file, 193 | map_location=lambda storage, loc: storage)) 194 | print('Finished!') 195 | else: 196 | print('Sorry only .pth and .pkl files supported.') 197 | 198 | 199 | # This function is derived from torchvision VGG make_layers() 200 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 201 | def vgg(cfg, i, batch_norm=False): 202 | layers = [] 203 | in_channels = i 204 | for v in cfg: 205 | if v == 'M': 206 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 207 | elif v == 'C': 208 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 209 | else: 210 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 211 | if batch_norm: 212 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 213 | else: 214 | layers += [conv2d, nn.ReLU(inplace=True)] 215 | in_channels = v 216 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 217 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 218 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 219 | layers += [pool5, conv6, 220 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 221 | return layers 222 | 223 | 224 | def add_extras(cfg, i): 225 | # Extra layers added to VGG for feature scaling 226 | layers = [] 227 | in_channels = i 228 | flag = False 229 | for k, v in enumerate(cfg): 230 | if in_channels != 'S': 231 | if v == 'S': 232 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 233 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 234 | else: 235 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 236 | flag = not flag 237 | in_channels = v 238 | return layers 239 | 240 | 241 | def multi_box(vgg_result, extra_layers, cfg, num_classes): 242 | loc_layers = [] 243 | conf_layers = [] 244 | vgg_source = [21, -2] 245 | for k, v in enumerate(vgg_source): 246 | loc_layers += [nn.Conv2d(vgg_result[v].out_channels, 247 | cfg[k] * 4, kernel_size=3, padding=1)] 248 | conf_layers += [nn.Conv2d(vgg_result[v].out_channels, 249 | cfg[k] * num_classes, kernel_size=3, padding=1)] 250 | for k, v in enumerate(extra_layers[1::2], 2): 251 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 252 | * 4, kernel_size=3, padding=1)] 253 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 254 | * num_classes, kernel_size=3, padding=1)] 255 | return vgg_result, extra_layers, (loc_layers, conf_layers) 256 | 257 | 258 | class L2Norm(nn.Module): 259 | def __init__(self, n_channels, scale): 260 | super(L2Norm, self).__init__() 261 | self.n_channels = n_channels 262 | self.gamma = scale or None 263 | self.eps = 1e-10 264 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 265 | self.reset_parameters() 266 | 267 | def reset_parameters(self): 268 | init.constant(self.weight, self.gamma) 269 | 270 | def forward(self, x): 271 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 272 | # x /= norm 273 | x = torch.div(x, norm) 274 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 275 | return out 276 | 277 | 278 | class PriorBox(object): 279 | """Compute priorbox coordinates in center-offset form for each source 280 | feature map. 281 | """ 282 | 283 | def __init__(self, cfg): 284 | super(PriorBox, self).__init__() 285 | self.image_size = cfg['min_dim'] 286 | # number of priors for feature map location (either 4 or 6) 287 | self.num_priors = len(cfg['aspect_ratios']) 288 | self.variance = cfg['variance'] or [0.1] 289 | self.feature_maps = cfg['feature_maps'] 290 | self.min_sizes = cfg['min_sizes'] 291 | self.max_sizes = cfg['max_sizes'] 292 | self.steps = cfg['steps'] 293 | self.aspect_ratios = cfg['aspect_ratios'] 294 | self.clip = cfg['clip'] 295 | self.version = cfg['name'] 296 | for v in self.variance: 297 | if v <= 0: 298 | raise ValueError('Variances must be greater than 0') 299 | 300 | def forward(self): 301 | mean = [] 302 | for k, f in enumerate(self.feature_maps): 303 | for i, j in product(range(f), repeat=2): 304 | f_k = self.image_size / self.steps[k] 305 | # unit center x,y 306 | cx = (j + 0.5) / f_k 307 | cy = (i + 0.5) / f_k 308 | 309 | # aspect_ratio: 1 310 | # rel size: min_size 311 | s_k = self.min_sizes[k] / self.image_size 312 | mean += [cx, cy, s_k, s_k] 313 | 314 | # aspect_ratio: 1 315 | # rel size: sqrt(s_k * s_(k+1)) 316 | s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size)) 317 | mean += [cx, cy, s_k_prime, s_k_prime] 318 | 319 | # rest of aspect ratios 320 | for ar in self.aspect_ratios[k]: 321 | mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)] 322 | mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)] 323 | # back to torch land 324 | output = torch.Tensor(mean).view(-1, 4) 325 | if self.clip: 326 | output.clamp_(max=1, min=0) 327 | return output 328 | 329 | 330 | class Detect(Function): 331 | """At test time, Detect is the final layer of SSD. Decode location preds, 332 | apply non-maximum suppression to location predictions based on conf 333 | scores and threshold to a top_k number of output predictions for both 334 | confidence score and locations. 335 | """ 336 | 337 | @staticmethod 338 | def backward(ctx, *grad_outputs): 339 | pass 340 | 341 | def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): 342 | self.num_classes = num_classes 343 | self.background_label = bkg_label 344 | self.top_k = top_k 345 | # Parameters used in nms. 346 | self.nms_thresh = nms_thresh 347 | if nms_thresh <= 0: 348 | raise ValueError('nms_threshold must be non negative.') 349 | self.conf_thresh = conf_thresh 350 | self.variance = VOC['variance'] 351 | 352 | def forward(self, loc_data, conf_data, prior_data): 353 | """ 354 | Args: 355 | loc_data: (tensor) Loc preds from loc layers 356 | Shape: [batch,num_priors*4] 357 | conf_data: (tensor) Shape: Conf preds from conf layers 358 | Shape: [batch*num_priors,num_classes] 359 | prior_data: (tensor) Prior boxes and variances from priorbox layers 360 | Shape: [1,num_priors,4] 361 | """ 362 | num = loc_data.size(0) # batch size 363 | num_priors = prior_data.size(0) 364 | output = torch.zeros(num, self.num_classes, self.top_k, 5) 365 | conf_preds = conf_data.view(num, num_priors, 366 | self.num_classes).transpose(2, 1) 367 | 368 | # Decode predictions into bboxes. 369 | for i in range(num): 370 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 371 | # For each class, perform nms 372 | conf_scores = conf_preds[i].clone() 373 | 374 | for cl in range(1, self.num_classes): 375 | c_mask = conf_scores[cl].gt(self.conf_thresh) 376 | scores = conf_scores[cl][c_mask] 377 | # if scores.dim() == 0: 378 | if scores.size(0) == 0: 379 | continue 380 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) 381 | boxes = decoded_boxes[l_mask].view(-1, 4) 382 | # idx of highest scoring and non-overlapping boxes per class 383 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) 384 | output[i, cl, :count] = \ 385 | torch.cat((scores[ids[:count]].unsqueeze(1), 386 | boxes[ids[:count]]), 1) 387 | flt = output.contiguous().view(num, -1, 5) 388 | _, idx = flt[:, :, 0].sort(1, descending=True) 389 | _, rank = idx.sort(1) 390 | flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) 391 | return output 392 | 393 | 394 | # Adapted from https://github.com/Hakuyume/chainer-ssd 395 | def decode(loc, priors, variances): 396 | """Decode locations from predictions using priors to undo 397 | the encoding we did for offset regression at train time. 398 | Args: 399 | loc (tensor): location predictions for loc layers, 400 | Shape: [num_priors,4] 401 | priors (tensor): Prior boxes in center-offset form. 402 | Shape: [num_priors,4]. 403 | variances: (list[float]) Variances of priorboxes 404 | Return: 405 | decoded bounding box predictions 406 | """ 407 | 408 | boxes = torch.cat(( 409 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 410 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 411 | boxes[:, :2] -= boxes[:, 2:] / 2 412 | boxes[:, 2:] += boxes[:, :2] 413 | return boxes 414 | 415 | 416 | # Original author: Francisco Massa: 417 | # https://github.com/fmassa/object-detection.torch 418 | # Ported to PyTorch by Max deGroot (02/01/2017) 419 | def nms(boxes, scores, overlap=0.5, top_k=200): 420 | """Apply non-maximum suppression at test time to avoid detecting too many 421 | overlapping bounding boxes for a given object. 422 | Args: 423 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 424 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 425 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 426 | top_k: (int) The Maximum number of box preds to consider. 427 | Return: 428 | The indices of the kept boxes with respect to num_priors. 429 | """ 430 | 431 | keep = scores.new(scores.size(0)).zero_().long() 432 | if boxes.numel() == 0: 433 | return keep 434 | x1 = boxes[:, 0] 435 | y1 = boxes[:, 1] 436 | x2 = boxes[:, 2] 437 | y2 = boxes[:, 3] 438 | area = torch.mul(x2 - x1, y2 - y1) 439 | _, idx = scores.sort(0) # sort in ascending order 440 | # I = I[v >= 0.01] 441 | idx = idx[-top_k:] # indices of the top-k largest vals 442 | xx1 = boxes.new() 443 | yy1 = boxes.new() 444 | xx2 = boxes.new() 445 | yy2 = boxes.new() 446 | w = boxes.new() 447 | h = boxes.new() 448 | 449 | # keep = torch.Tensor() 450 | count = 0 451 | while idx.numel() > 0: 452 | i = idx[-1] # index of current largest val 453 | # keep.append(i) 454 | keep[count] = i 455 | count += 1 456 | if idx.size(0) == 1: 457 | break 458 | idx = idx[:-1] # remove kept element from view 459 | # load bboxes of next highest vals 460 | torch.index_select(x1, 0, idx, out=xx1) 461 | torch.index_select(y1, 0, idx, out=yy1) 462 | torch.index_select(x2, 0, idx, out=xx2) 463 | torch.index_select(y2, 0, idx, out=yy2) 464 | # store element-wise max with next highest score 465 | xx1 = torch.clamp(xx1, min=x1[i]) 466 | yy1 = torch.clamp(yy1, min=y1[i]) 467 | xx2 = torch.clamp(xx2, max=x2[i]) 468 | yy2 = torch.clamp(yy2, max=y2[i]) 469 | w.resize_as_(xx2) 470 | h.resize_as_(yy2) 471 | w = xx2 - xx1 472 | h = yy2 - yy1 473 | # check sizes of xx1 and xx2.. after each iteration 474 | w = torch.clamp(w, min=0.0) 475 | h = torch.clamp(h, min=0.0) 476 | inter = w * h 477 | # iou = i / (area(a) + area(b) - i) 478 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 479 | union = (rem_areas - inter) + area[i] 480 | iou = inter / union # store result in iou 481 | # keep only elements with an iou <= overlap 482 | idx = idx[iou.le(overlap)] 483 | return keep, count 484 | 485 | 486 | class ObjectDetector(Pretrained): 487 | 488 | def __init__(self): 489 | super(ObjectDetector, self).__init__() 490 | self.model = None 491 | self.device = get_device() 492 | # load net 493 | num_classes = len(VOC_CLASSES) + 1 # +1 for background 494 | self.model = self._build_ssd('test', 300, num_classes) # initialize SSD 495 | if self.device.startswith("cuda"): 496 | self.model.load_state_dict(torch.load(self.local_paths[0])) 497 | else: 498 | self.model.load_state_dict(torch.load(self.local_paths[0], map_location=lambda storage, loc: storage)) 499 | self.model.eval() 500 | print('Finished loading model!') 501 | 502 | self.model = self.model.to(self.device) 503 | 504 | @property 505 | def _google_drive_files(self): 506 | return Constant.OBJECT_DETECTOR_MODELS 507 | 508 | def _build_ssd(self, phase, size=300, num_classes=21): 509 | if phase != "test" and phase != "train": 510 | print("ERROR: Phase: " + phase + " not recognized") 511 | return 512 | if size != 300: 513 | print("ERROR: You specified size " + repr(size) + ". However, " + 514 | "currently only SSD300 (size=300) is supported!") 515 | return 516 | base = { 517 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 518 | 512, 512, 512], 519 | '512': [], 520 | } 521 | extras = { 522 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 523 | '512': [], 524 | } 525 | mbox = { 526 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 527 | '512': [], 528 | } 529 | 530 | base_, extras_, head_ = multi_box(vgg(base[str(size)], 3), 531 | add_extras(extras[str(size)], 1024), 532 | mbox[str(size)], num_classes) 533 | return SSD(phase, size, base_, extras_, head_, num_classes, self.device) 534 | 535 | def predict(self, input_data, output_file_path=None): 536 | """ 537 | 538 | Returns: 539 | List of dictionaries. Each dictionary is like 540 | {"left": int, "top": int, "width": int, "height": int: "category": str, "confidence": float} 541 | """ 542 | from matplotlib.ticker import NullLocator 543 | 544 | dataset_mean = (104, 117, 123) 545 | 546 | image = cv2.imread(input_data, cv2.IMREAD_COLOR) 547 | height, width, _ = image.shape 548 | rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 549 | x = base_transform(rgb_image, 300, dataset_mean) 550 | x = x.astype(np.float32) 551 | x = torch.from_numpy(x).permute(2, 0, 1) 552 | xx = Variable(x.unsqueeze(0)) # wrap tensor in Variable 553 | # if self.device.startswith("cuda"): 554 | xx = xx.to(self.device) 555 | y = self.model(xx) 556 | 557 | # (batch, num_classes, top_k, 5), 5 means (confidence, ) 558 | detections = y.data 559 | results = [] 560 | # scale each detection back up to the image 561 | scale = torch.Tensor(rgb_image.shape[1::-1]).repeat(2) 562 | for i in range(detections.size(1)): 563 | j = 0 564 | while detections[0, i, j, 0] >= 0.6: 565 | score = detections[0, i, j, 0].item() 566 | label_name = VOC_CLASSES[i - 1] 567 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 568 | # result = ((pt[0], pt[1]), (pt[2] - pt[0] + 1, pt[3] - pt[1] + 1), label_name, score) 569 | result = { 570 | "left": max(int(np.round(pt[0])), 0), 571 | "top": max(int(np.round(pt[1])), 0), 572 | "width": min(int(np.round(pt[2] - pt[0] + 1)), width), 573 | "height": min(int(np.round(pt[3] - pt[1] + 1)), height), 574 | "category": label_name, 575 | "confidence": score 576 | } 577 | results.append(result) 578 | j += 1 579 | 580 | if output_file_path is not None: 581 | # plt.figure(figsize=(10,10)) 582 | colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() 583 | plt.imshow(rgb_image) # plot the image for matplotlib 584 | current_axis = plt.gca() 585 | current_axis.set_axis_off() 586 | current_axis.xaxis.set_major_locator(NullLocator()) 587 | current_axis.yaxis.set_major_locator(NullLocator()) 588 | 589 | # scale each detection back up to the image 590 | for i in range(detections.size(1)): 591 | j = 0 592 | while detections[0, i, j, 0] >= 0.6: 593 | score = detections[0, i, j, 0] 594 | label_name = VOC_CLASSES[i - 1] 595 | display_txt = '%s: %.2f' % (label_name, score) 596 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 597 | coords = (pt[0], pt[1]), pt[2] - pt[0] + 1, pt[3] - pt[1] + 1 598 | color = colors[i] 599 | current_axis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) 600 | current_axis.text(pt[0], pt[1], display_txt, bbox={'facecolor': color, 'alpha': 0.5}) 601 | j += 1 602 | plt.axis('off') 603 | plt.tight_layout() 604 | save_name = input_data.split('/')[-1] 605 | save_name = save_name.split('.') 606 | save_name = '.'.join(save_name[:-1]) + "_prediction." + save_name[-1] 607 | plt.savefig(os.path.join(output_file_path, save_name), bbox_inches='tight', pad_inches=0) 608 | plt.clf() 609 | 610 | return results 611 | 612 | 613 | def base_transform(image, size, mean): 614 | x = cv2.resize(image, (size, size)).astype(np.float32) 615 | x -= mean 616 | x = x.astype(np.float32) 617 | return x 618 | -------------------------------------------------------------------------------- /autokeras_pretrained/text_classifier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Original work Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Modified work Copyright 2019 The AutoKeras team. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import numpy as np 19 | import torch 20 | from abc import ABC 21 | 22 | from autokeras_pretrained.constant import Constant 23 | from autokeras_pretrained.base import Pretrained 24 | from autokeras_pretrained.bert.modeling import BertForSequenceClassification 25 | from autokeras_pretrained.bert.utils import convert_examples_to_features 26 | from autokeras_pretrained.bert.tokenization import BertTokenizer 27 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler 28 | 29 | 30 | class TextClassifier(Pretrained, ABC): 31 | """A pre-trained TextClassifier class based on Google AI's BERT model. 32 | 33 | Attributes: 34 | model: Type of BERT model to be used for the classification task. E.g:- Uncased, Cased, etc. 35 | The current pre-trained models are using 'bert-base-uncased'. 36 | tokenizer: Tokenizer used with BERT model. 37 | """ 38 | 39 | def __init__(self, num_classes=None, **kwargs): 40 | super().__init__(**kwargs) 41 | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 42 | 43 | model_state_dict = torch.load(self.local_paths[0], map_location=lambda storage, loc: storage) 44 | self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 45 | state_dict=model_state_dict, 46 | num_labels=num_classes) 47 | self.model.to(self.device) 48 | 49 | def y_predict(self, x_predict): 50 | """ Predict the labels for the provided input data. 51 | 52 | Args: 53 | x_predict: ndarray containing the data inputs. 54 | 55 | Returns: 56 | ndarray containing the predicted labels/outputs for x_predict. 57 | """ 58 | all_input_ids, all_input_mask, all_segment_ids = convert_examples_to_features([x_predict], 59 | self.tokenizer, 60 | max_seq_length=128) 61 | 62 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) 63 | 64 | eval_sampler = SequentialSampler(eval_data) 65 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1) 66 | 67 | self.model.eval() 68 | for input_ids, input_mask, segment_ids in eval_dataloader: 69 | input_ids = input_ids.to(self.device) 70 | input_mask = input_mask.to(self.device) 71 | segment_ids = segment_ids.to(self.device) 72 | 73 | with torch.no_grad(): 74 | logits = self.model(input_ids, segment_ids, input_mask) 75 | 76 | logits = logits.detach().cpu().numpy() 77 | 78 | for logit in logits: 79 | exp = np.exp(logit) 80 | exp = exp / np.sum(exp) 81 | y_pred = exp 82 | 83 | return y_pred 84 | 85 | 86 | class SentimentAnalysis(TextClassifier): 87 | """A SentimentAnalysis class inherited from TextClassifier. 88 | 89 | The model is trained on the IMDb dataset. The link for the dataset is given below. 90 | http://ai.stanford.edu/~amaas/data/sentiment/ 91 | """ 92 | 93 | def __init__(self, **kwargs): 94 | super().__init__(num_classes=2, **kwargs) 95 | 96 | @property 97 | def _google_drive_files(self): 98 | return Constant.SENTIMENT_ANALYSIS_MODELS 99 | 100 | def predict(self, x_predict, **kwargs): 101 | y_pred = self.y_predict(x_predict) 102 | return round(y_pred[1], 2) 103 | 104 | 105 | class TopicClassifier(TextClassifier): 106 | """A pre-trained TopicClassifier class inherited from TextClassifier. 107 | 108 | The model is trained on the AG News dataset. The link for the dataset is given below. 109 | https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html 110 | """ 111 | 112 | def __init__(self, **kwargs): 113 | super().__init__(num_classes=4, **kwargs) 114 | 115 | @property 116 | def _google_drive_files(self): 117 | return Constant.TOPIC_CLASSIFIER_MODELS 118 | 119 | def predict(self, x_predict, **kwargs): 120 | y_pred = self.y_predict(x_predict) 121 | class_id = np.argmax(y_pred) 122 | if class_id == 0: 123 | return "Business" 124 | elif class_id == 1: 125 | return "Sci/Tech" 126 | elif class_id == 2: 127 | return "World" 128 | else: 129 | return "Sports" 130 | -------------------------------------------------------------------------------- /autokeras_pretrained/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | import logging 4 | import os 5 | import pickle 6 | import random 7 | import string 8 | import sys 9 | import tempfile 10 | import zipfile 11 | from os import makedirs 12 | from os.path import dirname 13 | from os.path import exists 14 | from sys import stdout 15 | 16 | import imageio 17 | import numpy as np 18 | import requests 19 | import torch 20 | from scipy.ndimage import zoom 21 | 22 | from autokeras_pretrained.constant import Constant 23 | 24 | 25 | class NoImprovementError(Exception): 26 | def __init__(self, message): 27 | self.message = message 28 | 29 | 30 | def ensure_dir(directory): 31 | """Create directory if it does not exist.""" 32 | if not os.path.exists(directory): 33 | os.makedirs(directory) 34 | 35 | 36 | def ensure_file_dir(path): 37 | """Create path if it does not exist.""" 38 | ensure_dir(os.path.dirname(path)) 39 | 40 | 41 | def has_file(path): 42 | """Check if the given path exists.""" 43 | return os.path.exists(path) 44 | 45 | 46 | def pickle_from_file(path): 47 | """Load the pickle file from the provided path and returns the object.""" 48 | return pickle.load(open(path, 'rb')) 49 | 50 | 51 | def pickle_to_file(obj, path): 52 | """Save the pickle file to the specified path.""" 53 | pickle.dump(obj, open(path, 'wb')) 54 | 55 | 56 | def temp_path_generator(): 57 | sys_temp = tempfile.gettempdir() 58 | path = os.path.join(sys_temp, 'autokeras') 59 | return path 60 | 61 | 62 | def rand_temp_folder_generator(): 63 | """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).""" 64 | chars = string.ascii_uppercase + string.digits 65 | size = 6 66 | random_suffix = ''.join(random.choice(chars) for _ in range(size)) 67 | sys_temp = temp_path_generator() 68 | path = sys_temp + '_' + random_suffix 69 | ensure_dir(path) 70 | return path 71 | 72 | 73 | def download_file(file_link, file_path): 74 | """Download the file specified in `file_link` and saves it in `file_path`.""" 75 | if not os.path.exists(file_path): 76 | with open(file_path, "wb") as f: 77 | print("\nDownloading %s" % file_path) 78 | response = requests.get(file_link, stream=True) 79 | total_length = response.headers.get('content-length') 80 | 81 | if total_length is None: # no content length header 82 | f.write(response.content) 83 | else: 84 | dl = 0 85 | total_length = int(total_length) 86 | for data in response.iter_content(chunk_size=4096): 87 | dl += len(data) 88 | f.write(data) 89 | done = int(50 * dl / total_length) 90 | sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) 91 | sys.stdout.flush() 92 | 93 | 94 | def download_file_with_extract(file_link, file_path, extract_path): 95 | """Download the file specified in `file_link`, save to `file_path` and extract to the directory `extract_path`.""" 96 | if not os.path.exists(extract_path): 97 | download_file(file_link, file_path) 98 | zip_ref = zipfile.ZipFile(file_path, 'r') 99 | print("extracting downloaded file...") 100 | zip_ref.extractall(extract_path) 101 | os.remove(file_path) 102 | print("extracted and removed downloaded zip file") 103 | print("file already extracted in the path %s" % extract_path) 104 | 105 | 106 | def assert_search_space(search_space): 107 | grid = search_space 108 | value_list = [] 109 | if Constant.LENGTH_DIM not in list(grid.keys()): 110 | print('No length dimension found in search Space. Using default values') 111 | grid[Constant.LENGTH_DIM] = Constant.DEFAULT_LENGTH_SEARCH 112 | elif not isinstance(grid[Constant.LENGTH_DIM][0], int): 113 | print('Converting String to integers. Next time please make sure to enter integer values for Length Dimension') 114 | grid[Constant.LENGTH_DIM] = list(map(int, grid[Constant.LENGTH_DIM])) 115 | 116 | if Constant.WIDTH_DIM not in list(grid.keys()): 117 | print('No width dimension found in search Space. Using default values') 118 | grid[Constant.WIDTH_DIM] = Constant.DEFAULT_WIDTH_SEARCH 119 | elif not isinstance(grid[Constant.WIDTH_DIM][0], int): 120 | print('Converting String to integers. Next time please make sure to enter integer values for Width Dimension') 121 | grid[Constant.WIDTH_DIM] = list(map(int, grid[Constant.WIDTH_DIM])) 122 | 123 | grid_key_list = list(grid.keys()) 124 | grid_key_list.sort() 125 | for key in grid_key_list: 126 | value_list.append(grid[key]) 127 | 128 | dimension = list(itertools.product(*value_list)) 129 | # print(dimension) 130 | return grid, dimension 131 | 132 | 133 | def verbose_print(new_father_id, new_graph, new_model_id): 134 | """Print information about the operation performed on father model to obtain current model and father's id.""" 135 | cell_size = [24, 49] 136 | logging.info('New Model Id - ' + str(new_model_id)) 137 | header = ['Father Model ID', 'Added Operation'] 138 | line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(header)) 139 | logging.info('\n' + '+' + '-' * len(line) + '+') 140 | logging.info('|' + line + '|') 141 | logging.info('+' + '-' * len(line) + '+') 142 | for i in range(len(new_graph.operation_history)): 143 | if i == len(new_graph.operation_history) // 2: 144 | r = [str(new_father_id), ' '.join(str(item) for item in new_graph.operation_history[i])] 145 | else: 146 | r = [' ', ' '.join(str(item) for item in new_graph.operation_history[i])] 147 | line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(r)) 148 | logging.info('|' + line + '|') 149 | logging.info('+' + '-' * len(line) + '+') 150 | 151 | 152 | def validate_xy(x_train, y_train): 153 | """Validate `x_train`'s type and the shape of `x_train`, `y_train`.""" 154 | try: 155 | x_train = x_train.astype('float64') 156 | except ValueError: 157 | raise ValueError('x_train should only contain numerical data.') 158 | 159 | if len(x_train.shape) < 2: 160 | raise ValueError('x_train should at least has 2 dimensions.') 161 | 162 | if x_train.shape[0] != y_train.shape[0]: 163 | raise ValueError('x_train and y_train should have the same number of instances.') 164 | 165 | 166 | def read_csv_file(csv_file_path): 167 | """Read the csv file and returns two separate list containing file names and their labels. 168 | 169 | Args: 170 | csv_file_path: Path to the CSV file. 171 | 172 | Returns: 173 | file_names: List containing files names. 174 | file_label: List containing their respective labels. 175 | """ 176 | file_names = [] 177 | file_labels = [] 178 | with open(csv_file_path, 'r') as files_path: 179 | path_list = csv.DictReader(files_path) 180 | fieldnames = path_list.fieldnames 181 | for path in path_list: 182 | file_names.append(path[fieldnames[0]]) 183 | file_labels.append(path[fieldnames[1]]) 184 | return file_names, file_labels 185 | 186 | 187 | def read_tsv_file(input_file, quotechar=None): 188 | """Reads a tab separated value (tsv) file and return two lists containing file names and labels.""" 189 | with open(input_file, "r", encoding='utf-8') as f: 190 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 191 | x, y = [], [] 192 | for line in reader: 193 | x.append(line[0]) 194 | y.append(int(line[1])) 195 | return x, y 196 | 197 | 198 | def read_image(img_path): 199 | """Read the image contained in the provided path `image_path`.""" 200 | img = imageio.imread(uri=img_path) 201 | return img 202 | 203 | 204 | def compute_image_resize_params(data): 205 | """Compute median dimension of all images in data. 206 | 207 | It used to resize the images later. Number of channels do not change from the original data. 208 | 209 | Args: 210 | data: 1-D, 2-D or 3-D images. The Images are expected to have channel last configuration. 211 | 212 | Returns: 213 | median shape. 214 | """ 215 | if data is None or len(data.shape) == 0: 216 | return [] 217 | 218 | if len(data.shape) == len(data[0].shape) + 1 and np.prod(data[0].shape[:-1]) <= Constant.MAX_IMAGE_SIZE: 219 | return data[0].shape 220 | 221 | data_shapes = [] 222 | for x in data: 223 | data_shapes.append(x.shape) 224 | 225 | median_shape = np.median(np.array(data_shapes), axis=0) 226 | median_size = np.prod(median_shape[:-1]) 227 | 228 | if median_size > Constant.MAX_IMAGE_SIZE: 229 | reduction_factor = np.power(Constant.MAX_IMAGE_SIZE / median_size, 1 / (len(median_shape) - 1)) 230 | median_shape[:-1] = median_shape[:-1] * reduction_factor 231 | 232 | return median_shape.astype(int) 233 | 234 | 235 | def resize_image_data(data, resize_shape): 236 | """Resize images to given dimension. 237 | 238 | Args: 239 | data: 1-D, 2-D or 3-D images. The Images are expected to have channel last configuration. 240 | resize_shape: Image resize dimension. 241 | 242 | Returns: 243 | data: Reshaped data. 244 | """ 245 | if data is None or len(resize_shape) == 0: 246 | return data 247 | 248 | if len(data.shape) > 1 and np.array_equal(data[0].shape, resize_shape): 249 | return data 250 | 251 | output_data = [] 252 | for im in data: 253 | output_data.append(zoom(input=im, zoom=np.divide(resize_shape, im.shape))) 254 | 255 | return np.array(output_data) 256 | 257 | 258 | def get_system(): 259 | """Get the current system environment. If the current system is not supported, raise an exception. 260 | 261 | Returns: 262 | A string to represent the current OS name. 263 | "posix" stands for Linux, Mac or Solaris architecture. 264 | "nt" stands for Windows system. 265 | """ 266 | if 'google.colab' in sys.modules: 267 | return Constant.SYS_GOOGLE_COLAB 268 | if os.name == 'posix': 269 | return Constant.SYS_LINUX 270 | if os.name == 'nt': 271 | return Constant.SYS_WINDOWS 272 | 273 | raise EnvironmentError('Unsupported environment') 274 | 275 | 276 | def download_file_from_google_drive(file_id, dest_path, verbose=False): 277 | """ 278 | Downloads a shared file from google drive into a given folder. 279 | Optionally unzips it. 280 | 281 | Refact from: 282 | https://github.com/ndrplz/google-drive-downloader/blob/master/google_drive_downloader/google_drive_downloader.py 283 | 284 | Args: 285 | verbose: 286 | file_id: str 287 | the file identifier. 288 | You can obtain it from the sharable link. 289 | dest_path: str 290 | the destination where to save the downloaded file. 291 | Must be a path (for example: './downloaded_file.txt') 292 | """ 293 | 294 | destination_directory = dirname(dest_path) 295 | if len(destination_directory) > 0 and not exists(destination_directory): 296 | makedirs(destination_directory) 297 | 298 | session = requests.Session() 299 | 300 | if verbose: 301 | print('Downloading file with Google ID {} into {}... '.format(file_id, dest_path), end='') 302 | stdout.flush() 303 | 304 | response = session.get(Constant.DOWNLOAD_URL, params={'id': file_id}, stream=True) 305 | 306 | token = get_confirm_token(response) 307 | if token: 308 | params = {'id': file_id, 'confirm': token} 309 | response = session.get(Constant.DOWNLOAD_URL, params=params, stream=True) 310 | 311 | save_response_content(response, dest_path) 312 | if verbose: 313 | print('Download completed.') 314 | 315 | 316 | def get_confirm_token(response): 317 | for key, value in response.cookies.items(): 318 | if key.startswith('download_warning'): 319 | return value 320 | return None 321 | 322 | 323 | def save_response_content(response, destination): 324 | with open(destination, "wb") as f: 325 | for chunk in response.iter_content(Constant.CHUNK_SIZE): 326 | if chunk: # filter out keep-alive new chunks 327 | f.write(chunk) 328 | 329 | 330 | def get_device(): 331 | """ If CUDA is available, use CUDA device, else use CPU device. 332 | Returns: string device name 333 | """ 334 | return 'cuda' if torch.cuda.is_available() else 'cpu' 335 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/__init__.py -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/deepvoice3_pytorch/__init__.py -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/builder.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.model import MultiSpeakerTTSModel, AttentionSeq2Seq 2 | 3 | 4 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, n_speakers=1, speaker_embed_dim=16, 5 | padding_idx=0, dropout=(1 - 0.95), kernel_size=5, encoder_channels=128, decoder_channels=256, 6 | converter_channels=256, query_position_rate=1.0, key_position_rate=1.29, use_memory_mask=False, 7 | trainable_positional_encodings=False, force_monotonic_attention=True, 8 | use_decoder_state_for_postnet_input=True, max_positions=512, embedding_weight_std=0.1, 9 | freeze_embedding=False, window_ahead=3, window_backward=1): 10 | """Build deepvoice3 11 | """ 12 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 13 | 14 | # Seq2seq 15 | h = encoder_channels # hidden dim (channels) 16 | k = kernel_size # kernel size 17 | encoder = Encoder(n_vocab, embed_dim, n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 18 | padding_idx=padding_idx, embedding_weight_std=embedding_weight_std, 19 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 20 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 21 | (h, k, 1), (h, k, 3)], dropout=dropout) 22 | 23 | h = decoder_channels 24 | decoder = Decoder(embed_dim, n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, in_dim=mel_dim, r=r, 25 | max_positions=max_positions, preattention=[(h, k, 1), (h, k, 3)], 26 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 27 | (h, k, 1)], attention=[True, False, False, False, True], dropout=dropout, 28 | use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, 29 | query_position_rate=query_position_rate, key_position_rate=key_position_rate, 30 | window_ahead=window_ahead, window_backward=window_backward) 31 | 32 | seq2seq = AttentionSeq2Seq(encoder, decoder) 33 | 34 | # Post net 35 | in_dim = h // r 36 | 37 | h = converter_channels 38 | converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, in_dim=in_dim, out_dim=linear_dim, 39 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], dropout=dropout) 40 | 41 | # Seq2seq + post net 42 | model = MultiSpeakerTTSModel(seq2seq, converter, mel_dim=mel_dim, linear_dim=linear_dim, n_speakers=n_speakers, 43 | speaker_embed_dim=speaker_embed_dim, 44 | trainable_positional_encodings=trainable_positional_encodings, 45 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 46 | freeze_embedding=freeze_embedding) 47 | 48 | return model 49 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class Conv1d(nn.Conv1d): 7 | """Extended nn.Conv1d for incremental dilated convolutions 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.clear_buffer() 13 | self._linearized_weight = None 14 | 15 | def incremental_forward(self, input_data): 16 | 17 | # reshape weight 18 | weight = self._get_linearized_weight() 19 | kw = self.kernel_size[0] 20 | dilation = self.dilation[0] 21 | 22 | bsz = input_data.size(0) # conv_input: bsz x len x dim 23 | if kw > 1: 24 | input_data = input_data.data 25 | if self.input_buffer is None: 26 | self.input_buffer = input_data.new(bsz, kw + (kw - 1) * (dilation - 1), input_data.size(2)) 27 | self.input_buffer.zero_() 28 | else: 29 | # shift buffer 30 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 31 | # append next input 32 | self.input_buffer[:, -1, :] = input_data[:, -1, :] 33 | input_data = self.input_buffer 34 | if dilation > 1: 35 | input_data = input_data[:, 0::dilation, :].contiguous() 36 | input_data = F.linear(input_data.view(bsz, -1), weight, self.bias) 37 | return input_data.view(bsz, 1, -1) 38 | 39 | def clear_buffer(self): 40 | self.input_buffer = None 41 | 42 | def _get_linearized_weight(self): 43 | if self._linearized_weight is None: 44 | kw = self.kernel_size[0] 45 | # nn.Conv1d 46 | weight = self.weight.transpose(1, 2).contiguous() 47 | 48 | if weight.size() != (self.out_channels, kw, self.in_channels): 49 | raise AssertionError() 50 | self._linearized_weight = weight.view(self.out_channels, -1) 51 | return self._linearized_weight 52 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/deepvoice3.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import math 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from .modules import conv1d, conv_transpose1d, embedding, linear 9 | from .modules import SinusoidalEncoding, Conv1dGLU 10 | 11 | 12 | class Encoder(nn.Module): 13 | def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim, padding_idx=None, embedding_weight_std=0.1, 14 | convolutions=((64, 5, .1),) * 7, dropout=0.1, apply_grad_scaling=False): 15 | super(Encoder, self).__init__() 16 | self.dropout = dropout 17 | self.num_attention_layers = None 18 | self.apply_grad_scaling = apply_grad_scaling 19 | 20 | # Text input embeddings 21 | self.embed_tokens = embedding( 22 | n_vocab, embed_dim, padding_idx, embedding_weight_std) 23 | 24 | self.n_speakers = n_speakers 25 | 26 | # Non causual convolution blocks 27 | in_channels = embed_dim 28 | self.convolutions = nn.ModuleList() 29 | std_mul = 1.0 30 | for (out_channels, kernel_size, dilation) in convolutions: 31 | if in_channels != out_channels: 32 | # Conv1d + ReLU 33 | self.convolutions.append( 34 | conv1d(in_channels, out_channels, kernel_size=1, padding=0, 35 | dilation=1, std_mul=std_mul)) 36 | self.convolutions.append(nn.ReLU(inplace=True)) 37 | in_channels = out_channels 38 | std_mul = 2.0 39 | self.convolutions.append( 40 | Conv1dGLU(n_speakers, speaker_embed_dim, 41 | in_channels, out_channels, kernel_size, causal=False, 42 | dilation=dilation, dropout=dropout, std_mul=std_mul, 43 | residual=True)) 44 | in_channels = out_channels 45 | std_mul = 4.0 46 | # Last 1x1 convolution 47 | self.convolutions.append(conv1d(in_channels, embed_dim, kernel_size=1, 48 | padding=0, dilation=1, std_mul=std_mul, 49 | dropout=dropout)) 50 | 51 | def forward(self, text_sequences, text_positions=None, lengths=None, 52 | speaker_embed=None): 53 | if self.n_speakers != 1 and speaker_embed is None: 54 | raise AssertionError("Expected \033[1;31m\033[m to be 1 or \033[1:31m[m to be not None, but was not") 55 | # embed text_sequences 56 | x = self.embed_tokens(text_sequences.long()) 57 | x = F.dropout(x, p=self.dropout, training=self.training) 58 | 59 | # expand speaker embedding for all time steps 60 | speaker_embed_btc = None 61 | 62 | input_embedding = x 63 | 64 | # B x T x C -> B x C x T 65 | x = x.transpose(1, 2) 66 | 67 | # 1D conv blocks 68 | for f in self.convolutions: 69 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 70 | 71 | # Back to B x T x C 72 | keys = x.transpose(1, 2) 73 | 74 | # scale gradients (this only affects backward, not forward) 75 | # add output to input embedding for attention 76 | values = (keys + input_embedding) * math.sqrt(0.5) 77 | 78 | return keys, values 79 | 80 | 81 | class AttentionLayer(nn.Module): 82 | def __init__(self, conv_channels, embed_dim, dropout=0.1, window_ahead=3, window_backward=1): 83 | super(AttentionLayer, self).__init__() 84 | self.query_projection = linear(conv_channels, embed_dim) 85 | self.key_projection = None 86 | self.value_projection = None 87 | self.out_projection = linear(embed_dim, conv_channels) 88 | self.dropout = dropout 89 | self.window_ahead = window_ahead 90 | self.window_backward = window_backward 91 | 92 | def forward(self, query, encoder_out, mask=None, last_attended=None): 93 | keys, values = encoder_out 94 | residual = query 95 | 96 | # attention 97 | x = self.query_projection(query) 98 | x = torch.bmm(x, keys) 99 | 100 | mask_value = -float("inf") 101 | 102 | if last_attended is not None: 103 | backward = last_attended - self.window_backward 104 | if backward > 0: 105 | x[:, :, :backward] = mask_value 106 | ahead = last_attended + self.window_ahead 107 | if ahead < x.size(-1): 108 | x[:, :, ahead:] = mask_value 109 | 110 | # softmax over last dim 111 | # (B, tgt_len, src_len) 112 | sz = x.size() 113 | x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1) 114 | x = x.view(sz) 115 | attn_scores = x 116 | 117 | x = F.dropout(x, p=self.dropout, training=self.training) 118 | 119 | x = torch.bmm(x, values) 120 | 121 | # scale attention output 122 | s = values.size(1) 123 | x = x * (s * math.sqrt(1.0 / s)) 124 | 125 | # project back 126 | x = self.out_projection(x) 127 | x = (x + residual) * math.sqrt(0.5) 128 | return x, attn_scores 129 | 130 | 131 | class Decoder(nn.Module): 132 | def __init__(self, embed_dim, n_speakers, speaker_embed_dim, in_dim=80, r=5, max_positions=512, 133 | preattention=((128, 5, 1),) * 4, convolutions=((128, 5, 1),) * 4, attention=True, dropout=0.1, 134 | use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0, 135 | key_position_rate=1.29, window_ahead=3, window_backward=1): 136 | super(Decoder, self).__init__() 137 | self.dropout = dropout 138 | self.in_dim = in_dim 139 | self.r = r 140 | self.query_position_rate = query_position_rate 141 | self.key_position_rate = key_position_rate 142 | 143 | # Position encodings for query (decoder states) and keys (encoder states) 144 | self.embed_query_positions = SinusoidalEncoding( 145 | max_positions, convolutions[0][0]) 146 | self.embed_keys_positions = SinusoidalEncoding( 147 | max_positions, embed_dim) 148 | # Used for compute multiplier for positional encodings 149 | self.speaker_proj1, self.speaker_proj2 = None, None 150 | 151 | # Prenet: causal convolution blocks 152 | self.preattention = nn.ModuleList() 153 | in_channels = in_dim * r 154 | std_mul = 1.0 155 | for out_channels, kernel_size, dilation in preattention: 156 | if in_channels != out_channels: 157 | # Conv1d + ReLU 158 | self.preattention.append( 159 | conv1d(in_channels, out_channels, kernel_size=1, padding=0, 160 | dilation=1, std_mul=std_mul)) 161 | self.preattention.append(nn.ReLU(inplace=True)) 162 | in_channels = out_channels 163 | std_mul = 2.0 164 | self.preattention.append( 165 | Conv1dGLU(n_speakers, speaker_embed_dim, 166 | in_channels, out_channels, kernel_size, causal=True, 167 | dilation=dilation, dropout=dropout, std_mul=std_mul, 168 | residual=True)) 169 | in_channels = out_channels 170 | std_mul = 4.0 171 | 172 | # Causal convolution blocks + attention layers 173 | self.convolutions = nn.ModuleList() 174 | self.attention = nn.ModuleList() 175 | 176 | for i, (out_channels, kernel_size, dilation) in enumerate(convolutions): 177 | if in_channels != out_channels: 178 | raise AssertionError("Expected \033[1;31m\033[m to be equal to \033[1:31m[m, but was not") 179 | self.convolutions.append( 180 | Conv1dGLU(n_speakers, speaker_embed_dim, 181 | in_channels, out_channels, kernel_size, causal=True, 182 | dilation=dilation, dropout=dropout, std_mul=std_mul, 183 | residual=False)) 184 | self.attention.append( 185 | AttentionLayer(out_channels, embed_dim, dropout=dropout, window_ahead=window_ahead, 186 | window_backward=window_backward) 187 | if attention[i] else None) 188 | in_channels = out_channels 189 | std_mul = 4.0 190 | # Last 1x1 convolution 191 | self.last_conv = conv1d(in_channels, in_dim * r, kernel_size=1, 192 | padding=0, dilation=1, std_mul=std_mul, 193 | dropout=dropout) 194 | 195 | # Mel-spectrogram (before sigmoid) -> Done binary flag 196 | self.fc = linear(in_dim * r, 1) 197 | 198 | self.max_decoder_steps = 200 199 | self.min_decoder_steps = 10 200 | self.use_memory_mask = use_memory_mask 201 | self.force_monotonic_attention = [force_monotonic_attention] * len(convolutions) 202 | 203 | def forward(self, encoder_out, inputs=None, 204 | text_positions=None, frame_positions=None, 205 | speaker_embed=None, lengths=None): 206 | if inputs is None: 207 | if text_positions is None: 208 | raise AssertionError("Expected \033[1;31m\033[m to be not None, but was") 209 | self.start_fresh_sequence() 210 | outputs = self.incremental_forward(encoder_out, text_positions) 211 | return outputs 212 | 213 | # Grouping multiple frames if necessary 214 | 215 | def incremental_forward(self, encoder_out, text_positions, initial_input=None, test_inputs=None): 216 | keys, values = encoder_out 217 | b = keys.size(0) 218 | 219 | # position encodings 220 | w = self.key_position_rate 221 | text_pos_embed = self.embed_keys_positions(text_positions, w) 222 | keys = keys + text_pos_embed 223 | 224 | # transpose only once to speed up attention layers 225 | keys = keys.transpose(1, 2).contiguous() 226 | 227 | decoder_states = [] 228 | outputs = [] 229 | alignments = [] 230 | dones = [] 231 | # intially set to zeros 232 | last_attended = [None] * len(self.attention) 233 | for idx, v in enumerate(self.force_monotonic_attention): 234 | last_attended[idx] = 0 if v else None 235 | 236 | num_attention_layers = sum([layer is not None for layer in self.attention]) 237 | t = 0 238 | if initial_input is None: 239 | initial_input = keys.data.new(b, 1, self.in_dim * self.r).zero_() 240 | current_input = initial_input 241 | while True: 242 | # frame pos start with 1. 243 | frame_pos = keys.data.new(b, 1).fill_(t + 1).long() 244 | w = self.query_position_rate 245 | frame_pos_embed = self.embed_query_positions(frame_pos, w) 246 | 247 | if t > 0: 248 | current_input = outputs[-1] 249 | output_tensor = current_input 250 | output_tensor = F.dropout(output_tensor, p=self.dropout, training=self.training) 251 | 252 | # Prenet 253 | for f in self.preattention: 254 | if isinstance(f, Conv1dGLU): 255 | output_tensor = f.incremental_forward(output_tensor) 256 | else: 257 | try: 258 | output_tensor = f.incremental_forward(output_tensor, ) 259 | except AttributeError: 260 | output_tensor = f(output_tensor) 261 | 262 | # Casual convolutions + Multi-hop attentions 263 | ave_alignment = None 264 | for idx, (f, attention) in enumerate(zip(self.convolutions, 265 | self.attention)): 266 | residual = output_tensor 267 | if isinstance(f, Conv1dGLU): 268 | output_tensor = f.incremental_forward(output_tensor) 269 | 270 | if attention is not None: 271 | 272 | if isinstance(f, Conv1dGLU) is False: 273 | raise AssertionError() 274 | output_tensor = output_tensor + frame_pos_embed 275 | output_tensor, alignment = attention(output_tensor, (keys, values), 276 | last_attended=last_attended[idx]) 277 | if self.force_monotonic_attention[idx]: 278 | last_attended[idx] = alignment.max(-1)[1].view(-1).data[0] 279 | if ave_alignment is None: 280 | ave_alignment = alignment 281 | else: 282 | ave_alignment = ave_alignment + ave_alignment 283 | 284 | # residual 285 | if isinstance(f, Conv1dGLU): 286 | output_tensor = (output_tensor + residual) * math.sqrt(0.5) 287 | 288 | decoder_state = output_tensor 289 | output_tensor = self.last_conv.incremental_forward(output_tensor, ) 290 | ave_alignment = ave_alignment.div_(num_attention_layers) 291 | 292 | # Ooutput & done flag predictions 293 | output = F.sigmoid(output_tensor) 294 | done = F.sigmoid(self.fc(output_tensor)) 295 | 296 | decoder_states += [decoder_state] 297 | outputs += [output] 298 | alignments += [ave_alignment] 299 | dones += [done] 300 | 301 | t += 1 302 | if test_inputs is None: 303 | if (done > 0.5).all() and t > self.min_decoder_steps: 304 | break 305 | 306 | # Remove 1-element time axis 307 | alignments = list(map(lambda x: x.squeeze(1), alignments)) 308 | decoder_states = list(map(lambda x: x.squeeze(1), decoder_states)) 309 | outputs = list(map(lambda x: x.squeeze(1), outputs)) 310 | 311 | # Combine outputs for all time steps 312 | alignments = torch.stack(alignments).transpose(0, 1) 313 | decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous() 314 | outputs = torch.stack(outputs).transpose(0, 1).contiguous() 315 | 316 | return outputs, alignments, dones, decoder_states 317 | 318 | def start_fresh_sequence(self): 319 | _clear_modules(self.preattention) 320 | _clear_modules(self.convolutions) 321 | self.last_conv.clear_buffer() 322 | 323 | 324 | def _clear_modules(modules): 325 | for m in modules: 326 | try: 327 | m.clear_buffer() 328 | except AttributeError: 329 | pass 330 | 331 | 332 | class Converter(nn.Module): 333 | def __init__(self, n_speakers, speaker_embed_dim, in_dim, out_dim, convolutions=((256, 5, 1),) * 4, dropout=0.1): 334 | super(Converter, self).__init__() 335 | self.dropout = dropout 336 | self.in_dim = in_dim 337 | self.out_dim = out_dim 338 | self.n_speakers = n_speakers 339 | 340 | # Non causual convolution blocks 341 | in_channels = convolutions[0][0] 342 | # Idea from nyanko 343 | self.convolutions = nn.ModuleList([ 344 | conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1, 345 | std_mul=1.0), 346 | conv_transpose1d(in_channels, in_channels, kernel_size=2, 347 | padding=0, stride=2, std_mul=1.0), 348 | Conv1dGLU(n_speakers, speaker_embed_dim, 349 | in_channels, in_channels, kernel_size=3, causal=False, 350 | dilation=1, dropout=dropout, std_mul=1.0, residual=True), 351 | Conv1dGLU(n_speakers, speaker_embed_dim, 352 | in_channels, in_channels, kernel_size=3, causal=False, 353 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 354 | conv_transpose1d(in_channels, in_channels, kernel_size=2, 355 | padding=0, stride=2, std_mul=4.0), 356 | Conv1dGLU(n_speakers, speaker_embed_dim, 357 | in_channels, in_channels, kernel_size=3, causal=False, 358 | dilation=1, dropout=dropout, std_mul=1.0, residual=True), 359 | Conv1dGLU(n_speakers, speaker_embed_dim, 360 | in_channels, in_channels, kernel_size=3, causal=False, 361 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 362 | ]) 363 | 364 | std_mul = 4.0 365 | for (out_channels, kernel_size, dilation) in convolutions: 366 | if in_channels != out_channels: 367 | self.convolutions.append( 368 | conv1d(in_channels, out_channels, kernel_size=1, padding=0, 369 | dilation=1, std_mul=std_mul)) 370 | self.convolutions.append(nn.ReLU(inplace=True)) 371 | in_channels = out_channels 372 | std_mul = 2.0 373 | self.convolutions.append( 374 | Conv1dGLU(n_speakers, speaker_embed_dim, 375 | in_channels, out_channels, kernel_size, causal=False, 376 | dilation=dilation, dropout=dropout, std_mul=std_mul, 377 | residual=True)) 378 | in_channels = out_channels 379 | std_mul = 4.0 380 | # Last 1x1 convolution 381 | self.convolutions.append(conv1d(in_channels, out_dim, kernel_size=1, 382 | padding=0, dilation=1, std_mul=std_mul, 383 | dropout=dropout)) 384 | 385 | def forward(self, x, speaker_embed=None): 386 | if self.n_speakers != 1 and speaker_embed == None: 387 | raise AssertionError("Expected \033[1;31m\033[m to be 1 or \033[1:31m[m to be not None, but was not") 388 | speaker_embed_btc = None 389 | # Generic case: B x T x C -> B x C x T 390 | x = x.transpose(1, 2) 391 | for f in self.convolutions: 392 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 393 | # Back to B x T x C 394 | x = x.transpose(1, 2) 395 | 396 | return F.sigmoid(x) 397 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/frontend.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import nltk 4 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.symbols import symbols 5 | 6 | 7 | N_VOCAB = len(symbols) 8 | n_vocab = N_VOCAB 9 | 10 | try: 11 | _ARPHABET = nltk.corpus.cmudict.dict() 12 | except BaseException: 13 | nltk.download("cmudict") 14 | _ARPHABET = nltk.corpus.cmudict.dict() 15 | 16 | 17 | def _maybe_get_arpabet(word, pro): 18 | try: 19 | phonemes = _ARPHABET[word][0] 20 | phonemes = " ".join(phonemes) 21 | except KeyError: 22 | return word 23 | 24 | return '{%s}' % phonemes if ord(os.urandom(1)) < pro else word 25 | 26 | 27 | def mix_pronunciation(text, pro): 28 | text = ' '.join(_maybe_get_arpabet(word, pro) for word in text.split(' ')) 29 | return text 30 | 31 | 32 | def text_to_sequence(text, p=0.0): 33 | pro = p 34 | if pro >= 0: 35 | text = mix_pronunciation(text, pro) 36 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.text import text_to_sequence 37 | text = text_to_sequence(text, ["english_cleaners"]) 38 | return text 39 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from torch import nn 4 | 5 | 6 | class MultiSpeakerTTSModel(nn.Module): 7 | """Attention seq2seq model + post processing network 8 | """ 9 | 10 | def __init__(self, seq2seq, postnet, mel_dim=80, linear_dim=513, n_speakers=1, speaker_embed_dim=16, 11 | trainable_positional_encodings=False, use_decoder_state_for_postnet_input=False, 12 | freeze_embedding=False): 13 | super(MultiSpeakerTTSModel, self).__init__() 14 | self.seq2seq = seq2seq 15 | self.postnet = postnet # referred as "Converter" in DeepVoice3 16 | self.mel_dim = mel_dim 17 | self.linear_dim = linear_dim 18 | self.trainable_positional_encodings = trainable_positional_encodings 19 | self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input 20 | self.freeze_embedding = freeze_embedding 21 | 22 | self.n_speakers = n_speakers 23 | self.speaker_embed_dim = speaker_embed_dim 24 | 25 | def make_generation_fast_(self): 26 | 27 | def remove_weight_norm(m): 28 | try: 29 | nn.utils.remove_weight_norm(m) 30 | except ValueError: # this module didn't have weight norm 31 | return 32 | self.apply(remove_weight_norm) 33 | 34 | def forward(self, text_sequences, mel_targets=None, speaker_ids=None, 35 | text_positions=None, frame_positions=None, input_lengths=None): 36 | b = text_sequences.size(0) 37 | 38 | speaker_embed = None 39 | 40 | # Apply seq2seq 41 | # (B, T//r, mel_dim*r) 42 | mel_outputs, alignments, done, decoder_states = self.seq2seq( 43 | text_sequences, mel_targets, speaker_embed, 44 | text_positions, frame_positions, input_lengths) 45 | 46 | # Reshape 47 | # (B, T, mel_dim) 48 | mel_outputs = mel_outputs.view(b, -1, self.mel_dim) 49 | 50 | # Prepare postnet inputs 51 | postnet_inputs = decoder_states.view(b, mel_outputs.size(1), -1) 52 | 53 | # (B, T, linear_dim) 54 | # Convert coarse mel-spectrogram (or decoder hidden states) to 55 | # high resolution spectrogram 56 | linear_outputs = self.postnet(postnet_inputs, speaker_embed) 57 | 58 | if linear_outputs.size(-1) != self.linear_dim: 59 | raise AssertionError() 60 | return mel_outputs, linear_outputs, alignments, done 61 | 62 | 63 | class AttentionSeq2Seq(nn.Module): 64 | """Encoder + Decoder with attention 65 | """ 66 | 67 | def __init__(self, encoder, decoder): 68 | super(AttentionSeq2Seq, self).__init__() 69 | self.encoder = encoder 70 | self.decoder = decoder 71 | if isinstance(self.decoder.attention, nn.ModuleList): 72 | self.encoder.num_attention_layers = sum( 73 | [layer is not None for layer in decoder.attention]) 74 | 75 | def forward(self, text_sequences, mel_targets=None, speaker_embed=None, 76 | text_positions=None, frame_positions=None, input_lengths=None): 77 | # (B, T, text_embed_dim) 78 | encoder_outputs = self.encoder( 79 | text_sequences, lengths=input_lengths, speaker_embed=speaker_embed) 80 | 81 | # Mel: (B, T//r, mel_dim*r) 82 | # Alignments: (N, B, T_target, T_input) 83 | # Done: (B, T//r, 1) 84 | mel_outputs, alignments, done, decoder_states = self.decoder( 85 | encoder_outputs, mel_targets, 86 | text_positions=text_positions, frame_positions=frame_positions, 87 | speaker_embed=speaker_embed, lengths=input_lengths) 88 | 89 | return mel_outputs, alignments, done, decoder_states 90 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | import math 6 | import numpy as np 7 | from torch.nn import functional as F 8 | 9 | 10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0): 11 | """Init the sinusoid position encoding table """ 12 | 13 | # keep dim 0 for padding token position encoding zero vector 14 | position_enc = np.array([ 15 | [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)] 16 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) 17 | 18 | position_enc = torch.from_numpy(position_enc).float() 19 | 20 | return position_enc 21 | 22 | 23 | def sinusoidal_encode(x, w): 24 | y = w * x 25 | y[1:, 0::2] = torch.sin(y[1:, 0::2].clone()) 26 | y[1:, 1::2] = torch.cos(y[1:, 1::2].clone()) 27 | return y 28 | 29 | 30 | class SinusoidalEncoding(nn.Embedding): 31 | 32 | def __init__(self, num_embeddings, embedding_dim, 33 | *args, **kwargs): 34 | super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim, 35 | padding_idx=0, 36 | *args, **kwargs) 37 | self.weight.data = position_encoding_init(num_embeddings, embedding_dim, position_rate=1.0) 38 | 39 | def forward(self, x, w=1.0): 40 | isscaler = np.isscalar(w) 41 | if self.padding_idx is None: 42 | raise AssertionError() 43 | 44 | if isscaler or w.size(0) == 1: 45 | weight = sinusoidal_encode(self.weight, w) 46 | return F.embedding( 47 | x, weight, self.padding_idx, self.max_norm, 48 | self.norm_type, self.scale_grad_by_freq, self.sparse) 49 | 50 | 51 | def linear(in_features, out_features, dropout=0): 52 | """Weight-normalized Linear layer (input: N x T x C)""" 53 | m = nn.Linear(in_features, out_features) 54 | m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) 55 | m.bias.data.zero_() 56 | return nn.utils.weight_norm(m) 57 | 58 | 59 | def embedding(num_embeddings, embedding_dim, padding_idx, std=0.01): 60 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 61 | m.weight.data.normal_(0, std) 62 | return m 63 | 64 | def m_modification(m, in_channels, dropout, std_mul): 65 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 66 | m.weight.data.normal_(mean=0, std=std) 67 | m.bias.data.zero_() 68 | return m 69 | 70 | def conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs): 71 | from .conv import Conv1d 72 | m = Conv1d(in_channels, out_channels, kernel_size, **kwargs) 73 | m = m_modification(m,in_channels,dropout,std_mul) 74 | return nn.utils.weight_norm(m) 75 | 76 | 77 | def conv_transpose1d(in_channels, out_channels, kernel_size, dropout=0, 78 | std_mul=1.0, **kwargs): 79 | m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs) 80 | m = m_modification(m,in_channels,dropout,std_mul) 81 | return nn.utils.weight_norm(m) 82 | 83 | 84 | class Conv1dGLU(nn.Module): 85 | """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding 86 | """ 87 | 88 | def __init__(self, n_speakers, speaker_embed_dim, 89 | in_channels, out_channels, kernel_size, 90 | dropout, padding=None, dilation=1, causal=False, residual=False, 91 | *args, **kwargs): 92 | super(Conv1dGLU, self).__init__() 93 | self.dropout = dropout 94 | self.residual = residual 95 | if padding is None: 96 | # no future time stamps available 97 | if causal: 98 | padding = (kernel_size - 1) * dilation 99 | else: 100 | padding = (kernel_size - 1) // 2 * dilation 101 | self.causal = causal 102 | 103 | self.conv = conv1d(in_channels, 2 * out_channels, kernel_size, 104 | dropout=dropout, padding=padding, dilation=dilation, 105 | *args, **kwargs) 106 | if n_speakers > 1: 107 | self.speaker_proj = linear(speaker_embed_dim, out_channels) 108 | else: 109 | self.speaker_proj = None 110 | 111 | def forward(self, x, speaker_embed=None): 112 | return self._forward(x, False) 113 | 114 | def incremental_forward(self, x): 115 | return self._forward(x, True) 116 | 117 | def _forward(self, x, is_incremental): 118 | residual = x 119 | x = F.dropout(x, p=self.dropout, training=self.training) 120 | if is_incremental: 121 | splitdim = -1 122 | x = self.conv.incremental_forward(x, ) 123 | else: 124 | splitdim = 1 125 | x = self.conv(x) 126 | # remove future time steps 127 | x = x[:, :, :residual.size(-1)] if self.causal else x 128 | 129 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 130 | x = a * F.sigmoid(b) 131 | return (x + residual) * math.sqrt(0.5) if self.residual else x 132 | 133 | def clear_buffer(self): 134 | self.conv.clear_buffer() 135 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/__init__.py -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | """ 12 | 13 | import re 14 | 15 | from unidecode import unidecode 16 | 17 | from .numbers import normalize_numbers 18 | 19 | # Regular expression matching whitespace: 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def expand_numbers(text): 52 | return normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | return text.lower() 57 | 58 | 59 | def collapse_whitespace(text): 60 | return re.sub(_whitespace_re, ' ', text) 61 | 62 | 63 | def convert_to_ascii(text): 64 | return unidecode(text) 65 | 66 | 67 | def add_punctuation(text): 68 | if text[-1] not in '!,.:;?': 69 | text = text + '.' # without this decoder is confused when to output EOS 70 | return text 71 | 72 | 73 | def english_cleaners(text): 74 | """Pipeline for English text, including number and abbreviation expansion.""" 75 | text = convert_to_ascii(text) 76 | text = add_punctuation(text) 77 | text = lowercase(text) 78 | text = expand_numbers(text) 79 | text = expand_abbreviations(text) 80 | text = collapse_whitespace(text) 81 | return text 82 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/cmudict.py: -------------------------------------------------------------------------------- 1 | valid_symbols = [ 2 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 3 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 4 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 5 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 6 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 7 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 8 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 9 | ] 10 | 11 | _valid_symbol_set = set(valid_symbols) 12 | 13 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/numbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import inflect 4 | import re 5 | 6 | _inflect = inflect.engine() 7 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 8 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 9 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 13 | 14 | 15 | def _remove_commas(m): 16 | return m.group(1).replace(',', '') 17 | 18 | 19 | def _expand_decimal_point(m): 20 | return m.group(1).replace('.', ' point ') 21 | 22 | 23 | def _expand_dollars(m): 24 | match = m.group(1) 25 | parts = match.split('.') 26 | if len(parts) > 2: 27 | return match + ' dollars' # Unexpected format 28 | dollars = int(parts[0]) if parts[0] else 0 29 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 30 | if dollars and cents: 31 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 32 | cent_unit = 'cent' if cents == 1 else 'cents' 33 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 34 | elif dollars: 35 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 36 | return '%s %s' % (dollars, dollar_unit) 37 | elif cents: 38 | cent_unit = 'cent' if cents == 1 else 'cents' 39 | return '%s %s' % (cents, cent_unit) 40 | 41 | 42 | def _expand_ordinal(m): 43 | return _inflect.number_to_words(m.group(0)) 44 | 45 | 46 | def _expand_number(m): 47 | num = int(m.group(0)) 48 | if 10000 > num > 1000: 49 | if num % 100 == 0: 50 | return _inflect.number_to_words(num // 100) + ' hundred' 51 | else: 52 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 53 | else: 54 | return _inflect.number_to_words(num, andword='') 55 | 56 | 57 | def normalize_numbers(text): 58 | text = re.sub(_comma_number_re, _remove_commas, text) 59 | text = re.sub(_pounds_re, r'\1 pounds', text) 60 | text = re.sub(_dollars_re, _expand_dollars, text) 61 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 62 | text = re.sub(_ordinal_re, _expand_ordinal, text) 63 | text = re.sub(_number_re, _expand_number, text) 64 | return text 65 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from .cmudict import valid_symbols 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet 18 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/text/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text import cleaners 4 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch.text.symbols import symbols 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | """ 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def _clean_text(text, cleaner_names): 45 | for name in cleaner_names: 46 | cleaner = getattr(cleaners, name) 47 | text = cleaner(text) 48 | return text 49 | 50 | 51 | def _symbols_to_sequence(symbols): 52 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 53 | 54 | 55 | def _arpabet_to_sequence(text): 56 | return _symbols_to_sequence(['@' + s for s in text.split()]) 57 | 58 | 59 | def _should_keep_symbol(s): 60 | return s in _symbol_to_id and s is not '_' and s is not '~' 61 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/deepvoice3_pytorch/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.6+7a10ac6' 2 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_generator/voice_generator.py: -------------------------------------------------------------------------------- 1 | import lws 2 | 3 | 4 | import librosa 5 | import numpy as np 6 | import torch 7 | from scipy import signal 8 | 9 | from autokeras_pretrained.constant import Constant 10 | from autokeras_pretrained.base import Pretrained 11 | from autokeras_pretrained.voice_generator.deepvoice3_pytorch import frontend, builder 12 | 13 | 14 | # NOTE: If you want full control for model architecture. please take a look 15 | # at the code and change whatever you want. Some hyper parameters are hardcoded. 16 | 17 | # Default hyperparameters: 18 | 19 | class Hparams: 20 | name = "deepvoice3" 21 | 22 | # Text: 23 | # [en jp] 24 | frontend = 'en' 25 | 26 | # Replace words to its pronunciation with fixed probability. 27 | # e.g. 'hello' to 'HH AH0 L OW1' 28 | # [en jp] 29 | # en: Word -> pronunciation using CMUDict 30 | # jp: Word -> pronounciation usnig MeCab 31 | # [0 ~ 1.0]: 0 means no replacement happens. 32 | replace_pronunciation_prob = 0.5 33 | 34 | # Convenient model builder 35 | # Definitions can be found at deepvoice3_pytorch/builder.py 36 | # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654 37 | builder = "deepvoice3" 38 | 39 | # Must be configured depends on the dataset and model you use 40 | n_speakers = 1 41 | speaker_embed_dim = 16 42 | 43 | # Audio: 44 | num_mels = 80 45 | fmin = 125 46 | fmax = 7600 47 | fft_size = 1024 48 | hop_size = 256 49 | sample_rate = 22050 50 | preemphasis = 0.97 51 | min_level_db = -100 52 | ref_level_db = 20 53 | # whether to rescale waveform or not. 54 | # Let x is an input waveform rescaled waveform y is given by: 55 | # y = x / np.abs(x).max() * rescaling_max 56 | rescaling = False 57 | rescaling_max = 0.999 58 | # mel-spectrogram is normalized to [0 1] for each utterance and clipping may 59 | # happen depends on min_level_db and ref_level_db causing clipping noise. 60 | # If False assertion is added to ensure no clipping happens. 61 | allow_clipping_in_normalization = True 62 | 63 | # Model: 64 | downsample_step = 4 # must be 4 when builder="nyanko" 65 | outputs_per_step = 1 # must be 1 when builder="nyanko" 66 | embedding_weight_std = 0.1 67 | speaker_embedding_weight_std = 0.01 68 | padding_idx = 0 69 | # Maximum number of input text length 70 | # try setting larger value if you want to give very long text input 71 | max_positions = 512 72 | dropout = 1 - 0.95 73 | kernel_size = 3 74 | text_embed_dim = 128 75 | encoder_channels = 256 76 | decoder_channels = 256 77 | # Note: large converter channels requires significant computational cost 78 | converter_channels = 256 79 | query_position_rate = 1.0 80 | # can be computed by `compute_timestamp_ratio.py`. 81 | key_position_rate = 1.385 # 2.37 for jsut 82 | key_projection = False 83 | value_projection = False 84 | use_memory_mask = True 85 | trainable_positional_encodings = False 86 | freeze_embedding = False 87 | # If True use decoder's internal representation for postnet inputs 88 | # otherwise use mel-spectrogram. 89 | use_decoder_state_for_postnet_input = True 90 | 91 | # Data loader 92 | pin_memory = True 93 | num_workers = 2 # Set it to 1 when in Windows (MemoryError THAllocator.c 0x5) 94 | 95 | # Loss 96 | masked_loss_weight = 0.5 # (1-w)*loss + w * masked_loss 97 | priority_freq = 3000 # heuristic: priotrize [0 ~ priotiry_freq] for linear loss 98 | priority_freq_weight = 0.0 # (1-w)*linear_loss + w*priority_linear_loss 99 | # https://arxiv.org/pdf/1710.08969.pdf 100 | # Adding the divergence to the loss stabilizes training expecially for 101 | # very deep (> 10 layers) networks. 102 | # Binary div loss seems has approx 10x scale compared to L1 loss so I choose 0.1. 103 | binary_divergence_weight = 0.1 # set 0 to disable 104 | use_guided_attention = True 105 | guided_attention_sigma = 0.2 106 | 107 | # Training: 108 | batch_size = 16 109 | adam_beta1 = 0.5 110 | adam_beta2 = 0.9 111 | adam_eps = 1e-6 112 | amsgrad = False 113 | initial_learning_rate = 5e-4 # 0.001 114 | lr_schedule = "noam_learning_rate_decay" 115 | lr_schedule_kwargs = {} 116 | nepochs = 2000 117 | weight_decay = 0.0 118 | clip_thresh = 0.1 119 | 120 | # Save 121 | checkpoint_interval = 10000 122 | eval_interval = 10000 123 | save_optimizer_state = True 124 | 125 | # Eval: 126 | # this can be list for multple layers of attention 127 | # e.g. [True False False False True] 128 | force_monotonic_attention = True 129 | # Attention constraint for incremental decoding 130 | window_ahead = 3 131 | # 0 tends to prevent word repretetion but sometime causes skip words 132 | window_backward = 1 133 | power = 1.4 # Power to raise magnitudes to prior to phase retrieval 134 | 135 | # GC: 136 | # Forced garbage collection probability 137 | # Use only when MemoryError continues in Windows (Disabled by default) 138 | # gc_probability = 0.001 139 | 140 | # json_meta mode only 141 | # 0: "use all" 142 | # 1: "ignore only unmatched_alignment" 143 | # 2: "fully ignore recognition" 144 | ignore_recognition_level = 2 145 | # when dealing with non-dedicated speech dataset(e.g. movie excerpts) setting min_text above 15 is desirable. 146 | # Can be adjusted by dataset. 147 | min_text = 20 148 | # if true data without phoneme alignment file(.lab) will be ignored 149 | process_only_htk_aligned = False 150 | 151 | 152 | fs = Hparams.sample_rate 153 | global_step = 0 154 | global_epoch = 0 155 | 156 | 157 | def build_model(): 158 | model = getattr(builder, Hparams.builder)( 159 | n_speakers=Hparams.n_speakers, 160 | speaker_embed_dim=Hparams.speaker_embed_dim, 161 | n_vocab=frontend.n_vocab, 162 | embed_dim=Hparams.text_embed_dim, 163 | mel_dim=Hparams.num_mels, 164 | linear_dim=Hparams.fft_size // 2 + 1, 165 | r=Hparams.outputs_per_step, 166 | padding_idx=Hparams.padding_idx, 167 | dropout=Hparams.dropout, 168 | kernel_size=Hparams.kernel_size, 169 | encoder_channels=Hparams.encoder_channels, 170 | decoder_channels=Hparams.decoder_channels, 171 | converter_channels=Hparams.converter_channels, 172 | use_memory_mask=Hparams.use_memory_mask, 173 | trainable_positional_encodings=Hparams.trainable_positional_encodings, 174 | force_monotonic_attention=Hparams.force_monotonic_attention, 175 | use_decoder_state_for_postnet_input=Hparams.use_decoder_state_for_postnet_input, 176 | max_positions=Hparams.max_positions, 177 | freeze_embedding=Hparams.freeze_embedding, 178 | window_ahead=Hparams.window_ahead, 179 | window_backward=Hparams.window_backward 180 | ) 181 | return model 182 | 183 | 184 | def inv_preemphasis(x, coef=Hparams.preemphasis): 185 | """Inverse operation of pre-emphasis 186 | 187 | Args: 188 | x (1d-array): Input signal. 189 | coef (float): Pre-emphasis coefficient. 190 | 191 | Returns: 192 | array: Output filtered signal. 193 | 194 | See also: 195 | :func:`preemphasis` 196 | """ 197 | b = np.array([1.], x.dtype) 198 | a = np.array([1., -coef], x.dtype) 199 | return signal.lfilter(b, a, x) 200 | 201 | 202 | def inv_spectrogram(spectrogram): 203 | """Converts spectrogram to waveform using librosa""" 204 | S = _db_to_amp(_denormalize(spectrogram) + Hparams.ref_level_db) # Convert back to linear 205 | processor = _lws_processor() 206 | D = processor.run_lws(S.astype(np.float64).T ** Hparams.power) 207 | y = processor.istft(D).astype(np.float32) 208 | return inv_preemphasis(y) 209 | 210 | 211 | def _lws_processor(): 212 | return lws.lws(Hparams.fft_size, Hparams.hop_size, mode="speech") 213 | 214 | 215 | _mel_basis = None 216 | 217 | 218 | def _db_to_amp(x): 219 | return np.power(10.0, x * 0.05) 220 | 221 | 222 | def _denormalize(S): 223 | return (np.clip(S, 0, 1) * -Hparams.min_level_db) + Hparams.min_level_db 224 | 225 | 226 | class VoiceGenerator(Pretrained): 227 | def __init__(self, **kwargs): 228 | super().__init__(**kwargs) 229 | self.sample_rate = 0 230 | self.hop_length = 0 231 | self.sample_rate = Hparams.sample_rate 232 | self.hop_length = Hparams.hop_size 233 | 234 | self.model = self.load_checkpoint() 235 | self.model.to(self.device) 236 | 237 | @property 238 | def _google_drive_files(self): 239 | return Constant.VOICE_GENERATOR_MODELS 240 | 241 | def load_checkpoint(self): 242 | global global_step 243 | global global_epoch 244 | 245 | model = build_model() 246 | print("Load checkpoint from: {}".format(self.local_paths[0])) 247 | if self.device.startswith("cuda"): 248 | checkpoint = torch.load(self.local_paths[0]) 249 | else: 250 | checkpoint = torch.load(self.local_paths[0], map_location=lambda storage, loc: storage) 251 | model.load_state_dict(checkpoint["state_dict"]) 252 | global_step = checkpoint["global_step"] 253 | global_epoch = checkpoint["global_epoch"] 254 | 255 | return model 256 | 257 | def predict(self, text, path=None): 258 | waveform, alignment, spectrogram, _ = self.tts(text) 259 | if path is None: 260 | AssertionError('Please provide the output file path.') 261 | librosa.output.write_wav(path, waveform, self.sample_rate) 262 | 263 | def tts(self, text, p=0, speaker_id=None, fast=True): 264 | """Convert text to speech waveform given a deepvoice3 model. 265 | 266 | Args: 267 | speaker_id: 268 | fast: 269 | text (str) : Input text to be synthesized 270 | p (float) : Replace word to pronounciation if p > 0. Default is 0. 271 | """ 272 | self.model.eval() 273 | if fast: 274 | self.model.make_generation_fast_() 275 | 276 | sequence = np.array(frontend.text_to_sequence(text, p=p)) 277 | sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(self.device) 278 | text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(self.device) 279 | speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(self.device) 280 | 281 | # Greedy decoding 282 | with torch.no_grad(): 283 | mel_outputs, linear_outputs, alignments, _ = self.model( 284 | sequence, text_positions=text_positions, speaker_ids=speaker_ids) 285 | 286 | linear_output = linear_outputs[0].cpu().data.numpy() 287 | spectrogram = _denormalize(linear_output) 288 | alignment = alignments[0].cpu().data.numpy() 289 | mel = mel_outputs[0].cpu().data.numpy() 290 | mel = _denormalize(mel) 291 | 292 | # Predicted audio signal 293 | waveform = inv_spectrogram(linear_output.T) 294 | 295 | return waveform, alignment, spectrogram, mel 296 | -------------------------------------------------------------------------------- /autokeras_pretrained/voice_recognizer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | 8 | from autokeras_pretrained.base import Pretrained 9 | from autokeras_pretrained.constant import Constant 10 | 11 | supported_rnns = { 12 | 'lstm': nn.LSTM, 13 | 'rnn': nn.RNN, 14 | 'gru': nn.GRU 15 | } 16 | 17 | 18 | class Decoder(object): 19 | """ 20 | Basic decoder class from which all other decoders inherit. Implements several 21 | helper functions. Subclasses should implement the decode() method. 22 | 23 | Arguments: 24 | labels (string): mapping from integers to characters. 25 | blank_index (int, optional): index for the blank '_' character. Defaults to 0. 26 | space_index (int, optional): index for the space ' ' character. Defaults to 28. 27 | """ 28 | 29 | def __init__(self, labels, blank_index=0): 30 | # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#" 31 | self.labels = labels 32 | self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) 33 | self.blank_index = blank_index 34 | space_index = len(labels) # To prevent errors in decode, we add an out of bounds index for the space 35 | if ' ' in labels: 36 | space_index = labels.index(' ') 37 | self.space_index = space_index 38 | 39 | def decode(self, probs): 40 | """ 41 | Given a matrix of character probabilities, returns the decoder's 42 | best guess of the transcription 43 | 44 | Arguments: 45 | probs: Tensor of character probabilities, where probs[c,t] 46 | is the probability of character c at time t 47 | sizes(optional): Size of each sequence in the mini-batch 48 | Returns: 49 | string: sequence of the model's best guess for the transcription 50 | """ 51 | raise NotImplementedError 52 | 53 | 54 | class GreedyDecoder(Decoder): 55 | def __init__(self, labels, blank_index=0): 56 | super(GreedyDecoder, self).__init__(labels, blank_index) 57 | 58 | def convert_to_strings(self, sequences, return_offsets=True): 59 | """Given a list of numeric sequences, returns the corresponding strings""" 60 | strings = [] 61 | offsets = [] 62 | for sequence in sequences: 63 | seq_len = len(sequence) 64 | string, string_offsets = self.process_string(sequence, seq_len) 65 | strings.append([string]) # We only return one path 66 | if return_offsets: 67 | offsets.append([string_offsets]) 68 | return strings, offsets 69 | 70 | def process_string(self, sequence, size): 71 | string = '' 72 | offsets = [] 73 | for i in range(size): 74 | char = self.int_to_char[sequence[i].item()] 75 | if char == self.int_to_char[self.blank_index]: 76 | continue 77 | # if this char is a repetition and remove_repetitions=true, then skip 78 | if i != 0 and char == self.int_to_char[sequence[i - 1].item()]: 79 | continue 80 | if char == self.labels[self.space_index]: 81 | string += ' ' 82 | offsets.append(i) 83 | else: 84 | string = string + char 85 | offsets.append(i) 86 | return string, torch.IntTensor(offsets) 87 | 88 | def decode(self, probs): 89 | """ 90 | Returns the argmax decoding given the probability matrix. Removes 91 | repeated elements in the sequence, as well as blanks. 92 | 93 | Arguments: 94 | probs: Tensor of character probabilities from the network. Expected shape of seq_length x batch x output_dim 95 | sizes(optional): Size of each sequence in the mini-batch 96 | Returns: 97 | strings: sequences of the model's best guess for the transcription on inputs 98 | offsets: time step per character predicted 99 | """ 100 | _, max_probs = torch.max(probs.transpose(0, 1), 2) 101 | strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), 102 | return_offsets=True) 103 | return strings, offsets 104 | 105 | 106 | class SequenceWise(nn.Module): 107 | def __init__(self, module): 108 | """ 109 | Collapses input of dim T*N*H to (T*N)*H, and applies to a module. 110 | Allows handling of variable sequence lengths and minibatch sizes. 111 | :param module: Module to apply input to. 112 | """ 113 | super(SequenceWise, self).__init__() 114 | self.module = module 115 | 116 | def forward(self, x): 117 | t, n = x.size(0), x.size(1) 118 | x = x.view(t * n, -1) 119 | x = self.module(x) 120 | x = x.view(t, n, -1) 121 | return x 122 | 123 | 124 | class InferenceBatchSoftmax(nn.Module): 125 | def __init__(self): 126 | super(InferenceBatchSoftmax, self).__init__() 127 | 128 | @staticmethod 129 | def forward(input_): 130 | return F.softmax(input_, dim=-1) 131 | 132 | 133 | class BatchRNN(nn.Module): 134 | def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True): 135 | super(BatchRNN, self).__init__() 136 | self.input_size = input_size 137 | self.hidden_size = hidden_size 138 | self.bidirectional = bidirectional 139 | self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None 140 | self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, 141 | bidirectional=bidirectional, bias=False) 142 | self.num_directions = 2 if bidirectional else 1 143 | 144 | def flatten_parameters(self): 145 | self.rnn.flatten_parameters() 146 | 147 | def forward(self, x): 148 | if self.batch_norm is not None: 149 | x = self.batch_norm(x) 150 | x, _ = self.rnn(x) 151 | if self.bidirectional: 152 | x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum 153 | return x 154 | 155 | 156 | class DeepSpeech(nn.Module): 157 | def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, 158 | bidirectional=True): 159 | super(DeepSpeech, self).__init__() 160 | 161 | # model metadata needed for serialization/deserialization 162 | self._version = '0.0.1' 163 | self._hidden_size = rnn_hidden_size 164 | self._hidden_layers = nb_layers 165 | self._rnn_type = rnn_type 166 | self._labels = labels 167 | self._bidirectional = bidirectional 168 | 169 | num_classes = len(self._labels) 170 | 171 | self.conv = nn.Sequential( 172 | nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(0, 10)), 173 | nn.BatchNorm2d(32), 174 | nn.Hardtanh(0, 20, inplace=True), 175 | nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(0, 10)), 176 | nn.BatchNorm2d(32), 177 | nn.Hardtanh(0, 20, inplace=True) 178 | ) 179 | # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 180 | # rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 181 | # rnn_input_size = int(math.floor(rnn_input_size - 41) / 2 + 1) 182 | # rnn_input_size = int(math.floor(rnn_input_size - 21) / 2 + 1) 183 | # rnn_input_size *= 32 184 | rnn_input_size = 672 185 | 186 | rnns = [] 187 | rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, 188 | bidirectional=bidirectional, batch_norm=False) 189 | rnns.append(('0', rnn)) 190 | for x in range(nb_layers - 1): 191 | rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type, 192 | bidirectional=bidirectional) 193 | rnns.append(('%d' % (x + 1), rnn)) 194 | self.rnns = nn.Sequential(OrderedDict(rnns)) 195 | fully_connected = nn.Sequential( 196 | nn.BatchNorm1d(rnn_hidden_size), 197 | nn.Linear(rnn_hidden_size, num_classes, bias=False) 198 | ) 199 | self.fc = nn.Sequential( 200 | SequenceWise(fully_connected), 201 | ) 202 | self.inference_softmax = InferenceBatchSoftmax() 203 | 204 | def forward(self, x): 205 | x = self.conv(x) 206 | 207 | sizes = x.size() 208 | x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension 209 | x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH 210 | 211 | x = self.rnns(x) 212 | 213 | x = self.fc(x) 214 | x = x.transpose(0, 1) 215 | # identity in training mode, softmax in eval mode 216 | x = self.inference_softmax(x) 217 | return x 218 | 219 | @classmethod 220 | def load_model(cls, path, cuda=False): 221 | package = torch.load(path, map_location=lambda storage, loc: storage) 222 | model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], 223 | labels=package['labels'], rnn_type=supported_rnns[package['rnn_type']], 224 | bidirectional=package.get('bidirectional', True)) 225 | # the blacklist parameters are params that were previous erroneously saved by the model 226 | # care should be taken in future versions that if batch_norm on the first rnn is required 227 | # that it be named something else 228 | blacklist = ['rnns.0.batch_norm.module.weight', 'rnns.0.batch_norm.module.bias', 229 | 'rnns.0.batch_norm.module.running_mean', 'rnns.0.batch_norm.module.running_var'] 230 | model.load_state_dict(package['state_dict']) 231 | for x in model.rnns: 232 | x.flatten_parameters() 233 | if cuda: 234 | model = torch.nn.DataParallel(model).cuda() 235 | return model 236 | 237 | 238 | class VoiceRecognizer(Pretrained): 239 | def __init__(self, **kwargs): 240 | super().__init__(**kwargs) 241 | 242 | self.model = self.load_checkpoint() 243 | labels = Constant.VOICE_RECONGINIZER_LABELS 244 | self.decoder = GreedyDecoder(labels, blank_index=labels.index('_')) 245 | 246 | @property 247 | def _google_drive_files(self): 248 | return Constant.VOICE_RECONGINIZER_MODELS 249 | 250 | def load_checkpoint(self): 251 | model = DeepSpeech.load_model(self.local_paths[0], cuda=(self.device == 'cuda')) 252 | model.eval() 253 | return model 254 | 255 | def predict(self, audio_data, audio_path=None): 256 | if audio_data is None: 257 | raise TypeError("audio_data cannot be None") 258 | audio_data = audio_data.view(1, 1, audio_data.size(0), audio_data.size(1)) 259 | with torch.no_grad(): 260 | out = self.model(Variable(audio_data)) 261 | out = out.transpose(0, 1) # TxNxH 262 | decoded_output, _ = self.decoder.decode(out.data) 263 | return decoded_output[0][0] 264 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | setup( 5 | name='autokeras-pretrained', 6 | packages=find_packages(exclude=('tests',)), 7 | install_requires=['scipy==1.2.0', 8 | 'torch==1.0.1.post2', 9 | 'torchvision==0.2.1', 10 | 'numpy==1.16.1', 11 | 'scikit-image==0.14.2', 12 | 'imageio==2.5.0', 13 | 'requests==2.21.0', 14 | 'librosa==0.6.2', 15 | 'numba', 16 | 'inflect', 17 | 'unidecode', 18 | 'nltk==3.3', 19 | 'lws==1.2', 20 | 'opencv-python==4.0.0.21', 21 | 'boto3'], 22 | version='0.0.3', 23 | description='Pretrained models for Auto-Keras', 24 | author='DATA Lab at Texas A&M University', 25 | author_email='jhfjhfj1@gmail.com', 26 | url='http://autokeras.com', 27 | download_url='https://github.com/jhfjhfj1/autokeras-pretrained/archive/0.0.3.tar.gz', 28 | keywords=['autokeras', 'keras'], 29 | classifiers=[] 30 | ) 31 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | 4 | TEST_TEMP_AUTO_KERAS_DIR = 'tests/resources/temp/autokeras' 5 | TEST_TEMP_DIR = 'tests/resources/temp' 6 | 7 | 8 | def clean_dir(path): 9 | for f in os.listdir(path): 10 | full_path = os.path.join(path, f) 11 | if f != '.gitkeep': 12 | if os.path.isfile(full_path): 13 | os.remove(full_path) 14 | else: 15 | os.rmdir(full_path) 16 | # def mock_nvidia_smi_output(*arg, **kwargs): 17 | # return \ 18 | # ' Free : 1 MiB \n' \ 19 | # ' Free : 11176 MiB \n' \ 20 | # ' Free : 1 MiB \n' \ 21 | # ' Free : 1 MiB' 22 | -------------------------------------------------------------------------------- /tests/pretrained/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/pretrained/__init__.py -------------------------------------------------------------------------------- /tests/pretrained/test_face_detector.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.face_detector import FaceDetector 2 | import os 3 | 4 | from tests.common import TEST_TEMP_DIR, clean_dir 5 | 6 | 7 | def test_face_detector(): 8 | img_file, out_file = 'tests/resources/images_test/face_detector.jpg', os.path.join(TEST_TEMP_DIR, 'output.jpg') 9 | if os.path.exists(out_file): 10 | os.remove(out_file) 11 | face_detection = FaceDetector() 12 | bboxs1, landmarks1 = face_detection.predict(img_file, out_file) 13 | assert os.path.exists(out_file) 14 | bboxs2, landmarks2 = face_detection.predict(img_file) 15 | assert bboxs1.shape == bboxs2.shape == (11, 5) and landmarks1.shape == landmarks2.shape == (11, 10) 16 | clean_dir(TEST_TEMP_DIR) 17 | -------------------------------------------------------------------------------- /tests/pretrained/test_object_detection.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.object_detector import ObjectDetector 2 | from tests.common import TEST_TEMP_DIR, clean_dir 3 | 4 | 5 | def test_object_detection(): 6 | detector = ObjectDetector() 7 | img_path = 'tests/resources/images_test/od.JPG' 8 | result = detector.predict(img_path, TEST_TEMP_DIR) 9 | assert isinstance(result, list) 10 | clean_dir(TEST_TEMP_DIR) 11 | -------------------------------------------------------------------------------- /tests/pretrained/test_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.text_classifier import SentimentAnalysis 2 | 3 | 4 | def test_sentiment_analysis(): 5 | sentiment_analyzer = SentimentAnalysis() 6 | 7 | positive_polarity = sentiment_analyzer.predict("The model is working really well.") 8 | if positive_polarity <= 0.5: 9 | raise AssertionError() 10 | 11 | negative_polarity = sentiment_analyzer.predict("The university intake has reduced drastically this year.") 12 | if negative_polarity >= 0.5: 13 | raise AssertionError() 14 | -------------------------------------------------------------------------------- /tests/pretrained/test_topic_classifier.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained.text_classifier import TopicClassifier 2 | 3 | 4 | def test_topic_classifier(): 5 | topic_classifier = TopicClassifier() 6 | 7 | topic_name = topic_classifier.predict( 8 | "Risk mitigation is the pursuit of opportunities where the potential upside is far greater than the potential " 9 | "downside", ) 10 | 11 | if topic_name != "Business": 12 | raise AssertionError() 13 | 14 | topic_name = topic_classifier.predict( 15 | "With a tap on the screen the app will recognise your face and bring up the filter menu", ) 16 | 17 | if topic_name != "Sci/Tech": 18 | raise AssertionError() 19 | 20 | topic_name = topic_classifier.predict( 21 | "Anthony received a loud ovation when he was shown on the overhead videoboard in the first quarter", ) 22 | 23 | if topic_name != "Sports": 24 | raise AssertionError() 25 | 26 | topic_name = topic_classifier.predict("The soviet union was created about five years after Russian Revolution.", ) 27 | 28 | if topic_name != "World": 29 | raise AssertionError() 30 | -------------------------------------------------------------------------------- /tests/pretrained/test_voice_generator.py: -------------------------------------------------------------------------------- 1 | from autokeras_pretrained import VoiceGenerator 2 | from tests.common import TEST_TEMP_DIR, clean_dir 3 | import os 4 | 5 | 6 | def test_voice_generator(): 7 | voice_generator = VoiceGenerator() 8 | clean_dir(TEST_TEMP_DIR) 9 | texts = [ 10 | "Generative adversarial network or variational auto-encoder.", 11 | "The tuition of the coming semster is 6300 dollars.", 12 | "The tuition of the coming semster is 6350 dollars.", 13 | "Turn left on {HH AW1 S S T AH0 N} Street.", 14 | "This is expensive, it costs me $300.2", 15 | "This is expensive, it costs me $300", 16 | "This is cheap, it only costs me $.2", 17 | "Today he won the 1st prize of the competition", 18 | "The approximation of pi is 3.14", 19 | ] 20 | 21 | for idx, text in enumerate(texts): 22 | save_name = "test_" + str(idx) + ".wav" 23 | save_name = os.path.join(TEST_TEMP_DIR, save_name) 24 | voice_generator.predict(text, path=save_name) 25 | clean_dir(TEST_TEMP_DIR) 26 | -------------------------------------------------------------------------------- /tests/pretrained/test_voice_recognizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from autokeras_pretrained import VoiceRecognizer 4 | 5 | 6 | def test_voice_generator(): 7 | spect2 = torch.rand(161, 131) 8 | voice_recognizer = VoiceRecognizer() 9 | print(voice_recognizer.predict(audio_data=spect2)) 10 | 11 | 12 | def test_voice_generator_none_type_error(): 13 | voice_recognizer = VoiceRecognizer() 14 | try: 15 | print(voice_recognizer.predict(audio_data=None)) 16 | except TypeError: 17 | pass 18 | -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Eckhart_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Eckhart_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Peirsol_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Peirsol_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Peirsol_0003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0003.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Peirsol_0004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Peirsol_0004.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Sorkin_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Aaron_Sorkin_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Aaron_Sorkin_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abdel_Nasser_Assidi_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0003.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0004.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0005.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Black_white_images/Abel_Pacheco_0006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Black_white_images/Abel_Pacheco_0006.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Eckhart_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Eckhart_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Peirsol_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Peirsol_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Peirsol_0003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0003.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Peirsol_0004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Peirsol_0004.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Sorkin_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Sorkin_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Aaron_Sorkin_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Aaron_Sorkin_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abdel_Nasser_Assidi_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0001.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0002.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0003.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0004.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0005.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/Color_images/Abel_Pacheco_0006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/Color_images/Abel_Pacheco_0006.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/face_detector.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/face_detector.jpg -------------------------------------------------------------------------------- /tests/resources/images_test/images_name.csv: -------------------------------------------------------------------------------- 1 | File Name,Label 2 | Aaron_Peirsol_0001.jpg,0 3 | Aaron_Peirsol_0002.jpg,0 4 | Aaron_Peirsol_0003.jpg,0 5 | Aaron_Peirsol_0004.jpg,0 6 | Aaron_Sorkin_0001.jpg,1 7 | -------------------------------------------------------------------------------- /tests/resources/images_test/od.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/images_test/od.JPG -------------------------------------------------------------------------------- /tests/resources/temp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/autokeras-pretrained/2529714e697f5f1333c16809add621a42691b2dd/tests/resources/temp/.gitkeep --------------------------------------------------------------------------------