├── variable_length_pooling.py ├── Readme.md ├── my_resnet.py ├── my_resnet1d.py └── main.py /variable_length_pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class VariableLengthPooling(nn.Module): 5 | def forward(self, x, **kwargs): 6 | bounds = kwargs.get("bounds") 7 | # print("--------x--------", x.size(), x) 8 | # print("--------bounds--------", bounds.size(), bounds) 9 | cnt = torch.sum(bounds, dim=1) 10 | # print("--------cnt--------", cnt.size(), cnt) 11 | # print("--------bmm--------", torch.bmm(x, bounds).size(), torch.bmm(x, bounds)) 12 | out = torch.bmm(x, bounds) / cnt 13 | # print("--------out--------", out.size(), out) 14 | return out 15 | 16 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Resnet 1D and Variable Length Pooling for time series data 2 | 3 | ## Pooling from variable length of activations 4 | This is useful for dealing features of various length in time dimension. 5 | E.g., phonemes in speech data. 6 | 7 | ``` 8 | Softmax o1 o2 o3 9 | | | | 10 | 11 | VarLenAvgPooling p1 p2 p3 12 | / \ / \ / \ 13 | | | | | | | 14 | 15 | Activation a1 a2 a3 a4 a5 a6 a7 16 | | | | | | | | 17 | 18 | Conv1D o o o o o o o 19 | /|\ /|\ /|\ /|\ /|\ /|\ /|\ 20 | 21 | Activation a1 a2 a3 a4 a5 a6 a7 22 | | | | | | | | 23 | 24 | Conv1D o o o o o o o 25 | /|\ /|\ /|\ /|\ /|\ /|\ /|\ 26 | 27 | Time ---------------------------> 28 | 29 | ``` 30 | 31 | ## ResNet for time series data 32 | 33 | Vanilla ResNet uses Conv2D for image data. However this architecture may be useful for deep Conv1D networks as well. 34 | 35 | I tried two approaches in my code: 36 | 37 | - use rectangular filters (different H, W) directly in ResNet2D 38 | - shift to Conv1D entirely 39 | 40 | It depends on your specific problem to answer which approach is better. 41 | 42 | # Credits 43 | 44 | Some code are copied from pytorch official examples and CMU classes. Credits to the original authors. 45 | 46 | - https://github.com/pytorch/examples/tree/master/mnist 47 | - ... 48 | -------------------------------------------------------------------------------- /my_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | from variable_length_pooling import VariableLengthPooling 6 | 7 | def conv3x3(in_planes, out_planes, stride=1): 8 | """3x3 convolution with padding""" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=(5, 3), stride=(stride, 1), 10 | padding=(2, 1), bias=True) 11 | 12 | 13 | class BasicBlock(nn.Module): 14 | expansion = 1 15 | 16 | def __init__(self, inplanes, planes, stride=1, downsample=None): 17 | super(BasicBlock, self).__init__() 18 | self.conv1 = conv3x3(inplanes, planes, stride) 19 | self.bn1 = nn.BatchNorm2d(planes) 20 | self.relu = nn.LeakyReLU(inplace=True) 21 | self.conv2 = conv3x3(planes, planes) 22 | self.bn2 = nn.BatchNorm2d(planes) 23 | self.downsample = downsample 24 | self.stride = stride 25 | 26 | def forward(self, x): 27 | residual = x 28 | 29 | out = self.conv1(x) 30 | out = self.bn1(out) 31 | out = self.relu(out) 32 | 33 | out = self.conv2(out) 34 | out = self.bn2(out) 35 | 36 | if self.downsample is not None: 37 | residual = self.downsample(x) 38 | 39 | out += residual 40 | out = self.relu(out) 41 | 42 | return out 43 | 44 | class Bottleneck(nn.Module): 45 | expansion = 4 46 | 47 | def __init__(self, inplanes, planes, stride=1, downsample=None): 48 | super(Bottleneck, self).__init__() 49 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True) 50 | self.bn1 = nn.BatchNorm2d(planes) 51 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=(5, 3), stride=(stride, 1), 52 | padding=(2, 1), bias=True) 53 | self.bn2 = nn.BatchNorm2d(planes) 54 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True) 55 | self.bn3 = nn.BatchNorm2d(planes * 4) 56 | self.relu = nn.LeakyReLU(inplace=True) 57 | self.downsample = downsample 58 | self.stride = stride 59 | 60 | def forward(self, x): 61 | residual = x 62 | 63 | out = self.conv1(x) 64 | out = self.bn1(out) 65 | out = self.relu(out) 66 | 67 | out = self.conv2(out) 68 | out = self.bn2(out) 69 | out = self.relu(out) 70 | 71 | out = self.conv3(out) 72 | out = self.bn3(out) 73 | 74 | if self.downsample is not None: 75 | residual = self.downsample(x) 76 | 77 | out += residual 78 | out = self.relu(out) 79 | 80 | return out 81 | 82 | class ResNet(nn.Module): 83 | 84 | def __init__(self, block, layers, num_classes=46): 85 | self.inplanes = 16 86 | super(ResNet, self).__init__() 87 | self.conv1 = nn.Conv2d(1, 16, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1), 88 | bias=True) 89 | self.bn1 = nn.BatchNorm2d(16) 90 | self.relu = nn.LeakyReLU(inplace=True) 91 | # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 92 | self.layer1 = self._make_layer(block, 16, layers[0]) 93 | self.layer2 = self._make_layer(block, 32, layers[1], stride=2) # out: 10 94 | self.layer3 = self._make_layer(block, 128, layers[2], stride=2) # out: 5 95 | self.layer4 = self._make_layer(block, 256, layers[3], stride=2) # out: 3 96 | 97 | self.conv_merge = nn.Conv2d(256 * block.expansion, num_classes, 98 | kernel_size=(3, 3), stride=1, padding=(0, 1), 99 | bias=True) 100 | self.vlp = VariableLengthPooling() 101 | # self.avgpool = nn.AvgPool2d((5, 1), stride=1) 102 | # self.fc = nn.Linear(256 * block.expansion, num_classes) 103 | 104 | for m in self.modules(): 105 | if isinstance(m, nn.Conv2d): 106 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 107 | m.weight.data.normal_(0, math.sqrt(2. / n)) 108 | elif isinstance(m, nn.BatchNorm2d): 109 | m.weight.data.fill_(1) 110 | m.bias.data.zero_() 111 | 112 | def _make_layer(self, block, planes, blocks, stride=1): 113 | downsample = None 114 | if stride != 1 or self.inplanes != planes * block.expansion: 115 | downsample = nn.Sequential( 116 | nn.Conv2d(self.inplanes, planes * block.expansion, 117 | kernel_size=1, stride=(stride, 1), bias=False), 118 | nn.BatchNorm2d(planes * block.expansion), 119 | ) 120 | 121 | layers = [] 122 | layers.append(block(self.inplanes, planes, stride, downsample)) 123 | self.inplanes = planes * block.expansion 124 | for i in range(1, blocks): 125 | layers.append(block(self.inplanes, planes)) 126 | 127 | return nn.Sequential(*layers) 128 | 129 | def forward(self, x, bounds=None): 130 | x = self.conv1(x) 131 | x = self.bn1(x) 132 | x = self.relu(x) 133 | # x = self.maxpool(x) 134 | 135 | x = self.layer1(x) 136 | x = self.layer2(x) 137 | x = self.layer3(x) 138 | x = self.layer4(x) 139 | 140 | # x = self.avgpool(x) 141 | # x = x.view(x.size(0), -1) 142 | # x = self.fc(x) 143 | 144 | x = self.conv_merge(x) 145 | x = torch.squeeze(x, dim=2) 146 | x = self.vlp(x, bounds=bounds) 147 | 148 | return x 149 | 150 | -------------------------------------------------------------------------------- /my_resnet1d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | from variable_length_pooling import VariableLengthPooling 5 | 6 | def conv3x3(in_planes, out_planes, kernel_size=3, stride=1): 7 | """3x3 convolution with padding""" 8 | return nn.Conv1d(in_planes, out_planes, kernel_size=kernel_size, stride=1, 9 | padding=kernel_size//2, bias=True) 10 | 11 | 12 | class BasicBlock(nn.Module): 13 | expansion = 1 14 | 15 | def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None): 16 | super(BasicBlock, self).__init__() 17 | self.conv1 = conv3x3(inplanes, planes, kernel_size=kernel_size, stride=stride) 18 | self.bn1 = nn.BatchNorm1d(planes) 19 | self.relu = nn.LeakyReLU(inplace=True) 20 | self.conv2 = conv3x3(planes, planes, kernel_size=kernel_size, stride=stride) 21 | self.bn2 = nn.BatchNorm1d(planes) 22 | self.downsample = downsample 23 | self.stride = stride 24 | 25 | def forward(self, x): 26 | residual = x 27 | 28 | out = self.conv1(x) 29 | out = self.bn1(out) 30 | out = self.relu(out) 31 | 32 | out = self.conv2(out) 33 | out = self.bn2(out) 34 | 35 | if self.downsample is not None: 36 | residual = self.downsample(x) 37 | 38 | out += residual 39 | out = self.relu(out) 40 | 41 | return out 42 | 43 | class Bottleneck(nn.Module): 44 | expansion = 4 45 | 46 | def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None): 47 | super(Bottleneck, self).__init__() 48 | self.conv1 = nn.Conv1d(inplanes, planes, kernel_size=1, bias=True) 49 | self.bn1 = nn.BatchNorm1d(planes) 50 | self.conv2 = nn.Conv1d(planes, planes, kernel_size=kernel_size, stride=1, 51 | padding=kernel_size//2, bias=True) 52 | self.bn2 = nn.BatchNorm1d(planes) 53 | self.conv3 = nn.Conv1d(planes, planes * 4, kernel_size=1, bias=True) 54 | self.bn3 = nn.BatchNorm1d(planes * 4) 55 | self.relu = nn.LeakyReLU(inplace=True) 56 | self.downsample = downsample 57 | self.stride = stride 58 | 59 | def forward(self, x): 60 | residual = x 61 | 62 | out = self.conv1(x) 63 | out = self.bn1(out) 64 | out = self.relu(out) 65 | 66 | out = self.conv2(out) 67 | out = self.bn2(out) 68 | out = self.relu(out) 69 | 70 | out = self.conv3(out) 71 | out = self.bn3(out) 72 | 73 | if self.downsample is not None: 74 | residual = self.downsample(x) 75 | 76 | out += residual 77 | out = self.relu(out) 78 | 79 | return out 80 | 81 | class ResNet(nn.Module): 82 | 83 | def __init__(self, block, layers, num_classes=46): 84 | self.inplanes = 192 85 | super(ResNet, self).__init__() 86 | self.conv1 = nn.Conv1d(40, 192, kernel_size=3, stride=1, padding=1, bias=True) 87 | self.bn1 = nn.BatchNorm1d(192) 88 | self.relu = nn.LeakyReLU(inplace=True) 89 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 90 | self.layer0 = self._make_layer(block, 256, layers[0]) 91 | self.layer1 = self._make_layer(block, 256, layers[0], kernel_size=1, stride=1) 92 | self.layer2 = self._make_layer(block, 256, layers[1], kernel_size=5, stride=1) 93 | self.layer3 = self._make_layer(block, 256, layers[2], kernel_size=5, stride=1) 94 | self.layer4 = self._make_layer(block, 512, layers[3], kernel_size=1, stride=1) 95 | self.layer5 = self._make_layer(block, 512, layers[3], stride=1) 96 | 97 | self.conv_merge = nn.Conv1d(512 * block.expansion, num_classes, 98 | kernel_size=3, stride=1, padding=1, 99 | bias=True) 100 | self.vlp = VariableLengthPooling() 101 | # self.avgpool = nn.AvgPool2d((5, 1), stride=1) 102 | # self.fc = nn.Linear(256 * block.expansion, num_classes) 103 | 104 | for m in self.modules(): 105 | if isinstance(m, nn.Conv1d): 106 | # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 107 | # m.weight.data.normal_(0, math.sqrt(2. / n)) 108 | torch.nn.init.xavier_normal(m.weight.data) 109 | elif isinstance(m, nn.BatchNorm1d): 110 | m.weight.data.fill_(1) 111 | m.bias.data.zero_() 112 | 113 | def _make_layer(self, block, planes, blocks, kernel_size=3, stride=1): 114 | downsample = None 115 | if stride != 1 or self.inplanes != planes * block.expansion: 116 | downsample = nn.Sequential( 117 | nn.Conv1d(self.inplanes, planes * block.expansion, 118 | kernel_size=1, stride=1, bias=False), 119 | nn.BatchNorm1d(planes * block.expansion), 120 | ) 121 | 122 | layers = [] 123 | layers.append(block(self.inplanes, planes, kernel_size=kernel_size, 124 | stride=stride, downsample=downsample)) 125 | self.inplanes = planes * block.expansion 126 | for i in range(1, blocks): 127 | layers.append(block(self.inplanes, planes, kernel_size=kernel_size)) 128 | 129 | return nn.Sequential(*layers) 130 | 131 | def forward(self, x, bounds=None): 132 | x = self.conv1(x) 133 | x = self.bn1(x) 134 | x = self.relu(x) 135 | # x = self.maxpool(x) 136 | 137 | x = self.layer0(x) 138 | x = self.layer1(x) 139 | x = self.layer2(x) 140 | x = self.layer3(x) 141 | x = self.layer4(x) 142 | x = self.layer5(x) 143 | 144 | # x = self.avgpool(x) 145 | # x = x.view(x.size(0), -1) 146 | # x = self.fc(x) 147 | 148 | x = self.conv_merge(x) 149 | x = torch.squeeze(x, dim=2) 150 | x = self.vlp(x, bounds=bounds) 151 | 152 | return x 153 | 154 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import csv 5 | from datetime import datetime 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torchvision import datasets, transforms 13 | from torch.autograd import Variable 14 | from torchvision.models import resnet 15 | from torch.optim.lr_scheduler import ReduceLROnPlateau 16 | 17 | from variable_length_pooling import VariableLengthPooling 18 | 19 | def to_float_tensor(numpy_array): 20 | # Numpy array -> Tensor 21 | return torch.from_numpy(numpy_array).float() 22 | 23 | def to_long_tensor(numpy_array): 24 | # Numpy array -> Tensor 25 | return torch.from_numpy(numpy_array).long() 26 | 27 | def to_tensor(numpy_array): 28 | # Numpy array -> Tensor 29 | return torch.from_numpy(numpy_array) 30 | 31 | 32 | def to_variable(tensor): 33 | # Tensor -> Variable (on GPU if possible) 34 | if torch.cuda.is_available(): 35 | # Tensor -> GPU Tensor 36 | tensor = tensor.cuda() 37 | return torch.autograd.Variable(tensor) 38 | 39 | def get_onehot(b, n_phones, n_frames): 40 | #b = np.array([0, 2, 5]) 41 | b2 = np.concatenate((b[1:], [n_frames])) 42 | o = np.zeros((n_frames, n_phones)) 43 | p = np.zeros(n_frames, dtype=int) 44 | for idx, (s, e) in enumerate(zip(b, b2)): 45 | p[s:e] = idx 46 | o[range(n_frames), p] = 1 47 | # print(o) 48 | return o 49 | 50 | class MyDataset(torch.utils.data.Dataset): 51 | def __init__(self, x, y, for_conv2d=False): 52 | self.x = x 53 | self.y = y 54 | self.for_conv2d = for_conv2d 55 | self.total_phonemes = sum([len(xi[1]) for xi in x]) 56 | print("n_utters", self.x.shape[0], "total_phonemes", self.total_phonemes) 57 | 58 | def __getitem__(self, idx): 59 | """ 60 | return: frames, bounds(onehot), labels 61 | """ 62 | frames = self.x[idx][0] 63 | bounds = self.x[idx][1] 64 | n_phones = len(bounds) 65 | n_frames = len(frames) 66 | bounds_onehot = get_onehot(bounds, n_phones, n_frames) 67 | frames = frames.transpose() 68 | if self.for_conv2d: 69 | frames = np.expand_dims(frames, axis=0) 70 | return to_float_tensor(frames), \ 71 | to_float_tensor(bounds_onehot), \ 72 | to_long_tensor(self.y[idx] if self.y is not None else np.array([-1])) 73 | 74 | def __len__(self): 75 | return self.x.shape[0] 76 | 77 | 78 | def get_data_loaders(args, for_conv2d=False): 79 | print("loading data") 80 | 81 | # args.batch_size = 1 82 | 83 | # xtrain = np.load(args.data_dir + '/dev-features.npy') 84 | # ytrain = np.load(args.data_dir + '/dev-labels.npy') 85 | xtrain = np.load(args.data_dir + '/train-features.npy') 86 | ytrain = np.load(args.data_dir + '/train-labels.npy') 87 | xdev = np.load(args.data_dir + '/dev-features.npy') 88 | ydev = np.load(args.data_dir + '/dev-labels.npy') 89 | 90 | print("load complete") 91 | kwargs = {'num_workers': 3, 'pin_memory': True} if args.cuda else {} 92 | train_loader = torch.utils.data.DataLoader( 93 | MyDataset(xtrain, ytrain, for_conv2d=for_conv2d), 94 | batch_size=args.batch_size, shuffle=True, **kwargs) 95 | dev_loader = torch.utils.data.DataLoader( 96 | MyDataset(xdev, ydev, for_conv2d=for_conv2d), 97 | batch_size=args.batch_size, shuffle=True, **kwargs) 98 | 99 | return train_loader, dev_loader 100 | 101 | def weights_init(m): 102 | if isinstance(m, nn.Conv1d): 103 | torch.nn.init.xavier_normal(m.weight.data) 104 | # m.bias.data.zero_() 105 | 106 | 107 | class MyModel(nn.Module): 108 | def __init__(self): 109 | super(MyModel, self).__init__() 110 | self.firstrun = True 111 | self.layers = nn.ModuleList([ 112 | nn.Conv1d(40, 192, 3, padding=1), 113 | nn.BatchNorm1d(192), 114 | nn.LeakyReLU(inplace=True), 115 | 116 | # A 117 | nn.Conv1d(192, 192, 3, padding=1), 118 | nn.BatchNorm1d(192), 119 | nn.LeakyReLU(inplace=True), 120 | nn.Conv1d(192, 192, 3, padding=1), 121 | nn.BatchNorm1d(192), 122 | nn.LeakyReLU(inplace=True), 123 | nn.Conv1d(192, 192, 3, padding=1), 124 | nn.BatchNorm1d(192), 125 | nn.LeakyReLU(inplace=True), 126 | 127 | nn.Conv1d(192, 192, 1, padding=0), 128 | nn.BatchNorm1d(192), 129 | nn.LeakyReLU(inplace=True), 130 | 131 | # B 132 | nn.Conv1d(192, 192, 3, padding=1), 133 | nn.BatchNorm1d(192), 134 | nn.LeakyReLU(inplace=True), 135 | nn.Conv1d(192, 192, 3, padding=1), 136 | nn.BatchNorm1d(192), 137 | nn.LeakyReLU(inplace=True), 138 | nn.Conv1d(192, 256, 3, padding=1), 139 | nn.BatchNorm1d(256), 140 | nn.LeakyReLU(inplace=True), 141 | 142 | nn.Conv1d(256, 256, 1, padding=0), 143 | nn.BatchNorm1d(256), 144 | nn.LeakyReLU(inplace=True), 145 | 146 | # C 147 | # nn.Conv1d(256, 256, 3, padding=1), 148 | # nn.BatchNorm1d(256), 149 | # nn.LeakyReLU(inplace=True), 150 | # nn.Conv1d(256, 256, 3, padding=1), 151 | # nn.BatchNorm1d(256), 152 | # nn.LeakyReLU(inplace=True), 153 | # nn.Conv1d(256, 256, 3, padding=1), 154 | # nn.BatchNorm1d(256), 155 | # nn.LeakyReLU(inplace=True), 156 | # 157 | # nn.Conv1d(256, 256, 1, padding=0), 158 | # nn.BatchNorm1d(256), 159 | # nn.LeakyReLU(inplace=True), 160 | 161 | # D 162 | nn.Conv1d(256, 512, 3, padding=1), 163 | nn.BatchNorm1d(512), 164 | nn.LeakyReLU(inplace=True), 165 | nn.Conv1d(512, 512, 3, padding=1), 166 | nn.BatchNorm1d(512), 167 | nn.LeakyReLU(inplace=True), 168 | nn.Conv1d(512, 512, 3, padding=1), 169 | nn.BatchNorm1d(512), 170 | nn.LeakyReLU(inplace=True), 171 | 172 | nn.Conv1d(512, 512, 1, padding=0), 173 | nn.BatchNorm1d(512), 174 | nn.LeakyReLU(inplace=True), 175 | 176 | # E 177 | nn.Conv1d(512, 512, 5, padding=2), 178 | nn.BatchNorm1d(512), 179 | nn.LeakyReLU(inplace=True), 180 | nn.Conv1d(512, 512, 7, padding=3), 181 | nn.BatchNorm1d(512), 182 | nn.LeakyReLU(inplace=True), 183 | nn.Conv1d(512, 512, 9, padding=4), 184 | nn.BatchNorm1d(512), 185 | nn.LeakyReLU(inplace=True), 186 | nn.Conv1d(512, 1024, 11, padding=5), 187 | nn.BatchNorm1d(1024), 188 | nn.LeakyReLU(inplace=True), 189 | 190 | # nn.Conv1d(256, 128, 1, padding=0), 191 | # nn.BatchNorm1d(128), 192 | # nn.LeakyReLU(inplace=True), 193 | # nn.Conv1d(128, 128, 1, padding=0), 194 | # nn.BatchNorm1d(128), 195 | # nn.LeakyReLU(inplace=True), 196 | # nn.Conv1d(128, 46, 1, padding=0), 197 | # nn.BatchNorm1d(46), 198 | # nn.LeakyReLU(inplace=True), 199 | 200 | nn.Conv1d(1024, 1024, 3, padding=1), 201 | nn.BatchNorm1d(1024), 202 | nn.LeakyReLU(inplace=True), 203 | nn.Conv1d(1024, 1024, 3, padding=1), 204 | nn.BatchNorm1d(1024), 205 | nn.LeakyReLU(inplace=True), 206 | nn.Conv1d(1024, 46, 3, padding=1), 207 | nn.BatchNorm1d(46), 208 | nn.LeakyReLU(inplace=True), 209 | 210 | VariableLengthPooling() 211 | ]) 212 | 213 | def forward(self, input, bounds=None, print_firstrun=False): 214 | h = input 215 | if self.firstrun: 216 | print("****************************************") 217 | print("input: {}".format(h.size())) 218 | for i, layer in enumerate(self.layers): 219 | if i == len(self.layers) - 1 and isinstance(layer, VariableLengthPooling): 220 | h = layer(h, bounds=bounds) 221 | else: 222 | h = layer(h) 223 | if print_firstrun and self.firstrun: 224 | print("{}: {}".format(layer, h.size())) 225 | if self.firstrun: 226 | print("****************************************") 227 | self.firstrun = False 228 | return h 229 | 230 | 231 | 232 | def MyModelResNet2D(): 233 | """ 234 | Conv2D Resnet 235 | :return: 236 | """ 237 | from my_resnet import ResNet, BasicBlock 238 | return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=46) 239 | 240 | def MyModelResNet1D(): 241 | """ 242 | Conv1D Resnet 243 | :return: 244 | """ 245 | from my_resnet1d import ResNet, BasicBlock, Bottleneck 246 | return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=46) 247 | 248 | def train(epoch, model, optimizer, train_loader, args): 249 | model.train() 250 | 251 | t0 = time.time() 252 | for batch_idx, (frames, bounds, labels) in enumerate(train_loader): 253 | if args.cuda: 254 | frames, bounds, labels = map(lambda x: x.cuda(), [frames, bounds, labels]) 255 | # data, target = Variable(data), Variable(target) 256 | frames, bounds, labels = map(lambda x: Variable(x), [frames, bounds, labels]) 257 | optimizer.zero_grad() 258 | 259 | data = frames 260 | output = model(data, bounds=bounds) 261 | 262 | n_phones = len(labels.squeeze()) 263 | # print("n_phones", n_phones) 264 | loss = F.cross_entropy(output.squeeze().transpose(0, 1), labels.squeeze(), size_average=False) 265 | # Weighted loss. Typical utterance has 72 phonemes 266 | weighted_loss = loss * n_phones / 72.0 267 | 268 | # l2 reg 269 | # if args.cuda: 270 | # l2_reg = Variable(torch.cuda.FloatTensor(1), requires_grad=True) 271 | # else: 272 | # l2_reg = Variable(torch.FloatTensor(1), requires_grad=True) 273 | # for W in model.parameters(): 274 | # l2_reg = l2_reg + W.norm(2) 275 | 276 | # loss += args.l2_reg * l2_reg 277 | 278 | weighted_loss.backward() 279 | optimizer.step() 280 | # average loss per phoneme 281 | avg_loss = loss / n_phones 282 | 283 | 284 | if batch_idx % args.log_interval == 0: 285 | 286 | # if avg_loss.data[0] > 3.0: 287 | # pred = output.squeeze().transpose(0, 1).data.max(1, keepdim=True)[1] 288 | # gt = labels.squeeze().data.view_as(pred) 289 | # print(n_phones, loss.data[0], weighted_loss.data[0]) 290 | # print("gt ", gt.view(1, -1), "\npred", pred.view(1, -1)) 291 | 292 | print('Train Epoch: {} Batch: {} [{}/{} ({:.2f}%, time:{:.2f}s)]\tLoss: {:.6f}'.format( 293 | epoch, batch_idx, batch_idx * len(data), len(train_loader.dataset), 294 | 100. * batch_idx / len(train_loader), time.time() - t0, 295 | avg_loss.data[0])) 296 | t0 = time.time() 297 | 298 | 299 | def test(model, test_loader, args): 300 | model.eval() 301 | test_loss = 0 302 | correct = 0 303 | for frames, bounds, labels in test_loader: 304 | if args.cuda: 305 | frames, bounds, labels = map(lambda x: x.cuda(), [frames, bounds, labels]) 306 | frames, bounds, labels = Variable(frames, volatile=True), Variable(bounds), Variable(labels) 307 | 308 | data = frames 309 | 310 | output = model(data, bounds=bounds) 311 | output = output.squeeze().transpose(0, 1) 312 | labels = labels.squeeze() 313 | test_loss += F.cross_entropy(output, labels, size_average=False).data[0] # sum up batch loss 314 | pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability 315 | correct += pred.eq(labels.data.view_as(pred)).cpu().sum() 316 | 317 | test_loss /= test_loader.dataset.total_phonemes 318 | accuracy = correct / test_loader.dataset.total_phonemes 319 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format( 320 | test_loss, correct, test_loader.dataset.total_phonemes, 321 | 100 * accuracy)) 322 | return "{:.4f}%".format(100. * correct / test_loader.dataset.total_phonemes), accuracy 323 | 324 | 325 | def main(args): 326 | print(args) 327 | 328 | torch.manual_seed(args.seed) 329 | if args.cuda: 330 | torch.cuda.manual_seed(args.seed) 331 | 332 | train_loader, test_loader = get_data_loaders(args, for_conv2d=False) 333 | 334 | model = MyModelResNet1D() 335 | 336 | 337 | # model = MyModel() 338 | # model.apply(weights_init) 339 | 340 | if args.cuda: 341 | model.cuda() 342 | 343 | # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, ) 344 | optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=5e-5) #1e-4 345 | scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=2, verbose=True, 346 | threshold_mode='abs', threshold=0.01, min_lr=1e-6) 347 | for epoch in range(1, args.epochs + 1): 348 | print(datetime.now()) 349 | train(epoch, model, optimizer, train_loader, args) 350 | acc_str, acc = test(model, test_loader, args) 351 | scheduler.step(acc) 352 | if not os.path.exists(args.weights_dir): 353 | os.makedirs(args.weights_dir) 354 | torch.save(model.state_dict(), "{}/{:03d}_{}.w".format(args.weights_dir, epoch, acc_str)) 355 | 356 | 357 | def predict_batch(model, x, bounds, args): 358 | if args.cuda: 359 | model.cuda() 360 | x = x.cuda() 361 | bounds = bounds.cuda() 362 | model.eval() 363 | output = model(Variable(x, volatile=True), bounds=Variable(bounds)) 364 | output = output.squeeze().transpose(0, 1) 365 | return output.data.max(1, keepdim=True)[1] 366 | 367 | 368 | def get_test_data_loaders(args): 369 | print("loading data") 370 | # args.batch_size = 1 371 | xtest = np.load(args.data_dir + '/test-features.npy') 372 | 373 | print("load complete") 374 | # 'num_workers': 8, 375 | kwargs = {'pin_memory': True} if args.cuda else {} 376 | test_loader = torch.utils.data.DataLoader( 377 | MyDataset(xtest, None), 378 | batch_size=args.batch_size, shuffle=False, **kwargs) 379 | return test_loader 380 | 381 | 382 | def predict(args, csv_fpath, weights_fpath): 383 | model = MyModelResNet2D() 384 | model.load_state_dict(torch.load(weights_fpath)) 385 | test_loader = get_test_data_loaders(args) 386 | with open(csv_fpath, 'w') as csvfile: 387 | writer = csv.DictWriter(csvfile, fieldnames=['Id', 'Label']) 388 | writer.writeheader() 389 | cnt = 0 390 | for batch, (frames, bounds, _, _) in enumerate(test_loader): 391 | if batch % args.log_interval == 0: 392 | print("batch", batch) 393 | yhat = predict_batch(model, frames, bounds, args) 394 | for i, y in enumerate(yhat[:]): 395 | writer.writerow({"Id": cnt + i, "Label": y.cpu()[0]}) 396 | cnt += len(yhat) 397 | print("done") 398 | 399 | 400 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example') 401 | parser.add_argument('--batch-size', type=int, default=1, metavar='N', 402 | help='input batch size for training (default: 64)') 403 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 404 | help='input batch size for testing (default: 1000)') 405 | parser.add_argument('--epochs', type=int, default=100, metavar='N', 406 | help='number of epochs to train (default: 10)') 407 | parser.add_argument('--lr', type=float, default=0.001, metavar='LR', 408 | help='learning rate (default: 0.001)') 409 | parser.add_argument('--momentum', type=float, default=0.9, metavar='M', 410 | help='SGD momentum (default: 0.9)') 411 | parser.add_argument('--l2-reg', type=float, default=0.001, 412 | help='l2 regularization') 413 | parser.add_argument('--no-cuda', action='store_true', default=False, 414 | help='disables CUDA training') 415 | parser.add_argument('--seed', type=int, default=1, metavar='S', 416 | help='random seed (default: 1)') 417 | parser.add_argument('--log-interval', type=int, default=300, metavar='N', 418 | help='how many batches to wait before logging training status') 419 | # parser.add_argument('--K', type=int, default=10, metavar='N', 420 | # help='window size') 421 | parser.add_argument('--data-dir', type=str, default='./data/', 422 | help='data directory') 423 | parser.add_argument('--weights-dir', type=str, default='./weights/', 424 | help='data directory') 425 | 426 | 427 | if __name__ == "__main__": 428 | print(torch.__version__) 429 | args = parser.parse_args() 430 | args.cuda = not args.no_cuda and torch.cuda.is_available() 431 | args.batch_size = 1 432 | main(args) 433 | #predict(args, './e1/submission.csv', './e1/weights/012_82.1546%.w') 434 | --------------------------------------------------------------------------------