├── variable_length_pooling.py
├── Readme.md
├── my_resnet.py
├── my_resnet1d.py
└── main.py


/variable_length_pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class VariableLengthPooling(nn.Module):
 5 |     def forward(self, x, **kwargs):
 6 |         bounds = kwargs.get("bounds")
 7 |         # print("--------x--------", x.size(), x)
 8 |         # print("--------bounds--------", bounds.size(), bounds)
 9 |         cnt = torch.sum(bounds, dim=1)
10 |         # print("--------cnt--------", cnt.size(), cnt)
11 |         # print("--------bmm--------", torch.bmm(x, bounds).size(), torch.bmm(x, bounds))
12 |         out = torch.bmm(x, bounds) / cnt
13 |         # print("--------out--------", out.size(), out)
14 |         return out
15 | 
16 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Resnet 1D and Variable Length Pooling for time series data
 2 | 
 3 | ## Pooling from variable length of activations
 4 | This is useful for dealing features of various length in time dimension.
 5 | E.g., phonemes in speech data.
 6 | 
 7 | ```
 8 | Softmax             o1        o2        o3
 9 |                     |         |         |
10 | 
11 | VarLenAvgPooling    p1        p2        p3
12 |                    / \     /     \     / \
13 |                   |   |   |       |   |   |
14 | 
15 | Activation        a1  a2  a3  a4  a5  a6  a7
16 |                   |   |   |   |   |   |   |
17 | 
18 | Conv1D            o   o   o   o   o   o   o
19 |                  /|\ /|\ /|\ /|\ /|\ /|\ /|\
20 | 
21 | Activation        a1  a2  a3  a4  a5  a6  a7
22 |                   |   |   |   |   |   |   |
23 | 
24 | Conv1D            o   o   o   o   o   o   o
25 |                  /|\ /|\ /|\ /|\ /|\ /|\ /|\
26 | 
27 | Time             --------------------------->
28 | 
29 | ```
30 | 
31 | ## ResNet for time series data
32 | 
33 | Vanilla ResNet uses Conv2D for image data. However this architecture may be useful for deep Conv1D networks as well. 
34 | 
35 | I tried two approaches in my code:
36 | 
37 | - use rectangular filters (different H, W) directly in ResNet2D
38 | - shift to Conv1D entirely
39 | 
40 | It depends on your specific problem to answer which approach is better.
41 | 
42 | # Credits
43 | 
44 | Some code are copied from pytorch official examples and CMU classes. Credits to the original authors.
45 | 
46 | - https://github.com/pytorch/examples/tree/master/mnist
47 | - ...
48 | 


--------------------------------------------------------------------------------
/my_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | 
  5 | from variable_length_pooling import VariableLengthPooling
  6 | 
  7 | def conv3x3(in_planes, out_planes, stride=1):
  8 |     """3x3 convolution with padding"""
  9 |     return nn.Conv2d(in_planes, out_planes, kernel_size=(5, 3), stride=(stride, 1),
 10 |                      padding=(2, 1), bias=True)
 11 | 
 12 | 
 13 | class BasicBlock(nn.Module):
 14 |     expansion = 1
 15 | 
 16 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 17 |         super(BasicBlock, self).__init__()
 18 |         self.conv1 = conv3x3(inplanes, planes, stride)
 19 |         self.bn1 = nn.BatchNorm2d(planes)
 20 |         self.relu = nn.LeakyReLU(inplace=True)
 21 |         self.conv2 = conv3x3(planes, planes)
 22 |         self.bn2 = nn.BatchNorm2d(planes)
 23 |         self.downsample = downsample
 24 |         self.stride = stride
 25 | 
 26 |     def forward(self, x):
 27 |         residual = x
 28 | 
 29 |         out = self.conv1(x)
 30 |         out = self.bn1(out)
 31 |         out = self.relu(out)
 32 | 
 33 |         out = self.conv2(out)
 34 |         out = self.bn2(out)
 35 | 
 36 |         if self.downsample is not None:
 37 |             residual = self.downsample(x)
 38 | 
 39 |         out += residual
 40 |         out = self.relu(out)
 41 | 
 42 |         return out
 43 | 
 44 | class Bottleneck(nn.Module):
 45 |     expansion = 4
 46 | 
 47 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 48 |         super(Bottleneck, self).__init__()
 49 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
 50 |         self.bn1 = nn.BatchNorm2d(planes)
 51 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=(5, 3), stride=(stride, 1),
 52 |                                padding=(2, 1), bias=True)
 53 |         self.bn2 = nn.BatchNorm2d(planes)
 54 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
 55 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 56 |         self.relu = nn.LeakyReLU(inplace=True)
 57 |         self.downsample = downsample
 58 |         self.stride = stride
 59 | 
 60 |     def forward(self, x):
 61 |         residual = x
 62 | 
 63 |         out = self.conv1(x)
 64 |         out = self.bn1(out)
 65 |         out = self.relu(out)
 66 | 
 67 |         out = self.conv2(out)
 68 |         out = self.bn2(out)
 69 |         out = self.relu(out)
 70 | 
 71 |         out = self.conv3(out)
 72 |         out = self.bn3(out)
 73 | 
 74 |         if self.downsample is not None:
 75 |             residual = self.downsample(x)
 76 | 
 77 |         out += residual
 78 |         out = self.relu(out)
 79 | 
 80 |         return out
 81 | 
 82 | class ResNet(nn.Module):
 83 | 
 84 |     def __init__(self, block, layers, num_classes=46):
 85 |         self.inplanes = 16
 86 |         super(ResNet, self).__init__()
 87 |         self.conv1 = nn.Conv2d(1, 16, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1),
 88 |                                bias=True)
 89 |         self.bn1 = nn.BatchNorm2d(16)
 90 |         self.relu = nn.LeakyReLU(inplace=True)
 91 |         # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 92 |         self.layer1 = self._make_layer(block, 16, layers[0])
 93 |         self.layer2 = self._make_layer(block, 32, layers[1], stride=2)  # out: 10
 94 |         self.layer3 = self._make_layer(block, 128, layers[2], stride=2) # out: 5
 95 |         self.layer4 = self._make_layer(block, 256, layers[3], stride=2) # out: 3
 96 | 
 97 |         self.conv_merge = nn.Conv2d(256 * block.expansion, num_classes,
 98 |                                     kernel_size=(3, 3), stride=1, padding=(0, 1),
 99 |                                     bias=True)
100 |         self.vlp = VariableLengthPooling()
101 |         # self.avgpool = nn.AvgPool2d((5, 1), stride=1)
102 |         # self.fc = nn.Linear(256 * block.expansion, num_classes)
103 | 
104 |         for m in self.modules():
105 |             if isinstance(m, nn.Conv2d):
106 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
107 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
108 |             elif isinstance(m, nn.BatchNorm2d):
109 |                 m.weight.data.fill_(1)
110 |                 m.bias.data.zero_()
111 | 
112 |     def _make_layer(self, block, planes, blocks, stride=1):
113 |         downsample = None
114 |         if stride != 1 or self.inplanes != planes * block.expansion:
115 |             downsample = nn.Sequential(
116 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
117 |                           kernel_size=1, stride=(stride, 1), bias=False),
118 |                 nn.BatchNorm2d(planes * block.expansion),
119 |             )
120 | 
121 |         layers = []
122 |         layers.append(block(self.inplanes, planes, stride, downsample))
123 |         self.inplanes = planes * block.expansion
124 |         for i in range(1, blocks):
125 |             layers.append(block(self.inplanes, planes))
126 | 
127 |         return nn.Sequential(*layers)
128 | 
129 |     def forward(self, x, bounds=None):
130 |         x = self.conv1(x)
131 |         x = self.bn1(x)
132 |         x = self.relu(x)
133 |         # x = self.maxpool(x)
134 | 
135 |         x = self.layer1(x)
136 |         x = self.layer2(x)
137 |         x = self.layer3(x)
138 |         x = self.layer4(x)
139 | 
140 |         # x = self.avgpool(x)
141 |         # x = x.view(x.size(0), -1)
142 |         # x = self.fc(x)
143 | 
144 |         x = self.conv_merge(x)
145 |         x = torch.squeeze(x, dim=2)
146 |         x = self.vlp(x, bounds=bounds)
147 | 
148 |         return x
149 | 
150 | 


--------------------------------------------------------------------------------
/my_resnet1d.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | from variable_length_pooling import VariableLengthPooling
  5 | 
  6 | def conv3x3(in_planes, out_planes, kernel_size=3, stride=1):
  7 |     """3x3 convolution with padding"""
  8 |     return nn.Conv1d(in_planes, out_planes, kernel_size=kernel_size, stride=1,
  9 |                      padding=kernel_size//2, bias=True)
 10 | 
 11 | 
 12 | class BasicBlock(nn.Module):
 13 |     expansion = 1
 14 | 
 15 |     def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None):
 16 |         super(BasicBlock, self).__init__()
 17 |         self.conv1 = conv3x3(inplanes, planes, kernel_size=kernel_size, stride=stride)
 18 |         self.bn1 = nn.BatchNorm1d(planes)
 19 |         self.relu = nn.LeakyReLU(inplace=True)
 20 |         self.conv2 = conv3x3(planes, planes, kernel_size=kernel_size, stride=stride)
 21 |         self.bn2 = nn.BatchNorm1d(planes)
 22 |         self.downsample = downsample
 23 |         self.stride = stride
 24 | 
 25 |     def forward(self, x):
 26 |         residual = x
 27 | 
 28 |         out = self.conv1(x)
 29 |         out = self.bn1(out)
 30 |         out = self.relu(out)
 31 | 
 32 |         out = self.conv2(out)
 33 |         out = self.bn2(out)
 34 | 
 35 |         if self.downsample is not None:
 36 |             residual = self.downsample(x)
 37 | 
 38 |         out += residual
 39 |         out = self.relu(out)
 40 | 
 41 |         return out
 42 | 
 43 | class Bottleneck(nn.Module):
 44 |     expansion = 4
 45 | 
 46 |     def __init__(self, inplanes, planes, kernel_size=3, stride=1, downsample=None):
 47 |         super(Bottleneck, self).__init__()
 48 |         self.conv1 = nn.Conv1d(inplanes, planes, kernel_size=1, bias=True)
 49 |         self.bn1 = nn.BatchNorm1d(planes)
 50 |         self.conv2 = nn.Conv1d(planes, planes, kernel_size=kernel_size, stride=1,
 51 |                                padding=kernel_size//2, bias=True)
 52 |         self.bn2 = nn.BatchNorm1d(planes)
 53 |         self.conv3 = nn.Conv1d(planes, planes * 4, kernel_size=1, bias=True)
 54 |         self.bn3 = nn.BatchNorm1d(planes * 4)
 55 |         self.relu = nn.LeakyReLU(inplace=True)
 56 |         self.downsample = downsample
 57 |         self.stride = stride
 58 | 
 59 |     def forward(self, x):
 60 |         residual = x
 61 | 
 62 |         out = self.conv1(x)
 63 |         out = self.bn1(out)
 64 |         out = self.relu(out)
 65 | 
 66 |         out = self.conv2(out)
 67 |         out = self.bn2(out)
 68 |         out = self.relu(out)
 69 | 
 70 |         out = self.conv3(out)
 71 |         out = self.bn3(out)
 72 | 
 73 |         if self.downsample is not None:
 74 |             residual = self.downsample(x)
 75 | 
 76 |         out += residual
 77 |         out = self.relu(out)
 78 | 
 79 |         return out
 80 | 
 81 | class ResNet(nn.Module):
 82 | 
 83 |     def __init__(self, block, layers, num_classes=46):
 84 |         self.inplanes = 192
 85 |         super(ResNet, self).__init__()
 86 |         self.conv1 = nn.Conv1d(40, 192, kernel_size=3, stride=1, padding=1, bias=True)
 87 |         self.bn1 = nn.BatchNorm1d(192)
 88 |         self.relu = nn.LeakyReLU(inplace=True)
 89 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 90 |         self.layer0 = self._make_layer(block, 256, layers[0])
 91 |         self.layer1 = self._make_layer(block, 256, layers[0], kernel_size=1, stride=1)
 92 |         self.layer2 = self._make_layer(block, 256, layers[1], kernel_size=5, stride=1)
 93 |         self.layer3 = self._make_layer(block, 256, layers[2], kernel_size=5, stride=1)
 94 |         self.layer4 = self._make_layer(block, 512, layers[3], kernel_size=1, stride=1)
 95 |         self.layer5 = self._make_layer(block, 512, layers[3], stride=1)
 96 | 
 97 |         self.conv_merge = nn.Conv1d(512 * block.expansion, num_classes,
 98 |                                     kernel_size=3, stride=1, padding=1,
 99 |                                     bias=True)
100 |         self.vlp = VariableLengthPooling()
101 |         # self.avgpool = nn.AvgPool2d((5, 1), stride=1)
102 |         # self.fc = nn.Linear(256 * block.expansion, num_classes)
103 | 
104 |         for m in self.modules():
105 |             if isinstance(m, nn.Conv1d):
106 |                 # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
107 |                 # m.weight.data.normal_(0, math.sqrt(2. / n))
108 |                 torch.nn.init.xavier_normal(m.weight.data)
109 |             elif isinstance(m, nn.BatchNorm1d):
110 |                 m.weight.data.fill_(1)
111 |                 m.bias.data.zero_()
112 | 
113 |     def _make_layer(self, block, planes, blocks, kernel_size=3, stride=1):
114 |         downsample = None
115 |         if stride != 1 or self.inplanes != planes * block.expansion:
116 |             downsample = nn.Sequential(
117 |                 nn.Conv1d(self.inplanes, planes * block.expansion,
118 |                           kernel_size=1, stride=1, bias=False),
119 |                 nn.BatchNorm1d(planes * block.expansion),
120 |             )
121 | 
122 |         layers = []
123 |         layers.append(block(self.inplanes, planes, kernel_size=kernel_size,
124 |                             stride=stride, downsample=downsample))
125 |         self.inplanes = planes * block.expansion
126 |         for i in range(1, blocks):
127 |             layers.append(block(self.inplanes, planes, kernel_size=kernel_size))
128 | 
129 |         return nn.Sequential(*layers)
130 | 
131 |     def forward(self, x, bounds=None):
132 |         x = self.conv1(x)
133 |         x = self.bn1(x)
134 |         x = self.relu(x)
135 |         # x = self.maxpool(x)
136 | 
137 |         x = self.layer0(x)
138 |         x = self.layer1(x)
139 |         x = self.layer2(x)
140 |         x = self.layer3(x)
141 |         x = self.layer4(x)
142 |         x = self.layer5(x)
143 | 
144 |         # x = self.avgpool(x)
145 |         # x = x.view(x.size(0), -1)
146 |         # x = self.fc(x)
147 | 
148 |         x = self.conv_merge(x)
149 |         x = torch.squeeze(x, dim=2)
150 |         x = self.vlp(x, bounds=bounds)
151 | 
152 |         return x
153 | 
154 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import csv
  5 | from datetime import datetime
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from torchvision import datasets, transforms
 13 | from torch.autograd import Variable
 14 | from torchvision.models import resnet
 15 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 16 | 
 17 | from variable_length_pooling import VariableLengthPooling
 18 | 
 19 | def to_float_tensor(numpy_array):
 20 |     # Numpy array -> Tensor
 21 |     return torch.from_numpy(numpy_array).float()
 22 | 
 23 | def to_long_tensor(numpy_array):
 24 |     # Numpy array -> Tensor
 25 |     return torch.from_numpy(numpy_array).long()
 26 | 
 27 | def to_tensor(numpy_array):
 28 |     # Numpy array -> Tensor
 29 |     return torch.from_numpy(numpy_array)
 30 | 
 31 | 
 32 | def to_variable(tensor):
 33 |     # Tensor -> Variable (on GPU if possible)
 34 |     if torch.cuda.is_available():
 35 |         # Tensor -> GPU Tensor
 36 |         tensor = tensor.cuda()
 37 |     return torch.autograd.Variable(tensor)
 38 | 
 39 | def get_onehot(b, n_phones, n_frames):
 40 |     #b = np.array([0, 2, 5])
 41 |     b2 = np.concatenate((b[1:], [n_frames]))
 42 |     o = np.zeros((n_frames, n_phones))
 43 |     p = np.zeros(n_frames, dtype=int)
 44 |     for idx, (s, e) in enumerate(zip(b, b2)):
 45 |         p[s:e] = idx
 46 |     o[range(n_frames), p] = 1
 47 |     # print(o)
 48 |     return o
 49 | 
 50 | class MyDataset(torch.utils.data.Dataset):
 51 |     def __init__(self, x, y, for_conv2d=False):
 52 |         self.x = x
 53 |         self.y = y
 54 |         self.for_conv2d = for_conv2d
 55 |         self.total_phonemes = sum([len(xi[1]) for xi in x])
 56 |         print("n_utters", self.x.shape[0], "total_phonemes", self.total_phonemes)
 57 | 
 58 |     def __getitem__(self, idx):
 59 |         """
 60 |         return: frames, bounds(onehot), labels
 61 |         """
 62 |         frames = self.x[idx][0]
 63 |         bounds = self.x[idx][1]
 64 |         n_phones = len(bounds)
 65 |         n_frames = len(frames)
 66 |         bounds_onehot = get_onehot(bounds, n_phones, n_frames)
 67 |         frames = frames.transpose()
 68 |         if self.for_conv2d:
 69 |             frames = np.expand_dims(frames, axis=0)
 70 |         return to_float_tensor(frames), \
 71 |             to_float_tensor(bounds_onehot), \
 72 |             to_long_tensor(self.y[idx] if self.y is not None else np.array([-1]))
 73 | 
 74 |     def __len__(self):
 75 |         return self.x.shape[0]
 76 | 
 77 | 
 78 | def get_data_loaders(args, for_conv2d=False):
 79 |     print("loading data")
 80 | 
 81 |     # args.batch_size = 1
 82 | 
 83 |     # xtrain = np.load(args.data_dir + '/dev-features.npy')
 84 |     # ytrain = np.load(args.data_dir + '/dev-labels.npy')
 85 |     xtrain = np.load(args.data_dir + '/train-features.npy')
 86 |     ytrain = np.load(args.data_dir + '/train-labels.npy')
 87 |     xdev = np.load(args.data_dir + '/dev-features.npy')
 88 |     ydev = np.load(args.data_dir + '/dev-labels.npy')
 89 | 
 90 |     print("load complete")
 91 |     kwargs = {'num_workers': 3, 'pin_memory': True} if args.cuda else {}
 92 |     train_loader = torch.utils.data.DataLoader(
 93 |         MyDataset(xtrain, ytrain, for_conv2d=for_conv2d),
 94 |         batch_size=args.batch_size, shuffle=True, **kwargs)
 95 |     dev_loader = torch.utils.data.DataLoader(
 96 |         MyDataset(xdev, ydev, for_conv2d=for_conv2d),
 97 |         batch_size=args.batch_size, shuffle=True, **kwargs)
 98 | 
 99 |     return train_loader, dev_loader
100 | 
101 | def weights_init(m):
102 |     if isinstance(m, nn.Conv1d):
103 |         torch.nn.init.xavier_normal(m.weight.data)
104 |         # m.bias.data.zero_()
105 | 
106 | 
107 | class MyModel(nn.Module):
108 |     def __init__(self):
109 |         super(MyModel, self).__init__()
110 |         self.firstrun = True
111 |         self.layers = nn.ModuleList([
112 |             nn.Conv1d(40, 192, 3, padding=1),
113 |             nn.BatchNorm1d(192),
114 |             nn.LeakyReLU(inplace=True),
115 | 
116 |             # A
117 |             nn.Conv1d(192, 192, 3, padding=1),
118 |             nn.BatchNorm1d(192),
119 |             nn.LeakyReLU(inplace=True),
120 |             nn.Conv1d(192, 192, 3, padding=1),
121 |             nn.BatchNorm1d(192),
122 |             nn.LeakyReLU(inplace=True),
123 |             nn.Conv1d(192, 192, 3, padding=1),
124 |             nn.BatchNorm1d(192),
125 |             nn.LeakyReLU(inplace=True),
126 | 
127 |             nn.Conv1d(192, 192, 1, padding=0),
128 |             nn.BatchNorm1d(192),
129 |             nn.LeakyReLU(inplace=True),
130 | 
131 |             # B
132 |             nn.Conv1d(192, 192, 3, padding=1),
133 |             nn.BatchNorm1d(192),
134 |             nn.LeakyReLU(inplace=True),
135 |             nn.Conv1d(192, 192, 3, padding=1),
136 |             nn.BatchNorm1d(192),
137 |             nn.LeakyReLU(inplace=True),
138 |             nn.Conv1d(192, 256, 3, padding=1),
139 |             nn.BatchNorm1d(256),
140 |             nn.LeakyReLU(inplace=True),
141 | 
142 |             nn.Conv1d(256, 256, 1, padding=0),
143 |             nn.BatchNorm1d(256),
144 |             nn.LeakyReLU(inplace=True),
145 | 
146 |             # C
147 |             # nn.Conv1d(256, 256, 3, padding=1),
148 |             # nn.BatchNorm1d(256),
149 |             # nn.LeakyReLU(inplace=True),
150 |             # nn.Conv1d(256, 256, 3, padding=1),
151 |             # nn.BatchNorm1d(256),
152 |             # nn.LeakyReLU(inplace=True),
153 |             # nn.Conv1d(256, 256, 3, padding=1),
154 |             # nn.BatchNorm1d(256),
155 |             # nn.LeakyReLU(inplace=True),
156 |             #
157 |             # nn.Conv1d(256, 256, 1, padding=0),
158 |             # nn.BatchNorm1d(256),
159 |             # nn.LeakyReLU(inplace=True),
160 | 
161 |             # D
162 |             nn.Conv1d(256, 512, 3, padding=1),
163 |             nn.BatchNorm1d(512),
164 |             nn.LeakyReLU(inplace=True),
165 |             nn.Conv1d(512, 512, 3, padding=1),
166 |             nn.BatchNorm1d(512),
167 |             nn.LeakyReLU(inplace=True),
168 |             nn.Conv1d(512, 512, 3, padding=1),
169 |             nn.BatchNorm1d(512),
170 |             nn.LeakyReLU(inplace=True),
171 | 
172 |             nn.Conv1d(512, 512, 1, padding=0),
173 |             nn.BatchNorm1d(512),
174 |             nn.LeakyReLU(inplace=True),
175 | 
176 |             # E
177 |             nn.Conv1d(512, 512, 5, padding=2),
178 |             nn.BatchNorm1d(512),
179 |             nn.LeakyReLU(inplace=True),
180 |             nn.Conv1d(512, 512, 7, padding=3),
181 |             nn.BatchNorm1d(512),
182 |             nn.LeakyReLU(inplace=True),
183 |             nn.Conv1d(512, 512, 9, padding=4),
184 |             nn.BatchNorm1d(512),
185 |             nn.LeakyReLU(inplace=True),
186 |             nn.Conv1d(512, 1024, 11, padding=5),
187 |             nn.BatchNorm1d(1024),
188 |             nn.LeakyReLU(inplace=True),
189 | 
190 |             # nn.Conv1d(256, 128, 1, padding=0),
191 |             # nn.BatchNorm1d(128),
192 |             # nn.LeakyReLU(inplace=True),
193 |             # nn.Conv1d(128, 128, 1, padding=0),
194 |             # nn.BatchNorm1d(128),
195 |             # nn.LeakyReLU(inplace=True),
196 |             # nn.Conv1d(128, 46, 1, padding=0),
197 |             # nn.BatchNorm1d(46),
198 |             # nn.LeakyReLU(inplace=True),
199 | 
200 |             nn.Conv1d(1024, 1024, 3, padding=1),
201 |             nn.BatchNorm1d(1024),
202 |             nn.LeakyReLU(inplace=True),
203 |             nn.Conv1d(1024, 1024, 3, padding=1),
204 |             nn.BatchNorm1d(1024),
205 |             nn.LeakyReLU(inplace=True),
206 |             nn.Conv1d(1024, 46, 3, padding=1),
207 |             nn.BatchNorm1d(46),
208 |             nn.LeakyReLU(inplace=True),
209 | 
210 |             VariableLengthPooling()
211 |         ])
212 | 
213 |     def forward(self, input, bounds=None, print_firstrun=False):
214 |         h = input
215 |         if self.firstrun:
216 |             print("****************************************")
217 |             print("input: {}".format(h.size()))
218 |         for i, layer in enumerate(self.layers):
219 |             if i == len(self.layers) - 1 and isinstance(layer, VariableLengthPooling):
220 |                 h = layer(h, bounds=bounds)
221 |             else:
222 |                 h = layer(h)
223 |             if print_firstrun and self.firstrun:
224 |                 print("{}: {}".format(layer, h.size()))
225 |         if self.firstrun:
226 |             print("****************************************")
227 |         self.firstrun = False
228 |         return h
229 | 
230 | 
231 | 
232 | def MyModelResNet2D():
233 |     """
234 |     Conv2D Resnet
235 |     :return:
236 |     """
237 |     from my_resnet import ResNet,  BasicBlock
238 |     return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=46)
239 | 
240 | def MyModelResNet1D():
241 |     """
242 |     Conv1D Resnet
243 |     :return:
244 |     """
245 |     from my_resnet1d import ResNet, BasicBlock, Bottleneck
246 |     return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=46)
247 | 
248 | def train(epoch, model, optimizer, train_loader, args):
249 |     model.train()
250 | 
251 |     t0 = time.time()
252 |     for batch_idx, (frames, bounds, labels) in enumerate(train_loader):
253 |         if args.cuda:
254 |             frames, bounds, labels = map(lambda x: x.cuda(), [frames, bounds, labels])
255 |         # data, target = Variable(data), Variable(target)
256 |         frames, bounds, labels = map(lambda x: Variable(x), [frames, bounds, labels])
257 |         optimizer.zero_grad()
258 | 
259 |         data = frames
260 |         output = model(data, bounds=bounds)
261 | 
262 |         n_phones = len(labels.squeeze())
263 |         # print("n_phones", n_phones)
264 |         loss = F.cross_entropy(output.squeeze().transpose(0, 1), labels.squeeze(), size_average=False)
265 |         # Weighted loss. Typical utterance has 72 phonemes
266 |         weighted_loss = loss * n_phones / 72.0
267 | 
268 |         # l2 reg
269 |         #         if args.cuda:
270 |         #             l2_reg = Variable(torch.cuda.FloatTensor(1), requires_grad=True)
271 |         #         else:
272 |         #             l2_reg = Variable(torch.FloatTensor(1), requires_grad=True)
273 |         #         for W in model.parameters():
274 |         #             l2_reg = l2_reg + W.norm(2)
275 | 
276 |         #         loss += args.l2_reg * l2_reg
277 | 
278 |         weighted_loss.backward()
279 |         optimizer.step()
280 |         # average loss per phoneme
281 |         avg_loss = loss / n_phones
282 | 
283 | 
284 |         if batch_idx % args.log_interval == 0:
285 | 
286 |             # if avg_loss.data[0] > 3.0:
287 |             #     pred = output.squeeze().transpose(0, 1).data.max(1, keepdim=True)[1]
288 |             #     gt = labels.squeeze().data.view_as(pred)
289 |             #     print(n_phones, loss.data[0], weighted_loss.data[0])
290 |             #     print("gt  ", gt.view(1, -1), "\npred", pred.view(1, -1))
291 | 
292 |             print('Train Epoch: {} Batch: {} [{}/{} ({:.2f}%, time:{:.2f}s)]\tLoss: {:.6f}'.format(
293 |                 epoch, batch_idx, batch_idx * len(data), len(train_loader.dataset),
294 |                 100. * batch_idx / len(train_loader), time.time() - t0,
295 |                 avg_loss.data[0]))
296 |             t0 = time.time()
297 | 
298 | 
299 | def test(model, test_loader, args):
300 |     model.eval()
301 |     test_loss = 0
302 |     correct = 0
303 |     for frames, bounds, labels in test_loader:
304 |         if args.cuda:
305 |             frames, bounds, labels = map(lambda x: x.cuda(), [frames, bounds, labels])
306 |         frames, bounds, labels = Variable(frames, volatile=True), Variable(bounds), Variable(labels)
307 | 
308 |         data = frames
309 | 
310 |         output = model(data, bounds=bounds)
311 |         output = output.squeeze().transpose(0, 1)
312 |         labels = labels.squeeze()
313 |         test_loss += F.cross_entropy(output, labels, size_average=False).data[0]  # sum up batch loss
314 |         pred = output.data.max(1, keepdim=True)[1]  # get the index of the max log-probability
315 |         correct += pred.eq(labels.data.view_as(pred)).cpu().sum()
316 | 
317 |     test_loss /= test_loader.dataset.total_phonemes
318 |     accuracy = correct / test_loader.dataset.total_phonemes
319 |     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(
320 |         test_loss, correct, test_loader.dataset.total_phonemes,
321 |         100 * accuracy))
322 |     return "{:.4f}%".format(100. * correct / test_loader.dataset.total_phonemes), accuracy
323 | 
324 | 
325 | def main(args):
326 |     print(args)
327 | 
328 |     torch.manual_seed(args.seed)
329 |     if args.cuda:
330 |         torch.cuda.manual_seed(args.seed)
331 | 
332 |     train_loader, test_loader = get_data_loaders(args, for_conv2d=False)
333 | 
334 |     model = MyModelResNet1D()
335 | 
336 | 
337 |     # model = MyModel()
338 |     # model.apply(weights_init)
339 | 
340 |     if args.cuda:
341 |         model.cuda()
342 | 
343 |     # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, )
344 |     optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=5e-5) #1e-4
345 |     scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=2, verbose=True,
346 |                                   threshold_mode='abs', threshold=0.01, min_lr=1e-6)
347 |     for epoch in range(1, args.epochs + 1):
348 |         print(datetime.now())
349 |         train(epoch, model, optimizer, train_loader, args)
350 |         acc_str, acc = test(model, test_loader, args)
351 |         scheduler.step(acc)
352 |         if not os.path.exists(args.weights_dir):
353 |             os.makedirs(args.weights_dir)
354 |         torch.save(model.state_dict(), "{}/{:03d}_{}.w".format(args.weights_dir, epoch, acc_str))
355 | 
356 | 
357 | def predict_batch(model, x, bounds, args):
358 |     if args.cuda:
359 |         model.cuda()
360 |         x = x.cuda()
361 |         bounds = bounds.cuda()
362 |     model.eval()
363 |     output = model(Variable(x, volatile=True), bounds=Variable(bounds))
364 |     output = output.squeeze().transpose(0, 1)
365 |     return output.data.max(1, keepdim=True)[1]
366 | 
367 | 
368 | def get_test_data_loaders(args):
369 |     print("loading data")
370 |     # args.batch_size = 1
371 |     xtest = np.load(args.data_dir + '/test-features.npy')
372 | 
373 |     print("load complete")
374 |     # 'num_workers': 8,
375 |     kwargs = {'pin_memory': True} if args.cuda else {}
376 |     test_loader = torch.utils.data.DataLoader(
377 |         MyDataset(xtest, None),
378 |         batch_size=args.batch_size, shuffle=False, **kwargs)
379 |     return test_loader
380 | 
381 | 
382 | def predict(args, csv_fpath, weights_fpath):
383 |     model = MyModelResNet2D()
384 |     model.load_state_dict(torch.load(weights_fpath))
385 |     test_loader = get_test_data_loaders(args)
386 |     with open(csv_fpath, 'w') as csvfile:
387 |         writer = csv.DictWriter(csvfile, fieldnames=['Id', 'Label'])
388 |         writer.writeheader()
389 |         cnt = 0
390 |         for batch, (frames, bounds, _, _) in enumerate(test_loader):
391 |             if batch % args.log_interval == 0:
392 |                 print("batch", batch)
393 |             yhat = predict_batch(model, frames, bounds, args)
394 |             for i, y in enumerate(yhat[:]):
395 |                 writer.writerow({"Id": cnt + i, "Label": y.cpu()[0]})
396 |             cnt += len(yhat)
397 |     print("done")
398 | 
399 | 
400 | parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
401 | parser.add_argument('--batch-size', type=int, default=1, metavar='N',
402 |                     help='input batch size for training (default: 64)')
403 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
404 |                     help='input batch size for testing (default: 1000)')
405 | parser.add_argument('--epochs', type=int, default=100, metavar='N',
406 |                     help='number of epochs to train (default: 10)')
407 | parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
408 |                     help='learning rate (default: 0.001)')
409 | parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
410 |                     help='SGD momentum (default: 0.9)')
411 | parser.add_argument('--l2-reg', type=float, default=0.001,
412 |                     help='l2 regularization')
413 | parser.add_argument('--no-cuda', action='store_true', default=False,
414 |                     help='disables CUDA training')
415 | parser.add_argument('--seed', type=int, default=1, metavar='S',
416 |                     help='random seed (default: 1)')
417 | parser.add_argument('--log-interval', type=int, default=300, metavar='N',
418 |                     help='how many batches to wait before logging training status')
419 | # parser.add_argument('--K', type=int, default=10, metavar='N',
420 | #                     help='window size')
421 | parser.add_argument('--data-dir', type=str, default='./data/',
422 |                     help='data directory')
423 | parser.add_argument('--weights-dir', type=str, default='./weights/',
424 |                     help='data directory')
425 | 
426 | 
427 | if __name__ == "__main__":
428 |     print(torch.__version__)
429 |     args = parser.parse_args()
430 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
431 |     args.batch_size = 1
432 |     main(args)
433 |     #predict(args, './e1/submission.csv', './e1/weights/012_82.1546%.w')
434 | 


--------------------------------------------------------------------------------