├── .gitignore ├── LICENSE ├── README.md ├── SI1657.WAV ├── __init__.py ├── check_layer.py ├── example.py ├── ops.py ├── pytorch_model.py ├── tf2pytorch.py └── tf_model.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jiguo Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SoundNet_Pytorch 2 | converting the pretrained tensorflow SoundNet model to pytorch 3 | 4 | ![from soundnet](https://camo.githubusercontent.com/0b88af5c13ba987a17dcf90cd58816cf8ef04554/687474703a2f2f70726f6a656374732e637361696c2e6d69742e6564752f736f756e646e65742f736f756e646e65742e6a7067) 5 | 6 | # Introduction 7 | The code is for converting the pretrained [tensorflow soundnet model](https://github.com/eborboihuc/SoundNet-tensorflow) to pytorch model. So no training code for SoundNet model. The pretrained pytorch soundnet model can be found [here](https://drive.google.com/file/d/1-PhHutIYV9Oi2DhDZL2h1Myu84oGLI81/view?usp=sharing). 8 | 9 | # Prerequisites 10 | 1. tensorflow (cpu or gpu) 11 | 2. python 3.6 with numpy 12 | 3. pytorch 0.4+ 13 | 4. weight file: google drive: https://drive.google.com/drive/folders/1zjNiuLgZ1cjCzF80P4mlYe4KSGGOFlta?usp=sharing; 百度网盘:链接:https://pan.baidu.com/s/1v_K2pJvo0KE38EZ__WZJWg 提取码:iz4h 14 | 15 | 16 | # How to use 17 | 1. prepare the code 18 | ``` 19 | git clone https://github.com/smallflyingpig/SoundNet_Pytorch.git 20 | cd SoundNet_Pytorch 21 | ``` 22 | 2. prepare the tensorflow soundnet model parameters. Download from [sound8.npy](https://drive.google.com/uc?export=download&id=0B9wE6h4m--wjR015M1RLZW45OEU), which is provided by [eborboihuc](https://github.com/eborboihuc/SoundNet-tensorflow), and save in the current folder. 23 | 3. install the prerequisites 24 | 4. run 25 | ``` 26 | python tf2pytorch.py --tf_param_path ./sound8.npy --pytorch_param_path ./sound8.pth 27 | ``` 28 | 5. test the result 29 | 30 | download input demo data from [demo.py](https://drive.google.com/uc?export=download&id=0B9wE6h4m--wjcEtqQ3VIM1pvZ3c) and save to the current folder. We calculate the average feature errors at each convolution block (total 7 conv blocks) and the predictions for object/scene classification (2 layers), and output 9 error totally. 31 | ``` 32 | python check_layer.py --tf_param_path ./sound8.npy --pytorch_param_path ./sound8.pth --input_demo_data ./demo.npy 33 | ``` 34 | The expected output: 35 | ``` 36 | layer error: 37 | [-1.3113022e-06, 0.0, 0.0, 0.0, 1.4901161e-08, 0.0, -6.9849193e-10, 4.7683716e-07, 7.1525574e-07] 38 | ``` 39 | This indicates the success of our model conversion. 40 | 41 | 6. extract features 42 | after the pytorch model is got(save as ./sound8.pth), run the following command to extract features: 43 | ``` 44 | python example.py 45 | ``` 46 | # Acknowledgments 47 | Code for soundnet tensorflow model is ported from [soundnet_tensorflow](https://github.com/eborboihuc/SoundNet-tensorflow). Thanks for his works! 48 | 49 | # FAQs 50 | Feel free to mail me(jiguo.li@vipl.ict.ac.cn or jgli@pku.edu.cn) if you have any questions about this project. 51 | # reference 52 | 1. Yusuf Aytar, Carl Vondrick, and Antonio Torralba. "Soundnet: Learning sound representations from unlabeled video." Advances in Neural Information Processing Systems. 2016. 53 | 54 | -------------------------------------------------------------------------------- /SI1657.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/SoundNet_Pytorch/806af81cd7fbabde41b6f4991fccc7c61e396c84/SI1657.WAV -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smallflyingpig/SoundNet_Pytorch/806af81cd7fbabde41b6f4991fccc7c61e396c84/__init__.py -------------------------------------------------------------------------------- /check_layer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | 6 | tf_feature_layers = [4,8,11,14,18,21,24,25,26] 7 | 8 | local_config = { 9 | 'batch_size': 1, 10 | 'eps': 1e-5, 11 | 'sample_rate': 22050, 12 | 'load_size': 22050*20, 13 | 'name_scope': 'SoundNet', 14 | 'phase': 'extract', 15 | } 16 | 17 | # # Init. Session 18 | # sess_config = tf.ConfigProto() 19 | # sess_config.allow_soft_placement=True 20 | # # sess_config.gpu_options.allow_growth = True 21 | # 22 | # 23 | # with tf.Session(config=sess_config) as session: 24 | # # Build model 25 | # model = Model(session, config=local_config, param_G=param_G) 26 | # init = tf.global_variables_initializer() 27 | # session.run(init) 28 | # 29 | # model.load() 30 | # 31 | # for idx, sound_sample in enumerate(sound_samples): 32 | # output = extract_feat(model, sound_sample, args) 33 | 34 | def extract_tf_feature(input_data:np.ndarray, tf_param_path:str)->list: 35 | from tf_model import SoundNet8_tf 36 | 37 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 38 | tf_param = np.load(tf_param_path, encoding='latin1').item() 39 | sess_config = tf.ConfigProto() 40 | sess_config.allow_soft_placement=True 41 | with tf.Session(config=sess_config) as session: 42 | # Build model 43 | model = SoundNet8_tf(session, config=local_config, param_G=tf_param) 44 | init = tf.global_variables_initializer() 45 | session.run(init) 46 | model.load() 47 | # Demo 48 | sound_input = np.reshape(input_data, [1, -1, 1, 1]) 49 | feed_dict = {model.sound_input_placeholder: sound_input} 50 | 51 | feature_all = [] 52 | # Forward 53 | for idx in tf_feature_layers: 54 | feature = session.run(model.layers[idx], feed_dict=feed_dict) 55 | feature_all.append(feature) 56 | return feature_all 57 | 58 | 59 | def extract_pytorch_feature(input_data:np.ndarray, pytorch_param_path:str)->list: 60 | import torch 61 | from pytorch_model import SoundNet8_pytorch 62 | # "point invalid" error, if put the import on the top of this file 63 | model = SoundNet8_pytorch() 64 | 65 | model.load_state_dict(torch.load(pytorch_param_path)) 66 | 67 | data = torch.from_numpy(input_data).view(1,1,-1,1) 68 | model.eval() 69 | with torch.no_grad(): 70 | feature_all = model.extract_feat(data) 71 | return feature_all 72 | 73 | 74 | def get_parser(): 75 | parser = argparse.ArgumentParser("check") 76 | parser.add_argument("--input_demo_data", type=str, default="./demo.npy") 77 | parser.add_argument("--tf_param_path", type=str, default="./sound8.npy") 78 | parser.add_argument("--pytorch_param_path", type=str, default="./sound8.pth") 79 | 80 | args, _ = parser.parse_known_args() 81 | return args 82 | 83 | 84 | def main(args): 85 | input_data = np.load(args.input_demo_data) 86 | print("extract features using tensorflow model...") 87 | feature_all_tf = extract_tf_feature(input_data, args.tf_param_path) 88 | print("extrach features using pytorch model...") 89 | feature_all_pytorch = extract_pytorch_feature(input_data, args.pytorch_param_path) 90 | # check param 91 | layer_error_all = [] 92 | for idx, (feat_tf, feat_pytorch) in enumerate(zip(feature_all_tf, feature_all_pytorch)): 93 | layer_error = feat_tf.mean()-feat_pytorch.mean() 94 | layer_error_all.append(layer_error) 95 | print("layer error:") 96 | print(layer_error_all) 97 | 98 | # test 99 | if __name__=="__main__": 100 | args = get_parser() 101 | main(args) -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from pytorch_model import SoundNet8_pytorch 2 | import torch, torchaudio 3 | torchaudio.set_audio_backend("soundfile") 4 | 5 | audio_path = "./SI1657.WAV" 6 | 7 | if __name__=="__main__": 8 | model = SoundNet8_pytorch() 9 | model.load_state_dict(torch.load("./sound8.pth")) 10 | wav, sr = torchaudio.load(audio_path) 11 | print(wav.shape) 12 | 13 | wav = wav.unsqueeze(1).unsqueeze(-1).repeat(1,1,8,1) # errors occur when the wav is too short 14 | feats = model.extract_feat(wav) 15 | # features for layer1 to layer8 16 | for idx, f in enumerate(feats): 17 | print(f"feature shape for layer {idx}: {f.shape}") -------------------------------------------------------------------------------- /ops.py: -------------------------------------------------------------------------------- 1 | # TensorFlow version of NIPS2016 soundnet 2 | # copy from: https://github.com/eborboihuc/SoundNet-tensorflow/blob/master/ops.py 3 | import tensorflow as tf 4 | 5 | def conv2d(prev_layer, in_ch, out_ch, k_h=1, k_w=1, d_h=1, d_w=1, p_h=0, p_w=0, pad='VALID', name_scope='conv'): 6 | with tf.variable_scope(name_scope) as scope: 7 | # h x w x input_channel x output_channel 8 | w_conv = tf.get_variable('weights', [k_h, k_w, in_ch, out_ch], 9 | initializer=tf.truncated_normal_initializer(0.0, stddev=0.01)) 10 | b_conv = tf.get_variable('biases', [out_ch], 11 | initializer=tf.constant_initializer(0.0)) 12 | 13 | padded_input = tf.pad(prev_layer, [[0, 0], [p_h, p_h], [p_w, p_w], [0, 0]], "CONSTANT") if pad == 'VALID' \ 14 | else prev_layer 15 | 16 | output = tf.nn.conv2d(padded_input, w_conv, 17 | [1, d_h, d_w, 1], padding=pad, name='z') + b_conv 18 | 19 | return output 20 | 21 | 22 | def batch_norm(prev_layer, out_ch, eps, name_scope='conv'): 23 | with tf.variable_scope(name_scope) as scope: 24 | #mu_conv, var_conv = tf.nn.moments(prev_layer, [0, 1, 2], keep_dims=False) 25 | mu_conv = tf.get_variable('mean', [out_ch], 26 | initializer=tf.constant_initializer(0)) 27 | var_conv = tf.get_variable('var', [out_ch], 28 | initializer=tf.constant_initializer(1)) 29 | gamma_conv = tf.get_variable('gamma', [out_ch], 30 | initializer=tf.constant_initializer(1)) 31 | beta_conv = tf.get_variable('beta', [out_ch], 32 | initializer=tf.constant_initializer(0)) 33 | output = tf.nn.batch_normalization(prev_layer, mu_conv, 34 | var_conv, beta_conv, gamma_conv, eps, name='batch_norm') 35 | 36 | return output 37 | 38 | 39 | def relu(prev_layer, name_scope='conv'): 40 | with tf.variable_scope(name_scope) as scope: 41 | return tf.nn.relu(prev_layer, name='a') 42 | 43 | 44 | def maxpool(prev_layer, k_h=1, k_w=1, d_h=1, d_w=1, name_scope='conv'): 45 | with tf.variable_scope(name_scope) as scope: 46 | return tf.nn.max_pool(prev_layer, 47 | [1, k_h, k_w, 1], [1, d_h, d_w, 1], padding='VALID', name='maxpool') -------------------------------------------------------------------------------- /pytorch_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | 5 | class SoundNet8_pytorch(nn.Module): 6 | def __init__(self): 7 | super(SoundNet8_pytorch, self).__init__() 8 | 9 | self.define_module() 10 | 11 | def define_module(self): 12 | self.conv1 = nn.Sequential( 13 | nn.Conv2d(1,16, (64,1), (2,1), (32,0), bias=True), 14 | nn.BatchNorm2d(16), 15 | nn.ReLU(inplace=True), 16 | nn.MaxPool2d((8,1), (8,1)) 17 | ) 18 | self.conv2 = nn.Sequential( 19 | nn.Conv2d(16, 32, (32,1), (2,1), (16,0), bias=True), 20 | nn.BatchNorm2d(32), 21 | nn.ReLU(inplace=True), 22 | nn.MaxPool2d((8,1),(8,1)) 23 | ) 24 | self.conv3 = nn.Sequential( 25 | nn.Conv2d(32, 64, (16,1), (2,1), (8,0), bias=True), 26 | nn.BatchNorm2d(64), 27 | nn.ReLU(inplace=True) 28 | ) 29 | self.conv4 = nn.Sequential( 30 | nn.Conv2d(64, 128, (8,1), (2,1), (4,0), bias=True), 31 | nn.BatchNorm2d(128), 32 | nn.ReLU(inplace=True), 33 | ) 34 | self.conv5 = nn.Sequential( 35 | nn.Conv2d(128, 256, (4,1),(2,1),(2,0), bias=True), 36 | nn.BatchNorm2d(256), 37 | nn.ReLU(inplace=True), 38 | nn.MaxPool2d((4,1),(4,1)) 39 | ) # difference here (0.24751323, 0.2474), padding error has beed debuged 40 | self.conv6 = nn.Sequential( 41 | nn.Conv2d(256, 512, (4,1), (2,1), (2,0), bias=True), 42 | nn.BatchNorm2d(512), 43 | nn.ReLU(inplace=True) 44 | ) 45 | self.conv7 = nn.Sequential( 46 | nn.Conv2d(512, 1024, (4,1), (2,1), (2,0), bias=True), 47 | nn.BatchNorm2d(1024), 48 | nn.ReLU(inplace=True) 49 | ) 50 | self.conv8 = nn.Sequential( 51 | nn.Conv2d(1024, 1000, (8,1), (2,1), (0,0), bias=True), 52 | ) 53 | self.conv8_2 = nn.Sequential( 54 | nn.Conv2d(1024, 401, (8,1), (2,1), (0,0), bias=True) 55 | ) 56 | 57 | def forward(self, x): 58 | for net in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7]: 59 | x = net(x) 60 | object_pred = self.conv8(x) 61 | scene_pred = self.conv8_2(x) 62 | return object_pred, scene_pred 63 | 64 | def extract_feat(self,x:torch.Tensor)->list: 65 | output_list = [] 66 | for net in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7]: 67 | x = net(x) 68 | output_list.append(x.detach().cpu().numpy()) 69 | object_pred = self.conv8(x) 70 | output_list.append(object_pred.detach().cpu().numpy()) 71 | scene_pred = self.conv8_2(x) 72 | output_list.append(scene_pred.detach().cpu().numpy()) 73 | return output_list 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /tf2pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import torch 4 | from collections import OrderedDict 5 | # reference: https://github.com/smallflyingpig/SoundNet-tensorflow 6 | # torch model: OrderDict: 7 | # odict_keys(['BN1.weight', 'BN1.bias', 'BN1.running_mean', 'BN1.running_var', 'BN1.num_batches_tracked', 'classifier.0.weight', 'classifier.1.weight', 'classifier.1.bias', 'classifier.1 8 | # .running_mean', 'classifier.1.running_var', 'classifier.1.num_batches_tracked', 'classifier.4.conv1.weight', 'classifier.4.bn1.weight', 'classifier.4.bn1.bias', 'classifier.4.bn1.running_mean' 9 | # , 'classifier.4.bn1.running_var', 'classifier.4.bn1.num_batches_tracked', 'classifier.4.conv2.weight', 'classifier.4.bn2.weight', 'classifier.4.bn2.bias', 'classifier.4.bn2.running_mean', 'cla 10 | # ssifier.4.bn2.running_var', 'classifier.4.bn2.num_batches_tracked', 'classifier.4.conv3.weight', 'classifier.4.bn3.weight', 'classifier.4.bn3.bias', 'classifier.4.bn3.running_mean', 'classifie 11 | # r.4.bn3.running_var', 'classifier.4.bn3.num_batches_tracked', 'classifier.4.downsample.0.weight', 'classifier.4.downsample.1.weight', 'classifier.4.downsample.1.bias', 'classifier.4.downsample.1.running_mean', 'classifier.4.downsample.1.running_var', 'classifier.4.downsample.1.num_batches_tracked', 'classifier.6.conv1.weight', 'classifier.6.bn1.weight', 'classifier.6.bn1.bias', 'classifier.6.bn1.running_mean', 'classifier.6.bn1.running_var', 'classifier.6.bn1.num_batches_tracked', 'classifier.6.conv2.weight', 'classifier.6.bn2.weight', 'classifier.6.bn2.bias', 'classifier.6.bn2.running_mean', 'classifier.6.bn2.running_var', 'classifier.6.bn2.num_batches_tracked', 'classifier.6.conv3.weight', 'classifier.6.bn3.weight', 'classifier.6.bn3.bias', 'classifier.6.bn3.running_mean', 'classifier.6.bn3.running_var', 'classifier.6.bn3.num_batches_tracked', 'classifier.6.downsample.0.weight', 'classifier.6.downsample.1.weight', 'classifier.6.downsample.1.bias', 'classifier.6.downsample.1.running_mean', 'classifier.6.downsample.1.running_var', 'classifier.6.downsample.1.num_batches_tracked', 'classifier.8.weight']) 12 | 13 | # tensor model: numpy dict: 14 | # dict_keys(['conv3', 'conv2', 'conv1', 'conv7', 'conv6', 'conv5', 'conv4', 'conv8_2', 'conv8']) 15 | # In [24]: data_dict['conv1'].keys() 16 | # Out[24]: dict_keys(['beta', 'weights', 'biases', 'var', 'gamma', 'mean']) # (weights, bias) for conv layer, (mean, var, gamma, beta) for BN layer 17 | 18 | # (block_name, layer_idx, (tf_param_name, pytorch_param_name)) 19 | _layer_param_dict = { 20 | 'conv_layer':[('weights', 'weight', (3,2,0,1)), ('biases', 'bias')], #(H,W,in_channel, out_channel)-->(out_channel, in_channel, H,W) 21 | 'batch_norm_layer':[('gamma','weight'), ('beta','bias'), ('mean','running_mean'), ('var','running_var'), (100.0, 'num_batches_tracked')] 22 | } 23 | 24 | g_param_dict = [ 25 | ('conv1', '0', _layer_param_dict['conv_layer']), 26 | ('conv1', '1', _layer_param_dict['batch_norm_layer']), 27 | ('conv2', '0', _layer_param_dict['conv_layer']), 28 | ('conv2', '1', _layer_param_dict['batch_norm_layer']), 29 | ('conv3', '0', _layer_param_dict['conv_layer']), 30 | ('conv3', '1', _layer_param_dict['batch_norm_layer']), 31 | ('conv4', '0', _layer_param_dict['conv_layer']), 32 | ('conv4', '1', _layer_param_dict['batch_norm_layer']), 33 | ('conv5', '0', _layer_param_dict['conv_layer']), 34 | ('conv5', '1', _layer_param_dict['batch_norm_layer']), 35 | ('conv6', '0', _layer_param_dict['conv_layer']), 36 | ('conv6', '1', _layer_param_dict['batch_norm_layer']), 37 | ('conv7', '0', _layer_param_dict['conv_layer']), 38 | ('conv7', '1', _layer_param_dict['batch_norm_layer']), 39 | ('conv8', '0', _layer_param_dict['conv_layer']), 40 | ('conv8_2', '0', _layer_param_dict['conv_layer']) 41 | ] 42 | 43 | def convert_tf2pytorch(tf_param_dict, param_dict): 44 | torch_param_list = [] 45 | for param in param_dict: 46 | block_name, layer_idx = param[0], param[1] 47 | layer_param_list = param[2] 48 | for layer_param in layer_param_list: 49 | param_name_tf, param_name_torch = layer_param[0], layer_param[1] 50 | 51 | torch_param_name = '.'.join([block_name, layer_idx, param_name_torch]) 52 | 53 | if isinstance(param_name_tf, str): 54 | torch_param_value = tf_param_dict[block_name][param_name_tf] 55 | elif isinstance(param_name_tf, (float, int)): 56 | torch_param_value = param_name_tf 57 | else: 58 | raise ValueError 59 | torch_param_value = torch.tensor(torch_param_value, device='cpu') 60 | if len(layer_param)>2: 61 | transpose_idx = layer_param[2] 62 | torch_param_value = torch_param_value.permute(transpose_idx) 63 | torch_param_list.append((torch_param_name, torch_param_value)) 64 | 65 | return OrderedDict(torch_param_list) 66 | 67 | 68 | def get_parser(): 69 | parser = argparse.ArgumentParser("convert") 70 | parser.add_argument("--tf_param_path", type=str, default="./sound8.npy", help="") 71 | parser.add_argument("--pytorch_param_path", type=str, default="./sound8.pth", help="") 72 | 73 | args, _ = parser.parse_known_args() 74 | return args 75 | 76 | def main(args): 77 | tf_param_path = args.tf_param_path 78 | pytorch_param_path = args.pytorch_param_path 79 | tf_param = np.load(tf_param_path, encoding='latin1', allow_pickle=True).tolist() 80 | print("load tf param:{}".format(tf_param_path)) 81 | pytorch_param = convert_tf2pytorch(tf_param, g_param_dict) 82 | torch.save(pytorch_param, pytorch_param_path) 83 | print("save pytorch model:{}".format(pytorch_param_path)) 84 | 85 | 86 | if __name__=="__main__": 87 | args = get_parser() 88 | main(args) -------------------------------------------------------------------------------- /tf_model.py: -------------------------------------------------------------------------------- 1 | # TensorFlow version of NIPS2016 soundnet 2 | # copy from: https://github.com/eborboihuc/SoundNet-tensorflow/blob/master/model.py 3 | import sys 4 | import numpy as np 5 | import tensorflow as tf 6 | from ops import batch_norm, conv2d, relu, maxpool 7 | 8 | # Make xrange compatible in both Python 2, 3 9 | try: 10 | xrange 11 | except NameError: 12 | xrange = range 13 | 14 | local_config = { 15 | 'batch_size': 1, 16 | 'eps': 1e-5, 17 | 'name_scope': 'SoundNet', 18 | } 19 | 20 | class SoundNet8_tf(): 21 | def __init__(self, session, config=local_config, param_G=None): 22 | # Print config 23 | for key in config: print("{}:{}".format(key, config[key])) 24 | 25 | self.sess = session 26 | self.config = config 27 | self.param_G = param_G 28 | 29 | # Placeholder 30 | self.add_placeholders() 31 | 32 | # Generator 33 | self.add_generator(name_scope=self.config['name_scope']) 34 | 35 | 36 | def add_placeholders(self): 37 | self.sound_input_placeholder = tf.placeholder(tf.float32, 38 | shape=[self.config['batch_size'], None, 1, 1]) # batch x h x w x channel 39 | 40 | 41 | def add_generator(self, name_scope='SoundNet'): 42 | with tf.variable_scope(name_scope) as scope: 43 | self.layers = {} 44 | 45 | # Stream one: conv1 ~ conv7 46 | self.layers[1] = conv2d(self.sound_input_placeholder, 1, 16, k_h=64, d_h=2, p_h=32, name_scope='conv1') 47 | self.layers[2] = batch_norm(self.layers[1], 16, self.config['eps'], name_scope='conv1') 48 | self.layers[3] = relu(self.layers[2], name_scope='conv1') 49 | self.layers[4] = maxpool(self.layers[3], k_h=8, d_h=8, name_scope='conv1') 50 | 51 | self.layers[5] = conv2d(self.layers[4], 16, 32, k_h=32, d_h=2, p_h=16, name_scope='conv2') 52 | self.layers[6] = batch_norm(self.layers[5], 32, self.config['eps'], name_scope='conv2') 53 | self.layers[7] = relu(self.layers[6], name_scope='conv2') 54 | self.layers[8] = maxpool(self.layers[7], k_h=8, d_h=8, name_scope='conv2') 55 | 56 | self.layers[9] = conv2d(self.layers[8], 32, 64, k_h=16, d_h=2, p_h=8, name_scope='conv3') 57 | self.layers[10] = batch_norm(self.layers[9], 64, self.config['eps'], name_scope='conv3') 58 | self.layers[11] = relu(self.layers[10], name_scope='conv3') 59 | 60 | self.layers[12] = conv2d(self.layers[11], 64, 128, k_h=8, d_h=2, p_h=4, name_scope='conv4') 61 | self.layers[13] = batch_norm(self.layers[12], 128, self.config['eps'], name_scope='conv4') 62 | self.layers[14] = relu(self.layers[13], name_scope='conv4') 63 | 64 | self.layers[15] = conv2d(self.layers[14], 128, 256, k_h=4, d_h=2, p_h=2, name_scope='conv5') 65 | self.layers[16] = batch_norm(self.layers[15], 256, self.config['eps'], name_scope='conv5') 66 | self.layers[17] = relu(self.layers[16], name_scope='conv5') 67 | self.layers[18] = maxpool(self.layers[17], k_h=4, d_h=4, name_scope='conv5') 68 | 69 | self.layers[19] = conv2d(self.layers[18], 256, 512, k_h=4, d_h=2, p_h=2, name_scope='conv6') 70 | self.layers[20] = batch_norm(self.layers[19], 512, self.config['eps'], name_scope='conv6') 71 | self.layers[21] = relu(self.layers[20], name_scope='conv6') 72 | 73 | self.layers[22] = conv2d(self.layers[21], 512, 1024, k_h=4, d_h=2, p_h=2, name_scope='conv7') 74 | self.layers[23] = batch_norm(self.layers[22], 1024, self.config['eps'], name_scope='conv7') 75 | self.layers[24] = relu(self.layers[23], name_scope='conv7') 76 | 77 | # Split one: conv8, conv8_2 78 | self.layers[25] = conv2d(self.layers[24], 1024, 1000, k_h=8, d_h=2, name_scope='conv8') 79 | self.layers[26] = conv2d(self.layers[24], 1024, 401, k_h=8, d_h=2, name_scope='conv8_2') 80 | 81 | 82 | def load(self): 83 | if self.param_G is None: return False 84 | data_dict = self.param_G 85 | for key in data_dict: 86 | with tf.variable_scope(self.config['name_scope'] + '/' + key, reuse=True): 87 | for subkey in data_dict[key]: 88 | try: 89 | var = tf.get_variable(subkey) 90 | self.sess.run(var.assign(data_dict[key][subkey])) 91 | print('Assign pretrain model {} to {}'.format(subkey, key)) 92 | except: 93 | print('Ignore {}'.format(key)) 94 | self.param_G.clear() 95 | return True 96 | 97 | 98 | if __name__ == '__main__': 99 | 100 | layer_min = int(sys.argv[1]) 101 | layer_max = int(sys.argv[2]) if len(sys.argv) > 2 else layer_min + 1 102 | 103 | # Load pre-trained model 104 | G_name = './models/sound8.npy' 105 | param_G = np.load(G_name, encoding='latin1').item() 106 | dump_path = './output/' 107 | 108 | with tf.Session() as session: 109 | # Build model 110 | model = Model(session, config=local_config, param_G=param_G) 111 | init = tf.global_variables_initializer() 112 | session.run(init) 113 | 114 | model.load() 115 | 116 | # Demo 117 | sound_input = np.reshape(np.load('data/demo.npy', encoding='latin1'), [local_config['batch_size'], -1, 1, 1]) 118 | feed_dict = {model.sound_input_placeholder: sound_input} 119 | 120 | # Forward 121 | for idx in xrange(layer_min, layer_max): 122 | feature = session.run(model.layers[idx], feed_dict=feed_dict) 123 | np.save(dump_path + 'tf_fea{}.npy'.format(str(idx).zfill(2)), np.squeeze(feature)) 124 | print("Save layer {} with shape {} as {}tf_fea{}.npy".format(idx, np.squeeze(feature).shape, dump_path, str(idx).zfill(2))) 125 | 126 | --------------------------------------------------------------------------------