├── .gitignore
├── LICENSE
├── README.md
├── README_en.md
├── configs
    ├── augmentation.yml
    ├── cam++.yml
    ├── ecapa_tdnn.yml
    ├── eres2net.yml
    ├── panns.yml
    ├── res2net.yml
    ├── resnet_se.yml
    └── tdnn.yml
├── create_data.py
├── docs
    └── images
    │   ├── image1.png
    │   └── log.jpg
├── eval.py
├── extract_features.py
├── infer.py
├── infer_record.py
├── macls
    ├── __init__.py
    ├── data_utils
    │   ├── __init__.py
    │   ├── collate_fn.py
    │   ├── featurizer.py
    │   └── reader.py
    ├── metric
    │   ├── __init__.py
    │   └── metrics.py
    ├── models
    │   ├── __init__.py
    │   ├── campplus.py
    │   ├── ecapa_tdnn.py
    │   ├── eres2net.py
    │   ├── panns.py
    │   ├── pooling.py
    │   ├── res2net.py
    │   ├── resnet_se.py
    │   ├── tdnn.py
    │   └── utils.py
    ├── optimizer
    │   ├── __init__.py
    │   └── scheduler.py
    ├── predict.py
    ├── trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── checkpoint.py
    │   ├── record.py
    │   └── utils.py
├── record_audio.py
├── requirements.txt
├── setup.py
├── tools
    └── download_language_data.sh
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .idea/
 3 | build/
 4 | dist/
 5 | macls.egg-info/
 6 | dataset/
 7 | log/
 8 | output/
 9 | models/
10 | pretrained_models/
11 | feature_models/
12 | temp/
13 | test*.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 简体中文 | [English](./README_en.md)
  2 | 
  3 | # 基于Pytorch实现的声音分类系统
  4 | 
  5 | ![python version](https://img.shields.io/badge/python-3.8+-orange.svg)
  6 | ![GitHub forks](https://img.shields.io/github/forks/yeyupiaoling/AudioClassification-Pytorch)
  7 | ![GitHub Repo stars](https://img.shields.io/github/stars/yeyupiaoling/AudioClassification-Pytorch)
  8 | ![GitHub](https://img.shields.io/github/license/yeyupiaoling/AudioClassification-Pytorch)
  9 | ![支持系统](https://img.shields.io/badge/支持系统-Win/Linux/MAC-9cf)
 10 | 
 11 | # 前言
 12 | 
 13 | 本项目是基于Pytorch的声音分类项目，旨在实现对各种环境声音、动物叫声和语种的识别。项目提供了多种声音分类模型，如EcapaTdnn、PANNS、ResNetSE、CAMPPlus和ERes2Net，以支持不同的应用场景。此外，项目还提供了常用的Urbansound8K数据集测试报告和一些方言数据集的下载和使用例子。用户可以根据自己的需求选择适合的模型和数据集，以实现更准确的声音分类。项目的应用场景广泛，可以用于室外的环境监测、野生动物保护、语音识别等领域。同时，项目也鼓励用户探索更多的使用场景，以推动声音分类技术的发展和应用。
 14 | 
 15 | **欢迎大家扫码入知识星球或者QQ群讨论，知识星球里面提供项目的模型文件和博主其他相关项目的模型文件，也包括其他一些资源。**
 16 | 
 17 | <div align="center">
 18 |   <img src="https://yeyupiaoling.cn/zsxq.png" alt="知识星球" width="400">
 19 |   <img src="https://yeyupiaoling.cn/qq.png" alt="QQ群" width="400">
 20 | </div>
 21 | 
 22 | 
 23 | # 目录
 24 | 
 25 | - [前言](#前言)
 26 | - [项目特性](#项目特性)
 27 | - [模型测试表](#模型测试表)
 28 | - [安装环境](#安装环境)
 29 | - [创建数据](#创建数据)
 30 | - [修改预处理方法（可选）](#修改预处理方法可选)
 31 | - [提取特征（可选）](#提取特征可选)
 32 | - [训练模型](#训练模型)
 33 | - [评估模型](#评估模型)
 34 | - [预测](#预测)
 35 | - [其他功能](#其他功能)
 36 | 
 37 | 
 38 | # 使用准备
 39 | 
 40 |  - Anaconda 3
 41 |  - Python 3.11
 42 |  - Pytorch 2.0.1
 43 |  - Windows 11 or Ubuntu 22.04
 44 | 
 45 | # 项目特性
 46 | 
 47 | 1. 支持模型：EcapaTdnn、PANNS、TDNN、Res2Net、ResNetSE、CAMPPlus、ERes2Net
 48 | 2. 支持池化层：AttentiveStatsPool(ASP)、SelfAttentivePooling(SAP)、TemporalStatisticsPooling(TSP)、TemporalAveragePooling(TAP)
 49 | 4. 支持预处理方法：MelSpectrogram、Spectrogram、MFCC、Fbank、Wav2vec2.0、WavLM
 50 | 
 51 | **模型论文：**
 52 | 
 53 | - EcapaTdnn：[ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification](https://arxiv.org/abs/2005.07143v3)
 54 | - PANNS：[PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/abs/1912.10211v5)
 55 | - TDNN：[Prediction of speech intelligibility with DNN-based performance measures](https://arxiv.org/abs/2203.09148)
 56 | - Res2Net：[Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/abs/1904.01169)
 57 | - ResNetSE：[Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507)
 58 | - CAMPPlus：[CAM++: A Fast and Efficient Network for Speaker Verification Using Context-Aware Masking](https://arxiv.org/abs/2303.00332v3)
 59 | - ERes2Net：[An Enhanced Res2Net with Local and Global Feature Fusion for Speaker Verification](https://arxiv.org/abs/2305.12838v1)
 60 | 
 61 | # 模型测试表
 62 | 
 63 | |      模型      | Params(M) | 预处理方法 |     数据集      | 类别数量 |   准确率   |   获取模型   |
 64 | |:------------:|:---------:|:-----:|:------------:|:----:|:-------:|:--------:|
 65 | |   ResNetSE   |    7.8    | Flank | UrbanSound8K |  10  | 0.96233 | 加入知识星球获取 |
 66 | |  ERes2NetV2  |    5.4    | Flank | UrbanSound8K |  10  | 0.95662 | 加入知识星球获取 |
 67 | |   CAMPPlus   |    7.1    | Flank | UrbanSound8K |  10  | 0.95454 | 加入知识星球获取 |
 68 | |  EcapaTdnn   |    6.4    | Flank | UrbanSound8K |  10  | 0.95227 | 加入知识星球获取 |
 69 | |   ERes2Net   |    6.6    | Flank | UrbanSound8K |  10  | 0.94292 | 加入知识星球获取 |
 70 | |     TDNN     |    2.6    | Flank | UrbanSound8K |  10  | 0.93977 | 加入知识星球获取 |
 71 | | PANNS（CNN10） |    5.2    | Flank | UrbanSound8K |  10  | 0.92954 | 加入知识星球获取 |
 72 | |   Res2Net    |    5.0    | Flank | UrbanSound8K |  10  | 0.92580 | 加入知识星球获取 |
 73 | 
 74 | **说明：**
 75 | 
 76 | 1. 使用的测试集为从数据集中每10条音频取一条，共874条。
 77 | 
 78 | ## 安装环境
 79 | 
 80 |  - 首先安装的是Pytorch的GPU版本，如果已经安装过了，请跳过。
 81 | ```shell
 82 | conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1  pytorch-cuda=11.8 -c pytorch -c nvidia
 83 | ```
 84 | 
 85 |  - 安装macls库。
 86 |  
 87 | 使用pip安装，命令如下：
 88 | ```shell
 89 | python -m pip install macls -U -i https://pypi.tuna.tsinghua.edu.cn/simple
 90 | ```
 91 | 
 92 | **建议源码安装**，源码安装能保证使用最新代码。
 93 | ```shell
 94 | git clone https://github.com/yeyupiaoling/AudioClassification-Pytorch.git
 95 | cd AudioClassification-Pytorch/
 96 | pip install .
 97 | ```
 98 | 
 99 | ## 创建数据
100 | 
101 | 生成数据列表，用于下一步的读取需要，`audio_path`为音频文件路径，用户需要提前把音频数据集存放在`dataset/audio`目录下，每个文件夹存放一个类别的音频数据，每条音频数据长度在3秒以上，如 `dataset/audio/鸟叫声/······`。`audio`是数据列表存放的位置，生成的数据类别的格式为 `音频路径\t音频对应的类别标签`，音频路径和标签用制表符 `\t`分开。读者也可以根据自己存放数据的方式修改以下函数。
102 | 
103 | 以Urbansound8K为例，Urbansound8K是目前应用较为广泛的用于自动城市环境声分类研究的公共数据集，包含10个分类：空调声、汽车鸣笛声、儿童玩耍声、狗叫声、钻孔声、引擎空转声、枪声、手提钻、警笛声和街道音乐声。数据集下载地址：[UrbanSound8K.tar.gz](https://aistudio.baidu.com/aistudio/datasetdetail/36625)。以下是针对Urbansound8K生成数据列表的函数。如果读者想使用该数据集，请下载并解压到 `dataset`目录下，把生成数据列表代码改为以下代码。
104 | 
105 | 执行`create_data.py`即可生成数据列表，里面提供了生成多种数据集列表方式，具体看代码。
106 | ```shell
107 | python create_data.py
108 | ```
109 | 
110 | 生成的列表是长这样的，前面是音频的路径，后面是该音频对应的标签，从0开始，路径和标签之间用`\t`隔开。
111 | ```shell
112 | dataset/UrbanSound8K/audio/fold2/104817-4-0-2.wav	4
113 | dataset/UrbanSound8K/audio/fold9/105029-7-2-5.wav	7
114 | dataset/UrbanSound8K/audio/fold3/107228-5-0-0.wav	5
115 | dataset/UrbanSound8K/audio/fold4/109711-3-2-4.wav	3
116 | ```
117 | 
118 | # 修改预处理方法（可选）
119 | 
120 | 配置文件中默认使用的是Fbank预处理方法，如果要使用其他预处理方法，可以修改配置文件中的安装下面方式修改，具体的值可以根据自己情况修改。如果不清楚如何设置参数，可以直接删除该部分，直接使用默认值。
121 | 
122 | ```yaml
123 | # 数据预处理参数
124 | preprocess_conf:
125 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
126 |   use_hf_model: False
127 |   # 音频预处理方法，也可以叫特征提取方法
128 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
129 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
130 |   feature_method: 'Fbank'
131 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
132 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
133 |   method_args:
134 |     sample_frequency: 16000
135 |     num_mel_bins: 80
136 | ```
137 | 
138 | # 提取特征（可选）
139 | 
140 | 在训练过程中，首先是要读取音频数据，然后提取特征，最后再进行训练。其中读取音频数据、提取特征也是比较消耗时间的，所以我们可以选择提前提取好取特征，训练模型的是就可以直接加载提取好的特征，这样训练速度会更快。这个提取特征是可选择，如果没有提取好的特征，训练模型的时候就会从读取音频数据，然后提取特征开始。提取特征步骤如下：
141 | 
142 | 1. 执行`extract_features.py`，提取特征，特征会保存在`dataset/features`目录下，并生成新的数据列表`train_list_features.txt`和`test_list_features.txt`。
143 | 
144 | ```shell
145 | python extract_features.py --configs=configs/cam++.yml --save_dir=dataset/features
146 | ```
147 | 
148 | 2. 修改配置文件，将`dataset_conf.train_list`和`dataset_conf.test_list`修改为`train_list_features.txt`和`test_list_features.txt`。
149 | 
150 | 
151 | ## 训练模型
152 | 
153 | 接着就可以开始训练模型了，创建 `train.py`。配置文件里面的参数一般不需要修改，但是这几个是需要根据自己实际的数据集进行调整的，首先最重要的就是分类大小`dataset_conf.num_class`，这个每个数据集的分类大小可能不一样，根据自己的实际情况设定。然后是`dataset_conf.batch_size`，如果是显存不够的话，可以减小这个参数。
154 | 
155 | ```shell
156 | # 单卡训练
157 | CUDA_VISIBLE_DEVICES=0 python train.py
158 | # 多卡训练
159 | CUDA_VISIBLE_DEVICES=0,1 torchrun --standalone --nnodes=1 --nproc_per_node=2 train.py
160 | ```
161 | 
162 | 训练输出日志：
163 | ```
164 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:14 - ----------- 额外配置参数 -----------
165 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - configs: configs/ecapa_tdnn.yml
166 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - local_rank: 0
167 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - pretrained_model: None
168 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - resume_model: None
169 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - save_model_path: models/
170 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - use_gpu: True
171 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:17 - ------------------------------------------------
172 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:19 - ----------- 配置文件参数 -----------
173 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:22 - dataset_conf:
174 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	aug_conf:
175 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		noise_aug_prob: 0.2
176 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		noise_dir: dataset/noise
177 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		speed_perturb: True
178 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		volume_aug_prob: 0.2
179 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		volume_perturb: False
180 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	dataLoader:
181 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		batch_size: 64
182 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		num_workers: 4
183 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	do_vad: False
184 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	eval_conf:
185 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		batch_size: 1
186 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		max_duration: 20
187 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	label_list_path: dataset/label_list.txt
188 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	max_duration: 3
189 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	min_duration: 0.5
190 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	sample_rate: 16000
191 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	spec_aug_args:
192 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		freq_mask_width: [0, 8]
193 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		time_mask_width: [0, 10]
194 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	target_dB: -20
195 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	test_list: dataset/test_list.txt
196 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	train_list: dataset/train_list.txt
197 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	use_dB_normalization: True
198 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	use_spec_aug: True
199 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:22 - model_conf:
200 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	num_class: 10
201 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	pooling_type: ASP
202 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:22 - optimizer_conf:
203 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	learning_rate: 0.001
204 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	optimizer: Adam
205 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	scheduler: WarmupCosineSchedulerLR
206 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:25 - 	scheduler_args:
207 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		max_lr: 0.001
208 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		min_lr: 1e-05
209 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		warmup_epoch: 5
210 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	weight_decay: 1e-06
211 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:22 - preprocess_conf:
212 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	feature_method: Fbank
213 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:25 - 	method_args:
214 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:27 - 		num_mel_bins: 80
215 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:27 - 		sample_frequency: 16000
216 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:22 - train_conf:
217 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:29 - 	log_interval: 10
218 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:29 - 	max_epoch: 30
219 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:31 - use_model: EcapaTdnn
220 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:32 - ------------------------------------------------
221 | [2023-08-07 22:54:22.213166 WARNING] trainer:__init__:67 - Windows系统不支持多线程读取数据，已自动关闭！
222 | ==========================================================================================
223 | Layer (type:depth-idx)                   Output Shape              Param #
224 | ==========================================================================================
225 | EcapaTdnn                                [1, 10]                   --
226 | ├─Conv1dReluBn: 1-1                      [1, 512, 98]              --
227 | │    └─Conv1d: 2-1                       [1, 512, 98]              204,800
228 | │    └─BatchNorm1d: 2-2                  [1, 512, 98]              1,024
229 | ├─Sequential: 1-2                        [1, 512, 98]              --
230 | │    └─Conv1dReluBn: 2-3                 [1, 512, 98]              --
231 | │    │    └─Conv1d: 3-1                  [1, 512, 98]              262,144
232 | │    │    └─BatchNorm1d: 3-2             [1, 512, 98]              1,024
233 | │    └─Res2Conv1dReluBn: 2-4             [1, 512, 98]              --
234 | │    │    └─ModuleList: 3-15             --                        (recursive)
235 | │    │    └─ModuleList: 3-16             --                        (recursive)
236 | │    │    └─ModuleList: 3-15             --                        (recursive)
237 | │    │    └─ModuleList: 3-16             --                        (recursive)
238 | │    │    └─ModuleList: 3-15             --                        (recursive)
239 | │    │    └─ModuleList: 3-16             --                        (recursive)
240 | │    │    └─ModuleList: 3-15             --                        (recursive)
241 | │    │    └─ModuleList: 3-16             --                        (recursive)
242 | │    │    └─ModuleList: 3-15             --                        (recursive)
243 | │    │    └─ModuleList: 3-16             --                        (recursive)
244 | ···································
245 | │    │    └─ModuleList: 3-56             --                        (recursive)
246 | │    │    └─ModuleList: 3-55             --                        (recursive)
247 | │    │    └─ModuleList: 3-56             --                        (recursive)
248 | │    │    └─ModuleList: 3-55             --                        (recursive)
249 | │    │    └─ModuleList: 3-56             --                        (recursive)
250 | │    └─Conv1dReluBn: 2-13                [1, 512, 98]              --
251 | │    │    └─Conv1d: 3-57                 [1, 512, 98]              262,144
252 | │    │    └─BatchNorm1d: 3-58            [1, 512, 98]              1,024
253 | │    └─SE_Connect: 2-14                  [1, 512, 98]              --
254 | │    │    └─Linear: 3-59                 [1, 256]                  131,328
255 | │    │    └─Linear: 3-60                 [1, 512]                  131,584
256 | ├─Conv1d: 1-5                            [1, 1536, 98]             2,360,832
257 | ├─AttentiveStatsPool: 1-6                [1, 3072]                 --
258 | │    └─Conv1d: 2-15                      [1, 128, 98]              196,736
259 | │    └─Conv1d: 2-16                      [1, 1536, 98]             198,144
260 | ├─BatchNorm1d: 1-7                       [1, 3072]                 6,144
261 | ├─Linear: 1-8                            [1, 192]                  590,016
262 | ├─BatchNorm1d: 1-9                       [1, 192]                  384
263 | ├─Linear: 1-10                           [1, 10]                   1,930
264 | ==========================================================================================
265 | Total params: 6,188,490
266 | Trainable params: 6,188,490
267 | Non-trainable params: 0
268 | Total mult-adds (M): 470.96
269 | ==========================================================================================
270 | Input size (MB): 0.03
271 | Forward/backward pass size (MB): 10.28
272 | Params size (MB): 24.75
273 | Estimated Total Size (MB): 35.07
274 | ==========================================================================================
275 | [2023-08-07 22:54:26.726095 INFO   ] trainer:train:344 - 训练数据：8644
276 | [2023-08-07 22:54:30.092504 INFO   ] trainer:__train_epoch:296 - Train epoch: [1/30], batch: [0/4], loss: 2.57033, accuracy: 0.06250, learning rate: 0.00001000, speed: 19.02 data/sec, eta: 0:06:43
277 | ```
278 | 
279 | **训练可视化：**
280 | 
281 | 项目的根目录执行下面命令，并网页访问`http://localhost:8040/`，如果是服务器，需要修改`localhost`为服务器的IP地址。
282 | ```shell
283 | visualdl --logdir=log --host=0.0.0.0
284 | ```
285 | 
286 | 打开的网页如下：
287 | 
288 | <br/>
289 | <div align="center">
290 | <img src="docs/images/log.jpg" alt="混淆矩阵" width="600">
291 | </div>
292 | 
293 | 
294 | 
295 | # 评估模型
296 | 
297 | 执行下面命令执行评估。
298 | 
299 | ```shell
300 | python eval.py --configs=configs/bi_lstm.yml
301 | ```
302 | 
303 | 评估输出如下：
304 | ```shell
305 | [2024-02-03 15:13:25.469242 INFO   ] trainer:evaluate:461 - 成功加载模型：models/CAMPPlus_Fbank/best_model/model.pth
306 | 100%|██████████████████████████████| 150/150 [00:00<00:00, 1281.96it/s]
307 | 评估消耗时间：1s，loss：0.61840，accuracy：0.87333
308 | ```
309 | 
310 | 评估会出来输出准确率，还保存了混淆矩阵图片，保存路径`output/images/`，如下。
311 | 
312 | <br/>
313 | <div align="center">
314 | <img src="docs/images/image1.png" alt="混淆矩阵" width="600">
315 | </div>
316 | 
317 | 
318 | 注意：如果类别标签是中文的，需要设置安装字体才能正常显示，一般情况下Windows无需安装，Ubuntu需要安装。如果Windows确实是缺少字体，只需要[字体文件](https://github.com/tracyone/program_font)这里下载`.ttf`格式的文件，复制到`C:\Windows\Fonts`即可。Ubuntu系统操作如下。
319 | 
320 | 1. 安装字体
321 | ```shell
322 | git clone https://github.com/tracyone/program_font && cd program_font && ./install.sh
323 | ```
324 | 
325 | 2. 执行下面Python代码
326 | ```python
327 | import matplotlib
328 | import shutil
329 | import os
330 | 
331 | path = matplotlib.matplotlib_fname()
332 | path = path.replace('matplotlibrc', 'fonts/ttf/')
333 | print(path)
334 | shutil.copy('/usr/share/fonts/MyFonts/simhei.ttf', path)
335 | user_dir = os.path.expanduser('~')
336 | shutil.rmtree(f'{user_dir}/.cache/matplotlib', ignore_errors=True)
337 | ```
338 | 
339 | # 预测
340 | 
341 | 在训练结束之后，我们得到了一个模型参数文件，我们使用这个模型预测音频。
342 | 
343 | ```shell
344 | python infer.py --audio_path=dataset/UrbanSound8K/audio/fold5/156634-5-2-5.wav
345 | ```
346 | 
347 | # 其他功能
348 | 
349 |  - 为了方便读取录制数据和制作数据集，这里提供了录音程序`record_audio.py`，这个用于录制音频，录制的音频采样率为16000，单通道，16bit。
350 | 
351 | ```shell
352 | python record_audio.py
353 | ```
354 | 
355 |  - `infer_record.py`这个程序是用来不断进行录音识别，我们可以大致理解为这个程序在实时录音识别。通过这个应该我们可以做一些比较有趣的事情，比如把麦克风放在小鸟经常来的地方，通过实时录音识别，一旦识别到有鸟叫的声音，如果你的数据集足够强大，有每种鸟叫的声音数据集，这样你还能准确识别是那种鸟叫。如果识别到目标鸟类，就启动程序，例如拍照等等。
356 | 
357 | ```shell
358 | python infer_record.py --record_seconds=3
359 | ```
360 | 
361 | ## 打赏作者
362 | <br/>
363 | <div align="center">
364 | <p>打赏一块钱支持一下作者</p>
365 | <img src="https://yeyupiaoling.cn/reward.png" alt="打赏作者" width="400">
366 | </div>
367 | 
368 | # 参考资料
369 | 
370 | 1. https://github.com/PaddlePaddle/PaddleSpeech
371 | 2. https://github.com/yeyupiaoling/PaddlePaddle-MobileFaceNets
372 | 3. https://github.com/yeyupiaoling/PPASR
373 | 4. https://github.com/alibaba-damo-academy/3D-Speaker
374 | 


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | [简体中文](./README.md) | English
  2 | 
  3 | # Sound classification system implemented in Pytorch
  4 | 
  5 | ![python version](https://img.shields.io/badge/python-3.8+-orange.svg)
  6 | ![GitHub forks](https://img.shields.io/github/forks/yeyupiaoling/AudioClassification-Pytorch)
  7 | ![GitHub Repo stars](https://img.shields.io/github/stars/yeyupiaoling/AudioClassification-Pytorch)
  8 | ![GitHub](https://img.shields.io/github/license/yeyupiaoling/AudioClassification-Pytorch)
  9 | ![支持系统](https://img.shields.io/badge/支持系统-Win/Linux/MAC-9cf)
 10 | 
 11 | **Disclaimer, this document was obtained through machine translation, please check the original document [here](./README.md).**
 12 | 
 13 | 
 14 | # Introduction
 15 | 
 16 | This project is a sound classification project based on Pytorch, aiming to realize the recognition of various environmental sounds, animal calls and languages. Several sound classification models such as EcapaTdnn, PANNS, ResNetSE, CAMPPlus, and ERes2Net are provided to support different application scenarios. In addition, the project also provides the commonly used Urbansound8K dataset test report and some dialect datasets download and use examples. Users can choose suitable models and datasets according to their needs to achieve more accurate sound classification. The project has a wide range of application scenarios, and can be used in outdoor environmental monitoring, wildlife protection, speech recognition and other fields. At the same time, the project also encourages users to explore more usage scenarios to promote the development and application of sound classification technology.
 17 | 
 18 | 
 19 | # Environment
 20 | 
 21 |  - Anaconda 3
 22 |  - Python 3.11
 23 |  - Pytorch 2.0.1
 24 |  - Windows 11 or Ubuntu 22.04
 25 | 
 26 | # Project Features
 27 | 
 28 | 1. Supporting models: EcapaTdnn、PANNS、TDNN、Res2Net、ResNetSE、CAMPPlus、ERes2Net
 29 | 2. Supporting pooling: AttentiveStatsPool(ASP)、SelfAttentivePooling(SAP)、TemporalStatisticsPooling(TSP)、TemporalAveragePooling(TAP)
 30 | 3. Support preprocessing methods: MelSpectrogram、Spectrogram、MFCC、Fbank、Wav2vec2.0、WavLM
 31 | 
 32 | **Model Paper：**
 33 | 
 34 | - EcapaTdnn：[ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification](https://arxiv.org/abs/2005.07143v3)
 35 | - PANNS：[PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/abs/1912.10211v5)
 36 | - TDNN：[Prediction of speech intelligibility with DNN-based performance measures](https://arxiv.org/abs/2203.09148)
 37 | - Res2Net：[Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/abs/1904.01169)
 38 | - ResNetSE：[Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507)
 39 | - CAMPPlus：[CAM++: A Fast and Efficient Network for Speaker Verification Using Context-Aware Masking](https://arxiv.org/abs/2303.00332v3)
 40 | - ERes2Net：[An Enhanced Res2Net with Local and Global Feature Fusion for Speaker Verification](https://arxiv.org/abs/2305.12838v1)
 41 | 
 42 | # Model Test
 43 | 
 44 | |    Model     | Params(M) | Preprocessing method |   Dataset    | Number Class | Accuracy |
 45 | |:------------:|:---------:|:--------------------:|:------------:|:------------:|:--------:|
 46 | |   ResNetSE   |    7.8    |        Flank         | UrbanSound8K |      10      | 0.96233  |
 47 | |  ERes2NetV2  |    5.4    |        Flank         | UrbanSound8K |      10      | 0.95662  |
 48 | |   CAMPPlus   |    7.1    |        Flank         | UrbanSound8K |      10      | 0.95454  |
 49 | |  EcapaTdnn   |    6.4    |        Flank         | UrbanSound8K |      10      | 0.95227  |
 50 | |   ERes2Net   |    6.6    |        Flank         | UrbanSound8K |      10      | 0.94292  |
 51 | |     TDNN     |    2.6    |        Flank         | UrbanSound8K |      10      | 0.93977  |
 52 | | PANNS（CNN10） |    5.2    |        Flank         | UrbanSound8K |      10      | 0.92954  |
 53 | |   Res2Net    |    5.0    |        Flank         | UrbanSound8K |      10      | 0.92580  |
 54 | 
 55 | ## Installation Environment
 56 | 
 57 |  - The GPU version of Pytorch will be installed first, please skip it if you already have it installed.
 58 | ```shell
 59 | conda install pytorch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1  pytorch-cuda=11.8 -c pytorch -c nvidia
 60 | ```
 61 | 
 62 |  - Install macls.
 63 |  
 64 | Install it using pip with the following command:
 65 | ```shell
 66 | python -m pip install macls -U -i https://pypi.tuna.tsinghua.edu.cn/simple
 67 | ```
 68 | 
 69 | **Source installation is recommended**, which ensures that the latest code is used.
 70 | ```shell
 71 | git clone https://github.com/yeyupiaoling/AudioClassification_Pytorch.git
 72 | cd AudioClassification_Pytorch/
 73 | python setup.py install
 74 | ```
 75 | 
 76 | ## Preparing Data
 77 | 
 78 | The `audio_path` is the audio file path. The user needs to store the audio dataset in the `dataset/audio` directory in advance. Each folder stores a category of audio data, and the length of each audio data is more than 3 seconds. For example, `dataset/audio/ bird song /······`. `audio` is where the data list is stored, and the format of the generated data category is`audio_path\tcategory_label_audio`, and the audio path and label are separated by a TAB character `\t`. You can also modify the following functions depending on how you store your data:
 79 | 
 80 | Taking Urbansound8K as an example, it is a widely used public dataset for automatic urban environmental sound classification research. Urbansound8K contains 10 categories: air condition sound, car whistle sound, children playing sound, dog bark, drilling sound, engine idling sound, gun sound, jackdrill, siren sound, and street music sound. Data set download address: [UrbanSound8K](https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz). Here is the function to generate a list of data for Urbansound8K. If you want to use this dataset, please download and unzip it into the `dataset` directory and change the code to generate the list of data as follows.
 81 | 
 82 | `create_data.py` can be used to generate a list of data sets. There are many ways to generate a list of data sets.
 83 | ```shell
 84 | python create_data.py
 85 | ```
 86 | 
 87 | The resulting list looks like this, with the path to the audio followed by the tag for that audio, starting at 0, and separated by `\t`.
 88 | ```shell
 89 | dataset/UrbanSound8K/audio/fold2/104817-4-0-2.wav	4
 90 | dataset/UrbanSound8K/audio/fold9/105029-7-2-5.wav	7
 91 | dataset/UrbanSound8K/audio/fold3/107228-5-0-0.wav	5
 92 | dataset/UrbanSound8K/audio/fold4/109711-3-2-4.wav	3
 93 | ```
 94 | 
 95 | # Change preprocessing methods
 96 | 
 97 | By default, the Fbank preprocessing method is used in the configuration file. If you want to use other preprocessing methods, you can modify the following installation in the configuration file, and the specific value can be modified according to your own situation. If it's not clear how to set the parameters, you can remove that section and just use the default values.
 98 | 
 99 | ```yaml
100 | # 数据预处理参数
101 | preprocess_conf:
102 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
103 |   use_hf_model: False
104 |   # 音频预处理方法，也可以叫特征提取方法
105 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
106 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
107 |   feature_method: 'Fbank'
108 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
109 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
110 |   method_args:
111 |     sample_frequency: 16000
112 |     num_mel_bins: 80
113 | ```
114 | 
115 | ## 训练
116 | 
117 | Now we can train the model. We will create `train.py`. The parameters in the configuration file generally do not need to be modified, but these few need to be adjusted according to your actual dataset. The first and most important is the class size `dataset_conf.num_class`, which may be different for each dataset. Then there is` dataset_conf.batch_size `, which can be reduced if memory is insufficient.
118 | 
119 | ```shell
120 | # Single GPU training
121 | CUDA_VISIBLE_DEVICES=0 python train.py
122 | # Multi GPU training
123 | CUDA_VISIBLE_DEVICES=0,1 torchrun --standalone --nnodes=1 --nproc_per_node=2 train.py
124 | ```
125 | 
126 | Train log:
127 | ```
128 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:14 - ----------- 额外配置参数 -----------
129 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - configs: configs/ecapa_tdnn.yml
130 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - local_rank: 0
131 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - pretrained_model: None
132 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - resume_model: None
133 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - save_model_path: models/
134 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:16 - use_gpu: True
135 | [2023-08-07 22:54:22.148973 INFO   ] utils:print_arguments:17 - ------------------------------------------------
136 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:19 - ----------- 配置文件参数 -----------
137 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:22 - dataset_conf:
138 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	aug_conf:
139 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		noise_aug_prob: 0.2
140 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		noise_dir: dataset/noise
141 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		speed_perturb: True
142 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		volume_aug_prob: 0.2
143 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		volume_perturb: False
144 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	dataLoader:
145 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		batch_size: 64
146 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		num_workers: 4
147 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	do_vad: False
148 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	eval_conf:
149 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		batch_size: 1
150 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		max_duration: 20
151 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	label_list_path: dataset/label_list.txt
152 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	max_duration: 3
153 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	min_duration: 0.5
154 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:29 - 	sample_rate: 16000
155 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:25 - 	spec_aug_args:
156 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		freq_mask_width: [0, 8]
157 | [2023-08-07 22:54:22.202166 INFO   ] utils:print_arguments:27 - 		time_mask_width: [0, 10]
158 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	target_dB: -20
159 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	test_list: dataset/test_list.txt
160 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	train_list: dataset/train_list.txt
161 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	use_dB_normalization: True
162 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:29 - 	use_spec_aug: True
163 | [2023-08-07 22:54:22.203167 INFO   ] utils:print_arguments:22 - model_conf:
164 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	num_class: 10
165 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	pooling_type: ASP
166 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:22 - optimizer_conf:
167 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	learning_rate: 0.001
168 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	optimizer: Adam
169 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	scheduler: WarmupCosineSchedulerLR
170 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:25 - 	scheduler_args:
171 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		max_lr: 0.001
172 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		min_lr: 1e-05
173 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:27 - 		warmup_epoch: 5
174 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	weight_decay: 1e-06
175 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:22 - preprocess_conf:
176 | [2023-08-07 22:54:22.207167 INFO   ] utils:print_arguments:29 - 	feature_method: Fbank
177 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:25 - 	method_args:
178 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:27 - 		num_mel_bins: 80
179 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:27 - 		sample_frequency: 16000
180 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:22 - train_conf:
181 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:29 - 	log_interval: 10
182 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:29 - 	max_epoch: 30
183 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:31 - use_model: EcapaTdnn
184 | [2023-08-07 22:54:22.208167 INFO   ] utils:print_arguments:32 - ------------------------------------------------
185 | [2023-08-07 22:54:22.213166 WARNING] trainer:__init__:67 - Windows系统不支持多线程读取数据，已自动关闭！
186 | ==========================================================================================
187 | Layer (type:depth-idx)                   Output Shape              Param #
188 | ==========================================================================================
189 | EcapaTdnn                                [1, 10]                   --
190 | ├─Conv1dReluBn: 1-1                      [1, 512, 98]              --
191 | │    └─Conv1d: 2-1                       [1, 512, 98]              204,800
192 | │    └─BatchNorm1d: 2-2                  [1, 512, 98]              1,024
193 | ├─Sequential: 1-2                        [1, 512, 98]              --
194 | │    └─Conv1dReluBn: 2-3                 [1, 512, 98]              --
195 | │    │    └─Conv1d: 3-1                  [1, 512, 98]              262,144
196 | │    │    └─BatchNorm1d: 3-2             [1, 512, 98]              1,024
197 | │    └─Res2Conv1dReluBn: 2-4             [1, 512, 98]              --
198 | │    │    └─ModuleList: 3-15             --                        (recursive)
199 | │    │    └─ModuleList: 3-16             --                        (recursive)
200 | │    │    └─ModuleList: 3-15             --                        (recursive)
201 | │    │    └─ModuleList: 3-16             --                        (recursive)
202 | │    │    └─ModuleList: 3-15             --                        (recursive)
203 | │    │    └─ModuleList: 3-16             --                        (recursive)
204 | │    │    └─ModuleList: 3-15             --                        (recursive)
205 | │    │    └─ModuleList: 3-16             --                        (recursive)
206 | │    │    └─ModuleList: 3-15             --                        (recursive)
207 | │    │    └─ModuleList: 3-16             --                        (recursive)
208 | ···································
209 | │    │    └─ModuleList: 3-56             --                        (recursive)
210 | │    │    └─ModuleList: 3-55             --                        (recursive)
211 | │    │    └─ModuleList: 3-56             --                        (recursive)
212 | │    │    └─ModuleList: 3-55             --                        (recursive)
213 | │    │    └─ModuleList: 3-56             --                        (recursive)
214 | │    └─Conv1dReluBn: 2-13                [1, 512, 98]              --
215 | │    │    └─Conv1d: 3-57                 [1, 512, 98]              262,144
216 | │    │    └─BatchNorm1d: 3-58            [1, 512, 98]              1,024
217 | │    └─SE_Connect: 2-14                  [1, 512, 98]              --
218 | │    │    └─Linear: 3-59                 [1, 256]                  131,328
219 | │    │    └─Linear: 3-60                 [1, 512]                  131,584
220 | ├─Conv1d: 1-5                            [1, 1536, 98]             2,360,832
221 | ├─AttentiveStatsPool: 1-6                [1, 3072]                 --
222 | │    └─Conv1d: 2-15                      [1, 128, 98]              196,736
223 | │    └─Conv1d: 2-16                      [1, 1536, 98]             198,144
224 | ├─BatchNorm1d: 1-7                       [1, 3072]                 6,144
225 | ├─Linear: 1-8                            [1, 192]                  590,016
226 | ├─BatchNorm1d: 1-9                       [1, 192]                  384
227 | ├─Linear: 1-10                           [1, 10]                   1,930
228 | ==========================================================================================
229 | Total params: 6,188,490
230 | Trainable params: 6,188,490
231 | Non-trainable params: 0
232 | Total mult-adds (M): 470.96
233 | ==========================================================================================
234 | Input size (MB): 0.03
235 | Forward/backward pass size (MB): 10.28
236 | Params size (MB): 24.75
237 | Estimated Total Size (MB): 35.07
238 | ==========================================================================================
239 | [2023-08-07 22:54:26.726095 INFO   ] trainer:train:344 - 训练数据：8644
240 | [2023-08-07 22:54:30.092504 INFO   ] trainer:__train_epoch:296 - Train epoch: [1/30], batch: [0/4], loss: 2.57033, accuracy: 0.06250, learning rate: 0.00001000, speed: 19.02 data/sec, eta: 0:06:43
241 | ```
242 | 
243 | # Eval
244 | 
245 | At the end of each training round, we can perform an evaluation, which will output the accuracy. We also save the mixture matrix image, and save the path `output/images/` as follows.
246 | ![混合矩阵](docs/images/image1.png)
247 | 
248 | # Inference
249 | 
250 | At the end of the training, we are given a model parameter file, and we use this model to predict the audio.
251 | 
252 | ```shell
253 | python infer.py --audio_path=dataset/UrbanSound8K/audio/fold5/156634-5-2-5.wav
254 | ```
255 | 
256 | # Other Functions
257 | 
258 |  - In order to read the recorded data and make a dataset easily, we provide the recording program `record_audio.py`, which is used to record audio with a sample rate of 16,000, single channel, 16bit.
259 | 
260 | ```shell
261 | python record_audio.py
262 | ```
263 | 
264 |  - `infer_record.py`This program is used to continuously perform recording recognition, and we can roughly understand this program as recording recognition in real time. And this should allow us to do some interesting things, like put a microphone in a place where birds often come, and recognize it by recording it in real time, and once you recognize that there's a bird calling, if your dataset is powerful enough, and you have a dataset of every bird calling, then you can identify exactly which bird is calling. If the target bird is identified, the procedure is initiated, such as taking photos, etc.
265 | 
266 | ```shell
267 | python infer_record.py --record_seconds=3
268 | ```
269 | 
270 | # Reference
271 | 
272 | 1. https://github.com/PaddlePaddle/PaddleSpeech
273 | 2. https://github.com/yeyupiaoling/PaddlePaddle-MobileFaceNets
274 | 3. https://github.com/yeyupiaoling/PPASR
275 | 4. https://github.com/alibaba-damo-academy/3D-Speaker
276 | 


--------------------------------------------------------------------------------
/configs/augmentation.yml:
--------------------------------------------------------------------------------
 1 | # 语速增强
 2 | speed:
 3 |   # 增强概率
 4 |   prob: 1.0
 5 | 
 6 | # 音量增强
 7 | volume:
 8 |   # 增强概率
 9 |   prob: 0.0
10 |   # 最小增益
11 |   min_gain_dBFS: -15
12 |   # 最大增益
13 |   max_gain_dBFS: 15
14 | 
15 | # 噪声增强
16 | noise:
17 |   # 增强概率
18 |   prob: 0.5
19 |   # 噪声增强的噪声文件夹
20 |   noise_dir: 'dataset/noise'
21 |   # 针对噪声的最小音量增益
22 |   min_snr_dB: 10
23 |   # 针对噪声的最大音量增益
24 |   max_snr_dB: 50
25 | 
26 | # 混响增强
27 | reverb:
28 |   # 增强概率
29 |   prob: 0.5
30 |   # 混响增强的混响文件夹
31 |   reverb_dir: 'dataset/reverb'
32 | 
33 | # Spec增强
34 | spec_aug:
35 |   # 增强概率
36 |   prob: 0.5
37 |   # 频域掩蔽的比例
38 |   freq_mask_ratio: 0.1
39 |   # 频域掩蔽次数
40 |   n_freq_masks: 1
41 |   # 频域掩蔽的比例
42 |   time_mask_ratio: 0.05
43 |   # 频域掩蔽次数
44 |   n_time_masks: 1
45 |   # 最大时间扭曲
46 |   max_time_warp: 0
47 | 


--------------------------------------------------------------------------------
/configs/cam++.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 64
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 8
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型
50 |   model: 'CAMPPlus'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/ecapa_tdnn.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 128
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 16
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型
50 |   model: 'EcapaTdnn'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/eres2net.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 32
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 4
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型，支持ERes2Net、ERes2NetV2
50 |   model: 'ERes2Net'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/panns.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 64
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 8
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 | # 所使用的模型，支持PANNS_CNN6、PANNS_CNN10、PANNS_CNN14
50 |   model: 'PANNS_CNN10'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/res2net.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 32
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 4
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型
50 |   model: 'Res2Net'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/resnet_se.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 32
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 4
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型
50 |   model: 'ResNetSE'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/configs/tdnn.yml:
--------------------------------------------------------------------------------
 1 | # 数据集参数
 2 | dataset_conf:
 3 |   dataset:
 4 |     # 过滤最短的音频长度
 5 |     min_duration: 0.4
 6 |     # 最长的音频长度，大于这个长度会裁剪掉
 7 |     max_duration: 3
 8 |     # 音频的采样率
 9 |     sample_rate: 16000
10 |     # 是否对音频进行音量归一化
11 |     use_dB_normalization: True
12 |     # 对音频进行音量归一化的音量分贝值
13 |     target_dB: -20
14 |   dataLoader:
15 |     # 训练的批量大小
16 |     batch_size: 64
17 |     # 是否丢弃最后一个样本
18 |     drop_last: True
19 |     # 读取数据的线程数量
20 |     num_workers: 8
21 |   # 评估的数据要特殊处理
22 |   eval_conf:
23 |     # 评估的批量大小
24 |     batch_size: 8
25 |     # 最长的音频长度
26 |     max_duration: 20
27 |   # 训练数据的数据列表路径
28 |   train_list: 'dataset/train_list.txt'
29 |   # 测试数据的数据列表路径
30 |   test_list: 'dataset/test_list.txt'
31 |   # 标签列表
32 |   label_list_path: 'dataset/label_list.txt'
33 | 
34 | # 数据预处理参数
35 | preprocess_conf:
36 |   # 是否使用HF上的Wav2Vec2类似模型提取音频特征
37 |   use_hf_model: False
38 |   # 音频预处理方法，也可以叫特征提取方法
39 |   # 当use_hf_model为False时，支持：MelSpectrogram、Spectrogram、MFCC、Fbank
40 |   # 当use_hf_model为True时，指定的是HuggingFace的模型或者本地路径，比如facebook/w2v-bert-2.0或者./feature_models/w2v-bert-2.0
41 |   feature_method: 'Fbank'
42 |   # 当use_hf_model为False时，设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值。
43 |   # 当use_hf_model为True时，可以设置参数use_gpu，指定是否使用GPU提取特征
44 |   method_args:
45 |     sample_frequency: 16000
46 |     num_mel_bins: 80
47 | 
48 | model_conf:
49 |   # 所使用的模型
50 |   model: 'TDNN'
51 |   # 模型参数
52 |   model_args:
53 |     # 分类大小，如果为null，自动通过标签列表获取
54 |     num_class: null
55 | 
56 | optimizer_conf:
57 |   # 优化方法
58 |   optimizer: 'Adam'
59 |   # 优化方法参数
60 |   optimizer_args:
61 |     lr: 0.001
62 |     weight_decay: !!float 1e-5
63 |   # 学习率衰减函数，支持Pytorch支持的和项目提供的WarmupCosineSchedulerLR
64 |   scheduler: 'WarmupCosineSchedulerLR'
65 |   # 学习率衰减函数参数
66 |   scheduler_args:
67 |     min_lr: !!float 1e-5
68 |     max_lr: 0.001
69 |     warmup_epoch: 5
70 | 
71 | train_conf:
72 |   # 是否开启自动混合精度
73 |   enable_amp: False
74 |   # 是否使用Pytorch2.0的编译器
75 |   use_compile: False
76 |   # CrossEntropyLoss类的label_smoothing参数
77 |   label_smoothing: 0.0
78 |   # 训练的轮数
79 |   max_epoch: 60
80 |   log_interval: 10
81 | 


--------------------------------------------------------------------------------
/create_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | # 生成数据列表
  5 | def get_data_list(audio_path, list_path):
  6 |     sound_sum = 0
  7 |     audios = os.listdir(audio_path)
  8 |     os.makedirs(list_path, exist_ok=True)
  9 |     f_train = open(os.path.join(list_path, 'train_list.txt'), 'w', encoding='utf-8')
 10 |     f_test = open(os.path.join(list_path, 'test_list.txt'), 'w', encoding='utf-8')
 11 |     f_label = open(os.path.join(list_path, 'label_list.txt'), 'w', encoding='utf-8')
 12 | 
 13 |     for i in range(len(audios)):
 14 |         f_label.write(f'{audios[i]}\n')
 15 |         sounds = os.listdir(os.path.join(audio_path, audios[i]))
 16 |         for sound in sounds:
 17 |             sound_path = os.path.join(audio_path, audios[i], sound).replace('\\', '/')
 18 |             if sound_sum % 10 == 0:
 19 |                 f_test.write(f'{sound_path}\t{i}\n')
 20 |             else:
 21 |                 f_train.write(f'{sound_path}\t{i}\n')
 22 |             sound_sum += 1
 23 |         print(f"Audio：{i + 1}/{len(audios)}")
 24 |     f_label.close()
 25 |     f_test.close()
 26 |     f_train.close()
 27 | 
 28 | 
 29 | # 下载数据方式，执行：./tools/download_3dspeaker_data.sh
 30 | # 生成生成方言数据列表
 31 | def get_language_identification_data_list(audio_path, list_path):
 32 |     labels_dict = {0: 'Standard Mandarin', 3: 'Southwestern Mandarin', 6: 'Central Plains Mandarin',
 33 |                    4: 'JiangHuai Mandarin', 2: 'Wu dialect', 8: 'Gan dialect', 9: 'Jin dialect',
 34 |                    11: 'LiaoJiao Mandarin', 12: 'JiLu Mandarin', 10: 'Min dialect', 7: 'Yue dialect',
 35 |                    5: 'Hakka dialect', 1: 'Xiang dialect', 13: 'Northern Mandarin'}
 36 | 
 37 |     with open(os.path.join(list_path, 'train_list.txt'), 'w', encoding='utf-8') as f:
 38 |         train_dir = os.path.join(audio_path, 'train')
 39 |         for root,  dirs, files in os.walk(train_dir):
 40 |             for file in files:
 41 |                 if not file.endswith('.wav'): continue
 42 |                 label = int(file.split('_')[-1].replace('.wav', '')[-2:])
 43 |                 file = os.path.join(root, file)
 44 |                 f.write(f'{file}\t{label}\n')
 45 | 
 46 |     with open(os.path.join(list_path, 'test_list.txt'), 'w', encoding='utf-8') as f:
 47 |         test_dir = os.path.join(audio_path, 'test')
 48 |         for root,  dirs, files in os.walk(test_dir):
 49 |             for file in files:
 50 |                 if not file.endswith('.wav'): continue
 51 |                 label = int(file.split('_')[-1].replace('.wav', '')[-2:])
 52 |                 file = os.path.join(root, file)
 53 |                 f.write(f'{file}\t{label}\n')
 54 | 
 55 |     with open(os.path.join(list_path, 'label_list.txt'), 'w', encoding='utf-8') as f:
 56 |         for i in range(len(labels_dict)):
 57 |             f.write(f'{labels_dict[i]}\n')
 58 | 
 59 | 
 60 | # 创建UrbanSound8K数据列表
 61 | def create_UrbanSound8K_list(audio_path, metadata_path, list_path):
 62 |     sound_sum = 0
 63 | 
 64 |     f_train = open(os.path.join(list_path, 'train_list.txt'), 'w', encoding='utf-8')
 65 |     f_test = open(os.path.join(list_path, 'test_list.txt'), 'w', encoding='utf-8')
 66 |     f_label = open(os.path.join(list_path, 'label_list.txt'), 'w', encoding='utf-8')
 67 | 
 68 |     with open(metadata_path) as f:
 69 |         lines = f.readlines()
 70 | 
 71 |     labels = {}
 72 |     for i, line in enumerate(lines):
 73 |         if i == 0:continue
 74 |         data = line.replace('\n', '').split(',')
 75 |         class_id = int(data[6])
 76 |         if class_id not in labels.keys():
 77 |             labels[class_id] = data[-1]
 78 |         sound_path = os.path.join(audio_path, f'fold{data[5]}', data[0]).replace('\\', '/')
 79 |         if sound_sum % 10 == 0:
 80 |             f_test.write(f'{sound_path}\t{data[6]}\n')
 81 |         else:
 82 |             f_train.write(f'{sound_path}\t{data[6]}\n')
 83 |         sound_sum += 1
 84 |     for i in range(len(labels)):
 85 |         f_label.write(f'{labels[i]}\n')
 86 |     f_label.close()
 87 |     f_test.close()
 88 |     f_train.close()
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     # get_data_list('dataset/audio', 'dataset')
 93 |     # 生成生成方言数据列表
 94 |     # get_language_identification_data_list(audio_path='dataset/language',
 95 |     #                                       list_path='dataset/')
 96 |     # 创建UrbanSound8K数据列表
 97 |     create_UrbanSound8K_list(audio_path='dataset/UrbanSound8K/audio',
 98 |                              metadata_path='dataset/UrbanSound8K/metadata/UrbanSound8K.csv',
 99 |                              list_path='dataset')
100 | 


--------------------------------------------------------------------------------
/docs/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/AudioClassification-Pytorch/f9f55968cc1f181c8ebc786f2db880702e8bf79f/docs/images/image1.png


--------------------------------------------------------------------------------
/docs/images/log.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/AudioClassification-Pytorch/f9f55968cc1f181c8ebc786f2db880702e8bf79f/docs/images/log.jpg


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import time
 4 | 
 5 | from macls.trainer import MAClsTrainer
 6 | from macls.utils.utils import add_arguments, print_arguments
 7 | 
 8 | parser = argparse.ArgumentParser(description=__doc__)
 9 | add_arg = functools.partial(add_arguments, argparser=parser)
10 | add_arg('configs',          str,   'configs/cam++.yml',         "配置文件")
11 | add_arg("use_gpu",          bool,  True,                        "是否使用GPU评估模型")
12 | add_arg('save_matrix_path', str,   'output/images/',            "保存混合矩阵的路径")
13 | add_arg('resume_model',     str,   'models/CAMPPlus_Fbank/best_model/',  "模型的路径")
14 | add_arg('overwrites',       str,    None,    '覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开')
15 | args = parser.parse_args()
16 | print_arguments(args=args)
17 | 
18 | # 获取训练器
19 | trainer = MAClsTrainer(configs=args.configs, use_gpu=args.use_gpu, overwrites=args.overwrites)
20 | 
21 | # 开始评估
22 | start = time.time()
23 | loss, accuracy = trainer.evaluate(resume_model=args.resume_model,
24 |                                   save_matrix_path=args.save_matrix_path)
25 | end = time.time()
26 | print('评估消耗时间：{}s，loss：{:.5f}，accuracy：{:.5f}'.format(int(end - start), loss, accuracy))
27 | 


--------------------------------------------------------------------------------
/extract_features.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from macls.trainer import MAClsTrainer
 5 | from macls.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
10 | add_arg('save_dir',         str,    'dataset/features',         '保存特征的路径')
11 | add_arg('max_duration',    int,     100,                        '提取特征的最大时长，避免过长显存不足，单位秒')
12 | args = parser.parse_args()
13 | print_arguments(args=args)
14 | 
15 | # 获取训练器
16 | trainer = MAClsTrainer(configs=args.configs)
17 | 
18 | # 提取特征保存文件
19 | trainer.extract_features(save_dir=args.save_dir, max_duration=args.max_duration)
20 | 


--------------------------------------------------------------------------------
/infer.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from macls.predict import MAClsPredictor
 5 | from macls.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',          str,    'configs/cam++.yml',   '配置文件')
10 | add_arg('use_gpu',          bool,   True,                  '是否使用GPU预测')
11 | add_arg('audio_path',       str,    'dataset/UrbanSound8K/audio/fold5/156634-5-2-5.wav', '音频路径')
12 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
13 | args = parser.parse_args()
14 | print_arguments(args=args)
15 | 
16 | # 获取识别器
17 | predictor = MAClsPredictor(configs=args.configs,
18 |                            model_path=args.model_path,
19 |                            use_gpu=args.use_gpu)
20 | 
21 | label, score = predictor.predict(audio_data=args.audio_path)
22 | 
23 | print(f'音频：{args.audio_path} 的预测结果标签为：{label}，得分：{score}')
24 | 


--------------------------------------------------------------------------------
/infer_record.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import threading
 4 | import time
 5 | 
 6 | import numpy as np
 7 | import soundcard as sc
 8 | 
 9 | from macls.predict import MAClsPredictor
10 | from macls.utils.utils import add_arguments, print_arguments
11 | 
12 | parser = argparse.ArgumentParser(description=__doc__)
13 | add_arg = functools.partial(add_arguments, argparser=parser)
14 | add_arg('configs',          str,    'configs/cam++.yml',   '配置文件')
15 | add_arg('use_gpu',          bool,   True,                  '是否使用GPU预测')
16 | add_arg('record_seconds',   float,  3,                     '录音长度')
17 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
18 | args = parser.parse_args()
19 | print_arguments(args=args)
20 | 
21 | # 获取识别器
22 | predictor = MAClsPredictor(configs=args.configs,
23 |                            model_path=args.model_path,
24 |                            use_gpu=args.use_gpu)
25 | 
26 | all_data = []
27 | # 获取默认麦克风
28 | default_mic = sc.default_microphone()
29 | # 录音采样率
30 | samplerate = 16000
31 | # 录音块大小
32 | numframes = 1024
33 | # 模型输入长度
34 | infer_len = int(samplerate * args.record_seconds / numframes)
35 | 
36 | 
37 | def infer_thread():
38 |     global all_data
39 |     s = time.time()
40 |     while True:
41 |         if len(all_data) < infer_len: continue
42 |         # 截取最新的音频数据
43 |         seg_data = all_data[-infer_len:]
44 |         d = np.concatenate(seg_data)
45 |         # 删除旧的音频数据
46 |         del all_data[:len(all_data) - infer_len]
47 |         label, score = predictor.predict(audio_data=d, sample_rate=samplerate)
48 |         print(f'{int(time.time() - s)}s 预测结果标签为：{label}，得分：{score}')
49 | 
50 | 
51 | thread = threading.Thread(target=infer_thread, args=())
52 | thread.start()
53 | 
54 | 
55 | with default_mic.recorder(samplerate=samplerate, channels=1) as mic:
56 |     while True:
57 |         data = mic.record(numframes=numframes)
58 |         all_data.append(data)
59 | 


--------------------------------------------------------------------------------
/macls/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.6"
2 | 


--------------------------------------------------------------------------------
/macls/data_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/AudioClassification-Pytorch/f9f55968cc1f181c8ebc786f2db880702e8bf79f/macls/data_utils/__init__.py


--------------------------------------------------------------------------------
/macls/data_utils/collate_fn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | # 对一个batch的数据处理
 5 | def collate_fn(batch):
 6 |     # 找出音频长度最长的
 7 |     batch_sorted = sorted(batch, key=lambda sample: sample[0].size(0), reverse=True)
 8 |     freq_size = batch_sorted[0][0].size(1)
 9 |     max_freq_length = batch_sorted[0][0].size(0)
10 |     batch_size = len(batch_sorted)
11 |     # 以最大的长度创建0张量
12 |     features = torch.zeros((batch_size, max_freq_length, freq_size), dtype=torch.float32)
13 |     input_lens, labels = [], []
14 |     for x in range(batch_size):
15 |         tensor, label = batch[x]
16 |         seq_length = tensor.size(0)
17 |         # 将数据插入都0张量中，实现了padding
18 |         features[x, :seq_length, :] = tensor[:, :]
19 |         labels.append(label)
20 |         input_lens.append(seq_length)
21 |     labels = torch.tensor(labels, dtype=torch.int64)
22 |     input_lens = torch.tensor(input_lens, dtype=torch.int64)
23 |     return features, labels, input_lens
24 | 


--------------------------------------------------------------------------------
/macls/data_utils/featurizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torchaudio.compliance.kaldi as Kaldi
  4 | from torch import nn
  5 | from torchaudio.transforms import MelSpectrogram, Spectrogram, MFCC
  6 | from loguru import logger
  7 | 
  8 | 
  9 | class AudioFeaturizer(nn.Module):
 10 |     """音频特征器
 11 | 
 12 |     :param feature_method: 所使用的预处理方法
 13 |     :type feature_method: str
 14 |     :param use_hf_model: 是否使用HF上的Wav2Vec2类似模型提取音频特征
 15 |     :type use_hf_model: bool
 16 |     :param method_args: 预处理方法的参数
 17 |     :type method_args: dict
 18 |     """
 19 | 
 20 |     def __init__(self, feature_method='MelSpectrogram', use_hf_model=False, method_args={}):
 21 |         super().__init__()
 22 |         self._method_args = method_args
 23 |         self._feature_method = feature_method
 24 |         self.use_hf_model = use_hf_model
 25 |         if self.use_hf_model:
 26 |             from transformers import AutoModel, AutoFeatureExtractor
 27 |             # 判断是否使用GPU提取特征
 28 |             use_gpu = torch.cuda.is_available() and method_args.get('use_gpu', True)
 29 |             self.device = torch.device("cuda") if use_gpu else torch.device("cpu")
 30 |             # 加载Wav2Vec2类似模型
 31 |             self.processor = AutoFeatureExtractor.from_pretrained(feature_method)
 32 |             self.feature_model = AutoModel.from_pretrained(feature_method).to(self.device)
 33 |             logger.info(f'使用模型【{feature_method}】提取特征，使用【{self.device}】设备提取')
 34 |             # 获取模型的输出通道数
 35 |             inputs = self.processor(np.ones(16000 * 1, dtype=np.float32), sampling_rate=16000,
 36 |                                     return_tensors="pt").to(self.device)
 37 |             with torch.no_grad():
 38 |                 outputs = self.feature_model(**inputs)
 39 |                 self.output_channels = outputs.extract_features.shape[2]
 40 |         else:
 41 |             if feature_method == 'MelSpectrogram':
 42 |                 self.feat_fun = MelSpectrogram(**method_args)
 43 |             elif feature_method == 'Spectrogram':
 44 |                 self.feat_fun = Spectrogram(**method_args)
 45 |             elif feature_method == 'MFCC':
 46 |                 self.feat_fun = MFCC(**method_args)
 47 |             elif feature_method == 'Fbank':
 48 |                 self.feat_fun = KaldiFbank(**method_args)
 49 |             else:
 50 |                 raise Exception(f'预处理方法 {self._feature_method} 不存在!')
 51 |             logger.info(f'使用【{feature_method}】提取特征')
 52 | 
 53 |     def forward(self, waveforms, input_lens_ratio=None):
 54 |         """从AudioSegment中提取音频特征
 55 | 
 56 |         :param waveforms: Audio segment to extract features from.
 57 |         :type waveforms: AudioSegment
 58 |         :param input_lens_ratio: input length ratio
 59 |         :type input_lens_ratio: tensor
 60 |         :return: Spectrogram audio feature in 2darray.
 61 |         :rtype: ndarray
 62 |         """
 63 |         if len(waveforms.shape) == 1:
 64 |             waveforms = waveforms.unsqueeze(0)
 65 |         if self.use_hf_model:
 66 |             # 使用HF上的Wav2Vec2类似模型提取音频特征
 67 |             if isinstance(waveforms, torch.Tensor):
 68 |                 waveforms = waveforms.numpy()
 69 |             inputs = self.processor(waveforms, sampling_rate=16000,
 70 |                                     return_tensors="pt").to(self.device)
 71 |             with torch.no_grad():
 72 |                 outputs = self.feature_model(**inputs)
 73 |                 feature = outputs.extract_features.cpu().detach()
 74 |         else:
 75 |             # 使用普通方法提取音频特征
 76 |             feature = self.feat_fun(waveforms)
 77 |             feature = feature.transpose(2, 1)
 78 |         # 归一化
 79 |         feature = feature - feature.mean(1, keepdim=True)
 80 |         if input_lens_ratio is not None:
 81 |             # 对掩码比例进行扩展
 82 |             input_lens = (input_lens_ratio * feature.shape[1])
 83 |             mask_lens = torch.round(input_lens).long()
 84 |             mask_lens = mask_lens.unsqueeze(1)
 85 |             # 生成掩码张量
 86 |             idxs = torch.arange(feature.shape[1], device=feature.device).repeat(feature.shape[0], 1)
 87 |             mask = idxs < mask_lens
 88 |             mask = mask.unsqueeze(-1)
 89 |             # 对特征进行掩码操作
 90 |             feature = torch.where(mask, feature, torch.zeros_like(feature))
 91 |         return feature
 92 | 
 93 |     @property
 94 |     def feature_dim(self):
 95 |         """返回特征大小
 96 | 
 97 |         :return: 特征大小
 98 |         :rtype: int
 99 |         """
100 |         if self.use_hf_model:
101 |             return self.output_channels
102 |         if self._feature_method == 'MelSpectrogram':
103 |             return self._method_args.get('n_mels', 128)
104 |         elif self._feature_method == 'Spectrogram':
105 |             return self._method_args.get('n_fft', 400) // 2 + 1
106 |         elif self._feature_method == 'MFCC':
107 |             return self._method_args.get('n_mfcc', 40)
108 |         elif self._feature_method == 'Fbank':
109 |             return self._method_args.get('num_mel_bins', 23)
110 |         else:
111 |             raise Exception('没有{}预处理方法'.format(self._feature_method))
112 | 
113 | 
114 | class KaldiFbank(nn.Module):
115 |     def __init__(self, **kwargs):
116 |         super(KaldiFbank, self).__init__()
117 |         self.kwargs = kwargs
118 | 
119 |     def forward(self, waveforms):
120 |         """
121 |         :param waveforms: [Batch, Length]
122 |         :return: [Batch, Feature, Length]
123 |         """
124 |         log_fbanks = []
125 |         for waveform in waveforms:
126 |             if len(waveform.shape) == 1:
127 |                 waveform = waveform.unsqueeze(0)
128 |             log_fbank = Kaldi.fbank(waveform, **self.kwargs)
129 |             log_fbank = log_fbank.transpose(0, 1)
130 |             log_fbanks.append(log_fbank)
131 |         log_fbank = torch.stack(log_fbanks)
132 |         return log_fbank
133 | 


--------------------------------------------------------------------------------
/macls/data_utils/reader.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch.utils.data import Dataset
  6 | from tqdm import tqdm
  7 | from yeaudio.audio import AudioSegment
  8 | from yeaudio.augmentation import SpeedPerturbAugmentor, VolumePerturbAugmentor, NoisePerturbAugmentor, \
  9 |     ReverbPerturbAugmentor, SpecAugmentor
 10 | 
 11 | from macls.data_utils.featurizer import AudioFeaturizer
 12 | 
 13 | 
 14 | class MAClsDataset(Dataset):
 15 |     def __init__(self,
 16 |                  data_list_path,
 17 |                  audio_featurizer: AudioFeaturizer,
 18 |                  max_duration=3,
 19 |                  min_duration=0.5,
 20 |                  mode='train',
 21 |                  sample_rate=16000,
 22 |                  aug_conf=None,
 23 |                  use_dB_normalization=True,
 24 |                  target_dB=-20):
 25 |         """音频数据加载器
 26 | 
 27 |         Args:
 28 |             data_list_path: 包含音频路径和标签的数据列表文件的路径
 29 |             audio_featurizer: 声纹特征提取器
 30 |             max_duration: 最长的音频长度，大于这个长度会裁剪掉
 31 |             min_duration: 过滤最短的音频长度
 32 |             aug_conf: 用于指定音频增强的配置
 33 |             mode: 数据集模式。在训练模式下，数据集可能会进行一些数据增强的预处理
 34 |             sample_rate: 采样率
 35 |             use_dB_normalization: 是否对音频进行音量归一化
 36 |             target_dB: 音量归一化的大小
 37 |         """
 38 |         super(MAClsDataset, self).__init__()
 39 |         assert mode in ['train', 'eval', 'extract_feature']
 40 |         self.data_list_path = data_list_path
 41 |         self.max_duration = max_duration
 42 |         self.min_duration = min_duration
 43 |         self.mode = mode
 44 |         self._target_sample_rate = sample_rate
 45 |         self._use_dB_normalization = use_dB_normalization
 46 |         self._target_dB = target_dB
 47 |         self.speed_augment = None
 48 |         self.volume_augment = None
 49 |         self.noise_augment = None
 50 |         self.reverb_augment = None
 51 |         self.spec_augment = None
 52 |         # 获取特征器
 53 |         self.audio_featurizer = audio_featurizer
 54 |         # 获取特征裁剪的大小
 55 |         self.max_feature_len = self.get_crop_feature_len()
 56 |         # 获取数据列表
 57 |         with open(self.data_list_path, 'r', encoding='utf-8') as f:
 58 |             self.lines = f.readlines()
 59 |         if mode == 'train' and aug_conf is not None:
 60 |             # 获取数据增强器
 61 |             self.get_augmentor(aug_conf)
 62 |         # 评估模式下，数据列表需要排序
 63 |         if self.mode == 'eval':
 64 |             self.sort_list()
 65 | 
 66 |     def __getitem__(self, idx):
 67 |         # 分割数据文件路径和标签
 68 |         data_path, label = self.lines[idx].replace('\n', '').split('\t')
 69 |         # 如果后缀名为.npy的文件，那么直接读取
 70 |         if data_path.endswith('.npy'):
 71 |             feature = np.load(data_path)
 72 |             if feature.shape[0] > self.max_feature_len:
 73 |                 crop_start = random.randint(0, feature.shape[0] - self.max_feature_len) if self.mode == 'train' else 0
 74 |                 feature = feature[crop_start:crop_start + self.max_feature_len, :]
 75 |             feature = torch.tensor(feature, dtype=torch.float32)
 76 |         else:
 77 |             audio_path, label = self.lines[idx].strip().split('\t')
 78 |             # 读取音频
 79 |             audio_segment = AudioSegment.from_file(audio_path)
 80 |             # 数据太短不利于训练
 81 |             if self.mode == 'train' or self.mode == 'extract_feature':
 82 |                 if audio_segment.duration < self.min_duration:
 83 |                     return self.__getitem__(idx + 1 if idx < len(self.lines) - 1 else 0)
 84 |             # 音频增强
 85 |             if self.mode == 'train':
 86 |                 audio_segment = self.augment_audio(audio_segment)
 87 |             # 重采样
 88 |             if audio_segment.sample_rate != self._target_sample_rate:
 89 |                 audio_segment.resample(self._target_sample_rate)
 90 |             # 音量归一化
 91 |             if self._use_dB_normalization:
 92 |                 audio_segment.normalize(target_db=self._target_dB)
 93 |             # 裁剪需要的数据
 94 |             if audio_segment.duration > self.max_duration:
 95 |                 audio_segment.crop(duration=self.max_duration, mode=self.mode)
 96 |             samples = torch.tensor(audio_segment.samples, dtype=torch.float32)
 97 |             feature = self.audio_featurizer(samples)
 98 |             feature = feature.squeeze(0)
 99 |         if self.mode == 'train' and self.spec_augment is not None:
100 |             feature = self.spec_augment(feature.cpu().numpy())
101 |             feature = torch.tensor(feature, dtype=torch.float32)
102 |         label = torch.tensor(int(label), dtype=torch.int64)
103 |         return feature, label
104 | 
105 |     def __len__(self):
106 |         return len(self.lines)
107 | 
108 |     # 获取特征裁剪的大小，对应max_duration音频提取特征后的长度
109 |     def get_crop_feature_len(self):
110 |         samples = torch.randn((1, self.max_duration * self._target_sample_rate))
111 |         feature = self.audio_featurizer(samples).squeeze(0)
112 |         freq_len = feature.size(0)
113 |         return freq_len
114 | 
115 |     # 数据列表需要排序
116 |     def sort_list(self):
117 |         lengths = []
118 |         for line in tqdm(self.lines, desc=f"对列表[{self.data_list_path}]进行长度排序"):
119 |             # 分割数据文件路径和标签
120 |             data_path, _ = line.split('\t')
121 |             if data_path.endswith('.npy'):
122 |                 feature = np.load(data_path)
123 |                 length = feature.shape[0]
124 |                 lengths.append(length)
125 |             else:
126 |                 # 读取音频
127 |                 audio_segment = AudioSegment.from_file(data_path)
128 |                 length = audio_segment.duration
129 |                 lengths.append(length)
130 |         # 对长度排序并获取索引
131 |         sorted_indexes = np.argsort(lengths)
132 |         self.lines = [self.lines[i] for i in sorted_indexes]
133 | 
134 |     # 获取数据增强器
135 |     def get_augmentor(self, aug_conf):
136 |         if aug_conf.speed is not None:
137 |             self.speed_augment = SpeedPerturbAugmentor(**aug_conf.speed)
138 |         if aug_conf.volume is not None:
139 |             self.volume_augment = VolumePerturbAugmentor(**aug_conf.volume)
140 |         if aug_conf.noise is not None:
141 |             self.noise_augment = NoisePerturbAugmentor(**aug_conf.noise)
142 |         if aug_conf.reverb is not None:
143 |             self.reverb_augment = ReverbPerturbAugmentor(**aug_conf.reverb)
144 |         if aug_conf.spec_aug is not None:
145 |             self.spec_augment = SpecAugmentor(**aug_conf.spec_aug)
146 | 
147 |     # 音频增强
148 |     def augment_audio(self, audio_segment):
149 |         if self.speed_augment is not None:
150 |             audio_segment = self.speed_augment(audio_segment)
151 |         if self.volume_augment is not None:
152 |             audio_segment = self.volume_augment(audio_segment)
153 |         if self.noise_augment is not None:
154 |             audio_segment = self.noise_augment(audio_segment)
155 |         if self.reverb_augment is not None:
156 |             audio_segment = self.reverb_augment(audio_segment)
157 |         return audio_segment
158 | 


--------------------------------------------------------------------------------
/macls/metric/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/AudioClassification-Pytorch/f9f55968cc1f181c8ebc786f2db880702e8bf79f/macls/metric/__init__.py


--------------------------------------------------------------------------------
/macls/metric/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | # 计算准确率
 6 | def accuracy(output, label):
 7 |     output = torch.nn.functional.softmax(output, dim=-1)
 8 |     output = output.data.cpu().numpy()
 9 |     output = np.argmax(output, axis=1)
10 |     label = label.data.cpu().numpy()
11 |     acc = np.mean((output == label).astype(int))
12 |     return acc
13 | 


--------------------------------------------------------------------------------
/macls/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from loguru import logger
 4 | from .campplus import CAMPPlus
 5 | from .ecapa_tdnn import EcapaTdnn
 6 | from .eres2net import ERes2Net, ERes2NetV2
 7 | from .panns import PANNS_CNN6, PANNS_CNN10, PANNS_CNN14
 8 | from .res2net import Res2Net
 9 | from .resnet_se import ResNetSE
10 | from .tdnn import TDNN
11 | 
12 | __all__ = ['build_model']
13 | 
14 | 
15 | def build_model(input_size, configs):
16 |     use_model = configs.model_conf.get('model', 'CAMPPlus')
17 |     model_args = configs.model_conf.get('model_args', {})
18 |     mod = importlib.import_module(__name__)
19 |     model = getattr(mod, use_model)(input_size=input_size, **model_args)
20 |     logger.info(f'成功创建模型：{use_model}，参数为：{model_args}')
21 |     return model
22 | 


--------------------------------------------------------------------------------
/macls/models/campplus.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import OrderedDict
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torch.utils.checkpoint as cp
  7 | from torch import nn
  8 | 
  9 | 
 10 | def get_nonlinear(config_str, channels):
 11 |     nonlinear = nn.Sequential()
 12 |     for name in config_str.split('-'):
 13 |         if name == 'relu':
 14 |             nonlinear.add_module('relu', nn.ReLU(inplace=True))
 15 |         elif name == 'prelu':
 16 |             nonlinear.add_module('prelu', nn.PReLU(channels))
 17 |         elif name == 'batchnorm':
 18 |             nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels))
 19 |         elif name == 'batchnorm_':
 20 |             nonlinear.add_module('batchnorm',
 21 |                                  nn.BatchNorm1d(channels, affine=False))
 22 |         else:
 23 |             raise ValueError('Unexpected module ({}).'.format(name))
 24 |     return nonlinear
 25 | 
 26 | 
 27 | def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2):
 28 |     mean = x.mean(dim=dim)
 29 |     std = x.std(dim=dim, unbiased=unbiased)
 30 |     stats = torch.cat([mean, std], dim=-1)
 31 |     if keepdim:
 32 |         stats = stats.unsqueeze(dim=dim)
 33 |     return stats
 34 | 
 35 | 
 36 | class StatsPool(nn.Module):
 37 |     def forward(self, x):
 38 |         return statistics_pooling(x)
 39 | 
 40 | 
 41 | class TDNNLayer(nn.Module):
 42 |     def __init__(self,
 43 |                  in_channels,
 44 |                  out_channels,
 45 |                  kernel_size,
 46 |                  stride=1,
 47 |                  padding=0,
 48 |                  dilation=1,
 49 |                  bias=False,
 50 |                  config_str='batchnorm-relu'):
 51 |         super(TDNNLayer, self).__init__()
 52 |         if padding < 0:
 53 |             assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
 54 |                 kernel_size)
 55 |             padding = (kernel_size - 1) // 2 * dilation
 56 |         self.linear = nn.Conv1d(in_channels,
 57 |                                 out_channels,
 58 |                                 kernel_size,
 59 |                                 stride=stride,
 60 |                                 padding=padding,
 61 |                                 dilation=dilation,
 62 |                                 bias=bias)
 63 |         self.nonlinear = get_nonlinear(config_str, out_channels)
 64 | 
 65 |     def forward(self, x):
 66 |         x = self.linear(x)
 67 |         x = self.nonlinear(x)
 68 |         return x
 69 | 
 70 | 
 71 | class CAMLayer(nn.Module):
 72 |     def __init__(self,
 73 |                  bn_channels,
 74 |                  out_channels,
 75 |                  kernel_size,
 76 |                  stride,
 77 |                  padding,
 78 |                  dilation,
 79 |                  bias,
 80 |                  reduction=2):
 81 |         super(CAMLayer, self).__init__()
 82 |         self.linear_local = nn.Conv1d(bn_channels,
 83 |                                       out_channels,
 84 |                                       kernel_size,
 85 |                                       stride=stride,
 86 |                                       padding=padding,
 87 |                                       dilation=dilation,
 88 |                                       bias=bias)
 89 |         self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1)
 90 |         self.relu = nn.ReLU(inplace=True)
 91 |         self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1)
 92 |         self.sigmoid = nn.Sigmoid()
 93 | 
 94 |     def forward(self, x):
 95 |         y = self.linear_local(x)
 96 |         context = x.mean(-1, keepdim=True) + self.seg_pooling(x)
 97 |         context = self.relu(self.linear1(context))
 98 |         m = self.sigmoid(self.linear2(context))
 99 |         return y * m
100 | 
101 |     def seg_pooling(self, x, seg_len=100, stype='avg'):
102 |         if stype == 'avg':
103 |             seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
104 |         elif stype == 'max':
105 |             seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
106 |         else:
107 |             raise ValueError('Wrong segment pooling type.')
108 |         shape = seg.shape
109 |         seg = seg.unsqueeze(-1).expand(*shape, seg_len).reshape(*shape[:-1], -1)
110 |         seg = seg[..., :x.shape[-1]]
111 |         return seg
112 | 
113 | 
114 | class CAMDenseTDNNLayer(nn.Module):
115 |     def __init__(self,
116 |                  in_channels,
117 |                  out_channels,
118 |                  bn_channels,
119 |                  kernel_size,
120 |                  stride=1,
121 |                  dilation=1,
122 |                  bias=False,
123 |                  config_str='batchnorm-relu',
124 |                  memory_efficient=False):
125 |         super(CAMDenseTDNNLayer, self).__init__()
126 |         assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
127 |             kernel_size)
128 |         padding = (kernel_size - 1) // 2 * dilation
129 |         self.memory_efficient = memory_efficient
130 |         self.nonlinear1 = get_nonlinear(config_str, in_channels)
131 |         self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
132 |         self.nonlinear2 = get_nonlinear(config_str, bn_channels)
133 |         self.cam_layer = CAMLayer(bn_channels,
134 |                                   out_channels,
135 |                                   kernel_size,
136 |                                   stride=stride,
137 |                                   padding=padding,
138 |                                   dilation=dilation,
139 |                                   bias=bias)
140 | 
141 |     def bn_function(self, x):
142 |         return self.linear1(self.nonlinear1(x))
143 | 
144 |     def forward(self, x):
145 |         if self.training and self.memory_efficient:
146 |             x = cp.checkpoint(self.bn_function, x, use_reentrant=False)
147 |         else:
148 |             x = self.bn_function(x)
149 |         x = self.cam_layer(self.nonlinear2(x))
150 |         return x
151 | 
152 | 
153 | class CAMDenseTDNNBlock(nn.ModuleList):
154 |     def __init__(self,
155 |                  num_layers,
156 |                  in_channels,
157 |                  out_channels,
158 |                  bn_channels,
159 |                  kernel_size,
160 |                  stride=1,
161 |                  dilation=1,
162 |                  bias=False,
163 |                  config_str='batchnorm-relu',
164 |                  memory_efficient=False):
165 |         super(CAMDenseTDNNBlock, self).__init__()
166 |         for i in range(num_layers):
167 |             layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels,
168 |                                       out_channels=out_channels,
169 |                                       bn_channels=bn_channels,
170 |                                       kernel_size=kernel_size,
171 |                                       stride=stride,
172 |                                       dilation=dilation,
173 |                                       bias=bias,
174 |                                       config_str=config_str,
175 |                                       memory_efficient=memory_efficient)
176 |             self.add_module('tdnnd%d' % (i + 1), layer)
177 | 
178 |     def forward(self, x):
179 |         for layer in self:
180 |             x = torch.cat([x, layer(x)], dim=1)
181 |         return x
182 | 
183 | 
184 | class TransitLayer(nn.Module):
185 |     def __init__(self,
186 |                  in_channels,
187 |                  out_channels,
188 |                  bias=True,
189 |                  config_str='batchnorm-relu'):
190 |         super(TransitLayer, self).__init__()
191 |         self.nonlinear = get_nonlinear(config_str, in_channels)
192 |         self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
193 | 
194 |     def forward(self, x):
195 |         x = self.nonlinear(x)
196 |         x = self.linear(x)
197 |         return x
198 | 
199 | 
200 | class DenseLayer(nn.Module):
201 |     def __init__(self,
202 |                  in_channels,
203 |                  out_channels,
204 |                  bias=False,
205 |                  config_str='batchnorm-relu'):
206 |         super(DenseLayer, self).__init__()
207 |         self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
208 |         self.nonlinear = get_nonlinear(config_str, out_channels)
209 | 
210 |     def forward(self, x):
211 |         if len(x.shape) == 2:
212 |             x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
213 |         else:
214 |             x = self.linear(x)
215 |         x = self.nonlinear(x)
216 |         return x
217 | 
218 | 
219 | class BasicResBlock(nn.Module):
220 |     expansion = 1
221 | 
222 |     def __init__(self, in_planes, planes, stride=1):
223 |         super(BasicResBlock, self).__init__()
224 |         self.conv1 = nn.Conv2d(in_planes,
225 |                                planes,
226 |                                kernel_size=3,
227 |                                stride=(stride, 1),
228 |                                padding=1,
229 |                                bias=False)
230 |         self.bn1 = nn.BatchNorm2d(planes)
231 |         self.conv2 = nn.Conv2d(planes,
232 |                                planes,
233 |                                kernel_size=3,
234 |                                stride=1,
235 |                                padding=1,
236 |                                bias=False)
237 |         self.bn2 = nn.BatchNorm2d(planes)
238 | 
239 |         self.shortcut = nn.Sequential()
240 |         if stride != 1 or in_planes != self.expansion * planes:
241 |             self.shortcut = nn.Sequential(
242 |                 nn.Conv2d(in_planes,
243 |                           self.expansion * planes,
244 |                           kernel_size=1,
245 |                           stride=(stride, 1),
246 |                           bias=False),
247 |                 nn.BatchNorm2d(self.expansion * planes))
248 | 
249 |     def forward(self, x):
250 |         out = F.relu(self.bn1(self.conv1(x)))
251 |         out = self.bn2(self.conv2(out))
252 |         out += self.shortcut(x)
253 |         out = F.relu(out)
254 |         return out
255 | 
256 | 
257 | class FCM(nn.Module):
258 |     def __init__(self,
259 |                  block=BasicResBlock,
260 |                  num_blocks=[2, 2],
261 |                  m_channels=32,
262 |                  feat_dim=80):
263 |         super(FCM, self).__init__()
264 |         self.in_planes = m_channels
265 |         self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
266 |         self.bn1 = nn.BatchNorm2d(m_channels)
267 | 
268 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
269 |         self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
270 | 
271 |         self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
272 |         self.bn2 = nn.BatchNorm2d(m_channels)
273 |         self.out_channels = m_channels * (math.ceil(feat_dim / 8))
274 | 
275 |     def _make_layer(self, block, planes, num_blocks, stride):
276 |         strides = [stride] + [1] * (num_blocks - 1)
277 |         layers = []
278 |         for stride in strides:
279 |             layers.append(block(self.in_planes, planes, stride))
280 |             self.in_planes = planes * block.expansion
281 |         return nn.Sequential(*layers)
282 | 
283 |     def forward(self, x):
284 |         x = x.unsqueeze(1)
285 |         out = F.relu(self.bn1(self.conv1(x)))
286 |         out = self.layer1(out)
287 |         out = self.layer2(out)
288 |         out = F.relu(self.bn2(self.conv2(out)))
289 | 
290 |         shape = out.shape
291 |         out = out.reshape(shape[0], shape[1] * shape[2], shape[3])
292 |         return out
293 | 
294 | 
295 | class CAMPPlus(nn.Module):
296 |     def __init__(self,
297 |                  num_class,
298 |                  input_size,
299 |                  embd_dim=512,
300 |                  growth_rate=32,
301 |                  bn_size=4,
302 |                  init_channels=128,
303 |                  config_str='batchnorm-relu',
304 |                  memory_efficient=True):
305 |         super(CAMPPlus, self).__init__()
306 | 
307 |         self.head = FCM(feat_dim=input_size)
308 |         channels = self.head.out_channels
309 |         self.embd_dim = embd_dim
310 | 
311 |         self.xvector = nn.Sequential(
312 |             OrderedDict([('tdnn', TDNNLayer(channels,
313 |                                             init_channels,
314 |                                             5,
315 |                                             stride=2,
316 |                                             dilation=1,
317 |                                             padding=-1,
318 |                                             config_str=config_str)),
319 |                          ]))
320 |         channels = init_channels
321 |         for i, (num_layers, kernel_size,
322 |                 dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 2))):
323 |             block = CAMDenseTDNNBlock(num_layers=num_layers,
324 |                                       in_channels=channels,
325 |                                       out_channels=growth_rate,
326 |                                       bn_channels=bn_size * growth_rate,
327 |                                       kernel_size=kernel_size,
328 |                                       dilation=dilation,
329 |                                       config_str=config_str,
330 |                                       memory_efficient=memory_efficient)
331 |             self.xvector.add_module('block%d' % (i + 1), block)
332 |             channels = channels + num_layers * growth_rate
333 |             self.xvector.add_module('transit%d' % (i + 1),
334 |                                     TransitLayer(channels,
335 |                                                  channels // 2,
336 |                                                  bias=False,
337 |                                                  config_str=config_str))
338 |             channels //= 2
339 | 
340 |         self.xvector.add_module(
341 |             'out_nonlinear', get_nonlinear(config_str, channels))
342 | 
343 |         self.xvector.add_module('stats', StatsPool())
344 |         self.xvector.add_module('dense', DenseLayer(channels * 2, embd_dim, config_str='batchnorm_'))
345 |         # 分类层
346 |         self.fc = nn.Linear(embd_dim, num_class)
347 | 
348 |         for m in self.modules():
349 |             if isinstance(m, (nn.Conv1d, nn.Linear)):
350 |                 nn.init.kaiming_normal_(m.weight.data)
351 |                 if m.bias is not None:
352 |                     nn.init.zeros_(m.bias)
353 | 
354 |     def forward(self, x):
355 |         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
356 |         x = self.head(x)
357 |         x = self.xvector(x)
358 |         x = self.fc(x)
359 |         return x
360 | 


--------------------------------------------------------------------------------
/macls/models/ecapa_tdnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from macls.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
  5 | from macls.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
  6 | from macls.models.utils import Conv1d, length_to_mask, TDNNBlock, BatchNorm1d
  7 | 
  8 | 
  9 | class Res2NetBlock(torch.nn.Module):
 10 |     def __init__(self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1):
 11 |         """Implementation of Res2Net Block with dilation
 12 |            The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
 13 |            whose url is https://arxiv.org/abs/1904.01169
 14 |         Args:
 15 |             in_channels (int): input channels or input dimensions
 16 |             out_channels (int): output channels or output dimensions
 17 |             scale (int, optional): scale in res2net bolck. Defaults to 8.
 18 |             dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
 19 |         """
 20 |         super(Res2NetBlock, self).__init__()
 21 |         assert in_channels % scale == 0
 22 |         assert out_channels % scale == 0
 23 | 
 24 |         in_channel = in_channels // scale
 25 |         hidden_channel = out_channels // scale
 26 | 
 27 |         self.blocks = nn.ModuleList(
 28 |             [
 29 |                 TDNNBlock(
 30 |                     in_channel,
 31 |                     hidden_channel,
 32 |                     kernel_size=kernel_size,
 33 |                     dilation=dilation,
 34 |                 )
 35 |                 for i in range(scale - 1)
 36 |             ]
 37 |         )
 38 |         self.scale = scale
 39 | 
 40 |     def forward(self, x):
 41 |         y = []
 42 |         for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
 43 |             if i == 0:
 44 |                 y_i = x_i
 45 |             elif i == 1:
 46 |                 y_i = self.blocks[i - 1](x_i)
 47 |             else:
 48 |                 y_i = self.blocks[i - 1](x_i + y_i)
 49 |             y.append(y_i)
 50 |         y = torch.cat(y, dim=1)
 51 |         return y
 52 | 
 53 | 
 54 | class SEBlock(nn.Module):
 55 |     def __init__(self, in_channels, se_channels, out_channels):
 56 |         """Implementation of SEBlock
 57 |            The paper is refered as "Squeeze-and-Excitation Networks"
 58 |            whose url is https://arxiv.org/abs/1709.01507
 59 |         Args:
 60 |             in_channels (int): input channels or input data dimensions
 61 |             se_channels (_type_): _description_
 62 |             out_channels (int): output channels or output data dimensions
 63 |         """
 64 |         super(SEBlock, self).__init__()
 65 | 
 66 |         self.conv1 = Conv1d(in_channels=in_channels, out_channels=se_channels, kernel_size=1)
 67 |         self.relu = torch.nn.ReLU(inplace=True)
 68 |         self.conv2 = Conv1d(in_channels=se_channels, out_channels=out_channels, kernel_size=1)
 69 |         self.sigmoid = torch.nn.Sigmoid()
 70 | 
 71 |     def forward(self, x, lengths=None):
 72 |         L = x.shape[-1]
 73 |         if lengths is not None:
 74 |             mask = length_to_mask(lengths * L, max_len=L, device=x.device)
 75 |             mask = mask.unsqueeze(1)
 76 |             total = mask.sum(dim=2, keepdim=True)
 77 |             s = (x * mask).sum(dim=2, keepdim=True) / total
 78 |         else:
 79 |             s = x.mean(dim=2, keepdim=True)
 80 | 
 81 |         s = self.relu(self.conv1(s))
 82 |         s = self.sigmoid(self.conv2(s))
 83 | 
 84 |         return s * x
 85 | 
 86 | 
 87 | class SERes2NetBlock(nn.Module):
 88 |     def __init__(
 89 |             self,
 90 |             in_channels,
 91 |             out_channels,
 92 |             res2net_scale=8,
 93 |             se_channels=128,
 94 |             kernel_size=1,
 95 |             dilation=1,
 96 |             activation=torch.nn.ReLU,
 97 |             groups=1, ):
 98 |         """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
 99 |            The paper is refered "Squeeze-and-Excitation Networks"
100 |            whose url is: https://arxiv.org/pdf/1709.01507.pdf
101 |         Args:
102 |             in_channels (int): input channels or input data dimensions
103 |             out_channels (int): output channels or output data dimensions
104 |             res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
105 |             se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
106 |             kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
107 |             dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
108 |             activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
109 |         """
110 |         super().__init__()
111 |         self.out_channels = out_channels
112 |         self.tdnn1 = TDNNBlock(in_channels,
113 |                                out_channels,
114 |                                kernel_size=1,
115 |                                dilation=1,
116 |                                activation=activation,
117 |                                groups=groups, )
118 |         self.res2net_block = Res2NetBlock(out_channels, out_channels, res2net_scale, kernel_size, dilation)
119 |         self.tdnn2 = TDNNBlock(out_channels,
120 |                                out_channels,
121 |                                kernel_size=1,
122 |                                dilation=1,
123 |                                activation=activation,
124 |                                groups=groups, )
125 |         self.se_block = SEBlock(out_channels, se_channels, out_channels)
126 | 
127 |         self.shortcut = None
128 |         if in_channels != out_channels:
129 |             self.shortcut = Conv1d(in_channels=in_channels,
130 |                                    out_channels=out_channels,
131 |                                    kernel_size=1, )
132 | 
133 |     def forward(self, x, lengths=None):
134 |         residual = x
135 |         if self.shortcut:
136 |             residual = self.shortcut(x)
137 | 
138 |         x = self.tdnn1(x)
139 |         x = self.res2net_block(x)
140 |         x = self.tdnn2(x)
141 |         x = self.se_block(x, lengths)
142 | 
143 |         return x + residual
144 | 
145 | 
146 | class EcapaTdnn(torch.nn.Module):
147 |     def __init__(
148 |             self,
149 |             num_class,
150 |             input_size,
151 |             embd_dim=192,
152 |             pooling_type="ASP",
153 |             activation=nn.ReLU,
154 |             channels=[512, 512, 512, 512, 1536],
155 |             kernel_sizes=[5, 3, 3, 3, 1],
156 |             dilations=[1, 2, 3, 4, 1],
157 |             attention_channels=128,
158 |             res2net_scale=8,
159 |             se_channels=128,
160 |             global_context=True,
161 |             groups=[1, 1, 1, 1, 1], ):
162 |         """Implementation of ECAPA-TDNN backbone model network
163 |            The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
164 |            whose url is: https://arxiv.org/abs/2005.07143
165 |         Args:
166 |             input_size (_type_): input fature dimension
167 |             embd_dim (int, optional): speaker embedding size. Defaults to 192.
168 |             activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
169 |             channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
170 |             kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
171 |             dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
172 |             attention_channels (int, optional): attention dimensions. Defaults to 128.
173 |             res2net_scale (int, optional): scale value in res2net. Defaults to 8.
174 |             se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
175 |             global_context (bool, optional): global context flag. Defaults to True.
176 |         """
177 |         super().__init__()
178 |         assert len(channels) == len(kernel_sizes)
179 |         assert len(channels) == len(dilations)
180 |         self.channels = channels
181 |         self.blocks = nn.ModuleList()
182 | 
183 |         # The initial TDNN layer
184 |         self.blocks.append(
185 |             TDNNBlock(
186 |                 input_size,
187 |                 channels[0],
188 |                 kernel_sizes[0],
189 |                 dilations[0],
190 |                 activation,
191 |                 groups[0],
192 |             )
193 |         )
194 | 
195 |         # SE-Res2Net layers
196 |         for i in range(1, len(channels) - 1):
197 |             self.blocks.append(
198 |                 SERes2NetBlock(
199 |                     channels[i - 1],
200 |                     channels[i],
201 |                     res2net_scale=res2net_scale,
202 |                     se_channels=se_channels,
203 |                     kernel_size=kernel_sizes[i],
204 |                     dilation=dilations[i],
205 |                     activation=activation,
206 |                     groups=groups[i],
207 |                 )
208 |             )
209 | 
210 |         # Multi-layer feature aggregation
211 |         self.mfa = TDNNBlock(channels[-1],
212 |                              channels[-1],
213 |                              kernel_sizes[-1],
214 |                              dilations[-1],
215 |                              activation,
216 |                              groups=groups[-1], )
217 | 
218 |         # Attentive Statistical Pooling
219 |         cat_channels = channels[-1]
220 |         self.embd_dim = embd_dim
221 |         if pooling_type == "ASP":
222 |             self.asp = AttentiveStatisticsPooling(channels[-1],
223 |                                                   attention_channels=attention_channels,
224 |                                                   global_context=global_context)
225 |             self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
226 |             # Final linear transformation
227 |             self.fc = Conv1d(in_channels=channels[-1] * 2,
228 |                              out_channels=self.embd_dim,
229 |                              kernel_size=1)
230 |         elif pooling_type == "SAP":
231 |             self.asp = SelfAttentivePooling(cat_channels, 128)
232 |             self.asp_bn = nn.BatchNorm1d(cat_channels)
233 |             # Final linear transformation
234 |             self.fc = Conv1d(in_channels=cat_channels,
235 |                              out_channels=self.embd_dim,
236 |                              kernel_size=1)
237 |         elif pooling_type == "TAP":
238 |             self.asp = TemporalAveragePooling()
239 |             self.asp_bn = nn.BatchNorm1d(cat_channels)
240 |             # Final linear transformation
241 |             self.fc = Conv1d(in_channels=cat_channels,
242 |                              out_channels=self.embd_dim,
243 |                              kernel_size=1)
244 |         elif pooling_type == "TSP":
245 |             self.asp = TemporalStatisticsPooling()
246 |             self.asp_bn = nn.BatchNorm1d(cat_channels * 2)
247 |             # Final linear transformation
248 |             self.fc = Conv1d(in_channels=cat_channels * 2,
249 |                              out_channels=self.embd_dim,
250 |                              kernel_size=1)
251 |         else:
252 |             raise Exception(f'没有{pooling_type}池化层！')
253 |         self.output = nn.Linear(self.embd_dim, num_class)
254 | 
255 |     def forward(self, x, lengths=None):
256 |         """Returns the embedding vector.
257 | 
258 |         Arguments
259 |         ---------
260 |         x : torch.Tensor
261 |             Tensor of shape (batch, time, channel).
262 |         """
263 |         # Minimize transpose for efficiency
264 |         x = x.transpose(1, 2)
265 | 
266 |         xl = []
267 |         for layer in self.blocks:
268 |             try:
269 |                 x = layer(x, lengths=lengths)
270 |             except TypeError:
271 |                 x = layer(x)
272 |             xl.append(x)
273 | 
274 |         # Multi-layer feature aggregation
275 |         x = torch.cat(xl[1:], dim=1)
276 |         x = self.mfa(x)
277 | 
278 |         # Attentive Statistical Pooling
279 |         x = self.asp(x)
280 |         x = self.asp_bn(x)
281 |         x = x.unsqueeze(2)
282 |         # Final linear transformation
283 |         x = self.fc(x).squeeze(-1)  # (N, emb_size, 1) -> (N, emb_size)
284 |         x = self.output(x)
285 |         return x
286 | 


--------------------------------------------------------------------------------
/macls/models/eres2net.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from macls.models.pooling import TemporalStatsPool
  8 | 
  9 | __all__ = ['ERes2Net', 'ERes2NetV2']
 10 | 
 11 | 
 12 | class ReLU(nn.Hardtanh):
 13 | 
 14 |     def __init__(self, inplace=False):
 15 |         super(ReLU, self).__init__(0, 20, inplace)
 16 | 
 17 |     def __repr__(self):
 18 |         inplace_str = 'inplace' if self.inplace else ''
 19 |         return self.__class__.__name__ + ' (' + inplace_str + ')'
 20 | 
 21 | 
 22 | def conv1x1(in_planes, out_planes, stride=1):
 23 |     "1x1 convolution without padding"
 24 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False)
 25 | 
 26 | 
 27 | def conv3x3(in_planes, out_planes, stride=1):
 28 |     "3x3 convolution with padding"
 29 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
 30 | 
 31 | 
 32 | class AFF(nn.Module):
 33 | 
 34 |     def __init__(self, channels=64, r=4):
 35 |         super(AFF, self).__init__()
 36 |         inter_channels = int(channels // r)
 37 | 
 38 |         self.local_att = nn.Sequential(
 39 |             nn.Conv2d(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
 40 |             nn.BatchNorm2d(inter_channels),
 41 |             nn.SiLU(inplace=True),
 42 |             nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
 43 |             nn.BatchNorm2d(channels),
 44 |         )
 45 | 
 46 |     def forward(self, x, ds_y):
 47 |         xa = torch.cat((x, ds_y), dim=1)
 48 |         x_att = self.local_att(xa)
 49 |         x_att = 1.0 + torch.tanh(x_att)
 50 |         xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
 51 | 
 52 |         return xo
 53 | 
 54 | 
 55 | class BasicBlockERes2Net(nn.Module):
 56 | 
 57 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2):
 58 |         super(BasicBlockERes2Net, self).__init__()
 59 |         self.expansion = expansion
 60 |         width = int(math.floor(planes * (base_width / 64.0)))
 61 |         self.conv1 = conv1x1(in_planes, width * scale, stride)
 62 |         self.bn1 = nn.BatchNorm2d(width * scale)
 63 |         self.nums = scale
 64 | 
 65 |         convs = []
 66 |         bns = []
 67 |         for i in range(self.nums):
 68 |             convs.append(conv3x3(width, width))
 69 |             bns.append(nn.BatchNorm2d(width))
 70 |         self.convs = nn.ModuleList(convs)
 71 |         self.bns = nn.ModuleList(bns)
 72 |         self.relu = ReLU(inplace=True)
 73 | 
 74 |         self.conv3 = conv1x1(width * scale, planes * self.expansion)
 75 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 76 |         self.shortcut = nn.Sequential()
 77 |         if stride != 1 or in_planes != self.expansion * planes:
 78 |             self.shortcut = nn.Sequential(
 79 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
 80 |                 nn.BatchNorm2d(self.expansion * planes))
 81 |         self.stride = stride
 82 |         self.width = width
 83 |         self.scale = scale
 84 | 
 85 |     def forward(self, x):
 86 |         out = self.conv1(x)
 87 |         out = self.bn1(out)
 88 |         out = self.relu(out)
 89 |         spx = torch.split(out, self.width, 1)
 90 |         for i in range(self.nums):
 91 |             if i == 0:
 92 |                 sp = spx[i]
 93 |             else:
 94 |                 sp = sp + spx[i]
 95 |             sp = self.convs[i](sp)
 96 |             sp = self.relu(self.bns[i](sp))
 97 |             if i == 0:
 98 |                 out = sp
 99 |             else:
100 |                 out = torch.cat((out, sp), 1)
101 |         out = self.conv3(out)
102 |         out = self.bn3(out)
103 | 
104 |         residual = self.shortcut(x)
105 |         out += residual
106 |         out = self.relu(out)
107 | 
108 |         return out
109 | 
110 | 
111 | class BasicBlockERes2Net_diff_AFF(nn.Module):
112 | 
113 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2):
114 |         super(BasicBlockERes2Net_diff_AFF, self).__init__()
115 |         self.expansion = expansion
116 |         width = int(math.floor(planes * (base_width / 64.0)))
117 |         self.conv1 = conv1x1(in_planes, width * scale, stride)
118 |         self.bn1 = nn.BatchNorm2d(width * scale)
119 | 
120 |         self.nums = scale
121 | 
122 |         convs = []
123 |         fuse_models = []
124 |         bns = []
125 |         for i in range(self.nums):
126 |             convs.append(conv3x3(width, width))
127 |             bns.append(nn.BatchNorm2d(width))
128 |         for j in range(self.nums - 1):
129 |             fuse_models.append(AFF(channels=width))
130 |         self.convs = nn.ModuleList(convs)
131 |         self.bns = nn.ModuleList(bns)
132 |         self.fuse_models = nn.ModuleList(fuse_models)
133 |         self.relu = ReLU(inplace=True)
134 | 
135 |         self.conv3 = conv1x1(width * scale, planes * self.expansion)
136 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
137 |         self.shortcut = nn.Sequential()
138 |         if stride != 1 or in_planes != self.expansion * planes:
139 |             self.shortcut = nn.Sequential(
140 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
141 |                 nn.BatchNorm2d(self.expansion * planes))
142 |         self.stride = stride
143 |         self.width = width
144 |         self.scale = scale
145 | 
146 |     def forward(self, x):
147 |         out = self.conv1(x)
148 |         out = self.bn1(out)
149 |         out = self.relu(out)
150 |         spx = torch.split(out, self.width, 1)
151 |         for i in range(self.nums):
152 |             if i == 0:
153 |                 sp = spx[i]
154 |             else:
155 |                 sp = self.fuse_models[i - 1](sp, spx[i])
156 |             sp = self.convs[i](sp)
157 |             sp = self.relu(self.bns[i](sp))
158 |             if i == 0:
159 |                 out = sp
160 |             else:
161 |                 out = torch.cat((out, sp), 1)
162 |         out = self.conv3(out)
163 |         out = self.bn3(out)
164 | 
165 |         residual = self.shortcut(x)
166 |         out += residual
167 |         out = self.relu(out)
168 | 
169 |         return out
170 | 
171 | 
172 | class ERes2Net(nn.Module):
173 |     def __init__(self,
174 |                  num_class,
175 |                  input_size,
176 |                  block=BasicBlockERes2Net,
177 |                  block_fuse=BasicBlockERes2Net_diff_AFF,
178 |                  num_blocks=[3, 4, 6, 3],
179 |                  m_channels=32,
180 |                  mul_channel=1,
181 |                  expansion=2,
182 |                  base_width=32,
183 |                  scale=2,
184 |                  embd_dim=192,
185 |                  pooling_type='TSTP',
186 |                  two_emb_layer=False):
187 |         super(ERes2Net, self).__init__()
188 |         self.in_planes = m_channels
189 |         self.expansion = expansion
190 |         self.feat_dim = input_size
191 |         self.embd_dim = embd_dim
192 |         self.stats_dim = int(input_size / 8) * m_channels * 8
193 |         self.two_emb_layer = two_emb_layer
194 | 
195 |         self.conv1 = nn.Conv2d(1,
196 |                                m_channels,
197 |                                kernel_size=3,
198 |                                stride=1,
199 |                                padding=1,
200 |                                bias=False)
201 |         self.bn1 = nn.BatchNorm2d(m_channels)
202 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1,
203 |                                        base_width=base_width, scale=scale)
204 |         self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2,
205 |                                        base_width=base_width, scale=scale)
206 |         self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2,
207 |                                        base_width=base_width, scale=scale)
208 |         self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2,
209 |                                        base_width=base_width, scale=scale)
210 | 
211 |         # Downsampling module for each layer
212 |         self.layer1_downsample = nn.Conv2d(m_channels * 2 * mul_channel, m_channels * 4 * mul_channel, kernel_size=3,
213 |                                            padding=1, stride=2, bias=False)
214 |         self.layer2_downsample = nn.Conv2d(m_channels * 4 * mul_channel, m_channels * 8 * mul_channel, kernel_size=3,
215 |                                            padding=1, stride=2, bias=False)
216 |         self.layer3_downsample = nn.Conv2d(m_channels * 8 * mul_channel, m_channels * 16 * mul_channel, kernel_size=3,
217 |                                            padding=1, stride=2, bias=False)
218 |         self.fuse_mode12 = AFF(channels=m_channels * 4 * mul_channel)
219 |         self.fuse_mode123 = AFF(channels=m_channels * 8 * mul_channel)
220 |         self.fuse_mode1234 = AFF(channels=m_channels * 16 * mul_channel)
221 | 
222 |         self.n_stats = 2
223 |         self.pooling = TemporalStatsPool()
224 | 
225 |         self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim)
226 |         if self.two_emb_layer:
227 |             self.seg_bn_1 = nn.BatchNorm1d(embd_dim, affine=False)
228 |             self.seg_2 = nn.Linear(embd_dim, embd_dim)
229 |         else:
230 |             self.seg_bn_1 = nn.Identity()
231 |             self.seg_2 = nn.Identity()
232 |         # 分类层
233 |         self.fc = nn.Linear(embd_dim, num_class)
234 | 
235 |     def _make_layer(self, block, planes, num_blocks, stride, base_width, scale):
236 |         strides = [stride] + [1] * (num_blocks - 1)
237 |         layers = []
238 |         for stride in strides:
239 |             layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale))
240 |             self.in_planes = planes * self.expansion
241 |         return nn.Sequential(*layers)
242 | 
243 |     def forward(self, x):
244 |         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
245 | 
246 |         x = x.unsqueeze_(1)
247 |         out = F.relu(self.bn1(self.conv1(x)))
248 |         out1 = self.layer1(out)
249 |         out2 = self.layer2(out1)
250 |         out1_downsample = self.layer1_downsample(out1)
251 |         fuse_out12 = self.fuse_mode12(out2, out1_downsample)
252 |         out3 = self.layer3(out2)
253 |         fuse_out12_downsample = self.layer2_downsample(fuse_out12)
254 |         fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
255 |         out4 = self.layer4(out3)
256 |         fuse_out123_downsample = self.layer3_downsample(fuse_out123)
257 |         fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
258 |         stats = self.pooling(fuse_out1234)
259 | 
260 |         embed_a = self.seg_1(stats)
261 |         if self.two_emb_layer:
262 |             out = F.relu(embed_a)
263 |             out = self.seg_bn_1(out)
264 |             embed_b = self.seg_2(out)
265 | 
266 |             out = self.fc(embed_b)
267 |             return out
268 |         else:
269 |             out = self.fc(embed_a)
270 |             return out
271 | 
272 | 
273 | class BasicBlockERes2NetV2(nn.Module):
274 | 
275 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2):
276 |         super(BasicBlockERes2NetV2, self).__init__()
277 |         self.expansion = expansion
278 |         width = int(math.floor(planes * (base_width / 64.0)))
279 |         self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
280 |         self.bn1 = nn.BatchNorm2d(width * scale)
281 |         self.nums = scale
282 | 
283 |         convs = []
284 |         bns = []
285 |         for i in range(self.nums):
286 |             convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
287 |             bns.append(nn.BatchNorm2d(width))
288 |         self.convs = nn.ModuleList(convs)
289 |         self.bns = nn.ModuleList(bns)
290 |         self.relu = ReLU(inplace=True)
291 | 
292 |         self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
293 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
294 |         self.shortcut = nn.Sequential()
295 |         if stride != 1 or in_planes != self.expansion * planes:
296 |             self.shortcut = nn.Sequential(
297 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
298 |                 nn.BatchNorm2d(self.expansion * planes))
299 |         self.stride = stride
300 |         self.width = width
301 |         self.scale = scale
302 | 
303 |     def forward(self, x):
304 |         out = self.conv1(x)
305 |         out = self.bn1(out)
306 |         out = self.relu(out)
307 |         spx = torch.split(out, self.width, 1)
308 |         for i in range(self.nums):
309 |             if i == 0:
310 |                 sp = spx[i]
311 |             else:
312 |                 sp = sp + spx[i]
313 |             sp = self.convs[i](sp)
314 |             sp = self.relu(self.bns[i](sp))
315 |             if i == 0:
316 |                 out = sp
317 |             else:
318 |                 out = torch.cat((out, sp), 1)
319 |         out = self.conv3(out)
320 |         out = self.bn3(out)
321 | 
322 |         residual = self.shortcut(x)
323 |         out += residual
324 |         out = self.relu(out)
325 | 
326 |         return out
327 | 
328 | 
329 | class BasicBlockERes2NetV2_AFF(nn.Module):
330 | 
331 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2):
332 |         super(BasicBlockERes2NetV2_AFF, self).__init__()
333 |         self.expansion = expansion
334 |         width = int(math.floor(planes * (base_width / 64.0)))
335 |         self.conv1 = nn.Conv2d(in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
336 |         self.bn1 = nn.BatchNorm2d(width * scale)
337 |         self.nums = scale
338 | 
339 |         convs = []
340 |         fuse_models = []
341 |         bns = []
342 |         for i in range(self.nums):
343 |             convs.append(nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
344 |             bns.append(nn.BatchNorm2d(width))
345 |         for j in range(self.nums - 1):
346 |             fuse_models.append(AFF(channels=width, r=4))
347 | 
348 |         self.convs = nn.ModuleList(convs)
349 |         self.bns = nn.ModuleList(bns)
350 |         self.fuse_models = nn.ModuleList(fuse_models)
351 |         self.relu = ReLU(inplace=True)
352 | 
353 |         self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
354 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
355 |         self.shortcut = nn.Sequential()
356 |         if stride != 1 or in_planes != self.expansion * planes:
357 |             self.shortcut = nn.Sequential(
358 |                 nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
359 |                 nn.BatchNorm2d(self.expansion * planes))
360 |         self.stride = stride
361 |         self.width = width
362 |         self.scale = scale
363 | 
364 |     def forward(self, x):
365 |         out = self.conv1(x)
366 |         out = self.bn1(out)
367 |         out = self.relu(out)
368 |         spx = torch.split(out, self.width, 1)
369 |         for i in range(self.nums):
370 |             if i == 0:
371 |                 sp = spx[i]
372 |             else:
373 |                 sp = self.fuse_models[i - 1](sp, spx[i])
374 |             sp = self.convs[i](sp)
375 |             sp = self.relu(self.bns[i](sp))
376 |             if i == 0:
377 |                 out = sp
378 |             else:
379 |                 out = torch.cat((out, sp), 1)
380 |         out = self.conv3(out)
381 |         out = self.bn3(out)
382 | 
383 |         residual = self.shortcut(x)
384 |         out += residual
385 |         out = self.relu(out)
386 | 
387 |         return out
388 | 
389 | 
390 | class ERes2NetV2(nn.Module):
391 |     def __init__(self,
392 |                  num_class,
393 |                  input_size,
394 |                  block=BasicBlockERes2NetV2,
395 |                  block_fuse=BasicBlockERes2NetV2_AFF,
396 |                  num_blocks=[3, 4, 6, 3],
397 |                  m_channels=32,
398 |                  expansion=2,
399 |                  base_width=26,
400 |                  scale=2,
401 |                  embd_dim=192,
402 |                  pooling_type='TSTP',
403 |                  two_emb_layer=False):
404 |         super(ERes2NetV2, self).__init__()
405 |         self.in_planes = m_channels
406 |         self.expansion = expansion
407 |         self.embd_dim = embd_dim
408 |         self.stats_dim = int(input_size / 8) * m_channels * 8
409 |         self.two_emb_layer = two_emb_layer
410 | 
411 |         self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
412 |         self.bn1 = nn.BatchNorm2d(m_channels)
413 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1,
414 |                                        base_width=base_width, scale=scale)
415 |         self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2,
416 |                                        base_width=base_width, scale=scale)
417 |         self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2,
418 |                                        base_width=base_width, scale=scale)
419 |         self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2,
420 |                                        base_width=base_width, scale=scale)
421 | 
422 |         # Downsampling module
423 |         self.layer3_ds = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
424 | 
425 |         # Bottom-up fusion module
426 |         self.fuse34 = AFF(channels=m_channels * 16, r=4)
427 | 
428 |         self.n_stats = 2
429 |         self.pooling = TemporalStatsPool()
430 | 
431 |         self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim)
432 |         if self.two_emb_layer:
433 |             self.seg_bn_1 = nn.BatchNorm1d(embd_dim, affine=False)
434 |             self.seg_2 = nn.Linear(embd_dim, embd_dim)
435 |         else:
436 |             self.seg_bn_1 = nn.Identity()
437 |             self.seg_2 = nn.Identity()
438 |         # 分类层
439 |         self.fc = nn.Linear(embd_dim, num_class)
440 | 
441 |     def _make_layer(self, block, planes, num_blocks, stride, base_width, scale):
442 |         strides = [stride] + [1] * (num_blocks - 1)
443 |         layers = []
444 |         for stride in strides:
445 |             layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale))
446 |             self.in_planes = planes * self.expansion
447 |         return nn.Sequential(*layers)
448 | 
449 |     def forward(self, x):
450 |         x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
451 |         x = x.unsqueeze_(1)
452 |         out = F.relu(self.bn1(self.conv1(x)))
453 |         out1 = self.layer1(out)
454 |         out2 = self.layer2(out1)
455 |         out3 = self.layer3(out2)
456 |         out4 = self.layer4(out3)
457 |         out3_ds = self.layer3_ds(out3)
458 |         fuse_out34 = self.fuse34(out4, out3_ds)
459 |         stats = self.pooling(fuse_out34)
460 | 
461 |         embed_a = self.seg_1(stats)
462 |         if self.two_emb_layer:
463 |             out = F.relu(embed_a)
464 |             out = self.seg_bn_1(out)
465 |             embed_b = self.seg_2(out)
466 |             out = self.fc(embed_b)
467 |             return out
468 |         else:
469 |             out = self.fc(embed_a)
470 |             return out
471 | 


--------------------------------------------------------------------------------
/macls/models/panns.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | 
  4 | 
  5 | class ConvBlock(nn.Module):
  6 |     def __init__(self, in_channels, out_channels):
  7 |         super(ConvBlock, self).__init__()
  8 | 
  9 |         self.conv1 = nn.Conv2d(in_channels=in_channels,
 10 |                                out_channels=out_channels,
 11 |                                kernel_size=(3, 3),
 12 |                                stride=(1, 1),
 13 |                                padding=(1, 1))
 14 |         self.conv2 = nn.Conv2d(in_channels=out_channels,
 15 |                                out_channels=out_channels,
 16 |                                kernel_size=(3, 3),
 17 |                                stride=(1, 1),
 18 |                                padding=(1, 1))
 19 |         self.bn1 = nn.BatchNorm2d(out_channels)
 20 |         self.bn2 = nn.BatchNorm2d(out_channels)
 21 | 
 22 |     def forward(self, x, pool_size=(2, 2), pool_type='avg'):
 23 |         x = self.conv1(x)
 24 |         x = self.bn1(x)
 25 |         x = F.relu(x)
 26 | 
 27 |         x = self.conv2(x)
 28 |         x = self.bn2(x)
 29 |         x = F.relu(x)
 30 | 
 31 |         if pool_type == 'max':
 32 |             x = F.max_pool2d(x, kernel_size=pool_size)
 33 |         elif pool_type == 'avg':
 34 |             x = F.avg_pool2d(x, kernel_size=pool_size)
 35 |         elif pool_type == 'avg+max':
 36 |             x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size)
 37 |         else:
 38 |             raise Exception(
 39 |                 f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".')
 40 |         return x
 41 | 
 42 | 
 43 | class ConvBlock5x5(nn.Module):
 44 |     def __init__(self, in_channels, out_channels):
 45 |         super(ConvBlock5x5, self).__init__()
 46 | 
 47 |         self.conv1 = nn.Conv2d(in_channels=in_channels,
 48 |                                out_channels=out_channels,
 49 |                                kernel_size=(5, 5),
 50 |                                stride=(1, 1),
 51 |                                padding=(2, 2))
 52 |         self.bn1 = nn.BatchNorm2d(out_channels)
 53 | 
 54 |     def forward(self, x, pool_size=(2, 2), pool_type='avg'):
 55 |         x = self.conv1(x)
 56 |         x = self.bn1(x)
 57 |         x = F.relu(x)
 58 | 
 59 |         if pool_type == 'max':
 60 |             x = F.max_pool2d(x, kernel_size=pool_size)
 61 |         elif pool_type == 'avg':
 62 |             x = F.avg_pool2d(x, kernel_size=pool_size)
 63 |         elif pool_type == 'avg+max':
 64 |             x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size)
 65 |         else:
 66 |             raise Exception(
 67 |                 f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".')
 68 |         return x
 69 | 
 70 | 
 71 | class PANNS_CNN6(nn.Module):
 72 |     """
 73 |     The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
 74 |     block consists of 1 convolutional layers with a kernel size of 5 × 5.
 75 | 
 76 |     Reference:
 77 |         PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
 78 |         https://arxiv.org/pdf/1912.10211.pdf
 79 |     """
 80 |     emb_size = 512
 81 | 
 82 |     def __init__(self, num_class, input_size, dropout=0.1, extract_embedding: bool = True):
 83 | 
 84 |         super(PANNS_CNN6, self).__init__()
 85 |         self.bn0 = nn.BatchNorm2d(input_size)
 86 |         self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
 87 |         self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
 88 |         self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
 89 |         self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
 90 | 
 91 |         self.fc1 = nn.Linear(512, self.emb_size)
 92 |         self.fc_audioset = nn.Linear(self.emb_size, 527)
 93 |         self.extract_embedding = extract_embedding
 94 | 
 95 |         self.dropout = nn.Dropout(dropout)
 96 |         self.fc = nn.Linear(self.emb_size, num_class)
 97 | 
 98 |     def forward(self, x):
 99 |         x = x.unsqueeze(1)
100 |         x = x.permute([0, 3, 2, 1])
101 |         x = self.bn0(x)
102 |         x = x.permute([0, 3, 2, 1])
103 | 
104 |         x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
105 |         x = F.dropout(x, p=0.2, training=self.training)
106 | 
107 |         x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
108 |         x = F.dropout(x, p=0.2, training=self.training)
109 | 
110 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
111 |         x = F.dropout(x, p=0.2, training=self.training)
112 | 
113 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
114 |         x = F.dropout(x, p=0.2, training=self.training)
115 | 
116 |         x = x.mean(dim=3)
117 |         x = x.max(dim=2)[0] + x.mean(dim=2)
118 | 
119 |         x = F.dropout(x, p=0.5, training=self.training)
120 |         x = F.relu(self.fc1(x))
121 | 
122 |         if self.extract_embedding:
123 |             output = F.dropout(x, p=0.5, training=self.training)
124 |         else:
125 |             output = F.sigmoid(self.fc_audioset(x))
126 | 
127 |         x = self.dropout(output)
128 |         logits = self.fc(x)
129 | 
130 |         return logits
131 | 
132 | 
133 | class PANNS_CNN10(nn.Module):
134 |     """
135 |     The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
136 |     block consists of 2 convolutional layers with a kernel size of 3 × 3.
137 | 
138 |     Reference:
139 |         PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
140 |         https://arxiv.org/pdf/1912.10211.pdf
141 |     """
142 |     emb_size = 512
143 | 
144 |     def __init__(self, num_class, input_size, dropout=0.1, extract_embedding: bool = True):
145 | 
146 |         super(PANNS_CNN10, self).__init__()
147 |         self.bn0 = nn.BatchNorm2d(input_size)
148 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
149 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
150 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
151 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
152 | 
153 |         self.fc1 = nn.Linear(512, self.emb_size)
154 |         self.fc_audioset = nn.Linear(self.emb_size, 527)
155 |         self.extract_embedding = extract_embedding
156 | 
157 |         self.dropout = nn.Dropout(dropout)
158 |         self.fc = nn.Linear(self.emb_size, num_class)
159 | 
160 |     def forward(self, x):
161 |         x = x.unsqueeze(1)
162 |         x = x.permute([0, 3, 2, 1])
163 |         x = self.bn0(x)
164 |         x = x.permute([0, 3, 2, 1])
165 | 
166 |         x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
167 |         x = F.dropout(x, p=0.2, training=self.training)
168 | 
169 |         x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
170 |         x = F.dropout(x, p=0.2, training=self.training)
171 | 
172 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
173 |         x = F.dropout(x, p=0.2, training=self.training)
174 | 
175 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
176 |         x = F.dropout(x, p=0.2, training=self.training)
177 | 
178 |         x = x.mean(dim=3)
179 |         x = x.max(dim=2)[0] + x.mean(dim=2)
180 | 
181 |         x = F.dropout(x, p=0.5, training=self.training)
182 |         x = F.relu(self.fc1(x))
183 | 
184 |         if self.extract_embedding:
185 |             output = F.dropout(x, p=0.5, training=self.training)
186 |         else:
187 |             output = F.sigmoid(self.fc_audioset(x))
188 | 
189 |         x = self.dropout(output)
190 |         logits = self.fc(x)
191 | 
192 |         return logits
193 | 
194 | 
195 | class PANNS_CNN14(nn.Module):
196 |     """
197 |     The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
198 |     block consists of 2 convolutional layers with a kernel size of 3 × 3.
199 | 
200 |     Reference:
201 |         PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
202 |         https://arxiv.org/pdf/1912.10211.pdf
203 |     """
204 |     emb_size = 2048
205 | 
206 |     def __init__(self, num_class, input_size, dropout=0.1, extract_embedding: bool = True):
207 | 
208 |         super(PANNS_CNN14, self).__init__()
209 |         self.bn0 = nn.BatchNorm2d(input_size)
210 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
211 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
212 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
213 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
214 |         self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
215 |         self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
216 | 
217 |         self.fc1 = nn.Linear(2048, self.emb_size)
218 |         self.fc_audioset = nn.Linear(self.emb_size, 527)
219 |         self.extract_embedding = extract_embedding
220 | 
221 |         self.dropout = nn.Dropout(dropout)
222 |         self.fc = nn.Linear(self.emb_size, num_class)
223 | 
224 |     def forward(self, x):
225 |         x = x.unsqueeze(1)
226 |         x = x.permute([0, 3, 2, 1])
227 |         x = self.bn0(x)
228 |         x = x.permute([0, 3, 2, 1])
229 | 
230 |         x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
231 |         x = F.dropout(x, p=0.2, training=self.training)
232 | 
233 |         x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
234 |         x = F.dropout(x, p=0.2, training=self.training)
235 | 
236 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
237 |         x = F.dropout(x, p=0.2, training=self.training)
238 | 
239 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
240 |         x = F.dropout(x, p=0.2, training=self.training)
241 | 
242 |         x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
243 |         x = F.dropout(x, p=0.2, training=self.training)
244 | 
245 |         x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
246 |         x = F.dropout(x, p=0.2, training=self.training)
247 | 
248 |         x = x.mean(dim=3)
249 |         x = x.max(dim=2)[0] + x.mean(dim=2)
250 | 
251 |         x = F.dropout(x, p=0.5, training=self.training)
252 |         x = F.relu(self.fc1(x))
253 | 
254 |         if self.extract_embedding:
255 |             output = F.dropout(x, p=0.5, training=self.training)
256 |         else:
257 |             output = F.sigmoid(self.fc_audioset(x))
258 | 
259 |         x = self.dropout(output)
260 |         logits = self.fc(x)
261 | 
262 |         return logits
263 | 


--------------------------------------------------------------------------------
/macls/models/pooling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from macls.models.utils import TDNNBlock, Conv1d, length_to_mask
  6 | 
  7 | 
  8 | class TemporalAveragePooling(nn.Module):
  9 |     def __init__(self):
 10 |         """TAP
 11 |         Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification
 12 |         Link: https://arxiv.org/pdf/1903.12058.pdf
 13 |         """
 14 |         super(TemporalAveragePooling, self).__init__()
 15 | 
 16 |     def forward(self, x):
 17 |         """Computes Temporal Average Pooling Module
 18 |         Args:
 19 |             x (torch.Tensor): Input tensor (#batch, channels, frames).
 20 |         Returns:
 21 |             torch.Tensor: Output tensor (#batch, channels)
 22 |         """
 23 |         x = x.mean(dim=-1)
 24 |         # To be compatable with 2D input
 25 |         x = x.flatten(start_dim=1)
 26 |         return x
 27 | 
 28 | 
 29 | class TemporalStatisticsPooling(nn.Module):
 30 |     def __init__(self):
 31 |         """TSP
 32 |         Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition
 33 |         Link： http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
 34 |         """
 35 |         super(TemporalStatisticsPooling, self).__init__()
 36 | 
 37 |     def forward(self, x):
 38 |         """Computes Temporal Statistics Pooling Module
 39 |         Args:
 40 |             x (torch.Tensor): Input tensor (#batch, channels, frames).
 41 |         Returns:
 42 |             torch.Tensor: Output tensor (#batch, channels*2)
 43 |         """
 44 |         mean = torch.mean(x, dim=2)
 45 |         var = torch.var(x, dim=2)
 46 |         x = torch.cat((mean, var), dim=1)
 47 |         return x
 48 | 
 49 | 
 50 | class SelfAttentivePooling(nn.Module):
 51 |     """SAP"""
 52 | 
 53 |     def __init__(self, in_dim, bottleneck_dim=128):
 54 |         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
 55 |         # attention dim = 128
 56 |         super(SelfAttentivePooling, self).__init__()
 57 |         self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
 58 |         self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
 59 | 
 60 |     def forward(self, x):
 61 |         # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
 62 |         alpha = torch.tanh(self.linear1(x))
 63 |         alpha = torch.softmax(self.linear2(alpha), dim=2)
 64 |         mean = torch.sum(alpha * x, dim=2)
 65 |         return mean
 66 | 
 67 | 
 68 | class AttentiveStatisticsPooling(nn.Module):
 69 |     """ASP
 70 |     This class implements an attentive statistic pooling layer for each channel.
 71 |     It returns the concatenated mean and std of the input tensor.
 72 |     """
 73 | 
 74 |     def __init__(self, channels, attention_channels=128, global_context=True):
 75 |         super().__init__()
 76 | 
 77 |         self.eps = 1e-12
 78 |         self.global_context = global_context
 79 |         if global_context:
 80 |             self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
 81 |         else:
 82 |             self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
 83 |         self.tanh = nn.Tanh()
 84 |         self.conv = Conv1d(in_channels=attention_channels, out_channels=channels, kernel_size=1)
 85 | 
 86 |     def forward(self, x, lengths=None):
 87 |         """Calculates mean and std for a batch (input tensor).
 88 |         """
 89 |         L = x.shape[-1]
 90 | 
 91 |         def _compute_statistics(x, m, dim=2, eps=self.eps):
 92 |             mean = (m * x).sum(dim)
 93 |             std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
 94 |             return mean, std
 95 | 
 96 |         if lengths is None:
 97 |             lengths = torch.ones(x.shape[0], device=x.device)
 98 | 
 99 |         # Make binary mask of shape [N, 1, L]
100 |         mask = length_to_mask(lengths * L, max_len=L, device=x.device)
101 |         mask = mask.unsqueeze(1)
102 | 
103 |         # Expand the temporal context of the pooling layer by allowing the
104 |         # self-attention to look at global properties of the utterance.
105 |         if self.global_context:
106 |             # torch.std is unstable for backward computation
107 |             # https://github.com/pytorch/pytorch/issues/4320
108 |             total = mask.sum(dim=2, keepdim=True).float()
109 |             mean, std = _compute_statistics(x, mask / total)
110 |             mean = mean.unsqueeze(2).repeat(1, 1, L)
111 |             std = std.unsqueeze(2).repeat(1, 1, L)
112 |             attn = torch.cat([x, mean, std], dim=1)
113 |         else:
114 |             attn = x
115 | 
116 |         # Apply layers
117 |         attn = self.conv(self.tanh(self.tdnn(attn)))
118 | 
119 |         # Filter out zero-paddings
120 |         attn = attn.masked_fill(mask == 0, float("-inf"))
121 | 
122 |         attn = F.softmax(attn, dim=2)
123 |         mean, std = _compute_statistics(x, attn)
124 |         # Append mean and std of the batch
125 |         pooled_stats = torch.cat((mean, std), dim=1)
126 | 
127 |         return pooled_stats
128 | 
129 | 
130 | class TemporalStatsPool(nn.Module):
131 |     """TSTP
132 |     Temporal statistics pooling, concatenate mean and std, which is used in
133 |     x-vector
134 |     Comment: simple concatenation can not make full use of both statistics
135 |     """
136 | 
137 |     def __init__(self):
138 |         super(TemporalStatsPool, self).__init__()
139 | 
140 |     def forward(self, x):
141 |         # The last dimension is the temporal axis
142 |         pooling_mean = x.mean(dim=-1)
143 |         pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
144 |         pooling_mean = pooling_mean.flatten(start_dim=1)
145 |         pooling_std = pooling_std.flatten(start_dim=1)
146 | 
147 |         stats = torch.cat((pooling_mean, pooling_std), 1)
148 |         return stats
149 | 


--------------------------------------------------------------------------------
/macls/models/res2net.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from macls.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
  7 | from macls.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
  8 | 
  9 | 
 10 | class Bottle2neck(nn.Module):
 11 |     expansion = 4
 12 | 
 13 |     def __init__(self, inplanes, planes, stride=1, downsample=None, baseWidth=26, scale=4, stype='normal'):
 14 |         """ Constructor
 15 |         Args:
 16 |             inplanes: input channel dimensionality
 17 |             planes: output channel dimensionality
 18 |             stride: conv stride. Replaces pooling layer.
 19 |             downsample: None when stride = 1
 20 |             baseWidth: basic width of conv3x3
 21 |             scale: number of scale.
 22 |             type: 'normal': normal set. 'stage': first block of a new stage.
 23 |         """
 24 |         super(Bottle2neck, self).__init__()
 25 | 
 26 |         width = int(math.floor(planes * (baseWidth / 64.0)))
 27 |         self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
 28 |         self.bn1 = nn.BatchNorm2d(width * scale)
 29 | 
 30 |         if scale == 1:
 31 |             self.nums = 1
 32 |         else:
 33 |             self.nums = scale - 1
 34 |         if stype == 'stage':
 35 |             self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
 36 |         convs = []
 37 |         bns = []
 38 |         for i in range(self.nums):
 39 |             convs.append(nn.Conv2d(width, width, kernel_size=3, stride=stride, padding=1, bias=False))
 40 |             bns.append(nn.BatchNorm2d(width))
 41 |         self.convs = nn.ModuleList(convs)
 42 |         self.bns = nn.ModuleList(bns)
 43 | 
 44 |         self.conv3 = nn.Conv2d(width * scale, planes * self.expansion, kernel_size=1, bias=False)
 45 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 46 | 
 47 |         self.relu = nn.ReLU(inplace=True)
 48 |         self.downsample = downsample
 49 |         self.stype = stype
 50 |         self.scale = scale
 51 |         self.width = width
 52 | 
 53 |     def forward(self, x):
 54 |         residual = x
 55 | 
 56 |         out = self.conv1(x)
 57 |         out = self.bn1(out)
 58 |         out = self.relu(out)
 59 | 
 60 |         spx = torch.split(out, self.width, 1)
 61 |         for i in range(self.nums):
 62 |             if i == 0 or self.stype == 'stage':
 63 |                 sp = spx[i]
 64 |             else:
 65 |                 sp = sp + spx[i]
 66 |             sp = self.convs[i](sp)
 67 |             sp = self.relu(self.bns[i](sp))
 68 |             if i == 0:
 69 |                 out = sp
 70 |             else:
 71 |                 out = torch.cat((out, sp), 1)
 72 |         if self.scale != 1 and self.stype == 'normal':
 73 |             out = torch.cat((out, spx[self.nums]), 1)
 74 |         elif self.scale != 1 and self.stype == 'stage':
 75 |             out = torch.cat((out, self.pool(spx[self.nums])), 1)
 76 | 
 77 |         out = self.conv3(out)
 78 |         out = self.bn3(out)
 79 | 
 80 |         if self.downsample is not None:
 81 |             residual = self.downsample(x)
 82 | 
 83 |         out += residual
 84 |         out = self.relu(out)
 85 | 
 86 |         return out
 87 | 
 88 | 
 89 | class Res2Net(nn.Module):
 90 | 
 91 |     def __init__(self, num_class, input_size, m_channels=32, layers=[3, 4, 6, 3], base_width=32, scale=2, embd_dim=192,
 92 |                  pooling_type="ASP"):
 93 |         super(Res2Net, self).__init__()
 94 |         self.inplanes = m_channels
 95 |         self.base_width = base_width
 96 |         self.scale = scale
 97 |         self.embd_dim = embd_dim
 98 |         self.conv1 = nn.Conv2d(1, m_channels, kernel_size=7, stride=3, padding=1, bias=False)
 99 |         self.bn1 = nn.BatchNorm2d(m_channels)
100 |         self.relu = nn.ReLU(inplace=True)
101 |         self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
102 |         self.layer1 = self._make_layer(Bottle2neck, m_channels, layers[0])
103 |         self.layer2 = self._make_layer(Bottle2neck, m_channels * 2, layers[1], stride=2)
104 |         self.layer3 = self._make_layer(Bottle2neck, m_channels * 4, layers[2], stride=2)
105 |         self.layer4 = self._make_layer(Bottle2neck, m_channels * 8, layers[3], stride=2)
106 | 
107 |         if input_size < 96:
108 |             cat_channels = m_channels * 8 * Bottle2neck.expansion * (input_size // self.base_width)
109 |         else:
110 |             cat_channels = m_channels * 8 * Bottle2neck.expansion * (
111 |                         input_size // self.base_width - int(math.sqrt(input_size / 64)))
112 |         if pooling_type == "ASP":
113 |             self.pooling = AttentiveStatisticsPooling(cat_channels, 128)
114 |             self.bn2 = nn.BatchNorm1d(cat_channels * 2)
115 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
116 |             self.bn3 = nn.BatchNorm1d(embd_dim)
117 |         elif pooling_type == "SAP":
118 |             self.pooling = SelfAttentivePooling(cat_channels, 128)
119 |             self.bn2 = nn.BatchNorm1d(cat_channels)
120 |             self.linear = nn.Linear(cat_channels, embd_dim)
121 |             self.bn3 = nn.BatchNorm1d(embd_dim)
122 |         elif pooling_type == "TAP":
123 |             self.pooling = TemporalAveragePooling()
124 |             self.bn2 = nn.BatchNorm1d(cat_channels)
125 |             self.linear = nn.Linear(cat_channels, embd_dim)
126 |             self.bn3 = nn.BatchNorm1d(embd_dim)
127 |         elif pooling_type == "TSP":
128 |             self.pooling = TemporalStatisticsPooling()
129 |             self.bn2 = nn.BatchNorm1d(cat_channels * 2)
130 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
131 |             self.bn3 = nn.BatchNorm1d(embd_dim)
132 |         else:
133 |             raise Exception(f'没有{pooling_type}池化层！')
134 | 
135 |         self.fc = nn.Linear(embd_dim, num_class)
136 | 
137 |         for m in self.modules():
138 |             if isinstance(m, nn.Conv2d):
139 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
140 |             elif isinstance(m, nn.BatchNorm2d):
141 |                 nn.init.constant_(m.weight, 1)
142 |                 nn.init.constant_(m.bias, 0)
143 | 
144 |     def _make_layer(self, block, planes, blocks, stride=1):
145 |         downsample = None
146 |         if stride != 1 or self.inplanes != planes * block.expansion:
147 |             downsample = nn.Sequential(
148 |                 nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
149 |                 nn.BatchNorm2d(planes * block.expansion),
150 |             )
151 | 
152 |         layers = [block(self.inplanes, planes, stride, downsample=downsample,
153 |                         stype='stage', baseWidth=self.base_width, scale=self.scale)]
154 |         self.inplanes = planes * block.expansion
155 |         for i in range(1, blocks):
156 |             layers.append(block(self.inplanes, planes, baseWidth=self.base_width, scale=self.scale))
157 | 
158 |         return nn.Sequential(*layers)
159 | 
160 |     def forward(self, x):
161 |         x = x.transpose(2, 1)
162 |         x = x.unsqueeze(1)
163 |         x = self.conv1(x)
164 |         x = self.bn1(x)
165 |         x = self.relu(x)
166 |         x = self.max_pool(x)
167 | 
168 |         x = self.layer1(x)
169 |         x = self.layer2(x)
170 |         x = self.layer3(x)
171 |         x = self.layer4(x)
172 | 
173 |         x = x.reshape(x.shape[0], -1, x.shape[-1])
174 | 
175 |         x = self.pooling(x)
176 |         x = self.bn2(x)
177 |         x = self.linear(x)
178 |         x = self.bn3(x)
179 | 
180 |         out = self.fc(x)
181 |         return out
182 | 


--------------------------------------------------------------------------------
/macls/models/resnet_se.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | 
  3 | from macls.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
  4 | from macls.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
  5 | 
  6 | 
  7 | class SEBottleneck(nn.Module):
  8 |     expansion = 2
  9 | 
 10 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
 11 |         super(SEBottleneck, self).__init__()
 12 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 13 |         self.bn1 = nn.BatchNorm2d(planes)
 14 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 15 |         self.bn2 = nn.BatchNorm2d(planes)
 16 |         self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
 17 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 18 |         self.relu = nn.ReLU(inplace=True)
 19 |         self.se = SELayer(planes * self.expansion, reduction)
 20 |         self.downsample = downsample
 21 |         self.stride = stride
 22 | 
 23 |     def forward(self, x):
 24 |         residual = x
 25 | 
 26 |         out = self.conv1(x)
 27 |         out = self.bn1(out)
 28 |         out = self.relu(out)
 29 | 
 30 |         out = self.conv2(out)
 31 |         out = self.bn2(out)
 32 |         out = self.relu(out)
 33 | 
 34 |         out = self.conv3(out)
 35 |         out = self.bn3(out)
 36 |         out = self.se(out)
 37 | 
 38 |         if self.downsample is not None:
 39 |             residual = self.downsample(x)
 40 | 
 41 |         out += residual
 42 |         out = self.relu(out)
 43 | 
 44 |         return out
 45 | 
 46 | 
 47 | class SELayer(nn.Module):
 48 |     def __init__(self, channel, reduction=8):
 49 |         super(SELayer, self).__init__()
 50 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 51 |         self.fc = nn.Sequential(
 52 |             nn.Linear(channel, channel // reduction),
 53 |             nn.ReLU(inplace=True),
 54 |             nn.Linear(channel // reduction, channel),
 55 |             nn.Sigmoid()
 56 |         )
 57 | 
 58 |     def forward(self, x):
 59 |         b, c, _, _ = x.size()
 60 |         y = self.avg_pool(x).view(b, c)
 61 |         y = self.fc(y).view(b, c, 1, 1)
 62 |         return x * y
 63 | 
 64 | 
 65 | class ResNetSE(nn.Module):
 66 |     def __init__(self, num_class, input_size, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], embd_dim=192,
 67 |                  pooling_type="ASP"):
 68 |         super(ResNetSE, self).__init__()
 69 |         self.inplanes = num_filters[0]
 70 |         self.emb_size = embd_dim
 71 |         self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=(1, 1), padding=1, bias=False)
 72 |         self.bn1 = nn.BatchNorm2d(num_filters[0])
 73 |         self.relu = nn.ReLU(inplace=True)
 74 | 
 75 |         self.layer1 = self._make_layer(SEBottleneck, num_filters[0], layers[0])
 76 |         self.layer2 = self._make_layer(SEBottleneck, num_filters[1], layers[1], stride=(2, 2))
 77 |         self.layer3 = self._make_layer(SEBottleneck, num_filters[2], layers[2], stride=(2, 2))
 78 |         self.layer4 = self._make_layer(SEBottleneck, num_filters[3], layers[3], stride=(2, 2))
 79 | 
 80 |         cat_channels = num_filters[3] * SEBottleneck.expansion * (input_size // 8)
 81 |         if pooling_type == "ASP":
 82 |             self.pooling = AttentiveStatisticsPooling(cat_channels, 128)
 83 |             self.bn2 = nn.BatchNorm1d(cat_channels * 2)
 84 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
 85 |             self.bn3 = nn.BatchNorm1d(embd_dim)
 86 |         elif pooling_type == "SAP":
 87 |             self.pooling = SelfAttentivePooling(cat_channels, 128)
 88 |             self.bn2 = nn.BatchNorm1d(cat_channels)
 89 |             self.linear = nn.Linear(cat_channels, embd_dim)
 90 |             self.bn3 = nn.BatchNorm1d(embd_dim)
 91 |         elif pooling_type == "TAP":
 92 |             self.pooling = TemporalAveragePooling()
 93 |             self.bn2 = nn.BatchNorm1d(cat_channels)
 94 |             self.linear = nn.Linear(cat_channels, embd_dim)
 95 |             self.bn3 = nn.BatchNorm1d(embd_dim)
 96 |         elif pooling_type == "TSP":
 97 |             self.pooling = TemporalStatisticsPooling()
 98 |             self.bn2 = nn.BatchNorm1d(cat_channels * 2)
 99 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
100 |             self.bn3 = nn.BatchNorm1d(embd_dim)
101 |         else:
102 |             raise Exception(f'没有{pooling_type}池化层！')
103 | 
104 |         self.fc = nn.Linear(embd_dim, num_class)
105 | 
106 |         for m in self.modules():
107 |             if isinstance(m, nn.Conv2d):
108 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
109 |             elif isinstance(m, nn.BatchNorm2d):
110 |                 nn.init.constant_(m.weight, 1)
111 |                 nn.init.constant_(m.bias, 0)
112 | 
113 |     def _make_layer(self, block, planes, blocks, stride=1):
114 |         downsample = None
115 |         if stride != 1 or self.inplanes != planes * block.expansion:
116 |             downsample = nn.Sequential(
117 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
118 |                           kernel_size=1, stride=stride, bias=False),
119 |                 nn.BatchNorm2d(planes * block.expansion),
120 |             )
121 | 
122 |         layers = [block(self.inplanes, planes, stride, downsample)]
123 |         self.inplanes = planes * block.expansion
124 |         for i in range(1, blocks):
125 |             layers.append(block(self.inplanes, planes))
126 | 
127 |         return nn.Sequential(*layers)
128 | 
129 |     def forward(self, x):
130 |         x = x.transpose(2, 1)
131 |         x = x.unsqueeze(1)
132 |         x = self.conv1(x)
133 |         x = self.bn1(x)
134 |         x = self.relu(x)
135 | 
136 |         x = self.layer1(x)
137 |         x = self.layer2(x)
138 |         x = self.layer3(x)
139 |         x = self.layer4(x)
140 | 
141 |         x = x.reshape(x.shape[0], -1, x.shape[-1])
142 | 
143 |         x = self.pooling(x)
144 |         x = self.bn2(x)
145 |         x = self.linear(x)
146 |         x = self.bn3(x)
147 |         out = self.fc(x)
148 |         return out
149 | 


--------------------------------------------------------------------------------
/macls/models/tdnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from macls.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
 6 | from macls.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
 7 | 
 8 | 
 9 | class TDNN(nn.Module):
10 |     def __init__(self, num_class, input_size, channels=512, embd_dim=192, pooling_type="ASP"):
11 |         super(TDNN, self).__init__()
12 |         self.emb_size = embd_dim
13 |         self.td_layer1 = torch.nn.Conv1d(in_channels=input_size, out_channels=channels, dilation=1, kernel_size=5, stride=1)
14 |         self.bn1 = nn.BatchNorm1d(channels)
15 |         self.td_layer2 = torch.nn.Conv1d(in_channels=channels, out_channels=channels, dilation=2, kernel_size=3, stride=1)
16 |         self.bn2 = nn.BatchNorm1d(channels)
17 |         self.td_layer3 = torch.nn.Conv1d(in_channels=channels, out_channels=channels, dilation=3, kernel_size=3, stride=1)
18 |         self.bn3 = nn.BatchNorm1d(channels)
19 |         self.td_layer4 = torch.nn.Conv1d(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1)
20 |         self.bn4 = nn.BatchNorm1d(channels)
21 |         self.td_layer5 = torch.nn.Conv1d(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1)
22 | 
23 |         if pooling_type == "ASP":
24 |             self.pooling = AttentiveStatisticsPooling(channels, 128)
25 |             self.bn5 = nn.BatchNorm1d(channels * 2)
26 |             self.linear = nn.Linear(channels * 2, embd_dim)
27 |             self.bn6 = nn.BatchNorm1d(embd_dim)
28 |         elif pooling_type == "SAP":
29 |             self.pooling = SelfAttentivePooling(channels, 128)
30 |             self.bn5 = nn.BatchNorm1d(channels)
31 |             self.linear = nn.Linear(channels, embd_dim)
32 |             self.bn6 = nn.BatchNorm1d(embd_dim)
33 |         elif pooling_type == "TAP":
34 |             self.pooling = TemporalAveragePooling()
35 |             self.bn5 = nn.BatchNorm1d(channels)
36 |             self.linear = nn.Linear(channels, embd_dim)
37 |             self.bn6 = nn.BatchNorm1d(embd_dim)
38 |         elif pooling_type == "TSP":
39 |             self.pooling = TemporalStatisticsPooling()
40 |             self.bn5 = nn.BatchNorm1d(channels * 2)
41 |             self.linear = nn.Linear(channels * 2, embd_dim)
42 |             self.bn6 = nn.BatchNorm1d(embd_dim)
43 |         else:
44 |             raise Exception(f'没有{pooling_type}池化层！')
45 | 
46 |         self.fc = nn.Linear(embd_dim, num_class)
47 | 
48 |     def forward(self, x):
49 |         """
50 |         Compute embeddings.
51 | 
52 |         Args:
53 |             x (torch.Tensor): Input data with shape (N, time, freq).
54 | 
55 |         Returns:
56 |             torch.Tensor: Output embeddings with shape (N, self.emb_size, 1)
57 |         """
58 |         x = x.transpose(2, 1)
59 |         x = F.relu(self.td_layer1(x))
60 |         x = self.bn1(x)
61 |         x = F.relu(self.td_layer2(x))
62 |         x = self.bn2(x)
63 |         x = F.relu(self.td_layer3(x))
64 |         x = self.bn3(x)
65 |         x = F.relu(self.td_layer4(x))
66 |         x = self.bn4(x)
67 |         x = F.relu(self.td_layer5(x))
68 |         out = self.bn5(self.pooling(x))
69 |         out = self.bn6(self.linear(out))
70 |         out = self.fc(out)
71 |         return out
72 | 


--------------------------------------------------------------------------------
/macls/models/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | def length_to_mask(length, max_len=None, dtype=None, device=None):
 10 |     assert len(length.shape) == 1
 11 | 
 12 |     if max_len is None:
 13 |         max_len = length.max().long().item()
 14 |     mask = torch.arange(
 15 |         max_len, device=length.device, dtype=length.dtype).expand(
 16 |         len(length), max_len) < length.unsqueeze(1)
 17 | 
 18 |     if dtype is None:
 19 |         dtype = length.dtype
 20 | 
 21 |     if device is None:
 22 |         device = length.device
 23 | 
 24 |     mask = torch.as_tensor(mask, dtype=dtype, device=device)
 25 |     return mask
 26 | 
 27 | 
 28 | def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
 29 |     if stride > 1:
 30 |         n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
 31 |         padding = [kernel_size // 2, kernel_size // 2]
 32 | 
 33 |     else:
 34 |         L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
 35 |         padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
 36 |     return padding
 37 | 
 38 | 
 39 | class Conv1d(nn.Module):
 40 | 
 41 |     def __init__(
 42 |             self,
 43 |             out_channels,
 44 |             kernel_size,
 45 |             in_channels,
 46 |             stride=1,
 47 |             dilation=1,
 48 |             padding='same',
 49 |             groups=1,
 50 |             bias=True,
 51 |             padding_mode='reflect', ):
 52 |         """_summary_
 53 | 
 54 |         Args:
 55 |             in_channels (int): intput channel or input data dimensions
 56 |             out_channels (int): output channel or output data dimensions
 57 |             kernel_size (int): kernel size of 1-d convolution
 58 |             stride (int, optional): strid in 1-d convolution . Defaults to 1.
 59 |             padding (str, optional): padding value. Defaults to "same".
 60 |             dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
 61 |             groups (int, optional): groups in 1-d convolution. Defaults to 1.
 62 |             bias (bool, optional): bias in 1-d convolution . Defaults to True.
 63 |             padding_mode (str, optional): padding mode. Defaults to "reflect".
 64 |         """
 65 |         super().__init__()
 66 |         self.kernel_size = kernel_size
 67 |         self.stride = stride
 68 |         self.dilation = dilation
 69 |         self.padding = padding
 70 |         self.padding_mode = padding_mode
 71 | 
 72 |         self.conv = nn.Conv1d(
 73 |             in_channels,
 74 |             out_channels,
 75 |             self.kernel_size,
 76 |             stride=self.stride,
 77 |             dilation=self.dilation,
 78 |             padding=0,
 79 |             groups=groups,
 80 |             bias=bias,
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         if self.padding == 'same':
 85 |             x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride)
 86 |         elif self.padding == 'causal':
 87 |             num_pad = (self.kernel_size - 1) * self.dilation
 88 |             x = F.pad(x, (num_pad, 0))
 89 |         elif self.padding == 'valid':
 90 |             pass
 91 |         else:
 92 |             raise ValueError(f"Padding must be 'same', 'valid' or 'causal'. Got {self.padding}")
 93 | 
 94 |         wx = self.conv(x)
 95 | 
 96 |         return wx
 97 | 
 98 |     def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
 99 |         L_in = x.shape[-1]
100 |         padding = get_padding_elem(L_in, stride, kernel_size, dilation)
101 |         x = F.pad(x, padding, mode=self.padding_mode)
102 | 
103 |         return x
104 | 
105 | 
106 | class BatchNorm1d(nn.Module):
107 |     def __init__(self, input_size, eps=1e-05, momentum=0.1, ):
108 |         super().__init__()
109 |         self.norm = nn.BatchNorm1d(input_size, eps=eps, momentum=momentum)
110 | 
111 |     def forward(self, x):
112 |         return self.norm(x)
113 | 
114 | 
115 | class TDNNBlock(nn.Module):
116 |     """An implementation of TDNN.
117 |     """
118 | 
119 |     def __init__(
120 |             self,
121 |             in_channels,
122 |             out_channels,
123 |             kernel_size,
124 |             dilation,
125 |             activation=nn.ReLU,
126 |             groups=1,
127 |     ):
128 |         super(TDNNBlock, self).__init__()
129 |         self.conv = Conv1d(in_channels=in_channels,
130 |                            out_channels=out_channels,
131 |                            kernel_size=kernel_size,
132 |                            dilation=dilation,
133 |                            groups=groups)
134 |         self.activation = activation()
135 |         self.norm = BatchNorm1d(input_size=out_channels)
136 | 
137 |     def forward(self, x):
138 |         return self.norm(self.activation(self.conv(x)))
139 | 


--------------------------------------------------------------------------------
/macls/optimizer/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from loguru import logger
 4 | from torch.optim import *
 5 | from .scheduler import WarmupCosineSchedulerLR
 6 | from torch.optim.lr_scheduler import *
 7 | 
 8 | __all__ = ['build_optimizer', 'build_lr_scheduler']
 9 | 
10 | 
11 | def build_optimizer(params, configs):
12 |     use_optimizer = configs.optimizer_conf.get('optimizer', 'Adam')
13 |     optimizer_args = configs.optimizer_conf.get('optimizer_args', {})
14 |     optim = importlib.import_module(__name__)
15 |     optimizer = getattr(optim, use_optimizer)(params=params, **optimizer_args)
16 |     logger.info(f'成功创建优化方法：{use_optimizer}，参数为：{optimizer_args}')
17 |     return optimizer
18 | 
19 | 
20 | def build_lr_scheduler(optimizer, step_per_epoch, configs):
21 |     use_scheduler = configs.optimizer_conf.get('scheduler', 'WarmupCosineSchedulerLR')
22 |     scheduler_args = configs.optimizer_conf.get('scheduler_args', {})
23 |     if configs.optimizer_conf.scheduler == 'CosineAnnealingLR' and 'T_max' not in scheduler_args:
24 |         scheduler_args.T_max = int(configs.train_conf.max_epoch * 1.2) * step_per_epoch
25 |     if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'fix_epoch' not in scheduler_args:
26 |         scheduler_args.fix_epoch = configs.train_conf.max_epoch
27 |     if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'step_per_epoch' not in scheduler_args:
28 |         scheduler_args.step_per_epoch = step_per_epoch
29 |     optim = importlib.import_module(__name__)
30 |     scheduler = getattr(optim, use_scheduler)(optimizer=optimizer, **scheduler_args)
31 |     logger.info(f'成功创建学习率衰减：{use_scheduler}，参数为：{scheduler_args}')
32 |     return scheduler
33 | 


--------------------------------------------------------------------------------
/macls/optimizer/scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import List
 3 | 
 4 | 
 5 | class WarmupCosineSchedulerLR:
 6 |     def __init__(
 7 |             self,
 8 |             optimizer,
 9 |             min_lr,
10 |             max_lr,
11 |             warmup_epoch,
12 |             fix_epoch,
13 |             step_per_epoch
14 |     ):
15 |         self.optimizer = optimizer
16 |         assert min_lr <= max_lr
17 |         self.min_lr = min_lr
18 |         self.max_lr = max_lr
19 |         self.warmup_step = warmup_epoch * step_per_epoch
20 |         self.fix_step = fix_epoch * step_per_epoch
21 |         self.current_step = 0.0
22 | 
23 |     def set_lr(self, ):
24 |         new_lr = self.clr(self.current_step)
25 |         for param_group in self.optimizer.param_groups:
26 |             param_group['lr'] = new_lr
27 |         return new_lr
28 | 
29 |     def step(self, step=None):
30 |         if step is not None:
31 |             self.current_step = step
32 |         new_lr = self.set_lr()
33 |         self.current_step += 1
34 |         return new_lr
35 | 
36 |     def clr(self, step):
37 |         if step < self.warmup_step:
38 |             return self.min_lr + (self.max_lr - self.min_lr) * \
39 |                 (step / self.warmup_step)
40 |         elif self.warmup_step <= step < self.fix_step:
41 |             return self.min_lr + 0.5 * (self.max_lr - self.min_lr) * \
42 |                 (1 + math.cos(math.pi * (step - self.warmup_step) /
43 |                               (self.fix_step - self.warmup_step)))
44 |         else:
45 |             return self.min_lr
46 | 
47 |     def get_last_lr(self) -> List[float]:
48 |         return [self.clr(self.current_step)]
49 | 


--------------------------------------------------------------------------------
/macls/predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from io import BufferedReader
  4 | from typing import List
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import yaml
  9 | from loguru import logger
 10 | from yeaudio.audio import AudioSegment
 11 | from macls.data_utils.featurizer import AudioFeaturizer
 12 | from macls.models import build_model
 13 | from macls.utils.utils import dict_to_object, print_arguments, convert_string_based_on_type
 14 | 
 15 | 
 16 | class MAClsPredictor:
 17 |     def __init__(self,
 18 |                  configs,
 19 |                  model_path='models/CAMPPlus_Fbank/best_model/',
 20 |                  use_gpu=True,
 21 |                  overwrites=None,
 22 |                  log_level="info"):
 23 |         """声音分类预测工具
 24 | 
 25 |         :param configs: 配置文件路径，或者模型名称，如果是模型名称则会使用默认的配置文件
 26 |         :param model_path: 导出的预测模型文件夹路径
 27 |         :param use_gpu: 是否使用GPU预测
 28 |         :param overwrites: 覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开
 29 |         :param log_level: 打印的日志等级，可选值有："debug", "info", "warning", "error"
 30 |         """
 31 |         if use_gpu:
 32 |             assert (torch.cuda.is_available()), 'GPU不可用'
 33 |             self.device = torch.device("cuda")
 34 |         else:
 35 |             os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 36 |             self.device = torch.device("cpu")
 37 |         self.log_level = log_level.upper()
 38 |         logger.remove()
 39 |         logger.add(sink=sys.stdout, level=self.log_level)
 40 |         # 读取配置文件
 41 |         if isinstance(configs, str):
 42 |             # 获取当前程序绝对路径
 43 |             absolute_path = os.path.dirname(__file__)
 44 |             # 获取默认配置文件路径
 45 |             config_path = os.path.join(absolute_path, f"configs/{configs}.yml")
 46 |             configs = config_path if os.path.exists(config_path) else configs
 47 |             with open(configs, 'r', encoding='utf-8') as f:
 48 |                 configs = yaml.load(f.read(), Loader=yaml.FullLoader)
 49 |         self.configs = dict_to_object(configs)
 50 |         # 覆盖配置文件中的参数
 51 |         if overwrites:
 52 |             overwrites = overwrites.split(",")
 53 |             for overwrite in overwrites:
 54 |                 keys, value = overwrite.strip().split("=")
 55 |                 attrs = keys.split('.')
 56 |                 current_level = self.configs
 57 |                 for attr in attrs[:-1]:
 58 |                     current_level = getattr(current_level, attr)
 59 |                 before_value = getattr(current_level, attrs[-1])
 60 |                 setattr(current_level, attrs[-1], convert_string_based_on_type(before_value, value))
 61 |         # 打印配置信息
 62 |         print_arguments(configs=self.configs)
 63 |         # 获取特征器
 64 |         self._audio_featurizer = AudioFeaturizer(feature_method=self.configs.preprocess_conf.feature_method,
 65 |                                                  use_hf_model=self.configs.preprocess_conf.get('use_hf_model', False),
 66 |                                                  method_args=self.configs.preprocess_conf.get('method_args', {}))
 67 |         # 获取分类标签
 68 |         with open(self.configs.dataset_conf.label_list_path, 'r', encoding='utf-8') as f:
 69 |             lines = f.readlines()
 70 |         self.class_labels = [l.replace('\n', '') for l in lines]
 71 |         # 自动获取列表数量
 72 |         if self.configs.model_conf.model_args.get('num_class', None) is None:
 73 |             self.configs.model_conf.model_args.num_class = len(self.class_labels)
 74 |         # 获取模型
 75 |         self.predictor = build_model(input_size=self._audio_featurizer.feature_dim, configs=self.configs)
 76 |         self.predictor.to(self.device)
 77 |         # 加载模型
 78 |         if os.path.isdir(model_path):
 79 |             model_path = os.path.join(model_path, 'model.pth')
 80 |         assert os.path.exists(model_path), f"{model_path} 模型不存在！"
 81 |         if torch.cuda.is_available() and use_gpu:
 82 |             model_state_dict = torch.load(model_path, weights_only=False)
 83 |         else:
 84 |             model_state_dict = torch.load(model_path, weights_only=False, map_location='cpu')
 85 |         self.predictor.load_state_dict(model_state_dict)
 86 |         logger.info(f"成功加载模型参数：{model_path}")
 87 |         self.predictor.eval()
 88 | 
 89 |     def _load_audio(self, audio_data, sample_rate=16000):
 90 |         """加载音频
 91 |         :param audio_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整的字节文件
 92 |         :param sample_rate: 如果传入的事numpy数据，需要指定采样率
 93 |         :return: 识别的文本结果和解码的得分数
 94 |         """
 95 |         # 加载音频文件，并进行预处理
 96 |         if isinstance(audio_data, str):
 97 |             audio_segment = AudioSegment.from_file(audio_data)
 98 |         elif isinstance(audio_data, BufferedReader):
 99 |             audio_segment = AudioSegment.from_file(audio_data)
100 |         elif isinstance(audio_data, np.ndarray):
101 |             audio_segment = AudioSegment.from_ndarray(audio_data, sample_rate)
102 |         elif isinstance(audio_data, bytes):
103 |             audio_segment = AudioSegment.from_bytes(audio_data)
104 |         else:
105 |             raise Exception(f'不支持该数据类型，当前数据类型为：{type(audio_data)}')
106 |         # 重采样
107 |         if audio_segment.sample_rate != self.configs.dataset_conf.dataset.sample_rate:
108 |             audio_segment.resample(self.configs.dataset_conf.dataset.sample_rate)
109 |         # decibel normalization
110 |         if self.configs.dataset_conf.dataset.use_dB_normalization:
111 |             audio_segment.normalize(target_db=self.configs.dataset_conf.dataset.target_dB)
112 |         assert audio_segment.duration >= self.configs.dataset_conf.dataset.min_duration, \
113 |             f'音频太短，最小应该为{self.configs.dataset_conf.dataset.min_duration}s，当前音频为{audio_segment.duration}s'
114 |         return audio_segment
115 | 
116 |     # 预测一个音频的特征
117 |     def predict(self,
118 |                 audio_data,
119 |                 sample_rate=16000):
120 |         """预测一个音频
121 | 
122 |         :param audio_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整并带格式的字节文件
123 |         :param sample_rate: 如果传入的事numpy数据，需要指定采样率
124 |         :return: 结果标签和对应的得分
125 |         """
126 |         # 加载音频文件，并进行预处理
127 |         input_data = self._load_audio(audio_data=audio_data, sample_rate=sample_rate)
128 |         input_data = torch.tensor(input_data.samples, dtype=torch.float32).unsqueeze(0)
129 |         audio_feature = self._audio_featurizer(input_data).to(self.device)
130 |         # 执行预测
131 |         output = self.predictor(audio_feature)
132 |         result = torch.nn.functional.softmax(output, dim=-1)[0]
133 |         result = result.data.cpu().numpy()
134 |         # 最大概率的label
135 |         lab = np.argsort(result)[-1]
136 |         score = result[lab]
137 |         return self.class_labels[lab], round(float(score), 5)
138 | 
139 |     def predict_batch(self, audios_data: List, sample_rate=16000):
140 |         """预测一批音频的特征
141 | 
142 |         :param audios_data: 需要识别的数据，支持文件路径，文件对象，字节，numpy。如果是字节的话，必须是完整并带格式的字节文件
143 |         :param sample_rate: 如果传入的事numpy数据，需要指定采样率
144 |         :return: 结果标签和对应的得分
145 |         """
146 |         audios_data1 = []
147 |         for audio_data in audios_data:
148 |             # 加载音频文件，并进行预处理
149 |             input_data = self._load_audio(audio_data=audio_data, sample_rate=sample_rate)
150 |             audios_data1.append(input_data.samples)
151 |         # 找出音频长度最长的
152 |         batch = sorted(audios_data1, key=lambda a: a.shape[0], reverse=True)
153 |         max_audio_length = batch[0].shape[0]
154 |         batch_size = len(batch)
155 |         # 以最大的长度创建0张量
156 |         inputs = np.zeros((batch_size, max_audio_length), dtype=np.float32)
157 |         input_lens_ratio = []
158 |         for x in range(batch_size):
159 |             tensor = audios_data1[x]
160 |             seq_length = tensor.shape[0]
161 |             # 将数据插入都0张量中，实现了padding
162 |             inputs[x, :seq_length] = tensor[:]
163 |             input_lens_ratio.append(seq_length / max_audio_length)
164 |         inputs = torch.tensor(inputs, dtype=torch.float32)
165 |         input_lens_ratio = torch.tensor(input_lens_ratio, dtype=torch.float32)
166 |         audio_feature = self._audio_featurizer(inputs, input_lens_ratio).to(self.device)
167 |         # 执行预测
168 |         output = self.predictor(audio_feature)
169 |         results = torch.nn.functional.softmax(output, dim=-1)
170 |         results = results.data.cpu().numpy()
171 |         labels, scores = [], []
172 |         for result in results:
173 |             lab = np.argsort(result)[-1]
174 |             score = result[lab]
175 |             labels.append(self.class_labels[lab])
176 |             scores.append(round(float(score), 5))
177 |         return labels, scores
178 | 


--------------------------------------------------------------------------------
/macls/trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import sys
  4 | import time
  5 | import uuid
  6 | from datetime import timedelta
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.distributed as dist
 11 | import yaml
 12 | from sklearn.metrics import confusion_matrix
 13 | from torch.utils.data import DataLoader, RandomSampler
 14 | from torch.utils.data.distributed import DistributedSampler
 15 | from torchinfo import summary
 16 | from tqdm import tqdm
 17 | from loguru import logger
 18 | from visualdl import LogWriter
 19 | 
 20 | from macls.data_utils.collate_fn import collate_fn
 21 | from macls.data_utils.featurizer import AudioFeaturizer
 22 | from macls.data_utils.reader import MAClsDataset
 23 | from macls.metric.metrics import accuracy
 24 | from macls.models import build_model
 25 | from macls.optimizer import build_optimizer, build_lr_scheduler
 26 | from macls.utils.checkpoint import load_pretrained, load_checkpoint, save_checkpoint
 27 | from macls.utils.utils import dict_to_object, plot_confusion_matrix, print_arguments, convert_string_based_on_type
 28 | 
 29 | 
 30 | class MAClsTrainer(object):
 31 |     def __init__(self,
 32 |                  configs,
 33 |                  use_gpu=True,
 34 |                  data_augment_configs=None,
 35 |                  num_class=None,
 36 |                  overwrites=None,
 37 |                  log_level="info"):
 38 |         """声音分类训练工具类
 39 | 
 40 |         :param configs: 配置文件路径，或者模型名称，如果是模型名称则会使用默认的配置文件
 41 |         :param use_gpu: 是否使用GPU训练模型
 42 |         :param data_augment_configs: 数据增强配置字典或者其文件路径
 43 |         :param num_class: 分类大小，对应配置文件中的model_conf.model_args.num_class
 44 |         :param overwrites: 覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开
 45 |         :param log_level: 打印的日志等级，可选值有："debug", "info", "warning", "error"
 46 |         """
 47 |         if use_gpu:
 48 |             assert (torch.cuda.is_available()), 'GPU不可用'
 49 |             self.device = torch.device("cuda")
 50 |         else:
 51 |             os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 52 |             self.device = torch.device("cpu")
 53 |         self.use_gpu = use_gpu
 54 |         self.log_level = log_level.upper()
 55 |         logger.remove()
 56 |         logger.add(sink=sys.stdout, level=self.log_level)
 57 |         # 读取配置文件
 58 |         if isinstance(configs, str):
 59 |             # 获取当前程序绝对路径
 60 |             absolute_path = os.path.dirname(__file__)
 61 |             # 获取默认配置文件路径
 62 |             config_path = os.path.join(absolute_path, f"configs/{configs}.yml")
 63 |             configs = config_path if os.path.exists(config_path) else configs
 64 |             with open(configs, 'r', encoding='utf-8') as f:
 65 |                 configs = yaml.load(f.read(), Loader=yaml.FullLoader)
 66 |         self.configs = dict_to_object(configs)
 67 |         if num_class is not None:
 68 |             self.configs.model_conf.model_args.num_class = num_class
 69 |         # 覆盖配置文件中的参数
 70 |         if overwrites:
 71 |             overwrites = overwrites.split(",")
 72 |             for overwrite in overwrites:
 73 |                 keys, value = overwrite.strip().split("=")
 74 |                 attrs = keys.split('.')
 75 |                 current_level = self.configs
 76 |                 for attr in attrs[:-1]:
 77 |                     current_level = getattr(current_level, attr)
 78 |                 before_value = getattr(current_level, attrs[-1])
 79 |                 setattr(current_level, attrs[-1], convert_string_based_on_type(before_value, value))
 80 |         # 打印配置信息
 81 |         print_arguments(configs=self.configs)
 82 |         self.model = None
 83 |         self.optimizer = None
 84 |         self.scheduler = None
 85 |         self.audio_featurizer = None
 86 |         self.train_dataset = None
 87 |         self.train_loader = None
 88 |         self.test_dataset = None
 89 |         self.test_loader = None
 90 |         self.amp_scaler = None
 91 |         # 读取数据增强配置文件
 92 |         if isinstance(data_augment_configs, str):
 93 |             with open(data_augment_configs, 'r', encoding='utf-8') as f:
 94 |                 data_augment_configs = yaml.load(f.read(), Loader=yaml.FullLoader)
 95 |             print_arguments(configs=data_augment_configs, title='数据增强配置')
 96 |         self.data_augment_configs = dict_to_object(data_augment_configs)
 97 |         # 获取分类标签
 98 |         with open(self.configs.dataset_conf.label_list_path, 'r', encoding='utf-8') as f:
 99 |             lines = f.readlines()
100 |         self.class_labels = [l.replace('\n', '') for l in lines]
101 |         if platform.system().lower() == 'windows':
102 |             self.configs.dataset_conf.dataLoader.num_workers = 0
103 |             logger.warning('Windows系统不支持多线程读取数据，已自动关闭！')
104 |         if self.configs.preprocess_conf.get('use_hf_model', False):
105 |             self.configs.dataset_conf.dataLoader.num_workers = 0
106 |             logger.warning('使用HuggingFace模型不支持多线程进行特征提取，已自动关闭！')
107 |         self.max_step, self.train_step = None, None
108 |         self.train_loss, self.train_acc = None, None
109 |         self.train_eta_sec = None
110 |         self.eval_loss, self.eval_acc = None, None
111 |         self.test_log_step, self.train_log_step = 0, 0
112 |         self.stop_train, self.stop_eval = False, False
113 | 
114 |     def __setup_dataloader(self, is_train=False):
115 |         """ 获取数据加载器
116 | 
117 |         :param is_train: 是否获取训练数据
118 |         """
119 |         # 获取特征器
120 |         self.audio_featurizer = AudioFeaturizer(feature_method=self.configs.preprocess_conf.feature_method,
121 |                                                 use_hf_model=self.configs.preprocess_conf.get('use_hf_model', False),
122 |                                                 method_args=self.configs.preprocess_conf.get('method_args', {}))
123 | 
124 |         dataset_args = self.configs.dataset_conf.get('dataset', {})
125 |         data_loader_args = self.configs.dataset_conf.get('dataLoader', {})
126 |         if is_train:
127 |             self.train_dataset = MAClsDataset(data_list_path=self.configs.dataset_conf.train_list,
128 |                                               audio_featurizer=self.audio_featurizer,
129 |                                               aug_conf=self.data_augment_configs,
130 |                                               mode='train',
131 |                                               **dataset_args)
132 |             # 设置支持多卡训练
133 |             train_sampler = RandomSampler(self.train_dataset)
134 |             if torch.cuda.device_count() > 1:
135 |                 # 设置支持多卡训练
136 |                 train_sampler = DistributedSampler(dataset=self.train_dataset)
137 |             self.train_loader = DataLoader(dataset=self.train_dataset,
138 |                                            collate_fn=collate_fn,
139 |                                            sampler=train_sampler,
140 |                                            **data_loader_args)
141 |         # 获取测试数据
142 |         data_loader_args.drop_last = False
143 |         dataset_args.max_duration = self.configs.dataset_conf.eval_conf.max_duration
144 |         data_loader_args.batch_size = self.configs.dataset_conf.eval_conf.batch_size
145 |         self.test_dataset = MAClsDataset(data_list_path=self.configs.dataset_conf.test_list,
146 |                                          audio_featurizer=self.audio_featurizer,
147 |                                          mode='eval',
148 |                                          **dataset_args)
149 |         self.test_loader = DataLoader(dataset=self.test_dataset,
150 |                                       collate_fn=collate_fn,
151 |                                       shuffle=False,
152 |                                       **data_loader_args)
153 | 
154 |     def extract_features(self, save_dir='dataset/features', max_duration=100):
155 |         """ 提取特征保存文件
156 | 
157 |         :param save_dir: 保存路径
158 |         :param max_duration: 提取特征的最大时长，避免过长显存不足，单位秒
159 |         """
160 |         self.audio_featurizer = AudioFeaturizer(feature_method=self.configs.preprocess_conf.feature_method,
161 |                                                 use_hf_model=self.configs.preprocess_conf.get('use_hf_model', False),
162 |                                                 method_args=self.configs.preprocess_conf.get('method_args', {}))
163 |         dataset_args = self.configs.dataset_conf.get('dataset', {})
164 |         dataset_args.max_duration = max_duration
165 |         data_loader_args = self.configs.dataset_conf.get('dataLoader', {})
166 |         data_loader_args.drop_last = False
167 |         for data_list in [self.configs.dataset_conf.train_list, self.configs.dataset_conf.test_list]:
168 |             test_dataset = MAClsDataset(data_list_path=data_list,
169 |                                         audio_featurizer=self.audio_featurizer,
170 |                                         mode='extract_feature',
171 |                                         **dataset_args)
172 |             test_loader = DataLoader(dataset=test_dataset,
173 |                                      collate_fn=collate_fn,
174 |                                      shuffle=False,
175 |                                      **data_loader_args)
176 |             save_data_list = data_list.replace('.txt', '_features.txt')
177 |             with open(save_data_list, 'w', encoding='utf-8') as f:
178 |                 for features, labels, input_lens in tqdm(test_loader):
179 |                     for i in range(len(features)):
180 |                         feature, label, input_len = features[i], labels[i], input_lens[i]
181 |                         feature = feature.numpy()[:input_len]
182 |                         label = int(label)
183 |                         save_path = os.path.join(save_dir, str(label),
184 |                                                  f'{str(uuid.uuid4())}.npy').replace('\\', '/')
185 |                         os.makedirs(os.path.dirname(save_path), exist_ok=True)
186 |                         np.save(save_path, feature)
187 |                         f.write(f'{save_path}\t{label}\n')
188 |             logger.info(f'{data_list}列表中的数据已提取特征完成，新列表为：{save_data_list}')
189 | 
190 |     def __setup_model(self, input_size, is_train=False):
191 |         """ 获取模型
192 | 
193 |         :param input_size: 模型输入特征大小
194 |         :param is_train: 是否获取训练模型
195 |         """
196 |         # 自动获取列表数量
197 |         if self.configs.model_conf.model_args.get('num_class', None) is None:
198 |             self.configs.model_conf.model_args.num_class = len(self.class_labels)
199 |         # 获取模型
200 |         self.model = build_model(input_size=input_size, configs=self.configs)
201 |         self.model.to(self.device)
202 |         if self.log_level == "DEBUG" or self.log_level == "INFO":
203 |             # 打印模型信息，98是长度，这个取决于输入的音频长度
204 |             summary(self.model, input_size=(1, 98, input_size))
205 |         # 使用Pytorch2.0的编译器
206 |         if self.configs.train_conf.use_compile and torch.__version__ >= "2" and platform.system().lower() == 'windows':
207 |             self.model = torch.compile(self.model, mode="reduce-overhead")
208 |         # print(self.model)
209 |         # 获取损失函数
210 |         label_smoothing = self.configs.train_conf.get('label_smoothing', 0.0)
211 |         self.loss = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing)
212 |         if is_train:
213 |             if self.configs.train_conf.enable_amp:
214 |                 self.amp_scaler = torch.GradScaler(init_scale=1024)
215 |             # 获取优化方法
216 |             self.optimizer = build_optimizer(params=self.model.parameters(), configs=self.configs)
217 |             # 学习率衰减函数
218 |             self.scheduler = build_lr_scheduler(optimizer=self.optimizer, step_per_epoch=len(self.train_loader),
219 |                                                 configs=self.configs)
220 | 
221 |     def __train_epoch(self, epoch_id, local_rank, writer, nranks=0):
222 |         """训练一个epoch
223 | 
224 |         :param epoch_id: 当前epoch
225 |         :param local_rank: 当前显卡id
226 |         :param writer: VisualDL对象
227 |         :param nranks: 所使用显卡的数量
228 |         """
229 |         train_times, accuracies, loss_sum = [], [], []
230 |         start = time.time()
231 |         for batch_id, (features, label, input_len) in enumerate(self.train_loader):
232 |             if self.stop_train: break
233 |             if nranks > 1:
234 |                 features = features.to(local_rank)
235 |                 label = label.to(local_rank).long()
236 |             else:
237 |                 features = features.to(self.device)
238 |                 label = label.to(self.device).long()
239 |             # 执行模型计算，是否开启自动混合精度
240 |             with torch.autocast('cuda', enabled=self.configs.train_conf.enable_amp):
241 |                 output = self.model(features)
242 |             # 计算损失值
243 |             los = self.loss(output, label)
244 |             # 是否开启自动混合精度
245 |             if self.configs.train_conf.enable_amp:
246 |                 # loss缩放，乘以系数loss_scaling
247 |                 scaled = self.amp_scaler.scale(los)
248 |                 scaled.backward()
249 |             else:
250 |                 los.backward()
251 |             # 是否开启自动混合精度
252 |             if self.configs.train_conf.enable_amp:
253 |                 self.amp_scaler.unscale_(self.optimizer)
254 |                 self.amp_scaler.step(self.optimizer)
255 |                 self.amp_scaler.update()
256 |             else:
257 |                 self.optimizer.step()
258 |             self.optimizer.zero_grad()
259 | 
260 |             # 计算准确率
261 |             acc = accuracy(output, label)
262 |             accuracies.append(acc)
263 |             loss_sum.append(los.data.cpu().numpy())
264 |             train_times.append((time.time() - start) * 1000)
265 |             self.train_step += 1
266 | 
267 |             # 多卡训练只使用一个进程打印
268 |             if batch_id % self.configs.train_conf.log_interval == 0 and local_rank == 0:
269 |                 batch_id = batch_id + 1
270 |                 # 计算每秒训练数据量
271 |                 train_speed = self.configs.dataset_conf.dataLoader.batch_size / (
272 |                         sum(train_times) / len(train_times) / 1000)
273 |                 # 计算剩余时间
274 |                 self.train_eta_sec = (sum(train_times) / len(train_times)) * (self.max_step - self.train_step) / 1000
275 |                 eta_str = str(timedelta(seconds=int(self.train_eta_sec)))
276 |                 self.train_loss = sum(loss_sum) / len(loss_sum)
277 |                 self.train_acc = sum(accuracies) / len(accuracies)
278 |                 logger.info(f'Train epoch: [{epoch_id}/{self.configs.train_conf.max_epoch}], '
279 |                             f'batch: [{batch_id}/{len(self.train_loader)}], '
280 |                             f'loss: {self.train_loss:.5f}, accuracy: {self.train_acc:.5f}, '
281 |                             f'learning rate: {self.scheduler.get_last_lr()[0]:>.8f}, '
282 |                             f'speed: {train_speed:.2f} data/sec, eta: {eta_str}')
283 |                 writer.add_scalar('Train/Loss', self.train_loss, self.train_log_step)
284 |                 writer.add_scalar('Train/Accuracy', self.train_acc, self.train_log_step)
285 |                 # 记录学习率
286 |                 writer.add_scalar('Train/lr', self.scheduler.get_last_lr()[0], self.train_log_step)
287 |                 train_times, accuracies, loss_sum = [], [], []
288 |                 self.train_log_step += 1
289 |             start = time.time()
290 |             self.scheduler.step()
291 | 
292 |     def train(self,
293 |               save_model_path='models/',
294 |               log_dir='log/',
295 |               max_epoch=None,
296 |               resume_model=None,
297 |               pretrained_model=None):
298 |         """
299 |         训练模型
300 |         :param save_model_path: 模型保存的路径
301 |         :param log_dir: 保存VisualDL日志文件的路径
302 |         :param max_epoch: 最大训练轮数，对应配置文件中的train_conf.max_epoch
303 |         :param resume_model: 恢复训练，当为None则不使用预训练模型
304 |         :param pretrained_model: 预训练模型的路径，当为None则不使用预训练模型
305 |         """
306 |         # 获取有多少张显卡训练
307 |         nranks = torch.cuda.device_count()
308 |         local_rank = 0
309 |         writer = None
310 |         if local_rank == 0:
311 |             # 日志记录器
312 |             writer = LogWriter(logdir=log_dir)
313 | 
314 |         if nranks > 1 and self.use_gpu:
315 |             # 初始化NCCL环境
316 |             dist.init_process_group(backend='nccl')
317 |             local_rank = int(os.environ["LOCAL_RANK"])
318 | 
319 |         # 获取数据
320 |         self.__setup_dataloader(is_train=True)
321 |         # 获取模型
322 |         self.__setup_model(input_size=self.audio_featurizer.feature_dim, is_train=True)
323 |         # 加载预训练模型
324 |         self.model = load_pretrained(model=self.model, pretrained_model=pretrained_model, use_gpu=self.use_gpu)
325 |         # 加载恢复模型
326 |         self.model, self.optimizer, self.amp_scaler, self.scheduler, last_epoch, best_acc = \
327 |             load_checkpoint(configs=self.configs, model=self.model, optimizer=self.optimizer,
328 |                             amp_scaler=self.amp_scaler, scheduler=self.scheduler, step_epoch=len(self.train_loader),
329 |                             save_model_path=save_model_path, resume_model=resume_model)
330 | 
331 |         # 支持多卡训练
332 |         if nranks > 1 and self.use_gpu:
333 |             self.model.to(local_rank)
334 |             self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[local_rank])
335 |         logger.info('训练数据：{}'.format(len(self.train_dataset)))
336 | 
337 |         self.train_loss, self.train_acc = None, None
338 |         self.eval_loss, self.eval_acc = None, None
339 |         self.test_log_step, self.train_log_step = 0, 0
340 |         if local_rank == 0:
341 |             writer.add_scalar('Train/lr', self.scheduler.get_last_lr()[0], last_epoch)
342 |         if max_epoch is not None:
343 |             self.configs.train_conf.max_epoch = max_epoch
344 |         # 最大步数
345 |         self.max_step = len(self.train_loader) * self.configs.train_conf.max_epoch
346 |         self.train_step = max(last_epoch, 0) * len(self.train_loader)
347 |         # 开始训练
348 |         for epoch_id in range(last_epoch, self.configs.train_conf.max_epoch):
349 |             if self.stop_train: break
350 |             epoch_id += 1
351 |             start_epoch = time.time()
352 |             # 训练一个epoch
353 |             self.__train_epoch(epoch_id=epoch_id, local_rank=local_rank, writer=writer, nranks=nranks)
354 |             # 多卡训练只使用一个进程执行评估和保存模型
355 |             if local_rank == 0:
356 |                 if self.stop_eval: continue
357 |                 logger.info('=' * 70)
358 |                 self.eval_loss, self.eval_acc = self.evaluate()
359 |                 logger.info('Test epoch: {}, time/epoch: {}, loss: {:.5f}, accuracy: {:.5f}'.format(
360 |                     epoch_id, str(timedelta(seconds=(time.time() - start_epoch))), self.eval_loss, self.eval_acc))
361 |                 logger.info('=' * 70)
362 |                 writer.add_scalar('Test/Accuracy', self.eval_acc, self.test_log_step)
363 |                 writer.add_scalar('Test/Loss', self.eval_loss, self.test_log_step)
364 |                 self.test_log_step += 1
365 |                 self.model.train()
366 |                 # # 保存最优模型
367 |                 if self.eval_acc >= best_acc:
368 |                     best_acc = self.eval_acc
369 |                     save_checkpoint(configs=self.configs, model=self.model, optimizer=self.optimizer,
370 |                                     amp_scaler=self.amp_scaler, save_model_path=save_model_path, epoch_id=epoch_id,
371 |                                     accuracy=self.eval_acc, best_model=True)
372 |                 # 保存模型
373 |                 save_checkpoint(configs=self.configs, model=self.model, optimizer=self.optimizer,
374 |                                 amp_scaler=self.amp_scaler, save_model_path=save_model_path, epoch_id=epoch_id,
375 |                                 accuracy=self.eval_acc)
376 | 
377 |     def evaluate(self, resume_model=None, save_matrix_path=None):
378 |         """
379 |         评估模型
380 |         :param resume_model: 所使用的模型
381 |         :param save_matrix_path: 保存混合矩阵的路径
382 |         :return: 评估结果
383 |         """
384 |         if self.test_loader is None:
385 |             self.__setup_dataloader()
386 |         if self.model is None:
387 |             self.__setup_model(input_size=self.audio_featurizer.feature_dim)
388 |         if resume_model is not None:
389 |             if os.path.isdir(resume_model):
390 |                 resume_model = os.path.join(resume_model, 'model.pth')
391 |             assert os.path.exists(resume_model), f"{resume_model} 模型不存在！"
392 |             model_state_dict = torch.load(resume_model, weights_only=False)
393 |             self.model.load_state_dict(model_state_dict)
394 |             logger.info(f'成功加载模型：{resume_model}')
395 |         self.model.eval()
396 |         if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
397 |             eval_model = self.model.module
398 |         else:
399 |             eval_model = self.model
400 | 
401 |         accuracies, losses, preds, labels = [], [], [], []
402 |         with torch.no_grad():
403 |             for batch_id, (features, label, input_lens) in enumerate(tqdm(self.test_loader, desc='执行评估')):
404 |                 if self.stop_eval: break
405 |                 features = features.to(self.device)
406 |                 label = label.to(self.device).long()
407 |                 output = eval_model(features)
408 |                 los = self.loss(output, label)
409 |                 # 计算准确率
410 |                 acc = accuracy(output, label)
411 |                 accuracies.append(acc)
412 |                 # 模型预测标签
413 |                 label = label.data.cpu().numpy()
414 |                 output = output.data.cpu().numpy()
415 |                 pred = np.argmax(output, axis=1)
416 |                 preds.extend(pred.tolist())
417 |                 # 真实标签
418 |                 labels.extend(label.tolist())
419 |                 losses.append(los.data.cpu().numpy())
420 |         loss = float(sum(losses) / len(losses)) if len(losses) > 0 else -1
421 |         acc = float(sum(accuracies) / len(accuracies)) if len(accuracies) > 0 else -1
422 |         # 保存混合矩阵
423 |         if save_matrix_path is not None:
424 |             try:
425 |                 cm = confusion_matrix(labels, preds)
426 |                 plot_confusion_matrix(cm=cm, save_path=os.path.join(save_matrix_path, f'{int(time.time())}.png'),
427 |                                       class_labels=self.class_labels)
428 |             except Exception as e:
429 |                 logger.error(f'保存混淆矩阵失败：{e}')
430 |         self.model.train()
431 |         return loss, acc
432 | 
433 |     def export(self, save_model_path='models/', resume_model='models/EcapaTdnn_Fbank/best_model/'):
434 |         """
435 |         导出预测模型
436 |         :param save_model_path: 模型保存的路径
437 |         :param resume_model: 准备转换的模型路径
438 |         :return:
439 |         """
440 |         self.__setup_model(input_size=self.audio_featurizer.feature_dim)
441 |         # 加载预训练模型
442 |         if os.path.isdir(resume_model):
443 |             resume_model = os.path.join(resume_model, 'model.pth')
444 |         assert os.path.exists(resume_model), f"{resume_model} 模型不存在！"
445 |         model_state_dict = torch.load(resume_model)
446 |         self.model.load_state_dict(model_state_dict)
447 |         logger.info('成功恢复模型参数和优化方法参数：{}'.format(resume_model))
448 |         self.model.eval()
449 |         # 获取静态模型
450 |         infer_model = self.model.export()
451 |         infer_model_path = os.path.join(save_model_path,
452 |                                         f'{self.configs.use_model}_{self.configs.preprocess_conf.feature_method}',
453 |                                         'inference.pth')
454 |         os.makedirs(os.path.dirname(infer_model_path), exist_ok=True)
455 |         torch.jit.save(infer_model, infer_model_path)
456 |         logger.info("预测模型已保存：{}".format(infer_model_path))
457 | 


--------------------------------------------------------------------------------
/macls/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/AudioClassification-Pytorch/f9f55968cc1f181c8ebc786f2db880702e8bf79f/macls/utils/__init__.py


--------------------------------------------------------------------------------
/macls/utils/checkpoint.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | 
  5 | import torch
  6 | from loguru import logger
  7 | from macls import __version__
  8 | 
  9 | 
 10 | def load_pretrained(model, pretrained_model, use_gpu=True):
 11 |     """加载预训练模型
 12 | 
 13 |     :param model: 使用的模型
 14 |     :param pretrained_model: 预训练模型路径
 15 |     :param use_gpu: 模型是否使用GPU
 16 |     :return: 加载的模型
 17 |     """
 18 |     # 加载预训练模型
 19 |     if pretrained_model is None: return model
 20 |     if os.path.isdir(pretrained_model):
 21 |         pretrained_model = os.path.join(pretrained_model, 'model.pth')
 22 |     assert os.path.exists(pretrained_model), f"{pretrained_model} 模型不存在！"
 23 |     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
 24 |         model_dict = model.module.state_dict()
 25 |     else:
 26 |         model_dict = model.state_dict()
 27 |     if torch.cuda.is_available() and use_gpu:
 28 |         model_state_dict = torch.load(pretrained_model, weights_only=False)
 29 |     else:
 30 |         model_state_dict = torch.load(pretrained_model, weights_only=False, map_location='cpu')
 31 |     # 过滤不存在的参数
 32 |     for name, weight in model_dict.items():
 33 |         if name in model_state_dict.keys():
 34 |             if list(weight.shape) != list(model_state_dict[name].shape):
 35 |                 logger.warning(f'{name} not used, shape {list(model_state_dict[name].shape)} '
 36 |                                f'unmatched with {list(weight.shape)} in model.')
 37 |                 model_state_dict.pop(name, None)
 38 |     # 加载权重
 39 |     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
 40 |         missing_keys, unexpected_keys = model.module.load_state_dict(model_state_dict, strict=False)
 41 |     else:
 42 |         missing_keys, unexpected_keys = model.load_state_dict(model_state_dict, strict=False)
 43 |     if len(unexpected_keys) > 0:
 44 |         logger.warning('Unexpected key(s) in state_dict: {}. '
 45 |                        .format(', '.join('"{}"'.format(k) for k in unexpected_keys)))
 46 |     if len(missing_keys) > 0:
 47 |         logger.warning('Missing key(s) in state_dict: {}. '
 48 |                        .format(', '.join('"{}"'.format(k) for k in missing_keys)))
 49 |     logger.info('成功加载预训练模型：{}'.format(pretrained_model))
 50 |     return model
 51 | 
 52 | 
 53 | def load_checkpoint(configs, model, optimizer, amp_scaler, scheduler,
 54 |                     step_epoch, save_model_path, resume_model):
 55 |     """加载模型
 56 | 
 57 |     :param configs: 配置信息
 58 |     :param model: 使用的模型
 59 |     :param optimizer: 使用的优化方法
 60 |     :param amp_scaler: 使用的自动混合精度
 61 |     :param scheduler: 使用的学习率调整策略
 62 |     :param step_epoch: 每个epoch的step数量
 63 |     :param save_model_path: 模型保存路径
 64 |     :param resume_model: 恢复训练的模型路径
 65 |     """
 66 |     last_epoch1 = 0
 67 |     accuracy1 = 0.
 68 | 
 69 |     def load_model(model_path):
 70 |         assert os.path.exists(os.path.join(model_path, 'model.pth')), "模型参数文件不存在！"
 71 |         assert os.path.exists(os.path.join(model_path, 'optimizer.pth')), "优化方法参数文件不存在！"
 72 |         state_dict = torch.load(os.path.join(model_path, 'model.pth'), weights_only=False)
 73 |         if isinstance(model, torch.nn.parallel.DistributedDataParallel):
 74 |             model.module.load_state_dict(state_dict)
 75 |         else:
 76 |             model.load_state_dict(state_dict)
 77 |         optimizer.load_state_dict(torch.load(os.path.join(model_path, 'optimizer.pth'), weights_only=False))
 78 |         # 自动混合精度参数
 79 |         if amp_scaler is not None and os.path.exists(os.path.join(model_path, 'scaler.pth')):
 80 |             amp_scaler.load_state_dict(torch.load(os.path.join(model_path, 'scaler.pth')), weights_only=False)
 81 |         with open(os.path.join(model_path, 'model.state'), 'r', encoding='utf-8') as f:
 82 |             json_data = json.load(f)
 83 |             last_epoch = json_data['last_epoch']
 84 |             accuracy = json_data['accuracy']
 85 |         logger.info('成功恢复模型参数和优化方法参数：{}'.format(model_path))
 86 |         optimizer.step()
 87 |         [scheduler.step() for _ in range(last_epoch * step_epoch)]
 88 |         return last_epoch, accuracy
 89 | 
 90 |     # 获取最后一个保存的模型
 91 |     save_feature_method = configs.preprocess_conf.feature_method
 92 |     if configs.preprocess_conf.get('use_hf_model', False):
 93 |         save_feature_method = save_feature_method[:-1] if save_feature_method[-1] == '/' else save_feature_method
 94 |         save_feature_method = os.path.basename(save_feature_method)
 95 |     last_model_dir = os.path.join(save_model_path,
 96 |                                   f'{configs.model_conf.model}_{save_feature_method}',
 97 |                                   'last_model')
 98 |     if resume_model is not None or (os.path.exists(os.path.join(last_model_dir, 'model.pth'))
 99 |                                     and os.path.exists(os.path.join(last_model_dir, 'optimizer.pth'))):
100 |         if resume_model is not None:
101 |             last_epoch1, accuracy1 = load_model(resume_model)
102 |         else:
103 |             try:
104 |                 # 自动获取最新保存的模型
105 |                 last_epoch1, accuracy1 = load_model(last_model_dir)
106 |             except Exception as e:
107 |                 logger.warning(f'尝试自动恢复最新模型失败，错误信息：{e}')
108 |     return model, optimizer, amp_scaler, scheduler, last_epoch1, accuracy1
109 | 
110 | 
111 | # 保存模型
112 | def save_checkpoint(configs, model, optimizer, amp_scaler, save_model_path, epoch_id,
113 |                     accuracy=0., best_model=False):
114 |     """保存模型
115 | 
116 |     :param configs: 配置信息
117 |     :param model: 使用的模型
118 |     :param optimizer: 使用的优化方法
119 |     :param amp_scaler: 使用的自动混合精度
120 |     :param save_model_path: 模型保存路径
121 |     :param epoch_id: 当前epoch
122 |     :param accuracy: 当前准确率
123 |     :param best_model: 是否为最佳模型
124 |     """
125 |     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
126 |         state_dict = model.module.state_dict()
127 |     else:
128 |         state_dict = model.state_dict()
129 |     # 保存模型的路径
130 |     save_feature_method = configs.preprocess_conf.feature_method
131 |     if configs.preprocess_conf.get('use_hf_model', False):
132 |         save_feature_method = save_feature_method[:-1] if save_feature_method[-1] == '/' else save_feature_method
133 |         save_feature_method = os.path.basename(save_feature_method)
134 |     if best_model:
135 |         model_path = os.path.join(save_model_path,
136 |                                   f'{configs.model_conf.model}_{save_feature_method}', 'best_model')
137 |     else:
138 |         model_path = os.path.join(save_model_path,
139 |                                   f'{configs.model_conf.model}_{save_feature_method}', 'epoch_{}'.format(epoch_id))
140 |     os.makedirs(model_path, exist_ok=True)
141 |     # 保存模型参数
142 |     torch.save(optimizer.state_dict(), os.path.join(model_path, 'optimizer.pth'))
143 |     torch.save(state_dict, os.path.join(model_path, 'model.pth'))
144 |     # 自动混合精度参数
145 |     if amp_scaler is not None:
146 |         torch.save(amp_scaler.state_dict(), os.path.join(model_path, 'scaler.pth'))
147 |     with open(os.path.join(model_path, 'model.state'), 'w', encoding='utf-8') as f:
148 |         data = {"last_epoch": epoch_id, "accuracy": accuracy, "version": __version__,
149 |                 "model": configs.model_conf.model, "feature_method": save_feature_method}
150 |         f.write(json.dumps(data, indent=4, ensure_ascii=False))
151 |     if not best_model:
152 |         last_model_path = os.path.join(save_model_path,
153 |                                        f'{configs.model_conf.model}_{save_feature_method}', 'last_model')
154 |         shutil.rmtree(last_model_path, ignore_errors=True)
155 |         shutil.copytree(model_path, last_model_path)
156 |         # 删除旧的模型
157 |         old_model_path = os.path.join(save_model_path,
158 |                                       f'{configs.model_conf.model}_{save_feature_method}',
159 |                                       'epoch_{}'.format(epoch_id - 3))
160 |         if os.path.exists(old_model_path):
161 |             shutil.rmtree(old_model_path)
162 |     logger.info('已保存模型：{}'.format(model_path))
163 | 


--------------------------------------------------------------------------------
/macls/utils/record.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import soundcard
 4 | import soundfile
 5 | 
 6 | 
 7 | class RecordAudio:
 8 |     def __init__(self, channels=1, sample_rate=16000):
 9 |         # 录音参数
10 |         self.channels = channels
11 |         self.sample_rate = sample_rate
12 | 
13 |         # 获取麦克风
14 |         self.default_mic = soundcard.default_microphone()
15 | 
16 |     def record(self, record_seconds=3, save_path=None):
17 |         """录音
18 | 
19 |         :param record_seconds: 录音时间，默认3秒
20 |         :param save_path: 录音保存的路径，后缀名为wav
21 |         :return: 音频的numpy数据
22 |         """
23 |         print("开始录音......")
24 |         num_frames = int(record_seconds * self.sample_rate)
25 |         data = self.default_mic.record(samplerate=self.sample_rate, numframes=num_frames, channels=self.channels)
26 |         audio_data = data.squeeze()
27 |         print("录音已结束!")
28 |         if save_path is not None:
29 |             os.makedirs(os.path.dirname(save_path), exist_ok=True)
30 |             soundfile.write(save_path, data=data, samplerate=self.sample_rate)
31 |         return audio_data
32 | 


--------------------------------------------------------------------------------
/macls/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import distutils.util
  2 | import os
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | 
  7 | from loguru import logger
  8 | 
  9 | 
 10 | def print_arguments(args=None, configs=None, title=None):
 11 |     if args:
 12 |         logger.info("----------- 额外配置参数 -----------")
 13 |         for arg, value in sorted(vars(args).items()):
 14 |             logger.info("%s: %s" % (arg, value))
 15 |         logger.info("------------------------------------------------")
 16 |     if configs:
 17 |         title = title if title else "配置文件参数"
 18 |         logger.info(f"----------- {title} -----------")
 19 |         for arg, value in sorted(configs.items()):
 20 |             if isinstance(value, dict):
 21 |                 logger.info(f"{arg}:")
 22 |                 for a, v in sorted(value.items()):
 23 |                     if isinstance(v, dict):
 24 |                         logger.info(f"\t{a}:")
 25 |                         for a1, v1 in sorted(v.items()):
 26 |                             logger.info("\t\t%s: %s" % (a1, v1))
 27 |                     else:
 28 |                         logger.info("\t%s: %s" % (a, v))
 29 |             else:
 30 |                 logger.info("%s: %s" % (arg, value))
 31 |         logger.info("------------------------------------------------")
 32 | 
 33 | 
 34 | def add_arguments(argname, type, default, help, argparser, **kwargs):
 35 |     type = distutils.util.strtobool if type == bool else type
 36 |     argparser.add_argument("--" + argname,
 37 |                            default=default,
 38 |                            type=type,
 39 |                            help=help + ' 默认: %(default)s.',
 40 |                            **kwargs)
 41 | 
 42 | 
 43 | class Dict(dict):
 44 |     __setattr__ = dict.__setitem__
 45 |     __getattr__ = dict.__getitem__
 46 | 
 47 | 
 48 | def dict_to_object(dict_obj):
 49 |     if not isinstance(dict_obj, dict):
 50 |         return dict_obj
 51 |     inst = Dict()
 52 |     for k, v in dict_obj.items():
 53 |         inst[k] = dict_to_object(v)
 54 |     return inst
 55 | 
 56 | 
 57 | def plot_confusion_matrix(cm, save_path, class_labels, show=False):
 58 |     """
 59 |     绘制混淆矩阵
 60 |     @param cm: 混淆矩阵, 一个二维数组，表示预测结果与真实结果的混淆情况。
 61 |     @param save_path: 保存路径, 字符串，指定混淆矩阵图像的保存位置。
 62 |     @param class_labels: 类别名称, 一个列表，包含各个类别的名称。
 63 |     @param show: 是否显示图像, 布尔值，控制是否在绘图窗口显示混淆矩阵图像。
 64 |     """
 65 |     # 检测类别名称是否包含中文，是则设置相应字体
 66 |     s = ''.join(class_labels)
 67 |     is_ascii = all(ord(c) < 128 for c in s)
 68 |     if not is_ascii:
 69 |         plt.rcParams['font.sans-serif'] = ['SimHei']
 70 |         plt.rcParams['axes.unicode_minus'] = False
 71 | 
 72 |     # 初始化绘图参数并绘制混淆矩阵
 73 |     plt.figure(figsize=(12, 8), dpi=100)
 74 |     np.set_printoptions(precision=2)
 75 |     # 在混淆矩阵中绘制每个格子的概率值
 76 |     ind_array = np.arange(len(class_labels))
 77 |     x, y = np.meshgrid(ind_array, ind_array)
 78 |     for x_val, y_val in zip(x.flatten(), y.flatten()):
 79 |         c = cm[y_val][x_val] / (np.sum(cm[:, x_val]) + 1e-6)
 80 |         # 忽略概率值太小的格子
 81 |         if c < 1e-4: continue
 82 |         plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center')
 83 |     m = np.sum(cm, axis=0) + 1e-6
 84 |     plt.imshow(cm / m, interpolation='nearest', cmap=plt.cm.binary)
 85 |     plt.title('Confusion Matrix' if is_ascii else '混合矩阵')
 86 |     plt.colorbar()
 87 |     # 设置类别标签
 88 |     xlocations = np.array(range(len(class_labels)))
 89 |     plt.xticks(xlocations, class_labels, rotation=90)
 90 |     plt.yticks(xlocations, class_labels)
 91 |     plt.ylabel('Actual label' if is_ascii else '实际标签')
 92 |     plt.xlabel('Predict label' if is_ascii else '预测标签')
 93 | 
 94 |     # 调整刻度标记位置，提高可视化效果
 95 |     tick_marks = np.array(range(len(class_labels))) + 0.5
 96 |     plt.gca().set_xticks(tick_marks, minor=True)
 97 |     plt.gca().set_yticks(tick_marks, minor=True)
 98 |     plt.gca().xaxis.set_ticks_position('none')
 99 |     plt.gca().yaxis.set_ticks_position('none')
100 |     plt.grid(True, which='minor', linestyle='-')
101 |     plt.gcf().subplots_adjust(bottom=0.15)
102 |     # 保存图片
103 |     os.makedirs(os.path.dirname(save_path), exist_ok=True)
104 |     plt.savefig(save_path, format='png')
105 |     if show:
106 |         # 显示图片
107 |         plt.show()
108 | 
109 | 
110 | # 根据a的类型，将b转换为相应的类型
111 | def convert_string_based_on_type(a, b):
112 |     if isinstance(a, int):
113 |         try:
114 |             b = int(b)
115 |         except ValueError:
116 |             logger.error("无法将字符串转换为整数")
117 |     elif isinstance(a, float):
118 |         try:
119 |             b = float(b)
120 |         except ValueError:
121 |             logger.error("无法将字符串转换为浮点数")
122 |     elif isinstance(a, str):
123 |         return b
124 |     elif isinstance(a, bool):
125 |         b = b.lower() == 'true'
126 |     else:
127 |         try:
128 |             b = eval(b)
129 |         except Exception as e:
130 |             logger.exception("无法将字符串转换为其他类型，将忽略该参数类型转换")
131 |     return b
132 | 


--------------------------------------------------------------------------------
/record_audio.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from macls.utils.record import RecordAudio
 4 | 
 5 | s = input('请输入你计划录音多少秒：')
 6 | record_seconds = int(s)
 7 | save_path = "dataset/save_audio/%s.wav" % str(int(time.time()*1000))
 8 | 
 9 | record_audio = RecordAudio()
10 | input(f"按下回车键开机录音，录音{record_seconds}秒中：")
11 | record_audio.record(record_seconds=record_seconds,
12 |                     save_path=save_path)
13 | 
14 | print('文件保存在：%s' % save_path)
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.19.2
 2 | scipy>=1.6.3
 3 | librosa>=0.9.1
 4 | soundfile>=0.12.1
 5 | soundcard>=0.4.2
 6 | resampy>=0.2.2
 7 | numba>=0.53.0
 8 | pydub~=0.25.1
 9 | matplotlib>=3.5.2
10 | pillow>=10.3.0
11 | tqdm>=4.66.3
12 | visualdl==2.5.3
13 | pyyaml>=5.4.1
14 | scikit-learn>=1.0.2
15 | torchinfo>=1.7.2
16 | loguru>=0.7.2
17 | yeaudio>=0.0.7
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | import macls
 6 | 
 7 | VERSION = macls.__version__
 8 | 
 9 | # 复制配置文件到项目目录下
10 | shutil.rmtree('./macls/configs/', ignore_errors=True)
11 | shutil.copytree('./configs/', './macls/configs/')
12 | 
13 | 
14 | def readme():
15 |     with open('README.md', encoding='utf-8') as f:
16 |         content = f.read()
17 |     return content
18 | 
19 | 
20 | def parse_requirements():
21 |     with open('./requirements.txt', encoding="utf-8") as f:
22 |         requirements = f.readlines()
23 |     return requirements
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     setup(
28 |         name='macls',
29 |         packages=find_packages(),
30 |         package_data={'': ['configs/*']},
31 |         author='yeyupiaoling',
32 |         version=VERSION,
33 |         install_requires=parse_requirements(),
34 |         description='Audio Classification toolkit on Pytorch',
35 |         long_description=readme(),
36 |         long_description_content_type='text/markdown',
37 |         url='https://github.com/yeyupiaoling/AudioClassification-Pytorch',
38 |         download_url='https://github.com/yeyupiaoling/AudioClassification-Pytorch.git',
39 |         keywords=['audio', 'pytorch'],
40 |         classifiers=[
41 |             'Intended Audience :: Developers',
42 |             'License :: OSI Approved :: Apache Software License',
43 |             'Operating System :: OS Independent',
44 |             'Natural Language :: Chinese (Simplified)',
45 |             'Programming Language :: Python :: 3',
46 |             'Programming Language :: Python :: 3.5',
47 |             'Programming Language :: Python :: 3.6',
48 |             'Programming Language :: Python :: 3.7',
49 |             'Programming Language :: Python :: 3.8',
50 |             'Programming Language :: Python :: 3.9', 'Topic :: Utilities'
51 |         ],
52 |         license='Apache License 2.0',
53 |         ext_modules=[])
54 |     shutil.rmtree('./macls/configs/', ignore_errors=True)
55 | 


--------------------------------------------------------------------------------
/tools/download_language_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | download_dir=dataset/language
 4 | 
 5 | 
 6 | [ ! -d ${download_dir} ] && mkdir -p ${download_dir}
 7 | 
 8 | if [ ! -f ${download_dir}/test.tar.gz ]; then
 9 |     echo "准备下载测试集"
10 |     wget --no-check-certificate https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/3D-Speaker/test.tar.gz -P ${download_dir}
11 |     md5=$(md5sum ${download_dir}/test.tar.gz | awk '{print $1}')
12 |     [ $md5 != "45972606dd10d3f7c1c31f27acdfbed7" ] && echo "Wrong md5sum of 3dspeaker test.tar.gz" && exit 1
13 | fi
14 | 
15 | if [ ! -f ${download_dir}/train.tar.gz ]; then
16 |     echo "准备下载训练集"
17 |     wget --no-check-certificate https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/3D-Speaker/train.tar.gz -P ${download_dir}
18 |     md5=$(md5sum ${download_dir}/train.tar.gz | awk '{print $1}')
19 |     [ $md5 != "c2cea55fd22a2b867d295fb35a2d3340" ] && echo "Wrong md5sum of 3dspeaker train.tar.gz" && exit 1
20 | fi
21 | 
22 | echo "下载完成！"
23 | 
24 | echo "准备解压"
25 | 
26 | tar -zxvf ${download_dir}/train.tar.gz -C ${rawdata_dir}/
27 | tar -xzvf ${download_dir}/test.tar.gz -C ${rawdata_dir}/
28 | 
29 | echo "解压完成！"
30 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from macls.trainer import MAClsTrainer
 5 | from macls.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',              str,    'configs/cam++.yml',        '配置文件')
10 | add_arg('data_augment_configs', str,    'configs/augmentation.yml', '数据增强配置文件')
11 | add_arg("local_rank",           int,    0,                          '多卡训练需要的参数')
12 | add_arg("use_gpu",              bool,   True,                       '是否使用GPU训练')
13 | add_arg('save_model_path',      str,    'models/',                  '模型保存的路径')
14 | add_arg('log_dir',              str,    'log/',                     '保存VisualDL日志文件的路径')
15 | add_arg('resume_model',         str,    None,                       '恢复训练，当为None则不使用预训练模型')
16 | add_arg('pretrained_model',     str,    None,                       '预训练模型的路径，当为None则不使用预训练模型')
17 | add_arg('overwrites',           str,    None,    '覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开')
18 | args = parser.parse_args()
19 | print_arguments(args=args)
20 | 
21 | # 获取训练器
22 | trainer = MAClsTrainer(configs=args.configs,
23 |                        use_gpu=args.use_gpu,
24 |                        data_augment_configs=args.data_augment_configs,
25 |                        overwrites=args.overwrites)
26 | 
27 | trainer.train(save_model_path=args.save_model_path,
28 |               log_dir=args.log_dir,
29 |               resume_model=args.resume_model,
30 |               pretrained_model=args.pretrained_model)
31 | 


--------------------------------------------------------------------------------