├── .gitignore
├── LICENSE
├── README.md
├── audio_db
    ├── 李达康
    │   └── 0.wav
    └── 沙瑞金
    │   └── 0.wav
├── configs
    ├── augmentation.yml
    ├── cam++.yml
    ├── ecapa_tdnn.yml
    ├── eres2net.yml
    ├── res2net.yml
    ├── resnet_se.yml
    └── tdnn.yml
├── create_data.py
├── dataset
    ├── a_1.wav
    ├── a_2.wav
    ├── b_1.wav
    ├── b_2.wav
    └── test_long.wav
├── docs
    └── images
    │   ├── contrast.jpg
    │   ├── log.jpg
    │   ├── recognition.jpg
    │   ├── speaker_diarization.jpg
    │   └── speaker_diarization_gui.png
├── eval.py
├── extract_features.py
├── infer_contrast.py
├── infer_contrast_gui.py
├── infer_recognition.py
├── infer_recognition_gui.py
├── infer_speaker_diarization.py
├── infer_speaker_diarization_gui.py
├── ppvector
    ├── __init__.py
    ├── data_utils
    │   ├── __init__.py
    │   ├── collate_fn.py
    │   ├── featurizer.py
    │   ├── pk_sampler.py
    │   └── reader.py
    ├── infer_utils
    │   ├── __init__.py
    │   ├── player.py
    │   ├── speaker_diarization.py
    │   └── viewer.py
    ├── loss
    │   ├── __init__.py
    │   ├── aamloss.py
    │   ├── amloss.py
    │   ├── armloss.py
    │   ├── celoss.py
    │   ├── sphereface2.py
    │   ├── subcenterloss.py
    │   └── tripletangularmarginloss.py
    ├── metric
    │   ├── __init__.py
    │   └── metrics.py
    ├── models
    │   ├── __init__.py
    │   ├── campplus.py
    │   ├── ecapa_tdnn.py
    │   ├── eres2net.py
    │   ├── fc.py
    │   ├── pooling.py
    │   ├── res2net.py
    │   ├── resnet_se.py
    │   ├── tdnn.py
    │   └── utils.py
    ├── optimizer
    │   ├── __init__.py
    │   └── scheduler.py
    ├── predict.py
    ├── trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── checkpoint.py
    │   ├── record.py
    │   └── utils.py
├── requirements.txt
├── setup.py
├── tools
    └── eval_speaker_diarization
    │   ├── README.md
    │   ├── compute_metrics.py
    │   ├── create_aishell4_test_rttm.py
    │   └── infer_data.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | __pycache__/
 3 | build/
 4 | dist/
 5 | ppvector.egg-info/
 6 | log/
 7 | models/
 8 | test*.py
 9 | dataset/
10 | audio_db/
11 | output/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/audio_db/李达康/0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/audio_db/李达康/0.wav


--------------------------------------------------------------------------------
/audio_db/沙瑞金/0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/audio_db/沙瑞金/0.wav


--------------------------------------------------------------------------------
/configs/augmentation.yml:
--------------------------------------------------------------------------------
 1 | # 语速增强
 2 | speed:
 3 |   # 增强概率
 4 |   prob: 1.0
 5 |   # 使用语速增强是否分类大小翻三倍
 6 |   speed_perturb_3_class: False
 7 | 
 8 | # 音量增强
 9 | volume:
10 |   # 增强概率
11 |   prob: 0.0
12 |   # 最小增益
13 |   min_gain_dBFS: -15
14 |   # 最大增益
15 |   max_gain_dBFS: 15
16 | 
17 | # 噪声增强
18 | noise:
19 |   # 增强概率
20 |   prob: 0.5
21 |   # 噪声增强的噪声文件夹
22 |   noise_dir: 'dataset/noise'
23 |   # 针对噪声的最小音量增益
24 |   min_snr_dB: 10
25 |   # 针对噪声的最大音量增益
26 |   max_snr_dB: 50
27 | 
28 | # 混响增强
29 | reverb:
30 |   # 增强概率
31 |   prob: 0.5
32 |   # 混响增强的混响文件夹
33 |   reverb_dir: 'dataset/reverb'
34 | 
35 | # Spec增强
36 | spec_aug:
37 |   # 增强概率
38 |   prob: 0.5
39 |   # 频域掩蔽的比例
40 |   freq_mask_ratio: 0.1
41 |   # 频域掩蔽次数
42 |   n_freq_masks: 1
43 |   # 频域掩蔽的比例
44 |   time_mask_ratio: 0.05
45 |   # 频域掩蔽次数
46 |   n_time_masks: 1
47 |   # 最大时间扭曲
48 |   max_time_warp: 0
49 | 


--------------------------------------------------------------------------------
/configs/cam++.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型
 52 |   model: 'CAMPPlus'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |   # 分类器参数
 57 |   classifier:
 58 |     # 分类器类型，支持Cosine、Linear
 59 |     classifier_type: 'Cosine'
 60 |     # 说话人数量，即分类大小
 61 |     num_speakers: 2796
 62 |     num_blocks: 0
 63 | 
 64 | loss_conf:
 65 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 66 |   loss: 'AAMLoss'
 67 |   # 损失函数参数
 68 |   loss_args:
 69 |     margin: 0.2
 70 |     scale: 32
 71 |     easy_margin: False
 72 |     label_smoothing: 0.0
 73 |   # 是否使用损失函数margin调度器
 74 |   use_margin_scheduler: True
 75 |   # margin调度器参数
 76 |   margin_scheduler_args:
 77 |     initial_margin: 0.0
 78 |     final_margin: 0.3
 79 | 
 80 | optimizer_conf:
 81 |   # 优化方法
 82 |   optimizer: 'Adam'
 83 |   # 优化方法参数
 84 |   optimizer_args:
 85 |     weight_decay: !!float 1e-6
 86 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 87 |   scheduler: 'WarmupCosineSchedulerLR'
 88 |   # 学习率衰减函数参数
 89 |   scheduler_args:
 90 |     # 学习率的大小
 91 |     learning_rate: 0.001
 92 |     min_lr: !!float 1e-5
 93 |     warmup_epoch: 5
 94 | 
 95 | train_conf:
 96 |   # 是否开启自动混合精度
 97 |   enable_amp: False
 98 |   # 训练的轮数
 99 |   max_epoch: 60
100 |   log_interval: 10
101 | 


--------------------------------------------------------------------------------
/configs/ecapa_tdnn.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型
 52 |   model: 'EcapaTdnn'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |     # 所使用的池化层，支持ASP、SAP、TSP、TAP
 57 |     pooling_type: 'ASP'
 58 |     channels: [512, 512, 512, 512, 1536]
 59 |   # 分类器参数
 60 |   classifier:
 61 |     # 分类器类型，支持Cosine、Linear
 62 |     classifier_type: 'Cosine'
 63 |     # 说话人数量，即分类大小
 64 |     num_speakers: 2796
 65 |     num_blocks: 0
 66 | 
 67 | loss_conf:
 68 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 69 |   loss: 'AAMLoss'
 70 |   # 损失函数参数
 71 |   loss_args:
 72 |     margin: 0.2
 73 |     scale: 32
 74 |     easy_margin: False
 75 |     label_smoothing: 0.0
 76 |   # 是否使用损失函数margin调度器
 77 |   use_margin_scheduler: True
 78 |   # margin调度器参数
 79 |   margin_scheduler_args:
 80 |     initial_margin: 0.0
 81 |     final_margin: 0.3
 82 | 
 83 | optimizer_conf:
 84 |   # 优化方法
 85 |   optimizer: 'Adam'
 86 |   # 优化方法参数
 87 |   optimizer_args:
 88 |     weight_decay: !!float 1e-6
 89 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 90 |   scheduler: 'WarmupCosineSchedulerLR'
 91 |   # 学习率衰减函数参数
 92 |   scheduler_args:
 93 |     # 学习率的大小
 94 |     learning_rate: 0.001
 95 |     min_lr: !!float 1e-5
 96 |     warmup_epoch: 5
 97 | 
 98 | train_conf:
 99 |   # 是否开启自动混合精度
100 |   enable_amp: False
101 |   # 训练的轮数
102 |   max_epoch: 60
103 |   log_interval: 10
104 | 


--------------------------------------------------------------------------------
/configs/eres2net.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型，支持ERes2Net、ERes2NetV2
 52 |   model: 'ERes2Net'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |     m_channels: 32
 57 |   # 分类器参数
 58 |   classifier:
 59 |     # 分类器类型，支持Cosine、Linear
 60 |     classifier_type: 'Cosine'
 61 |     # 说话人数量，即分类大小
 62 |     num_speakers: 2796
 63 |     num_blocks: 0
 64 | loss_conf:
 65 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 66 |   loss: 'AAMLoss'
 67 |   # 损失函数参数
 68 |   loss_args:
 69 |     margin: 0.2
 70 |     scale: 32
 71 |     easy_margin: False
 72 |     label_smoothing: 0.0
 73 |   # 是否使用损失函数margin调度器
 74 |   use_margin_scheduler: True
 75 |   # margin调度器参数
 76 |   margin_scheduler_args:
 77 |     initial_margin: 0.0
 78 |     final_margin: 0.3
 79 | 
 80 | optimizer_conf:
 81 |   # 优化方法
 82 |   optimizer: 'Adam'
 83 |   # 优化方法参数
 84 |   optimizer_args:
 85 |     weight_decay: !!float 1e-6
 86 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 87 |   scheduler: 'WarmupCosineSchedulerLR'
 88 |   # 学习率衰减函数参数
 89 |   scheduler_args:
 90 |     # 学习率的大小
 91 |     learning_rate: 0.001
 92 |     min_lr: !!float 1e-5
 93 |     warmup_epoch: 5
 94 | 
 95 | train_conf:
 96 |   # 是否开启自动混合精度
 97 |   enable_amp: False
 98 |   # 训练的轮数
 99 |   max_epoch: 60
100 |   log_interval: 10
101 | 


--------------------------------------------------------------------------------
/configs/res2net.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型
 52 |   model: 'Res2Net'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |     # 所使用的池化层，支持ASP、SAP、TSP、TAP
 57 |     pooling_type: 'ASP'
 58 |     m_channels: 32
 59 |   # 分类器参数
 60 |   classifier:
 61 |     # 分类器类型，支持Cosine、Linear
 62 |     classifier_type: 'Cosine'
 63 |     # 说话人数量，即分类大小
 64 |     num_speakers: 2796
 65 |     num_blocks: 0
 66 | 
 67 | loss_conf:
 68 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 69 |   loss: 'AAMLoss'
 70 |   # 损失函数参数
 71 |   loss_args:
 72 |     margin: 0.2
 73 |     scale: 32
 74 |     easy_margin: False
 75 |     label_smoothing: 0.0
 76 |   # 是否使用损失函数margin调度器
 77 |   use_margin_scheduler: True
 78 |   # margin调度器参数
 79 |   margin_scheduler_args:
 80 |     initial_margin: 0.0
 81 |     final_margin: 0.3
 82 | 
 83 | optimizer_conf:
 84 |   # 优化方法
 85 |   optimizer: 'Adam'
 86 |   # 优化方法参数
 87 |   optimizer_args:
 88 |     weight_decay: !!float 1e-6
 89 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 90 |   scheduler: 'WarmupCosineSchedulerLR'
 91 |   # 学习率衰减函数参数
 92 |   scheduler_args:
 93 |     # 学习率的大小
 94 |     learning_rate: 0.001
 95 |     min_lr: !!float 1e-5
 96 |     warmup_epoch: 5
 97 | 
 98 | train_conf:
 99 |   # 是否开启自动混合精度
100 |   enable_amp: False
101 |   # 训练的轮数
102 |   max_epoch: 60
103 |   log_interval: 10
104 | 


--------------------------------------------------------------------------------
/configs/resnet_se.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型
 52 |   model: 'ResNetSE'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |     # 所使用的池化层，支持ASP、SAP、TSP、TAP
 57 |     pooling_type: 'ASP'
 58 |   # 分类器参数
 59 |   classifier:
 60 |     # 分类器类型，支持Cosine、Linear
 61 |     classifier_type: 'Cosine'
 62 |     # 说话人数量，即分类大小
 63 |     num_speakers: 2796
 64 |     num_blocks: 0
 65 | 
 66 | loss_conf:
 67 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 68 |   loss: 'AAMLoss'
 69 |   # 损失函数参数
 70 |   loss_args:
 71 |     margin: 0.2
 72 |     scale: 32
 73 |     easy_margin: False
 74 |     label_smoothing: 0.0
 75 |   # 是否使用损失函数margin调度器
 76 |   use_margin_scheduler: True
 77 |   # margin调度器参数
 78 |   margin_scheduler_args:
 79 |     initial_margin: 0.0
 80 |     final_margin: 0.3
 81 | 
 82 | optimizer_conf:
 83 |   # 优化方法
 84 |   optimizer: 'Adam'
 85 |   # 优化方法参数
 86 |   optimizer_args:
 87 |     weight_decay: !!float 1e-6
 88 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 89 |   scheduler: 'WarmupCosineSchedulerLR'
 90 |   # 学习率衰减函数参数
 91 |   scheduler_args:
 92 |     # 学习率的大小
 93 |     learning_rate: 0.001
 94 |     min_lr: !!float 1e-5
 95 |     warmup_epoch: 5
 96 | 
 97 | train_conf:
 98 |   # 是否开启自动混合精度
 99 |   enable_amp: False
100 |   # 训练的轮数
101 |   max_epoch: 60
102 |   log_interval: 10
103 | 


--------------------------------------------------------------------------------
/configs/tdnn.yml:
--------------------------------------------------------------------------------
  1 | # 数据集参数
  2 | dataset_conf:
  3 |   dataset:
  4 |     # 过滤最短的音频长度
  5 |     min_duration: 0.3
  6 |     # 最长的音频长度，大于这个长度会裁剪掉
  7 |     max_duration: 3
  8 |     # 音频的采样率
  9 |     sample_rate: 16000
 10 |     # 是否对音频进行音量归一化
 11 |     use_dB_normalization: True
 12 |     # 对音频进行音量归一化的音量分贝值
 13 |     target_dB: -20
 14 |   sampler:
 15 |     # 训练的批量大小
 16 |     batch_size: 64
 17 |     # 是否打乱数据
 18 |     shuffle: True
 19 |     # 是否丢弃最后一个样本
 20 |     drop_last: True
 21 |   dataLoader:
 22 |     # 读取数据的线程数量
 23 |     num_workers: 8
 24 |   # 评估的数据要特殊处理
 25 |   eval_conf:
 26 |     # 评估的批量大小
 27 |     batch_size: 8
 28 |     # 最长的音频长度
 29 |     max_duration: 20
 30 |   # 训练数据的数据列表路径
 31 |   train_list: 'dataset/train_list.txt'
 32 |   # 评估注册的数据列表路径
 33 |   enroll_list: 'dataset/cn-celeb-test/enroll_list.txt'
 34 |   # 评估检验的数据列表路径
 35 |   trials_list: 'dataset/cn-celeb-test/trials_list.txt'
 36 |   # 是否使用PKSampler，该Sampler可以保证每个说话人都有sample_per_id个样本
 37 |   is_use_pksampler: False
 38 |   # 使用PKSampler时设置样本数量
 39 |   sample_per_id: 4
 40 | 
 41 | # 数据预处理参数
 42 | preprocess_conf:
 43 |   # 音频预处理方法，支持：LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank
 44 |   feature_method: 'Fbank'
 45 |   # 设置API参数，更参数查看对应API，不清楚的可以直接删除该部分，直接使用默认值
 46 |   method_args:
 47 |     sr: 16000
 48 |     n_mels: 80
 49 | 
 50 | model_conf:
 51 |   # 所使用的模型
 52 |   model: 'TDNN'
 53 |   # 模型参数
 54 |   model_args:
 55 |     embd_dim: 192
 56 |     channels: 512
 57 |     # 所使用的池化层，支持ASP、SAP、TSP、TAP
 58 |     pooling_type: 'ASP'
 59 |   # 分类器参数
 60 |   classifier:
 61 |     # 分类器类型，支持Cosine、Linear
 62 |     classifier_type: 'Cosine'
 63 |     # 说话人数量，即分类大小
 64 |     num_speakers: 2796
 65 |     num_blocks: 0
 66 | 
 67 | loss_conf:
 68 |   # 所使用的损失函数，支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss
 69 |   loss: 'AAMLoss'
 70 |   # 损失函数参数
 71 |   loss_args:
 72 |     margin: 0.2
 73 |     scale: 32
 74 |     easy_margin: False
 75 |     label_smoothing: 0.0
 76 |   # 是否使用损失函数margin调度器
 77 |   use_margin_scheduler: True
 78 |   # margin调度器参数
 79 |   margin_scheduler_args:
 80 |     initial_margin: 0.0
 81 |     final_margin: 0.3
 82 | 
 83 | optimizer_conf:
 84 |   # 优化方法
 85 |   optimizer: 'Adam'
 86 |   # 优化方法参数
 87 |   optimizer_args:
 88 |     weight_decay: !!float 1e-6
 89 |   # 学习率衰减函数，支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR
 90 |   scheduler: 'WarmupCosineSchedulerLR'
 91 |   # 学习率衰减函数参数
 92 |   scheduler_args:
 93 |     # 学习率的大小
 94 |     learning_rate: 0.001
 95 |     min_lr: !!float 1e-5
 96 |     warmup_epoch: 5
 97 | 
 98 | train_conf:
 99 |   # 是否开启自动混合精度
100 |   enable_amp: False
101 |   # 训练的轮数
102 |   max_epoch: 60
103 |   log_interval: 10
104 | 


--------------------------------------------------------------------------------
/create_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | # 制作CN-Celeb数据集列表
 5 | # 下载地址：https://openslr.trmal.net/resources/82/cn-celeb_v2.tar.gz
 6 | # 下载并解压到dataset目录，解压命令：tar -zxvf cn-celeb_v2.tar.gz
 7 | def create_cn_celeb(list_path, data_path='dataset/'):
 8 |     f_train = open(list_path, 'w', encoding='utf-8')
 9 |     data_dir = os.path.join(data_path, 'CN-Celeb_flac/data/')
10 |     dirs = sorted(os.listdir(data_dir))
11 |     for label, d in enumerate(dirs):
12 |         # 跳过测试集
13 |         if label >= 800:continue
14 |         for file in os.listdir(os.path.join(data_dir, d)):
15 |             sound_path = os.path.join(data_dir, d, file).replace('\\', '/')
16 |             f_train.write(f'{sound_path}\t{label}\n')
17 |     f_train.close()
18 | 
19 | 
20 | # 制作CN-Celeb2数据集列表，如果是Windows，请跳过这个数据集
21 | # 下载分包1地址：https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzaa
22 | # 下载分包2地址：https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzab
23 | # 下载分包3地址：https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzac
24 | # 下载并解压到dataset目录，合并压缩包命令：cat cn-celeb2_v2.tar.gza* > cn-celeb2_v2.tar.gz，解压命令：tar -zxvf cn-celeb2_v2.tar.gz
25 | def create_cn_celeb2(list_path, data_path='dataset/'):
26 |     data_dir = os.path.join(data_path, 'CN-Celeb2_flac/data/')
27 |     if not os.path.exists(data_dir):
28 |         print('CN-Celeb2数据集不存在，请先下载并解压到dataset目录，目前忽略，你也可继续正常训练')
29 |         return
30 |     f_train = open(list_path, 'a', encoding='utf-8')
31 |     dirs = sorted(os.listdir(data_dir))
32 |     last_label = 800
33 |     for label, d in enumerate(dirs):
34 |         for file in os.listdir(os.path.join(data_dir, d)):
35 |             sound_path = os.path.join(data_dir, d, file).replace('\\', '/')
36 |             f_train.write(f'{sound_path}\t{label + last_label}\n')
37 |     f_train.close()
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     create_cn_celeb(list_path='dataset/train_list.txt', data_path='dataset')
42 |     create_cn_celeb2(list_path='dataset/train_list.txt', data_path='dataset')
43 | 


--------------------------------------------------------------------------------
/dataset/a_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/a_1.wav


--------------------------------------------------------------------------------
/dataset/a_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/a_2.wav


--------------------------------------------------------------------------------
/dataset/b_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/b_1.wav


--------------------------------------------------------------------------------
/dataset/b_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/b_2.wav


--------------------------------------------------------------------------------
/dataset/test_long.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/test_long.wav


--------------------------------------------------------------------------------
/docs/images/contrast.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/contrast.jpg


--------------------------------------------------------------------------------
/docs/images/log.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/log.jpg


--------------------------------------------------------------------------------
/docs/images/recognition.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/recognition.jpg


--------------------------------------------------------------------------------
/docs/images/speaker_diarization.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/speaker_diarization.jpg


--------------------------------------------------------------------------------
/docs/images/speaker_diarization_gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/speaker_diarization_gui.png


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import time
 4 | 
 5 | from ppvector.trainer import PPVectorTrainer
 6 | from ppvector.utils.utils import add_arguments, print_arguments
 7 | 
 8 | parser = argparse.ArgumentParser(description=__doc__)
 9 | add_arg = functools.partial(add_arguments, argparser=parser)
10 | add_arg('configs',          str,   'configs/cam++.yml',    "配置文件")
11 | add_arg("use_gpu",          bool,  True,                   "是否使用GPU评估模型")
12 | add_arg('save_image_path',  str,   'output/images/',       "保存结果图的路径")
13 | add_arg('resume_model',     str,   'models/CAMPPlus_Fbank/best_model/',  "模型的路径")
14 | add_arg('overwrites',       str,    None,    '覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开')
15 | args = parser.parse_args()
16 | print_arguments(args=args)
17 | 
18 | # 获取训练器
19 | trainer = PPVectorTrainer(configs=args.configs, use_gpu=args.use_gpu, overwrites=args.overwrites)
20 | 
21 | # 开始评估
22 | start = time.time()
23 | eer, min_dcf, threshold = trainer.evaluate(resume_model=args.resume_model, save_image_path=args.save_image_path)
24 | end = time.time()
25 | print('评估消耗时间：{}s，threshold：{:.2f}，EER: {:.5f}, MinDCF: {:.5f}'
26 |       .format(int(end - start), threshold, eer, min_dcf))
27 | 


--------------------------------------------------------------------------------
/extract_features.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from ppvector.trainer import PPVectorTrainer
 5 | from ppvector.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
10 | add_arg('save_dir',         str,    'dataset/features',         '保存特征的路径')
11 | add_arg('max_duration',     int,    100,                        '提取特征的最大时长，单位秒')
12 | args = parser.parse_args()
13 | print_arguments(args=args)
14 | 
15 | # 获取训练器
16 | trainer = PPVectorTrainer(configs=args.configs)
17 | 
18 | # 提取特征保存文件
19 | trainer.extract_features(save_dir=args.save_dir, max_duration=args.max_duration)
20 | 


--------------------------------------------------------------------------------
/infer_contrast.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from ppvector.predict import PPVectorPredictor
 5 | from ppvector.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
10 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
11 | add_arg('audio_path1',      str,    'dataset/a_1.wav',          '预测第一个音频')
12 | add_arg('audio_path2',      str,    'dataset/b_2.wav',          '预测第二个音频')
13 | add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
14 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
15 | args = parser.parse_args()
16 | print_arguments(args=args)
17 | 
18 | # 获取识别器
19 | predictor = PPVectorPredictor(configs=args.configs,
20 |                               model_path=args.model_path,
21 |                               use_gpu=args.use_gpu)
22 | 
23 | dist = predictor.contrast(args.audio_path1, args.audio_path2)
24 | if dist > args.threshold:
25 |     print(f"{args.audio_path1} 和 {args.audio_path2} 为同一个人，相似度为：{dist}")
26 | else:
27 |     print(f"{args.audio_path1} 和 {args.audio_path2} 不是同一个人，相似度为：{dist}")
28 | 


--------------------------------------------------------------------------------
/infer_contrast_gui.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import tkinter as tk
  3 | from tkinter import ttk, filedialog, messagebox
  4 | import functools
  5 | import argparse
  6 | import threading
  7 | import time
  8 | from pathlib import Path
  9 | 
 10 | from ppvector.predict import PPVectorPredictor
 11 | from ppvector.utils.utils import add_arguments, print_arguments
 12 | 
 13 | parser = argparse.ArgumentParser(description=__doc__)
 14 | add_arg = functools.partial(add_arguments, argparser=parser)
 15 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
 16 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
 17 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
 18 | args = parser.parse_args()
 19 | print_arguments(args=args)
 20 | 
 21 | 
 22 | class VoiceContrastGUI:
 23 |     def __init__(self, master):
 24 |         self.master = master
 25 |         master.title("夜雨飘零声纹对比系统")
 26 |         master.geometry('700x670')
 27 |         master.resizable(True, True)
 28 |         master.configure(bg='#f0f0f0')
 29 | 
 30 |         # 使用ttk样式
 31 |         self.style = ttk.Style()
 32 |         self.style.theme_use('clam')
 33 | 
 34 |         # 配置样式
 35 |         self.style.configure('TButton', font=('微软雅黑', 10), padding=5)
 36 |         self.style.configure('TLabel', font=('微软雅黑', 10), background='#f0f0f0')
 37 |         self.style.configure('Header.TLabel', font=('微软雅黑', 14, 'bold'), background='#f0f0f0')
 38 |         self.style.configure('Result.TLabel', font=('微软雅黑', 16, 'bold'), foreground='#007bff', background='#f0f0f0')
 39 |         self.style.configure("Green.Horizontal.TProgressbar", background='#4CAF50', troughcolor='#f0f0f0',
 40 |                              borderwidth=0, thickness=20)
 41 | 
 42 |         # 创建主框架
 43 |         self.main_frame = ttk.Frame(master, padding="20 20 20 20")
 44 |         self.main_frame.pack(fill=tk.BOTH, expand=True)
 45 | 
 46 |         # 创建标题
 47 |         self.title_label = ttk.Label(self.main_frame, text="声纹对比系统", style='Header.TLabel')
 48 |         self.title_label.grid(row=0, column=0, columnspan=4, pady=(0, 20))
 49 | 
 50 |         # 音频选择区域
 51 |         self.audio_frame = ttk.LabelFrame(self.main_frame, text="音频选择", padding="10 10 10 10")
 52 |         self.audio_frame.grid(row=1, column=0, columnspan=4, sticky="ew", pady=(0, 20))
 53 | 
 54 |         # 音频1选择
 55 |         self.label1 = ttk.Label(self.audio_frame, text="音频文件1:")
 56 |         self.label1.grid(row=0, column=0, padx=10, pady=10, sticky="w")
 57 | 
 58 |         self.audio1_path = tk.StringVar()
 59 |         self.entry_audio1 = ttk.Entry(self.audio_frame, width=50, textvariable=self.audio1_path)
 60 |         self.entry_audio1.grid(row=0, column=1, padx=10, pady=10, sticky="ew")
 61 | 
 62 |         self.btn_audio1 = ttk.Button(self.audio_frame, text="选择文件", command=self.select_audio1)
 63 |         self.btn_audio1.grid(row=0, column=2, padx=10, pady=10)
 64 | 
 65 |         # 音频2选择
 66 |         self.label2 = ttk.Label(self.audio_frame, text="音频文件2:")
 67 |         self.label2.grid(row=1, column=0, padx=10, pady=10, sticky="w")
 68 | 
 69 |         self.audio2_path = tk.StringVar()
 70 |         self.entry_audio2 = ttk.Entry(self.audio_frame, width=50, textvariable=self.audio2_path)
 71 |         self.entry_audio2.grid(row=1, column=1, padx=10, pady=10, sticky="ew")
 72 | 
 73 |         self.btn_audio2 = ttk.Button(self.audio_frame, text="选择文件", command=self.select_audio2)
 74 |         self.btn_audio2.grid(row=1, column=2, padx=10, pady=10)
 75 | 
 76 |         # 设置列的权重
 77 |         self.audio_frame.columnconfigure(1, weight=1)
 78 | 
 79 |         # 参数设置区域
 80 |         self.settings_frame = ttk.LabelFrame(self.main_frame, text="参数设置", padding="10 10 10 10")
 81 |         self.settings_frame.grid(row=2, column=0, columnspan=4, sticky="ew", pady=(0, 20))
 82 | 
 83 |         # 判断阈值
 84 |         self.label3 = ttk.Label(self.settings_frame, text="对比阈值:")
 85 |         self.label3.grid(row=0, column=0, padx=10, pady=10, sticky="w")
 86 | 
 87 |         self.threshold = tk.StringVar(value="0.6")
 88 |         self.entry_threshold = ttk.Entry(self.settings_frame, width=10, textvariable=self.threshold)
 89 |         self.entry_threshold.grid(row=0, column=1, padx=10, pady=10, sticky="w")
 90 | 
 91 |         self.threshold_info = ttk.Label(self.settings_frame, text="(取值范围0-1，越大表示要求越严格)")
 92 |         self.threshold_info.grid(row=0, column=2, padx=10, pady=10, sticky="w")
 93 | 
 94 |         # 操作按钮区域
 95 |         self.button_frame = ttk.Frame(self.main_frame)
 96 |         self.button_frame.grid(row=3, column=0, columnspan=4, sticky="ew", pady=(0, 20))
 97 | 
 98 |         self.btn_predict = ttk.Button(self.button_frame, text="开始对比", command=self.predict_thread)
 99 |         self.btn_predict.pack(side=tk.LEFT, padx=10)
100 | 
101 |         self.btn_clear = ttk.Button(self.button_frame, text="清空", command=self.clear)
102 |         self.btn_clear.pack(side=tk.LEFT, padx=10)
103 | 
104 |         self.btn_quit = ttk.Button(self.button_frame, text="退出", command=self.quit)
105 |         self.btn_quit.pack(side=tk.RIGHT, padx=10)
106 | 
107 |         # 状态区域
108 |         self.status_frame = ttk.LabelFrame(self.main_frame, text="状态", padding="10 10 10 10")
109 |         self.status_frame.grid(row=4, column=0, columnspan=4, sticky="ew", pady=(0, 10))
110 | 
111 |         # 进度条
112 |         self.progress_var = tk.DoubleVar()
113 |         self.progress_bar = ttk.Progressbar(self.status_frame, orient="horizontal",
114 |                                             mode="determinate", variable=self.progress_var,
115 |                                             style="Green.Horizontal.TProgressbar")
116 |         self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
117 | 
118 |         # 结果显示区域
119 |         self.result_frame = ttk.LabelFrame(self.main_frame, text="对比结果", padding="10 10 10 10")
120 |         self.result_frame.grid(row=5, column=0, columnspan=4, sticky="ew")
121 | 
122 |         self.result_label = ttk.Label(self.result_frame, text="请选择两个音频文件进行对比",
123 |                                       style='Result.TLabel', anchor=tk.CENTER)
124 |         self.result_label.pack(fill=tk.X, pady=10)
125 | 
126 |         # 结果详情
127 |         self.detail_frame = ttk.Frame(self.result_frame)
128 |         self.detail_frame.pack(fill=tk.X, pady=5)
129 | 
130 |         self.similarity_label = ttk.Label(self.detail_frame, text="相似度: ")
131 |         self.similarity_label.pack(side=tk.LEFT, padx=10)
132 | 
133 |         self.similarity_value = ttk.Label(self.detail_frame, text="--", font=('微软雅黑', 12, 'bold'))
134 |         self.similarity_value.pack(side=tk.LEFT)
135 | 
136 |         # 设置列的权重
137 |         for i in range(4):
138 |             self.main_frame.columnconfigure(i, weight=1)
139 | 
140 |         # 预测器
141 |         self.predictor = PPVectorPredictor(configs=args.configs, model_path=args.model_path, use_gpu=args.use_gpu)
142 |         self.is_predicting = False
143 | 
144 |     def select_audio1(self):
145 |         filename = filedialog.askopenfilename(initialdir='./dataset',
146 |                                               filetypes=[("音频文件", "*.wav *.mp3 *.flac *.ogg *.m4a"),
147 |                                                          ("所有文件", "*.*")])
148 |         if filename:
149 |             self.audio1_path.set(filename)
150 | 
151 |     def select_audio2(self):
152 |         filename = filedialog.askopenfilename(initialdir='./dataset',
153 |                                               filetypes=[("音频文件", "*.wav *.mp3 *.flac *.ogg *.m4a"),
154 |                                                          ("所有文件", "*.*")])
155 |         if filename:
156 |             self.audio2_path.set(filename)
157 | 
158 |     def predict_thread(self):
159 |         """在线程中执行预测"""
160 |         if self.is_predicting:
161 |             messagebox.showinfo("提示", "正在处理中，请稍候...")
162 |             return
163 | 
164 |         audio_path1 = self.audio1_path.get()
165 |         audio_path2 = self.audio2_path.get()
166 | 
167 |         if not audio_path1 or not audio_path2:
168 |             messagebox.showerror("错误", "请选择两个音频文件")
169 |             return
170 | 
171 |         try:
172 |             threshold = float(self.threshold.get())
173 |             if threshold < 0 or threshold > 1:
174 |                 messagebox.showerror("错误", "阈值必须在0-1之间")
175 |                 return
176 |         except ValueError:
177 |             messagebox.showerror("错误", "请输入有效的阈值")
178 |             return
179 | 
180 |         self.is_predicting = True
181 |         self.btn_predict.config(state=tk.DISABLED)
182 |         self.result_label.config(text="正在处理...")
183 |         self.similarity_value.config(text="--")
184 |         self.update_progress_bar(0)
185 | 
186 |         # 启动线程进行预测
187 |         threading.Thread(target=self._predict, args=(audio_path1, audio_path2, threshold)).start()
188 | 
189 |     def _predict(self, audio_path1, audio_path2, threshold):
190 |         """执行预测"""
191 |         try:
192 |             # 模拟进度
193 |             for i in range(1, 101):
194 |                 if i < 90:  # 预留最后10%用于实际计算结果
195 |                     self.update_progress_bar(i)
196 |                     time.sleep(0.02)  # 调整速度使进度条看起来更自然
197 | 
198 |             # 执行实际预测
199 |             dist = self.predictor.contrast(audio_path1, audio_path2)
200 | 
201 |             # 完成进度
202 |             self.update_progress_bar(100)
203 | 
204 |             # 更新UI显示结果
205 |             self.similarity_value.config(text=f"{dist:.5f}")
206 | 
207 |             if dist > threshold:
208 |                 result_text = f"两段语音来自同一个人"
209 |                 self.result_label.config(text=result_text, foreground="#4CAF50")
210 |             else:
211 |                 result_text = f"两段语音来自不同的人"
212 |                 self.result_label.config(text=result_text, foreground="#F44336")
213 | 
214 |         except Exception as e:
215 |             messagebox.showerror("错误", f"预测失败: {str(e)}")
216 |             self.result_label.config(text="预测失败，请检查音频文件格式", foreground="#F44336")
217 | 
218 |         finally:
219 |             self.btn_predict.config(state=tk.NORMAL)
220 |             self.is_predicting = False
221 | 
222 |     def clear(self):
223 |         """清空所有输入和结果"""
224 |         self.audio1_path.set("")
225 |         self.audio2_path.set("")
226 |         self.threshold.set("0.6")
227 |         self.result_label.config(text="请选择两个音频文件进行对比", foreground="#007bff")
228 |         self.similarity_value.config(text="--")
229 |         self.update_progress_bar(0)
230 | 
231 |     def update_progress_bar(self, value):
232 |         """更新进度条"""
233 |         self.progress_var.set(value)
234 |         self.master.update_idletasks()
235 | 
236 |     def quit(self):
237 |         self.master.destroy()
238 | 
239 | 
240 | if __name__ == '__main__':
241 |     root = tk.Tk()
242 |     app = VoiceContrastGUI(root)
243 |     root.mainloop()
244 | 


--------------------------------------------------------------------------------
/infer_recognition.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from ppvector.predict import PPVectorPredictor
 5 | from ppvector.utils.record import RecordAudio
 6 | from ppvector.utils.utils import add_arguments, print_arguments
 7 | 
 8 | parser = argparse.ArgumentParser(description=__doc__)
 9 | add_arg = functools.partial(add_arguments, argparser=parser)
10 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
11 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
12 | add_arg('audio_db_path',    str,    'audio_db/',                '音频库的路径')
13 | add_arg('record_seconds',   int,    3,                          '录音长度')
14 | add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
15 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
16 | args = parser.parse_args()
17 | print_arguments(args=args)
18 | 
19 | # 获取识别器
20 | predictor = PPVectorPredictor(configs=args.configs,
21 |                               threshold=args.threshold,
22 |                               audio_db_path=args.audio_db_path,
23 |                               model_path=args.model_path,
24 |                               use_gpu=args.use_gpu)
25 | # 获取录音器
26 | record_audio = RecordAudio()
27 | 
28 | while True:
29 |     select_fun = int(input("请选择功能，0为注册音频到声纹库，1为执行声纹识别，2为删除用户："))
30 |     if select_fun == 0:
31 |         input(f"按下回车键开机录音，录音{args.record_seconds}秒中：")
32 |         audio_data = record_audio.record(record_seconds=args.record_seconds)
33 |         name = input("请输入该音频用户的名称：")
34 |         if name == '': continue
35 |         predictor.register(user_name=name, audio_data=audio_data, sample_rate=record_audio.sample_rate)
36 |     elif select_fun == 1:
37 |         input(f"按下回车键开机录音，录音{args.record_seconds}秒中：")
38 |         audio_data = record_audio.record(record_seconds=args.record_seconds)
39 |         name, score = predictor.recognition(audio_data, sample_rate=record_audio.sample_rate)
40 |         if name:
41 |             print(f"识别说话的为：{name}，得分：{score}")
42 |         else:
43 |             print(f"没有识别到说话人，可能是没注册。")
44 |     elif select_fun == 2:
45 |         name = input("请输入该音频用户的名称：")
46 |         if name == '': continue
47 |         predictor.remove_user(user_name=name)
48 |     else:
49 |         print('请正确选择功能')
50 | 


--------------------------------------------------------------------------------
/infer_speaker_diarization.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import os
 4 | 
 5 | from ppvector.predict import PPVectorPredictor
 6 | from ppvector.utils.utils import add_arguments, print_arguments
 7 | 
 8 | parser = argparse.ArgumentParser(description=__doc__)
 9 | add_arg = functools.partial(add_arguments, argparser=parser)
10 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
11 | add_arg('audio_path',       str,    'dataset/test_long.wav',    '预测音频路径')
12 | add_arg('audio_db_path',    str,    'audio_db/',                '音频库的路径')
13 | add_arg('speaker_num',      int,    None,                       '说话人数量，提供说话人数量可以提高准确率')
14 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
15 | add_arg('show_plot',        bool,   True,                       '是否显示结果图像')
16 | add_arg('search_audio_db',  bool,   True,                       '是否在音频库中搜索对应的说话人')
17 | add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
18 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
19 | args = parser.parse_args()
20 | print_arguments(args=args)
21 | 
22 | if args.search_audio_db:
23 |     assert args.audio_db_path is not None, "请指定音频库的路径"
24 | 
25 | # 获取识别器
26 | predictor = PPVectorPredictor(configs=args.configs,
27 |                               model_path=args.model_path,
28 |                               threshold=args.threshold,
29 |                               audio_db_path=args.audio_db_path,
30 |                               use_gpu=args.use_gpu)
31 | 
32 | # 进行说话人日志识别
33 | results = predictor.speaker_diarization(args.audio_path,
34 |                                         speaker_num=args.speaker_num,
35 |                                         search_audio_db=args.search_audio_db)
36 | print(f"识别结果：")
37 | for result in results:
38 |     print(result)
39 | 
40 | # 绘制结果图像
41 | if args.show_plot:
42 |     from ppvector.infer_utils.viewer import PlotSpeaker
43 |     plot_speaker = PlotSpeaker(results, audio_path=args.audio_path)
44 |     os.makedirs('output', exist_ok=True)
45 |     plot_speaker.draw('output/speaker_diarization.png')
46 |     plot_speaker.plot.show()
47 | 


--------------------------------------------------------------------------------
/infer_speaker_diarization_gui.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import functools
  3 | import os.path
  4 | import threading
  5 | import tkinter as tk
  6 | from tkinter import filedialog
  7 | 
  8 | from ppvector.predict import PPVectorPredictor
  9 | from ppvector.utils.utils import add_arguments, print_arguments
 10 | 
 11 | parser = argparse.ArgumentParser(description=__doc__)
 12 | add_arg = functools.partial(add_arguments, argparser=parser)
 13 | add_arg('configs',          str,    'configs/cam++.yml',        '配置文件')
 14 | add_arg('audio_path',       str,    'dataset/test_long.wav',    '预测音频路径')
 15 | add_arg('audio_db_path',    str,    'audio_db/',                '音频库的路径')
 16 | add_arg('speaker_num',      int,    None,                       '说话人数量，提供说话人数量可以提高准确率')
 17 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
 18 | add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
 19 | add_arg('model_path',       str,    'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
 20 | args = parser.parse_args()
 21 | print_arguments(args=args)
 22 | 
 23 | 
 24 | class SpeakerDiarizationGUI:
 25 |     def __init__(self, window):
 26 |         self.window = window
 27 |         window.title("夜雨飘零说话人日志")
 28 |         self.plot_speaker = None
 29 |         self.show_plot = True
 30 |         self.search_audio_db = True
 31 |         # 添加组件
 32 |         self.label1 = tk.Label(window, text="音频路径:")
 33 |         self.label1.grid(row=0, column=0, padx=10, pady=10)
 34 |         self.entry_audio1 = tk.Entry(window, width=60)
 35 |         self.entry_audio1.grid(row=0, column=1, columnspan=2, padx=10, pady=10)
 36 |         self.btn_audio1 = tk.Button(window, text="选择", command=self.select_audio)
 37 |         self.btn_audio1.grid(row=0, column=3, padx=10, pady=10)
 38 |         self.btn_predict = tk.Button(window, text="开始识别", command=self.predict)
 39 |         self.btn_predict.grid(row=0, column=4, padx=10, pady=10)
 40 |         self.an_frame = tk.Frame(window)
 41 |         self.check_var = tk.BooleanVar(value=False)
 42 |         self.is_show_check = tk.Checkbutton(self.an_frame, text='是否显示结果图', variable=self.check_var, command=self.is_show_state)
 43 |         self.is_show_check.grid(row=0)
 44 |         self.is_show_check.select()
 45 |         self.an_frame.grid(row=1)
 46 |         self.an_frame.grid(row=2, column=1, padx=10)
 47 |         self.an_frame1 = tk.Frame(window)
 48 |         self.check_var1 = tk.BooleanVar(value=False)
 49 |         self.is_search_check = tk.Checkbutton(self.an_frame1, text='是否检索数据库', variable=self.check_var1, command=self.is_search_state)
 50 |         self.is_search_check.grid(row=0)
 51 |         self.is_search_check.select()
 52 |         self.an_frame1.grid(row=1)
 53 |         self.an_frame1.grid(row=2, column=2, padx=10)
 54 |         # 输出结果文本框
 55 |         self.result_label = tk.Label(self.window, text="输出结果：")
 56 |         self.result_label.grid(row=3, column=0, padx=10, pady=10)
 57 |         self.result_text = tk.Text(self.window, width=60, height=20)
 58 |         self.result_text.grid(row=3, column=1, columnspan=2, padx=10, pady=10)
 59 | 
 60 |         # 预测器
 61 |         self.predictor = PPVectorPredictor(configs=args.configs,
 62 |                                            model_path=args.model_path,
 63 |                                            threshold=args.threshold,
 64 |                                            audio_db_path=args.audio_db_path,
 65 |                                            use_gpu=args.use_gpu)
 66 | 
 67 |     def is_show_state(self):
 68 |         self.show_plot = self.check_var.get()
 69 | 
 70 |     def is_search_state(self):
 71 |         self.search_audio_db = self.check_var1.get()
 72 | 
 73 |     def select_audio(self):
 74 |         filename = filedialog.askopenfilename(initialdir='./dataset')
 75 |         self.entry_audio1.delete(0, tk.END)
 76 |         self.entry_audio1.insert(tk.END, filename)
 77 | 
 78 |     def predict(self):
 79 |         if self.plot_speaker:
 80 |             self.plot_speaker.plot.close()
 81 |         self.plot_speaker = None
 82 |         audio_path = self.entry_audio1.get()
 83 |         if audio_path is None or len(audio_path) == 0: return
 84 |         print(f'选择音频路径：{audio_path}')
 85 |         # 进行说话人日志识别
 86 |         results = self.predictor.speaker_diarization(audio_path,
 87 |                                                      speaker_num=args.speaker_num,
 88 |                                                      search_audio_db=self.search_audio_db)
 89 |         self.result_text.delete('1.0', 'end')
 90 |         for result in results:
 91 |             self.result_text.insert(tk.END, f"{result}\n")
 92 | 
 93 |         if self.show_plot:
 94 |             threading.Thread(target=self.show_result(results), args=(results,)).start()
 95 | 
 96 |     def show_result(self, results):
 97 |         from ppvector.infer_utils.viewer import PlotSpeaker
 98 |         self.plot_speaker = PlotSpeaker(results, audio_path=args.audio_path)
 99 |         os.makedirs('output', exist_ok=True)
100 |         self.plot_speaker.draw('output/speaker_diarization.png')
101 |         self.plot_speaker.plot.show()
102 |         self.plot_speaker = None
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     root = tk.Tk()
107 |     app = SpeakerDiarizationGUI(root)
108 |     root.mainloop()
109 | 


--------------------------------------------------------------------------------
/ppvector/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.5"
2 | 


--------------------------------------------------------------------------------
/ppvector/data_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/data_utils/__init__.py


--------------------------------------------------------------------------------
/ppvector/data_utils/collate_fn.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | 
 3 | 
 4 | # 对一个batch的数据处理
 5 | def collate_fn(batch):
 6 |     # 找出音频长度最长的
 7 |     batch_sorted = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True)
 8 |     freq_size = batch_sorted[0][0].shape[1]
 9 |     max_freq_length = batch_sorted[0][0].shape[0]
10 |     batch_size = len(batch_sorted)
11 |     # 以最大的长度创建0张量
12 |     features = paddle.zeros((batch_size, max_freq_length, freq_size), dtype=paddle.float32)
13 |     input_lens, labels = [], []
14 |     for x in range(batch_size):
15 |         tensor, label = batch[x]
16 |         seq_length = tensor.shape[0]
17 |         # 将数据插入都0张量中，实现了padding
18 |         features[x, :seq_length, :] = tensor[:, :]
19 |         labels.append(label)
20 |         input_lens.append(seq_length)
21 |     labels = paddle.to_tensor(labels, dtype=paddle.int64)
22 |     input_lens = paddle.to_tensor(input_lens, dtype=paddle.int64)
23 |     return features, labels, input_lens
24 | 


--------------------------------------------------------------------------------
/ppvector/data_utils/featurizer.py:
--------------------------------------------------------------------------------
  1 | import paddle
  2 | from paddle import nn
  3 | import paddleaudio.compliance.kaldi as Kaldi
  4 | from paddle.audio.features import LogMelSpectrogram, MelSpectrogram, Spectrogram, MFCC
  5 | 
  6 | 
  7 | class AudioFeaturizer(nn.Layer):
  8 |     """音频特征器
  9 | 
 10 |     :param feature_method: 所使用的预处理方法
 11 |     :type feature_method: str
 12 |     :param method_args: 预处理方法的参数
 13 |     :type method_args: dict
 14 |     """
 15 | 
 16 |     def __init__(self, feature_method='MelSpectrogram', method_args={}):
 17 |         super().__init__()
 18 |         self._method_args = method_args
 19 |         self._feature_method = feature_method
 20 |         if feature_method == 'LogMelSpectrogram':
 21 |             self.feat_fun = LogMelSpectrogram(**method_args)
 22 |         elif feature_method == 'MelSpectrogram':
 23 |             self.feat_fun = MelSpectrogram(**method_args)
 24 |         elif feature_method == 'Spectrogram':
 25 |             self.feat_fun = Spectrogram(**method_args)
 26 |         elif feature_method == 'MFCC':
 27 |             self.feat_fun = MFCC(**method_args)
 28 |         elif feature_method == 'Fbank':
 29 |             self.feat_fun = KaldiFbank(**method_args)
 30 |         else:
 31 |             raise Exception(f'预处理方法 {self._feature_method} 不存在!')
 32 | 
 33 |     def forward(self, waveforms, input_lens_ratio=None):
 34 |         """从AudioSegment中提取音频特征
 35 | 
 36 |         :param waveforms: Audio segment to extract features from.
 37 |         :type waveforms: AudioSegment
 38 |         :param input_lens_ratio: input length ratio
 39 |         :type input_lens_ratio: tensor
 40 |         :return: Spectrogram audio feature in 2darray.
 41 |         :rtype: ndarray
 42 |         """
 43 |         if len(waveforms.shape) == 1:
 44 |             waveforms = waveforms.unsqueeze(0)
 45 |         feature = self.feat_fun(waveforms)
 46 |         feature = feature.transpose([0, 2, 1])
 47 |         # 归一化
 48 |         feature = feature - feature.mean(1, keepdim=True)
 49 |         if input_lens_ratio is not None:
 50 |             # 对掩码比例进行扩展
 51 |             input_lens = (input_lens_ratio * feature.shape[1]).astype(paddle.int32)
 52 |             mask_lens = input_lens.unsqueeze(1)
 53 |             # 生成掩码张量
 54 |             idxs = paddle.arange(feature.shape[1])
 55 |             idxs = idxs.tile([feature.shape[0], 1])
 56 |             mask = idxs < mask_lens
 57 |             mask = mask.unsqueeze(-1)
 58 |             # 对特征进行掩码操作
 59 |             feature = paddle.where(mask, feature, paddle.zeros_like(feature))
 60 |         return feature
 61 | 
 62 |     @property
 63 |     def feature_dim(self):
 64 |         """返回特征大小
 65 | 
 66 |         :return: 特征大小
 67 |         :rtype: int
 68 |         """
 69 |         if self._feature_method == 'LogMelSpectrogram':
 70 |             return self._method_args.get('n_mels', 128)
 71 |         elif self._feature_method == 'MelSpectrogram':
 72 |             return self._method_args.get('n_mels', 64)
 73 |         elif self._feature_method == 'Spectrogram':
 74 |             return self._method_args.get('n_fft', 512) // 2 + 1
 75 |         elif self._feature_method == 'MFCC':
 76 |             return self._method_args.get('n_mfcc', 40)
 77 |         elif self._feature_method == 'Fbank':
 78 |             return self._method_args.get('n_mels', 23)
 79 |         else:
 80 |             raise Exception('没有{}预处理方法'.format(self._feature_method))
 81 | 
 82 | 
 83 | class KaldiFbank(nn.Layer):
 84 |     def __init__(self, **kwargs):
 85 |         super(KaldiFbank, self).__init__()
 86 |         self.kwargs = kwargs
 87 | 
 88 |     def forward(self, waveforms):
 89 |         """
 90 |         :param waveforms: [Batch, Length]
 91 |         :return: [Batch, Length, Feature]
 92 |         """
 93 |         log_fbanks = []
 94 |         for waveform in waveforms:
 95 |             if len(waveform.shape) == 1:
 96 |                 waveform = waveform.unsqueeze(0)
 97 |             log_fbank = Kaldi.fbank(waveform, **self.kwargs)
 98 |             log_fbank = log_fbank.transpose((1, 0))
 99 |             log_fbanks.append(log_fbank)
100 |         log_fbank = paddle.stack(log_fbanks)
101 |         return log_fbank
102 | 


--------------------------------------------------------------------------------
/ppvector/data_utils/pk_sampler.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import numpy as np
 4 | import paddle.distributed as dist
 5 | from paddle.io import DistributedBatchSampler
 6 | 
 7 | 
 8 | class PKSampler(DistributedBatchSampler):
 9 |     """随机取一批数据，保证每个类别的数量都是相同的。
10 | 
11 |     Args:
12 |         dataset (Dataset): 数据的Dataset
13 |         batch_size (int): batch size
14 |         sample_per_id (int): 每个类别的样本数量
15 |         shuffle (bool, optional): 是否随机打乱数据
16 |         drop_last (bool, optional): 是否丢掉最后一个batch
17 |     """
18 | 
19 |     def __init__(self,
20 |                  dataset,
21 |                  batch_size,
22 |                  sample_per_id,
23 |                  shuffle=True,
24 |                  drop_last=True):
25 |         super().__init__(dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
26 |         assert batch_size % sample_per_id == 0, f"batch_size({batch_size})必须是sample_per_id({sample_per_id})的整数倍"
27 |         self.sample_per_id = sample_per_id
28 |         self.label_dict = defaultdict(list)
29 |         for idx, label in enumerate(self.dataset.labels):
30 |             self.label_dict[label].append(idx)
31 |         self.label_list = list(self.label_dict)
32 |         assert len(self.label_list) * self.sample_per_id >= self.batch_size, \
33 |             f"batch_size({self.batch_size})必须大于等于label_list({len(self.label_list)})*sample_per_id({self.sample_per_id})"
34 |         self.prob_list = np.array([1 / len(self.label_list)] * len(self.label_list))
35 |         diff = np.abs(sum(self.prob_list) - 1)
36 |         if diff > 0.00000001:
37 |             self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
38 | 
39 |     def __iter__(self):
40 |         if self.shuffle:
41 |             rank = dist.get_rank()
42 |             np.random.RandomState(rank * self.epoch).shuffle(self.label_list)
43 |             np.random.RandomState(rank * self.epoch).shuffle(self.prob_list)
44 |             self.epoch += 1
45 | 
46 |         label_per_batch = self.batch_size // self.sample_per_id
47 |         for _ in range(len(self)):
48 |             batch_index = []
49 |             batch_label_list = np.random.choice(self.label_list, size=label_per_batch, replace=False, p=self.prob_list)
50 |             for label_i in batch_label_list:
51 |                 label_i_indexes = self.label_dict[label_i]
52 |                 batch_index.extend(
53 |                     np.random.choice(label_i_indexes, size=self.sample_per_id,
54 |                                      replace=not self.sample_per_id <= len(label_i_indexes)))
55 |             # 再次随机打乱
56 |             if self.shuffle:
57 |                 np.random.shuffle(batch_index)
58 |             if not self.drop_last or len(batch_index) == self.batch_size:
59 |                 yield batch_index
60 | 


--------------------------------------------------------------------------------
/ppvector/data_utils/reader.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import paddle
  5 | from loguru import logger
  6 | from paddle.io import Dataset
  7 | from tqdm import tqdm
  8 | from yeaudio.audio import AudioSegment
  9 | from yeaudio.augmentation import ReverbPerturbAugmentor, SpecAugmentor
 10 | from yeaudio.augmentation import SpeedPerturbAugmentor, VolumePerturbAugmentor, NoisePerturbAugmentor
 11 | 
 12 | from ppvector.data_utils.featurizer import AudioFeaturizer
 13 | 
 14 | 
 15 | # 音频数据加载器
 16 | class PPVectorDataset(Dataset):
 17 |     def __init__(self,
 18 |                  data_list_path,
 19 |                  audio_featurizer: AudioFeaturizer,
 20 |                  max_duration=3,
 21 |                  min_duration=0.5,
 22 |                  mode='train',
 23 |                  sample_rate=16000,
 24 |                  aug_conf=None,
 25 |                  num_speakers=None,
 26 |                  use_dB_normalization=True,
 27 |                  target_dB=-20):
 28 |         """音频数据加载器
 29 | 
 30 |         Args:
 31 |             data_list_path: 包含音频路径和标签的数据列表文件的路径
 32 |             audio_featurizer: 声纹特征提取器
 33 |             max_duration: 最长的音频长度，大于这个长度会裁剪掉
 34 |             min_duration: 过滤最短的音频长度
 35 |             aug_conf: 用于指定音频增强的配置
 36 |             mode: 数据集模式。在训练模式下，数据集可能会进行一些数据增强的预处理
 37 |             sample_rate: 采样率
 38 |             num_speakers: 总说话人数量
 39 |             use_dB_normalization: 是否对音频进行音量归一化
 40 |             target_dB: 音量归一化的大小
 41 |         """
 42 |         super(PPVectorDataset, self).__init__()
 43 |         assert mode in ['train', 'eval', 'extract_feature']
 44 |         self.data_list_path = data_list_path
 45 |         self.max_duration = max_duration
 46 |         self.min_duration = min_duration
 47 |         self.mode = mode
 48 |         self._target_sample_rate = sample_rate
 49 |         self._use_dB_normalization = use_dB_normalization
 50 |         self._target_dB = target_dB
 51 |         self.num_speakers = num_speakers
 52 |         self.speed_augment = None
 53 |         self.volume_augment = None
 54 |         self.noise_augment = None
 55 |         self.reverb_augment = None
 56 |         self.spec_augment = None
 57 |         # 获取特征器
 58 |         self.audio_featurizer = audio_featurizer
 59 |         # 获取特征裁剪的大小
 60 |         self.max_feature_len = self.get_crop_feature_len()
 61 |         # 获取数据列表
 62 |         with open(self.data_list_path, 'r', encoding='utf-8') as f:
 63 |             self.lines = f.readlines()
 64 |         self.labels = [np.int64(line.strip().split('\t')[1]) for line in self.lines]
 65 |         if mode == 'train' and aug_conf is not None:
 66 |             # 获取数据增强器
 67 |             self.get_augmentor(aug_conf)
 68 |         # 评估模式下，数据列表需要排序
 69 |         if self.mode == 'eval':
 70 |             self.sort_list()
 71 | 
 72 |     def __getitem__(self, idx):
 73 |         # 分割音频路径和标签
 74 |         data_path, spk_id = self.lines[idx].strip().split('\t')
 75 |         spk_id = int(spk_id)
 76 |         # 如果后缀名为.npy的文件，那么直接读取
 77 |         if data_path.endswith('.npy'):
 78 |             feature = np.load(data_path)
 79 |             if feature.shape[0] > self.max_feature_len:
 80 |                 crop_start = random.randint(0, feature.shape[0] - self.max_feature_len) if self.mode == 'train' else 0
 81 |                 feature = feature[crop_start:crop_start + self.max_feature_len, :]
 82 |             feature = paddle.to_tensor(feature, dtype=paddle.float32)
 83 |         else:
 84 |             # 读取音频
 85 |             audio_segment = AudioSegment.from_file(data_path)
 86 |             # 数据太短不利于训练
 87 |             if self.mode == 'train' or self.mode == 'extract_feature':
 88 |                 if audio_segment.duration < self.min_duration:
 89 |                     return self.__getitem__(idx + 1 if idx < len(self.lines) - 1 else 0)
 90 |             # 重采样
 91 |             if audio_segment.sample_rate != self._target_sample_rate:
 92 |                 audio_segment.resample(self._target_sample_rate)
 93 |             # 音频增强
 94 |             if self.mode == 'train':
 95 |                 audio_segment, spk_id = self.augment_audio(audio_segment, spk_id)
 96 |             # decibel normalization
 97 |             if self._use_dB_normalization:
 98 |                 audio_segment.normalize(target_db=self._target_dB)
 99 |             # 裁剪需要的数据
100 |             if self.mode != 'extract_feature' and audio_segment.duration > self.max_duration:
101 |                 audio_segment.crop(duration=self.max_duration, mode=self.mode)
102 |             samples = paddle.to_tensor(audio_segment.samples, dtype=paddle.float32)
103 |             feature = self.audio_featurizer(samples)
104 |             feature = feature.squeeze(0)
105 |         if self.mode == 'train' and self.spec_augment is not None:
106 |             feature = self.spec_augment(feature.numpy())
107 |             feature = paddle.to_tensor(feature, dtype=paddle.float32)
108 |         spk_id = paddle.to_tensor(int(spk_id), dtype=paddle.int64)
109 |         return feature, spk_id
110 | 
111 |     def __len__(self):
112 |         return len(self.lines)
113 | 
114 |     # 获取特征裁剪的大小，对应max_duration音频提取特征后的长度
115 |     def get_crop_feature_len(self):
116 |         samples = paddle.randn((1, self.max_duration * self._target_sample_rate))
117 |         feature = self.audio_featurizer(samples).squeeze(0)
118 |         freq_len = feature.shape[0]
119 |         return freq_len
120 | 
121 |     # 数据列表需要排序
122 |     def sort_list(self):
123 |         lengths = []
124 |         for line in tqdm(self.lines, desc=f"对列表[{self.data_list_path}]进行长度排序"):
125 |             # 分割数据文件路径和标签
126 |             data_path, _ = line.split('\t')
127 |             if data_path.endswith('.npy'):
128 |                 feature = np.load(data_path)
129 |                 length = feature.shape[0]
130 |                 lengths.append(length)
131 |             else:
132 |                 # 读取音频
133 |                 audio_segment = AudioSegment.from_file(data_path)
134 |                 length = audio_segment.duration
135 |                 lengths.append(length)
136 |         # 对长度排序并获取索引
137 |         sorted_indexes = np.argsort(lengths)
138 |         self.lines = [self.lines[i] for i in sorted_indexes]
139 | 
140 |     # 获取数据增强器
141 |     def get_augmentor(self, aug_conf):
142 |         if aug_conf.speed is not None:
143 |             self.speed_augment = SpeedPerturbAugmentor(num_speakers=self.num_speakers, **aug_conf.speed)
144 |         if aug_conf.volume is not None:
145 |             self.volume_augment = VolumePerturbAugmentor(**aug_conf.volume)
146 |         if aug_conf.noise is not None:
147 |             self.noise_augment = NoisePerturbAugmentor(**aug_conf.noise)
148 |         if aug_conf.reverb is not None:
149 |             self.reverb_augment = ReverbPerturbAugmentor(**aug_conf.reverb)
150 |         if aug_conf.spec_aug is not None:
151 |             self.spec_augment = SpecAugmentor(**aug_conf.spec_aug)
152 | 
153 |     # 音频增强
154 |     def augment_audio(self, audio_segment, spk_id):
155 |         if self.speed_augment is not None:
156 |             audio_segment, spk_id = self.speed_augment(audio_segment, spk_id)
157 |         if self.volume_augment is not None:
158 |             audio_segment = self.volume_augment(audio_segment)
159 |         if self.noise_augment is not None:
160 |             audio_segment = self.noise_augment(audio_segment)
161 |         if self.reverb_augment is not None:
162 |             audio_segment = self.reverb_augment(audio_segment)
163 |         return audio_segment, spk_id
164 | 


--------------------------------------------------------------------------------
/ppvector/infer_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/infer_utils/__init__.py


--------------------------------------------------------------------------------
/ppvector/infer_utils/player.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | import soundcard
 4 | from yeaudio.audio import AudioSegment
 5 | 
 6 | 
 7 | class AudioPlayer:
 8 |     def __init__(self, audio_path):
 9 |         """音频播放器
10 | 
11 |         Args:
12 |             audio_path (str): 音频文件路径
13 |         """
14 |         self.playing = False
15 |         self.to_pause = False
16 |         self.pos = 0
17 |         self.audio_segment = AudioSegment.from_file(audio_path)
18 |         self.audio_data = self.audio_segment.to_bytes(dtype="int16")
19 |         self.audio_segment = AudioSegment.from_file(audio_path)
20 |         self.audio_data = self.audio_segment.to_bytes(dtype="int16")
21 |         self.samples = self.audio_segment.samples
22 |         self.sample_rate = self.audio_segment.sample_rate
23 |         self.default_speaker = soundcard.default_speaker()
24 |         self.block_size = self.sample_rate // 2
25 | 
26 |     def _play(self):
27 |         self.to_pause = False
28 |         self.playing = True
29 |         with self.default_speaker.player(samplerate=self.sample_rate) as p:
30 |             for i in range(int(self.pos * self.sample_rate), len(self.samples), self.block_size):
31 |                 if self.to_pause: break
32 |                 self.pos = i / self.sample_rate
33 |                 p.play(self.samples[i:i + self.block_size])
34 |         self.playing = False
35 | 
36 |     # 播放音频
37 |     def play(self):
38 |         if not self.playing:
39 |             thread = threading.Thread(target=self._play)
40 |             thread.start()
41 | 
42 |     # 暂停播放
43 |     def pause(self):
44 |         self.to_pause = True
45 | 
46 |     # 跳转到指定时间
47 |     def seek(self, seconds=0.0):
48 |         self.pos = seconds
49 | 
50 |     # 获取当前播放时间
51 |     def current_time(self):
52 |         return self.pos
53 | 


--------------------------------------------------------------------------------
/ppvector/infer_utils/speaker_diarization.py:
--------------------------------------------------------------------------------
  1 | # This implementation is adapted from https://github.com/modelscope/modelscope
  2 | import numpy as np
  3 | import scipy
  4 | import sklearn
  5 | from sklearn.cluster import k_means
  6 | from yeaudio.audio import AudioSegment
  7 | 
  8 | 
  9 | class SpeakerDiarization(object):
 10 | 
 11 |     def __init__(self, seg_duration=1.5, seg_shift=0.75, sample_rate=16000, merge_threshold=0.78):
 12 |         """说话人日志工具
 13 | 
 14 |         Args:
 15 |             seg_duration (float, optional): 每个分割片段的持续时间（秒），默认为1.5秒。
 16 |             seg_shift (float, optional): 分割片段之间的时间间隔（秒），默认为0.75秒。
 17 |             sample_rate (int, optional): 音频采样率，默认为16000Hz。
 18 |             merge_threshold (float, optional): 合并片段的阈值，默认为0.78。当两个片段之间的相似度大于此阈值时，将合并这两个片段。
 19 |         """
 20 |         self.seg_duration = seg_duration
 21 |         self.seg_shift = seg_shift
 22 |         self.sample_rate = sample_rate
 23 |         self.merge_threshold = merge_threshold
 24 |         self.spectral_cluster = SpectralCluster()
 25 | 
 26 |     def segments_audio(self, audio_segment: AudioSegment) -> list:
 27 |         """ 从音频段中分割出有效的语音段。
 28 | 
 29 |         Args:
 30 |             audio_segment (AudioSegment): 要分割的音频段对象。
 31 |         Returns:
 32 |             list: 分割出的有效语音段列表，每个元素是一个包含起始时间戳、结束时间戳和对应音频样本的列表。
 33 |         """
 34 |         vad_segments = []
 35 |         samples = audio_segment.samples
 36 |         self.sample_rate = audio_segment.sample_rate
 37 |         vad_time_list = audio_segment.vad(return_seconds=True)
 38 |         for t in vad_time_list:
 39 |             st = round(t['start'], 3)
 40 |             ed = round(t['end'], 3)
 41 |             vad_segments.append([st, ed, samples[int(st * self.sample_rate):int(ed * self.sample_rate)]])
 42 |         self._check_audio_list(vad_segments)
 43 |         segments = self._chunk(vad_segments)
 44 |         return segments
 45 | 
 46 |     # 检查分割的结果数据是否符合要求
 47 |     def _check_audio_list(self, audio: list):
 48 |         audio_duration = 0
 49 |         for i in range(len(audio)):
 50 |             seg = audio[i]
 51 |             assert seg[1] >= seg[0], '分割的时间戳错误'
 52 |             assert isinstance(seg[2], np.ndarray), '数据的类型不正确'
 53 |             assert int(seg[1] * self.sample_rate) - int(seg[0] * self.sample_rate) == seg[2].shape[0], '时间长度和数据长度不匹配'
 54 |             if i > 0:
 55 |                 assert seg[0] >= audio[i - 1][1], 'modelscope error: Wrong time stamps.'
 56 |             audio_duration += seg[1] - seg[0]
 57 |         assert audio_duration > 5, f'音频时间过短，应当大于5秒，当前长度是{audio_duration}秒'
 58 | 
 59 |     # 将音频片段继续细分割成固定长度的片段
 60 |     def _chunk(self, vad_segments: list) -> list:
 61 | 
 62 |         def seg_chunk(seg_data):
 63 |             seg_st = seg_data[0]
 64 |             data = seg_data[2]
 65 |             chunk_len = int(self.seg_duration * self.sample_rate)
 66 |             chunk_shift = int(self.seg_shift * self.sample_rate)
 67 |             last_chunk_ed = 0
 68 |             seg_res = []
 69 |             for chunk_st in range(0, data.shape[0], chunk_shift):
 70 |                 chunk_ed = min(chunk_st + chunk_len, data.shape[0])
 71 |                 if chunk_ed <= last_chunk_ed:
 72 |                     break
 73 |                 last_chunk_ed = chunk_ed
 74 |                 chunk_st = max(0, chunk_ed - chunk_len)
 75 |                 chunk_data = data[chunk_st:chunk_ed]
 76 |                 if chunk_data.shape[0] < chunk_len:
 77 |                     chunk_data = np.pad(chunk_data, (0, chunk_len - chunk_data.shape[0]), 'constant')
 78 |                 seg_res.append([
 79 |                     chunk_st / self.sample_rate + seg_st, chunk_ed / self.sample_rate + seg_st,
 80 |                     chunk_data
 81 |                 ])
 82 |             return seg_res
 83 | 
 84 |         segs = []
 85 |         for i, s in enumerate(vad_segments):
 86 |             segs.extend(seg_chunk(s))
 87 |         return segs
 88 | 
 89 |     def clustering(self, embeddings: np.ndarray, speaker_num=None) -> [np.ndarray, np.ndarray]:
 90 |         """聚类音频特征向量，返回聚类后的标签数组
 91 | 
 92 |         Args:
 93 |             embeddings (np.ndarray): 音频特征向量数组，形状为 (n_samples, embedding_dim)
 94 |             speaker_num (int): 说话人数量，提供说话人数量可以提高准确率
 95 |         Returns:
 96 |             Dict[np.ndarray, dict]: 聚类后的标签数组，形状为 (n_samples,)
 97 |         """
 98 |         labels = self.spectral_cluster(embeddings, oracle_num=speaker_num)
 99 |         labels = self._correct_labels(labels)
100 |         # 每个说话人特征向量平均值
101 |         spk_num = labels.max() + 1
102 |         spk_center = []
103 |         for i in range(spk_num):
104 |             spk_emb = embeddings[labels == i].mean(0)
105 |             spk_center.append(spk_emb)
106 |         assert len(spk_center) > 0
107 |         spk_center_embeddings = np.stack(spk_center, axis=0)
108 |         labels = self._merge_by_cos(labels, spk_center, self.merge_threshold)
109 |         return labels, spk_center_embeddings
110 | 
111 |     # 通过余弦相似度合并相似说话人
112 |     @staticmethod
113 |     def _merge_by_cos(labels, spk_center_emb, cos_thr):
114 |         assert 0 < cos_thr <= 1
115 |         while True:
116 |             spk_num = labels.max() + 1
117 |             if spk_num == 1:
118 |                 break
119 |             spk_center = []
120 |             for i in range(spk_num):
121 |                 spk_emb = spk_center_emb[i]
122 |                 spk_center.append(spk_emb)
123 |             assert len(spk_center) > 0
124 |             spk_center = np.stack(spk_center, axis=0)
125 |             norm_spk_center = spk_center / np.linalg.norm(spk_center, axis=1, keepdims=True)
126 |             affinity = np.matmul(norm_spk_center, norm_spk_center.T)
127 |             affinity = np.triu(affinity, 1)
128 |             spks = np.unravel_index(np.argmax(affinity), affinity.shape)
129 |             if affinity[spks] < cos_thr:
130 |                 break
131 |             for i in range(len(labels)):
132 |                 if labels[i] == spks[1]:
133 |                     labels[i] = spks[0]
134 |                 elif labels[i] > spks[1]:
135 |                     labels[i] -= 1
136 |         return labels
137 | 
138 |     def postprocess(self, segments: list, labels: np.ndarray) -> list:
139 |         """对音频分割结果进行后处理，包括标签校正、片段合并、重叠区域分配和平滑处理。
140 | 
141 |         Args:
142 |             segments (list): 包含分割的数据列表，每个元素是一个包含起始时间、结束时间，音频数据。
143 |             labels (np.ndarray): 包含每个音频片段对应说话人标签的数组。
144 |         Returns:
145 |             list: 包含处理后的音频片段信息的列表，包含说话人标签、起始时间和结束时间。
146 |         """
147 |         assert len(segments) == len(labels)
148 |         distribute_res = []
149 |         for i in range(len(segments)):
150 |             distribute_res.append([segments[i][0], segments[i][1], labels[i]])
151 |         # 按时间顺序合并相同的说话人
152 |         distribute_res = self._merge_seque(distribute_res)
153 | 
154 |         def is_overlapped(t1, t2):
155 |             if t1 > t2 + 1e-4:
156 |                 return True
157 |             return False
158 | 
159 |         # 分割重叠区域
160 |         for i in range(1, len(distribute_res)):
161 |             if is_overlapped(distribute_res[i - 1][1], distribute_res[i][0]):
162 |                 p = (distribute_res[i][0] + distribute_res[i - 1][1]) / 2
163 |                 distribute_res[i][0] = p
164 |                 distribute_res[i - 1][1] = p
165 | 
166 |         # 平滑处理
167 |         distribute_res = self._smooth(distribute_res)
168 | 
169 |         # 将结果转换为字典形式
170 |         results = []
171 |         for result in distribute_res:
172 |             results.append(dict(speaker=result[2], start=round(result[0], 3), end=round(result[1], 3)))
173 | 
174 |         return results
175 | 
176 |     # 重排序标签
177 |     @staticmethod
178 |     def _correct_labels(labels):
179 |         labels_id = 0
180 |         id2id = {}
181 |         new_labels = []
182 |         for i in labels:
183 |             if i not in id2id:
184 |                 id2id[i] = labels_id
185 |                 labels_id += 1
186 |             new_labels.append(id2id[i])
187 |         return np.array(new_labels)
188 | 
189 |     # 合并连续且属于同一说话人的音频片段
190 |     @staticmethod
191 |     def _merge_seque(distribute_res):
192 |         res = [distribute_res[0]]
193 |         for i in range(1, len(distribute_res)):
194 |             if distribute_res[i][2] != res[-1][2] or distribute_res[i][0] > res[-1][1]:
195 |                 res.append(distribute_res[i])
196 |             else:
197 |                 res[-1][1] = distribute_res[i][1]
198 |         return res
199 | 
200 |     # 对结果进行平滑处理，主要是处理时间长度过短的片段
201 |     def _smooth(self, res, min_duration=1):
202 |         for i in range(len(res)):
203 |             res[i][0] = round(res[i][0], 2)
204 |             res[i][1] = round(res[i][1], 2)
205 |             if res[i][1] - res[i][0] < min_duration:
206 |                 if i == 0:
207 |                     res[i][2] = res[i + 1][2]
208 |                 elif i == len(res) - 1:
209 |                     res[i][2] = res[i - 1][2]
210 |                 elif res[i][0] - res[i - 1][1] <= res[i + 1][0] - res[i][1]:
211 |                     res[i][2] = res[i - 1][2]
212 |                 else:
213 |                     res[i][2] = res[i + 1][2]
214 |         # 合并说话人
215 |         res = self._merge_seque(res)
216 |         return res
217 | 
218 | 
219 | class SpectralCluster:
220 |     def __init__(self, min_num_spks=1, max_num_spks=15, pval=0.022):
221 |         """实现了基于相似度矩阵的非归一化拉普拉斯矩阵的谱聚类方法。
222 | 
223 |         :param min_num_spks: 聚类的最小数量，默认为1。
224 |         :type min_num_spks: int
225 |         :param max_num_spks: 聚类的最大数量，默认为15。
226 |         :type max_num_spks: int
227 |         :param pval: 用于相似度矩阵修剪的阈值，默认为0.022。
228 |         :type pval: float
229 |         """
230 |         self.min_num_spks = min_num_spks
231 |         self.max_num_spks = max_num_spks
232 |         self.pval = pval
233 | 
234 |     # 对输入数据X进行谱聚类，返回聚类标签
235 |     def __call__(self, X, oracle_num=None):
236 |         """
237 | 
238 |         :param X: 输入数据，形状为[n_samples, n_features]
239 |         :type X: np.ndarray
240 |         :param oracle_num: 聚类数量，默认为None，此时将根据特征间隙自动选择聚类数量。
241 |         :type oracle_num: int
242 |         :return: 聚类标签，形状为[n_samples]
243 |         """
244 |         sim_mat = self.get_sim_mat(X)
245 |         prunned_sim_mat = self.p_pruning(sim_mat)
246 |         sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)
247 |         laplacian = self.get_laplacian(sym_prund_sim_mat)
248 |         emb, num_of_spk = self.get_spec_embs(laplacian, oracle_num)
249 |         labels = self.cluster_embs(emb, num_of_spk)
250 |         return labels
251 | 
252 |     # 计算输入数据X的相似度矩阵
253 |     @staticmethod
254 |     def get_sim_mat(X):
255 |         # Cosine similarities
256 |         M = sklearn.metrics.pairwise.cosine_similarity(X, X)
257 |         return M
258 | 
259 |     # 根据阈值pval修剪相似度矩阵A
260 |     def p_pruning(self, A):
261 |         if A.shape[0] * self.pval < 6:
262 |             pval = 6. / A.shape[0]
263 |         else:
264 |             pval = self.pval
265 |         n_elems = int((1 - pval) * A.shape[0])
266 | 
267 |         # 关联矩阵中的每一行中的前n_elems个最小值下标
268 |         for i in range(A.shape[0]):
269 |             low_indexes = np.argsort(A[i, :])
270 |             low_indexes = low_indexes[0:n_elems]
271 |             # 用0替换较小的相似度值
272 |             A[i, low_indexes] = 0
273 |         return A
274 | 
275 |     # 计算对称相似度矩阵M的拉普拉斯矩阵
276 |     @staticmethod
277 |     def get_laplacian(M):
278 |         M[np.diag_indices(M.shape[0])] = 0
279 |         D = np.sum(np.abs(M), axis=1)
280 |         D = np.diag(D)
281 |         L = D - M
282 |         return L
283 | 
284 |     # 计算拉普拉斯矩阵L的谱嵌入，并根据特征间隙或指定的oracle_num确定聚类数量
285 |     def get_spec_embs(self, L, k_oracle=None):
286 |         lambdas, eig_vecs = scipy.linalg.eigh(L)
287 | 
288 |         if k_oracle is not None:
289 |             num_of_spk = k_oracle
290 |         else:
291 |             lambda_gap_list = self.get_eigen_gaps(lambdas[self.min_num_spks - 1:self.max_num_spks + 1])
292 |             num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks
293 | 
294 |         emb = eig_vecs[:, :num_of_spk]
295 |         return emb, num_of_spk
296 | 
297 |     # 使用k-means算法对谱嵌入emb进行聚类，返回聚类标签
298 |     @staticmethod
299 |     def cluster_embs(emb, k):
300 |         _, labels, _ = k_means(emb, k, n_init="auto")
301 |         return labels
302 | 
303 |     # 计算特征值的间隙列表
304 |     @staticmethod
305 |     def get_eigen_gaps(eig_vals):
306 |         eig_vals_gap_list = []
307 |         for i in range(len(eig_vals) - 1):
308 |             gap = float(eig_vals[i + 1]) - float(eig_vals[i])
309 |             eig_vals_gap_list.append(gap)
310 |         return eig_vals_gap_list
311 | 


--------------------------------------------------------------------------------
/ppvector/infer_utils/viewer.py:
--------------------------------------------------------------------------------
  1 | # This implementation is adapted from https://github.com/taylorlu/Speaker-Diarization
  2 | import matplotlib.pyplot as plot
  3 | 
  4 | from ppvector.infer_utils.player import AudioPlayer
  5 | 
  6 | 
  7 | class PlotSpeaker:
  8 |     def __init__(self, speakers_data, audio_path=None, title="speaker-diarization", gui=True, size=(14, 6)):
  9 |         """绘制说话人结果
 10 | 
 11 |         Args:
 12 |             speakers_data (list): 包含说话人信息的列表，每个元素是一个包含起始时间戳、结束时间戳和说话人的字典。
 13 |             audio_path (str, optional): 音频文件的路径，默认为None。如果提供，则使用AudioPlayer播放音频。
 14 |             title (str, optional): 图形窗口的标题，默认为"speaker-diarization"。
 15 |             gui (bool, optional): 是否启用图形用户界面，默认为True。
 16 |             size (tuple, optional): 图形窗口的大小（宽度，高度），默认为(14, 6)。
 17 |         """
 18 |         # 检测类别名称是否包含中文，是则设置相应字体
 19 |         s = ''.join([str(data["speaker"]) for data in speakers_data])
 20 |         s += title
 21 |         is_ascii = all(ord(c) < 128 for c in s)
 22 |         if not is_ascii:
 23 |             plot.rcParams['font.sans-serif'] = ['SimHei']
 24 |             plot.rcParams['axes.unicode_minus'] = False
 25 |         # 定义颜色
 26 |         self.rect_color = (0.0, 0.6, 1.0, 1.0)
 27 |         self.rect_selected_color = (0.75, 0.75, 0, 1.0)
 28 |         self.cluster_colors = [(0.0, 0.6, 1.0, 1.0), (0.0, 1.0, 0.6, 1.0), (0.6, 0.0, 1.0, 1.0),
 29 |                                (0.6, 1.0, 0.0, 1.0), (1.0, 0.0, 0.6, 1.0), (1.0, 0.6, 0.0, 1.0)]
 30 |         self.gui = gui
 31 |         self.title = title
 32 |         self.fig = plot.figure(figsize=size, facecolor='white', tight_layout=True)
 33 |         self.plot = plot
 34 | 
 35 |         self.ax = self.fig.add_subplot(1, 1, 1)
 36 |         if self.gui:
 37 |             self.fig.canvas.mpl_connect('key_press_event', self._on_keypress)
 38 |             self.fig.canvas.mpl_connect('button_press_event', self._on_click)
 39 |         self.height = 5
 40 |         self.maxx = 0
 41 |         self.audio = None
 42 |         if audio_path is not None and self.gui:
 43 |             self.audio = AudioPlayer(audio_path)
 44 |             self.timer = self.fig.canvas.new_timer(interval=500)
 45 |             self.timer.add_callback(self._update_timeline)
 46 |             self.timer.start()
 47 | 
 48 |         self.timeline = self.ax.plot([0, 0], [0, 0], color='r')[-1]
 49 |         segment_data = dict()
 50 |         for data in speakers_data:
 51 |             start, end, speaker = data['start'], data['end'], data['speaker']
 52 |             if speaker not in segment_data:
 53 |                 segment_data[speaker] = []
 54 |             segment_data[speaker].append(dict(start=start, end=end))
 55 |         self.speakers_data = segment_data
 56 | 
 57 |     # 根据音频播放器中的位置更新时间轴
 58 |     def _update_timeline(self):
 59 |         if self.audio is not None and self.audio.playing:
 60 |             t = self.audio.current_time()
 61 |             self._draw_timeline(t)
 62 |             self.fig.canvas.draw()
 63 | 
 64 |     # 绘制时间轴
 65 |     def _draw_timeline(self, t):
 66 |         min_y, max_y = self.ax.get_ylim()
 67 |         self.timeline.set_data([t, t], [min_y, max_y])
 68 |         self._draw_info(t)
 69 | 
 70 |     # 绘制信息
 71 |     @staticmethod
 72 |     def _draw_info(t):
 73 |         h = int(t) // 3600
 74 |         t %= 3600
 75 |         m = int(t) // 60
 76 |         s = int(t % 60)
 77 |         plot.xlabel(f'time: {h:02}:{m:02}:{s:02}')
 78 | 
 79 |     def draw(self, save_path=None):
 80 |         """绘制说话人分割结果
 81 | 
 82 |         Args:
 83 |             save_path (str, optional): 保存图像的路径，默认为None。如果提供，则将绘制的图像保存到指定路径。
 84 |         """
 85 |         y = 0
 86 |         labels_pos = []
 87 |         labels = []
 88 |         for i, cluster in enumerate(self.speakers_data.keys()):
 89 |             labels.append(cluster)
 90 |             labels_pos.append(y + self.height // 2)
 91 |             for row in self.speakers_data[cluster]:
 92 |                 x = row['start']
 93 |                 w = row['end'] - row['start']
 94 |                 self.maxx = max(self.maxx, row['end'])
 95 |                 c = self.cluster_colors[i % len(self.cluster_colors)]
 96 |                 rect = plot.Rectangle((x, y), w, self.height, color=c)
 97 |                 self.ax.add_patch(rect)
 98 |             y += self.height
 99 |         if self.gui:
100 |             plot.xlim([0, min(600, self.maxx)])
101 |         else:
102 |             plot.xlim([0, self.maxx])
103 | 
104 |         plot.ylim([0, y])
105 |         plot.yticks(labels_pos, labels)
106 |         for _ in self.speakers_data:
107 |             self.ax.plot([0, self.maxx], [y, y], linestyle=':', color='#AAAAAA')
108 |             y -= self.height
109 | 
110 |         plot.title(self.title)
111 |         if self.gui:
112 |             self._draw_info(0)
113 |         plot.tight_layout()
114 |         if save_path is not None:
115 |             plot.savefig(save_path)
116 | 
117 |     # 键盘点击事件处理函数
118 |     def _on_keypress(self, event):
119 |         if event.key == ' ' and self.audio is not None:
120 |             if self.audio.playing:
121 |                 self.audio.pause()
122 |             else:
123 |                 self.audio.play()
124 |         self.fig.canvas.draw()
125 | 
126 |     # 鼠标点击事件处理函数
127 |     def _on_click(self, event):
128 |         if event.xdata is not None:
129 |             if self.audio is not None:
130 |                 self.audio.pause()
131 |                 self.audio.seek(event.xdata)
132 |             self._draw_timeline(event.xdata)
133 |             self.fig.canvas.draw()
134 | 


--------------------------------------------------------------------------------
/ppvector/loss/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from loguru import logger
 4 | 
 5 | from .aamloss import AAMLoss
 6 | from .amloss import AMLoss
 7 | from .armloss import ARMLoss
 8 | from .celoss import CELoss
 9 | from .sphereface2 import SphereFace2
10 | from .subcenterloss import SubCenterLoss
11 | from .tripletangularmarginloss import TripletAngularMarginLoss
12 | 
13 | __all__ = ['build_loss']
14 | 
15 | 
16 | def build_loss(configs):
17 |     use_loss = configs.loss_conf.get('loss', 'AAMLoss')
18 |     loss_args = configs.loss_conf.get('loss_args', {})
19 |     los = importlib.import_module(__name__)
20 |     loss = getattr(los, use_loss)(**loss_args)
21 |     logger.info(f'成功创建损失函数：{use_loss}，参数为：{loss_args}')
22 |     return loss
23 | 


--------------------------------------------------------------------------------
/ppvector/loss/aamloss.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import paddle
 4 | import paddle.nn as nn
 5 | import paddle.nn.functional as F
 6 | 
 7 | 
 8 | class AAMLoss(nn.Layer):
 9 |     def __init__(self, margin=0.2, scale=32, easy_margin=False, label_smoothing=0.0):
10 |         """The Implementation of Additive Angular Margin (AAM) proposed
11 |        in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
12 |        (https://arxiv.org/abs/1906.07317)
13 | 
14 |         Args:
15 |             margin (float, optional): margin factor. Defaults to 0.3.
16 |             scale (float, optional): scale factor. Defaults to 32.0.
17 |             easy_margin (bool, optional): easy_margin flag. Defaults to False.
18 |         """
19 |         super(AAMLoss, self).__init__()
20 |         self.scale = scale
21 |         self.easy_margin = easy_margin
22 |         self.cos_m = math.cos(margin)
23 |         self.sin_m = math.sin(margin)
24 |         self.th = math.cos(math.pi - margin)
25 |         self.mmm = 1.0 + math.cos(math.pi - margin)
26 |         self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
27 | 
28 |     def forward(self, inputs, labels):
29 |         """
30 |         Args:
31 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
32 |             labels(paddle.Tensor): 类别标签 (batch_size)
33 |         """
34 |         features, logits = inputs['features'], inputs['logits']
35 |         sine = paddle.sqrt(1.0 - paddle.pow(logits, 2))
36 |         phi = logits * self.cos_m - sine * self.sin_m
37 |         if self.easy_margin:
38 |             phi = paddle.where(logits > 0, phi, logits)
39 |         else:
40 |             phi = paddle.where(logits > self.th, phi, logits - self.mmm)
41 | 
42 |         one_hot = F.one_hot(labels, logits.shape[1])
43 |         output = (one_hot * phi) + ((1.0 - one_hot) * logits)
44 |         output *= self.scale
45 | 
46 |         loss = self.criterion(output, labels)
47 |         return loss
48 | 
49 |     def update(self, margin=0.2):
50 |         self.cos_m = math.cos(margin)
51 |         self.sin_m = math.sin(margin)
52 |         self.th = math.cos(math.pi - margin)
53 |         self.mmm = 1.0 + math.cos(math.pi - margin)
54 | 


--------------------------------------------------------------------------------
/ppvector/loss/amloss.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | import paddle.nn as nn
 3 | 
 4 | 
 5 | class AMLoss(nn.Layer):
 6 |     def __init__(self, margin=0.2, scale=30, label_smoothing=0.0):
 7 |         super(AMLoss, self).__init__()
 8 |         self.margin = margin
 9 |         self.scale = scale
10 |         self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing)
11 | 
12 |     def forward(self, inputs, labels):
13 |         """
14 |         Args:
15 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
16 |             labels(paddle.Tensor): 类别标签 (batch_size)
17 |         """
18 |         features, logits = inputs['features'], inputs['logits']
19 |         delt_costh = paddle.zeros(logits.shape)
20 |         for i, index in enumerate(labels):
21 |             delt_costh[i, index] = self.margin
22 |         costh_m = logits - delt_costh
23 |         predictions = self.scale * costh_m
24 |         loss = self.criterion(predictions, labels) / labels.shape[0]
25 |         return loss
26 | 
27 |     def update(self, margin=0.2):
28 |         self.margin = margin
29 | 


--------------------------------------------------------------------------------
/ppvector/loss/armloss.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | import paddle.nn as nn
 3 | 
 4 | 
 5 | class ARMLoss(nn.Layer):
 6 |     def __init__(self, margin=0.2, scale=30, label_smoothing=0.0):
 7 |         super(ARMLoss, self).__init__()
 8 |         self.margin = margin
 9 |         self.scale = scale
10 |         self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing)
11 | 
12 |     def forward(self, inputs, labels):
13 |         """
14 |         Args:
15 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
16 |             labels(paddle.Tensor): 类别标签 (batch_size)
17 |         """
18 |         features, logits = inputs['features'], inputs['logits']
19 |         delt_costh = paddle.zeros(logits.shape)
20 |         for i, index in enumerate(labels):
21 |             delt_costh[i, index] = self.margin
22 |         costh_m = logits - delt_costh
23 |         costh_m_s = self.scale * costh_m
24 |         delt_costh_m_s = paddle.zeros([logits.shape[0], 1], dtype=paddle.float32)
25 |         for i, index in enumerate(labels):
26 |             delt_costh_m_s[i] = costh_m_s[i, index]
27 |         delt_costh_m_s = delt_costh_m_s.tile([1, costh_m_s.shape[1]])
28 |         costh_m_s_reduct = costh_m_s - delt_costh_m_s
29 |         predictions = paddle.where(costh_m_s_reduct < 0.0, paddle.zeros_like(costh_m_s), costh_m_s)
30 |         loss = self.criterion(predictions, labels) / labels.shape[0]
31 |         return loss
32 | 
33 |     def update(self, margin=0.2):
34 |         self.margin = margin
35 | 
36 | 


--------------------------------------------------------------------------------
/ppvector/loss/celoss.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | import paddle.nn as nn
 3 | 
 4 | 
 5 | class CELoss(nn.Layer):
 6 |     def __init__(self, label_smoothing=0.0):
 7 |         super(CELoss, self).__init__()
 8 |         self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing)
 9 | 
10 |     def forward(self, inputs, labels):
11 |         """
12 |         Args:
13 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
14 |             labels(paddle.Tensor): 类别标签 (batch_size)
15 |         """
16 |         features, logits = inputs['features'], inputs['logits']
17 |         loss = self.criterion(logits, labels) / labels.shape[0]
18 |         return loss
19 | 
20 |     def update(self, margin=0.2):
21 |         pass
22 | 
23 | 


--------------------------------------------------------------------------------
/ppvector/loss/sphereface2.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import paddle
 4 | import paddle.nn as nn
 5 | import paddle.nn.functional as F
 6 | 
 7 | 
 8 | 
 9 | class SphereFace2(nn.Layer):
10 |     def __init__(self, margin=0.2, scale=32.0, lanbuda=0.7, t=3, margin_type='C'):
11 |         """Implement of sphereface2 for speaker verification:
12 |             Reference:
13 |                 [1] Exploring Binary Classification Loss for Speaker Verification
14 |                 https://ieeexplore.ieee.org/abstract/document/10094954
15 |                 [2] Sphereface2: Binary classification is all you need for deep face recognition
16 |                 https://arxiv.org/pdf/2108.01513
17 |             Args:
18 |                 scale: norm of logits feature
19 |                 margin: margin
20 |                 lanbuda: weight of positive and negative pairs
21 |                 t: parameter for adjust score distribution
22 |                 margin_type: A:cos(theta+margin) or C:cos(theta)-margin
23 |             Recommend margin:
24 |                 training: 0.2 for C and 0.15 for A
25 |                 LMF: 0.3 for C and 0.25 for A
26 |         """
27 |         super(SphereFace2, self).__init__()
28 |         self.scale = scale
29 |         self.bias = paddle.create_parameter([1, 1], dtype=paddle.float32, is_bias=True)
30 |         self.t = t
31 |         self.lanbuda = lanbuda
32 |         self.margin_type = margin_type
33 | 
34 |         self.margin = margin
35 |         self.cos_m = math.cos(margin)
36 |         self.sin_m = math.sin(margin)
37 |         self.th = math.cos(math.pi - margin)
38 |         self.mmm = 1.0 + math.cos(math.pi - margin)
39 | 
40 |     def fun_g(self, z, t: int):
41 |         gz = 2 * paddle.pow((z + 1) / 2, t) - 1
42 |         return gz
43 | 
44 |     def forward(self, inputs, labels):
45 |         """
46 |         Args:
47 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
48 |             labels(paddle.Tensor): 类别标签 (batch_size)
49 |         """
50 |         features, logits = inputs['features'], inputs['logits']
51 |         if self.margin_type == 'A':  # arcface type
52 |             sin = paddle.sqrt(1.0 - paddle.pow(logits, 2))
53 |             cos_m_theta_p = self.scale * self.fun_g(
54 |                 paddle.where(logits > self.th, logits * self.cos_m - sin * self.sin_m, logits - self.mmm), self.t) + \
55 |                             self.bias[0][0]
56 |             cos_m_theta_n = self.scale * self.fun_g(logits * self.cos_m + sin * self.sin_m, self.t) + self.bias[0][0]
57 |             cos_p_theta = self.lanbuda * paddle.log(1 + paddle.exp(-1.0 * cos_m_theta_p))
58 |             cos_n_theta = (1 - self.lanbuda) * paddle.log(1 + paddle.exp(cos_m_theta_n))
59 |         else:
60 |             # cosface type
61 |             cos_m_theta_p = self.scale * (self.fun_g(logits, self.t) - self.margin) + self.bias[0][0]
62 |             cos_m_theta_n = self.scale * (self.fun_g(logits, self.t) + self.margin) + self.bias[0][0]
63 |             cos_p_theta = self.lanbuda * paddle.log(1 + paddle.exp(-1.0 * cos_m_theta_p))
64 |             cos_n_theta = (1 - self.lanbuda) * paddle.log(1 + paddle.exp(cos_m_theta_n))
65 | 
66 |         target_mask = F.one_hot(labels, logits.shape[1])
67 |         nontarget_mask = 1 - target_mask
68 |         loss = (target_mask * cos_p_theta + nontarget_mask * cos_n_theta).sum(1).mean()
69 |         return loss
70 | 
71 |     def update(self, margin=0.2):
72 |         self.margin = margin
73 |         self.cos_m = math.cos(margin)
74 |         self.sin_m = math.sin(margin)
75 |         self.th = math.cos(math.pi - margin)
76 |         self.mmm = 1.0 + math.cos(math.pi - margin)
77 | 
78 | 


--------------------------------------------------------------------------------
/ppvector/loss/subcenterloss.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import paddle
 4 | import paddle.nn as nn
 5 | import paddle.nn.functional as F
 6 | 
 7 | 
 8 | class SubCenterLoss(nn.Layer):
 9 |     r"""Implement of large margin arc distance with subcenter:
10 |     Reference:Sub-center ArcFace: Boosting Face Recognition byLarge-Scale Noisy
11 |      Web Faces.https://ibug.doc.ic.ac.uk/media/uploads/documents/eccv_1445.pdf
12 | 
13 |      Args:
14 |         margin (float, optional): margin factor. Defaults to 0.3.
15 |         scale (float, optional): scale factor. Defaults to 32.0.
16 |         easy_margin (bool, optional): easy_margin flag. Defaults to False.
17 |         K: number of sub-centers, same classifier K.
18 |     """
19 | 
20 |     def __init__(self, margin=0.2, scale=32, easy_margin=False, K=3, label_smoothing=0.0):
21 |         super(SubCenterLoss, self).__init__()
22 |         self.scale = scale
23 |         # subcenter
24 |         self.K = K
25 |         self.easy_margin = easy_margin
26 |         self.cos_m = math.cos(margin)
27 |         self.sin_m = math.sin(margin)
28 |         self.th = math.cos(math.pi - margin)
29 |         self.mmm = 1.0 + math.cos(math.pi - margin)
30 |         self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
31 | 
32 |     def forward(self, inputs, labels):
33 |         """
34 |         Args:
35 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
36 |             labels(paddle.Tensor): 类别标签 (batch_size)
37 |         """
38 |         features, logits = inputs['features'], inputs['logits']
39 |         # (batch, out_dim, k)
40 |         cosine = paddle.reshape(logits, (-1, logits.shape[1] // self.K, self.K))
41 |         # (batch, out_dim)
42 |         cosine = paddle.max(cosine, 2)
43 |         sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2))
44 |         phi = cosine * self.cos_m - sine * self.sin_m
45 |         if self.easy_margin:
46 |             phi = paddle.where(cosine > 0, phi, cosine)
47 |         else:
48 |             phi = paddle.where(cosine > self.th, phi, cosine - self.mmm)
49 | 
50 |         one_hot = F.one_hot(labels, cosine.shape[1])
51 |         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
52 |         output *= self.scale
53 | 
54 |         loss = self.criterion(output, labels)
55 |         return loss
56 | 
57 |     def update(self, margin=0.2):
58 |         self.cos_m = math.cos(margin)
59 |         self.sin_m = math.sin(margin)
60 |         self.th = math.cos(math.pi - margin)
61 |         self.mmm = 1.0 + math.cos(math.pi - margin)
62 | 


--------------------------------------------------------------------------------
/ppvector/loss/tripletangularmarginloss.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | import paddle.nn as nn
 3 | 
 4 | 
 5 | class TripletAngularMarginLoss(nn.Layer):
 6 |     """A more robust triplet loss with hard positive/negative mining on angular margin instead of relative distance between d(a,p) and d(a,n).
 7 | 
 8 |     Args:
 9 |         margin (float, optional): angular margin. Defaults to 0.5.
10 |         normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True.
11 |         add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to True.
12 |         absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0.
13 |         ap_value (float, optional): weight for d(a, p). Defaults to 0.8.
14 |         an_value (float, optional): weight for d(a, n). Defaults to 0.4.
15 |     """
16 | 
17 |     def __init__(self,
18 |                  margin=0.5,
19 |                  normalize_feature=True,
20 |                  add_absolute=True,
21 |                  absolute_loss_weight=1.0,
22 |                  ap_value=0.8,
23 |                  an_value=0.4,
24 |                  label_smoothing=0.0):
25 |         super(TripletAngularMarginLoss, self).__init__()
26 |         self.margin = margin
27 |         self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
28 |         self.normalize_feature = normalize_feature
29 |         self.add_absolute = add_absolute
30 |         self.ap_value = ap_value
31 |         self.an_value = an_value
32 |         self.absolute_loss_weight = absolute_loss_weight
33 |         self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
34 | 
35 |     def forward(self, inputs, labels):
36 |         """
37 |         Args:
38 |             inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num)
39 |             labels(paddle.Tensor): 类别标签 (batch_size)
40 |         """
41 |         features, logits = inputs['features'], inputs['logits']
42 |         loss_ce = self.criterion(logits, labels)
43 | 
44 |         if self.normalize_feature:
45 |             features = paddle.divide(features, paddle.norm(features, p=2, axis=-1, keepdim=True))
46 | 
47 |         bs = features.shape[0]
48 | 
49 |         # compute distance(cos-similarity)
50 |         dist = paddle.matmul(features, features.t())
51 | 
52 |         # hard negative mining
53 |         is_pos = paddle.expand(labels, (bs, bs)).equal(paddle.expand(labels, (bs, bs)).t())
54 |         is_neg = paddle.expand(labels, (bs, bs)).not_equal(paddle.expand(labels, (bs, bs)).t())
55 | 
56 |         # `dist_ap` means distance(anchor, positive)
57 |         # both `dist_ap` and `relative_p_inds` with shape [N, 1]
58 |         d1 = paddle.masked_select(dist, is_pos)
59 |         d2 = paddle.reshape(d1, (bs, -1))
60 |         dist_ap = paddle.min(d2, axis=1, keepdim=True)
61 |         # `dist_an` means distance(anchor, negative)
62 |         # both `dist_an` and `relative_n_inds` with shape [N, 1]
63 |         dist_an = paddle.max(paddle.reshape(
64 |             paddle.masked_select(dist, is_neg), (bs, -1)), axis=1, keepdim=True)
65 |         # shape [N]
66 |         dist_ap = paddle.squeeze(dist_ap, axis=1)
67 |         dist_an = paddle.squeeze(dist_an, axis=1)
68 | 
69 |         # Compute ranking hinge loss
70 |         y = paddle.ones_like(dist_an)
71 |         loss = self.ranking_loss(dist_ap, dist_an, y)
72 | 
73 |         if self.add_absolute:
74 |             absolut_loss_ap = self.ap_value - dist_ap
75 |             absolut_loss_ap = paddle.where(absolut_loss_ap > 0, absolut_loss_ap, paddle.zeros_like(absolut_loss_ap))
76 | 
77 |             absolut_loss_an = dist_an - self.an_value
78 |             absolut_loss_an = paddle.where(absolut_loss_an > 0, absolut_loss_an, paddle.ones_like(absolut_loss_an))
79 | 
80 |             loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()) * self.absolute_loss_weight + loss.mean()
81 |         loss = loss + loss_ce
82 |         return loss
83 | 
84 |     def update(self, margin=0.5):
85 |         self.ranking_loss.margin = margin
86 | 


--------------------------------------------------------------------------------
/ppvector/metric/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/metric/__init__.py


--------------------------------------------------------------------------------
/ppvector/metric/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def compute_fnr_fpr(scores, labels, weights=None):
 5 |     sorted_ndx = np.argsort(scores)
 6 |     thresholds = scores[sorted_ndx]
 7 |     labels = labels[sorted_ndx]
 8 |     if weights is not None:
 9 |         weights = weights[sorted_ndx]
10 |     else:
11 |         weights = np.ones(labels.shape, dtype='f8')
12 | 
13 |     tgt_wghts = weights * (labels == 1).astype('f8')
14 |     imp_wghts = weights * (labels == 0).astype('f8')
15 | 
16 |     fnr = np.cumsum(tgt_wghts) / np.sum(tgt_wghts)
17 |     fpr = 1 - np.cumsum(imp_wghts) / np.sum(imp_wghts)
18 |     return fnr, fpr, thresholds
19 | 
20 | 
21 | def compute_eer(fnr, fpr, scores=None):
22 |     diff_pm_fa = fnr - fpr
23 |     x1 = np.flatnonzero(diff_pm_fa >= 0)[0]
24 |     x2 = np.flatnonzero(diff_pm_fa < 0)[-1]
25 |     a = (fnr[x1] - fpr[x1]) / (fpr[x2] - fpr[x1] - (fnr[x2] - fnr[x1]))
26 | 
27 |     if scores is not None:
28 |         score_sort = np.sort(scores)
29 |         return fnr[x1] + a * (fnr[x2] - fnr[x1]), score_sort[x1]
30 | 
31 |     return fnr[x1] + a * (fnr[x2] - fnr[x1])
32 | 
33 | 
34 | def compute_dcf(fnr, fpr, p_target=0.01, c_miss=1, c_fa=1):
35 |     c_det = min(c_miss * fnr * p_target + c_fa * fpr * (1 - p_target))
36 |     c_def = min(c_miss * p_target, c_fa * (1 - p_target))
37 |     return c_det / c_def
38 | 


--------------------------------------------------------------------------------
/ppvector/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from loguru import logger
 4 | 
 5 | from .campplus import CAMPPlus
 6 | from .ecapa_tdnn import EcapaTdnn
 7 | from .eres2net import ERes2Net, ERes2NetV2
 8 | from .res2net import Res2Net
 9 | from .resnet_se import ResNetSE
10 | from .tdnn import TDNN
11 | 
12 | __all__ = ['build_model']
13 | 
14 | 
15 | def build_model(input_size, configs):
16 |     use_model = configs.model_conf.get('model', 'CAMPPlus')
17 |     model_args = configs.model_conf.get('model_args', {})
18 |     mod = importlib.import_module(__name__)
19 |     model = getattr(mod, use_model)(input_size=input_size, **model_args)
20 |     logger.info(f'成功创建模型：{use_model}，参数为：{model_args}')
21 |     return model
22 | 


--------------------------------------------------------------------------------
/ppvector/models/campplus.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import paddle
  4 | import paddle.nn.functional as F
  5 | from paddle import nn
  6 | 
  7 | 
  8 | def get_nonlinear(config_str, channels):
  9 |     nonlinear = nn.Sequential()
 10 |     for name in config_str.split('-'):
 11 |         if name == 'relu':
 12 |             nonlinear.add_sublayer('relu', nn.ReLU())
 13 |         elif name == 'prelu':
 14 |             nonlinear.add_sublayer('prelu', nn.PReLU(channels))
 15 |         elif name == 'batchnorm':
 16 |             nonlinear.add_sublayer('batchnorm', nn.BatchNorm1D(channels))
 17 |         elif name == 'batchnorm_':
 18 |             nonlinear.add_sublayer('batchnorm', nn.BatchNorm1D(channels))
 19 |         else:
 20 |             raise ValueError('Unexpected module ({}).'.format(name))
 21 |     return nonlinear
 22 | 
 23 | 
 24 | def statistics_pooling(x, axis=-1, keepdim=False, unbiased=True, eps=1e-2):
 25 |     mean = x.mean(axis=axis)
 26 |     std = x.std(axis=axis, unbiased=unbiased)
 27 |     stats = paddle.concat([mean, std], axis=-1)
 28 |     if keepdim:
 29 |         stats = stats.unsqueeze(axis=axis)
 30 |     return stats
 31 | 
 32 | 
 33 | class StatsPool(nn.Layer):
 34 |     def forward(self, x):
 35 |         return statistics_pooling(x)
 36 | 
 37 | 
 38 | class TDNNLayer(nn.Layer):
 39 |     def __init__(self,
 40 |                  in_channels,
 41 |                  out_channels,
 42 |                  kernel_size,
 43 |                  stride=1,
 44 |                  padding=0,
 45 |                  dilation=1,
 46 |                  bias=False,
 47 |                  config_str='batchnorm-relu'):
 48 |         super(TDNNLayer, self).__init__()
 49 |         if padding < 0:
 50 |             assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
 51 |                 kernel_size)
 52 |             padding = (kernel_size - 1) // 2 * dilation
 53 |         self.linear = nn.Conv1D(in_channels,
 54 |                                 out_channels,
 55 |                                 kernel_size,
 56 |                                 stride=stride,
 57 |                                 padding=padding,
 58 |                                 dilation=dilation)
 59 |         self.nonlinear = get_nonlinear(config_str, out_channels)
 60 | 
 61 |     def forward(self, x):
 62 |         x = self.linear(x)
 63 |         x = self.nonlinear(x)
 64 |         return x
 65 | 
 66 | 
 67 | class CAMLayer(nn.Layer):
 68 |     def __init__(self,
 69 |                  bn_channels,
 70 |                  out_channels,
 71 |                  kernel_size,
 72 |                  stride,
 73 |                  padding,
 74 |                  dilation,
 75 |                  bias,
 76 |                  reduction=2):
 77 |         super(CAMLayer, self).__init__()
 78 |         self.linear_local = nn.Conv1D(bn_channels,
 79 |                                       out_channels,
 80 |                                       kernel_size,
 81 |                                       stride=stride,
 82 |                                       padding=padding,
 83 |                                       dilation=dilation)
 84 |         self.linear1 = nn.Conv1D(bn_channels, bn_channels // reduction, 1)
 85 |         self.relu = nn.ReLU()
 86 |         self.linear2 = nn.Conv1D(bn_channels // reduction, out_channels, 1)
 87 |         self.sigmoid = nn.Sigmoid()
 88 | 
 89 |     def forward(self, x):
 90 |         y = self.linear_local(x)
 91 |         context = x.mean(-1, keepdim=True) + self.seg_pooling(x)
 92 |         context = self.relu(self.linear1(context))
 93 |         m = self.sigmoid(self.linear2(context))
 94 |         return y * m
 95 | 
 96 |     def seg_pooling(self, x, seg_len=100, stype='avg'):
 97 |         if stype == 'avg':
 98 |             seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
 99 |         elif stype == 'max':
100 |             seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
101 |         else:
102 |             raise ValueError('Wrong segment pooling type.')
103 |         shape = seg.shape
104 |         seg = seg.unsqueeze(-1).expand((*shape, seg_len)).reshape((*shape[:-1], -1))
105 |         seg = seg[..., :x.shape[-1]]
106 |         return seg
107 | 
108 | 
109 | class CAMDenseTDNNLayer(nn.Layer):
110 |     def __init__(self,
111 |                  in_channels,
112 |                  out_channels,
113 |                  bn_channels,
114 |                  kernel_size,
115 |                  stride=1,
116 |                  dilation=1,
117 |                  bias=False,
118 |                  config_str='batchnorm-relu',
119 |                  memory_efficient=False):
120 |         super(CAMDenseTDNNLayer, self).__init__()
121 |         assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
122 |             kernel_size)
123 |         padding = (kernel_size - 1) // 2 * dilation
124 |         self.memory_efficient = memory_efficient
125 |         self.nonlinear1 = get_nonlinear(config_str, in_channels)
126 |         self.linear1 = nn.Conv1D(in_channels, bn_channels, 1)
127 |         self.nonlinear2 = get_nonlinear(config_str, bn_channels)
128 |         self.cam_layer = CAMLayer(bn_channels,
129 |                                   out_channels,
130 |                                   kernel_size,
131 |                                   stride=stride,
132 |                                   padding=padding,
133 |                                   dilation=dilation,
134 |                                   bias=bias)
135 | 
136 |     def bn_function(self, x):
137 |         return self.linear1(self.nonlinear1(x))
138 | 
139 |     def forward(self, x):
140 |         x = self.bn_function(x)
141 |         x = self.cam_layer(self.nonlinear2(x))
142 |         return x
143 | 
144 | 
145 | class CAMDenseTDNNBlock(nn.LayerList):
146 |     def __init__(self,
147 |                  num_layers,
148 |                  in_channels,
149 |                  out_channels,
150 |                  bn_channels,
151 |                  kernel_size,
152 |                  stride=1,
153 |                  dilation=1,
154 |                  bias=False,
155 |                  config_str='batchnorm-relu',
156 |                  memory_efficient=False):
157 |         super(CAMDenseTDNNBlock, self).__init__()
158 |         for i in range(num_layers):
159 |             layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels,
160 |                                       out_channels=out_channels,
161 |                                       bn_channels=bn_channels,
162 |                                       kernel_size=kernel_size,
163 |                                       stride=stride,
164 |                                       dilation=dilation,
165 |                                       bias=bias,
166 |                                       config_str=config_str,
167 |                                       memory_efficient=memory_efficient)
168 |             self.add_sublayer('tdnnd%d' % (i + 1), layer)
169 | 
170 |     def forward(self, x):
171 |         for layer in self:
172 |             x = paddle.concat([x, layer(x)], axis=1)
173 |         return x
174 | 
175 | 
176 | class TransitLayer(nn.Layer):
177 |     def __init__(self,
178 |                  in_channels,
179 |                  out_channels,
180 |                  bias=True,
181 |                  config_str='batchnorm-relu'):
182 |         super(TransitLayer, self).__init__()
183 |         self.nonlinear = get_nonlinear(config_str, in_channels)
184 |         self.linear = nn.Conv1D(in_channels, out_channels, 1)
185 | 
186 |     def forward(self, x):
187 |         x = self.nonlinear(x)
188 |         x = self.linear(x)
189 |         return x
190 | 
191 | 
192 | class DenseLayer(nn.Layer):
193 |     def __init__(self,
194 |                  in_channels,
195 |                  out_channels,
196 |                  bias=False,
197 |                  config_str='batchnorm-relu'):
198 |         super(DenseLayer, self).__init__()
199 |         self.linear = nn.Conv1D(in_channels, out_channels, 1)
200 |         self.nonlinear = get_nonlinear(config_str, out_channels)
201 | 
202 |     def forward(self, x):
203 |         if len(x.shape) == 2:
204 |             x = self.linear(x.unsqueeze(axis=-1)).squeeze(axis=-1)
205 |         else:
206 |             x = self.linear(x)
207 |         x = self.nonlinear(x)
208 |         return x
209 | 
210 | 
211 | class BasicResBlock(nn.Layer):
212 |     expansion = 1
213 | 
214 |     def __init__(self, in_planes, planes, stride=1):
215 |         super(BasicResBlock, self).__init__()
216 |         self.conv1 = nn.Conv2D(in_planes,
217 |                                planes,
218 |                                kernel_size=3,
219 |                                stride=(stride, 1),
220 |                                padding=1)
221 |         self.bn1 = nn.BatchNorm2D(planes)
222 |         self.conv2 = nn.Conv2D(planes,
223 |                                planes,
224 |                                kernel_size=3,
225 |                                stride=1,
226 |                                padding=1)
227 |         self.bn2 = nn.BatchNorm2D(planes)
228 | 
229 |         self.shortcut = nn.Sequential()
230 |         if stride != 1 or in_planes != self.expansion * planes:
231 |             self.shortcut = nn.Sequential(
232 |                 nn.Conv2D(in_planes,
233 |                           self.expansion * planes,
234 |                           kernel_size=1,
235 |                           stride=(stride, 1)),
236 |                 nn.BatchNorm2D(self.expansion * planes))
237 | 
238 |     def forward(self, x):
239 |         out = F.relu(self.bn1(self.conv1(x)))
240 |         out = self.bn2(self.conv2(out))
241 |         out += self.shortcut(x)
242 |         out = F.relu(out)
243 |         return out
244 | 
245 | 
246 | class FCM(nn.Layer):
247 |     def __init__(self,
248 |                  block=BasicResBlock,
249 |                  num_blocks=[2, 2],
250 |                  m_channels=32,
251 |                  feat_dim=80):
252 |         super(FCM, self).__init__()
253 |         self.in_planes = m_channels
254 |         self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1)
255 |         self.bn1 = nn.BatchNorm2D(m_channels)
256 | 
257 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
258 |         self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
259 | 
260 |         self.conv2 = nn.Conv2D(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1)
261 |         self.bn2 = nn.BatchNorm2D(m_channels)
262 |         self.out_channels = m_channels * (math.ceil(feat_dim / 8))
263 | 
264 |     def _make_layer(self, block, planes, num_blocks, stride):
265 |         strides = [stride] + [1] * (num_blocks - 1)
266 |         layers = []
267 |         for stride in strides:
268 |             layers.append(block(self.in_planes, planes, stride))
269 |             self.in_planes = planes * block.expansion
270 |         return nn.Sequential(*layers)
271 | 
272 |     def forward(self, x):
273 |         x = x.unsqueeze(1)
274 |         out = F.relu(self.bn1(self.conv1(x)))
275 |         out = self.layer1(out)
276 |         out = self.layer2(out)
277 |         out = F.relu(self.bn2(self.conv2(out)))
278 | 
279 |         shape = out.shape
280 |         out = out.reshape((shape[0], shape[1] * shape[2], shape[3]))
281 |         return out
282 | 
283 | 
284 | class CAMPPlus(nn.Layer):
285 |     def __init__(self,
286 |                  input_size,
287 |                  embd_dim=512,
288 |                  growth_rate=32,
289 |                  bn_size=4,
290 |                  init_channels=128,
291 |                  config_str='batchnorm-relu',
292 |                  memory_efficient=True):
293 |         super(CAMPPlus, self).__init__()
294 | 
295 |         self.head = FCM(feat_dim=input_size)
296 |         channels = self.head.out_channels
297 |         self.embd_dim = embd_dim
298 | 
299 |         self.xvector = nn.Sequential(('tdnn', TDNNLayer(channels,
300 |                                                         init_channels,
301 |                                                         5,
302 |                                                         stride=2,
303 |                                                         dilation=1,
304 |                                                         padding=-1,
305 |                                                         config_str=config_str)))
306 |         channels = init_channels
307 |         for i, (num_layers, kernel_size,
308 |                 dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 2))):
309 |             block = CAMDenseTDNNBlock(num_layers=num_layers,
310 |                                       in_channels=channels,
311 |                                       out_channels=growth_rate,
312 |                                       bn_channels=bn_size * growth_rate,
313 |                                       kernel_size=kernel_size,
314 |                                       dilation=dilation,
315 |                                       config_str=config_str,
316 |                                       memory_efficient=memory_efficient)
317 |             self.xvector.add_sublayer('block%d' % (i + 1), block)
318 |             channels = channels + num_layers * growth_rate
319 |             self.xvector.add_sublayer('transit%d' % (i + 1),
320 |                                     TransitLayer(channels,
321 |                                                  channels // 2,
322 |                                                  bias=False,
323 |                                                  config_str=config_str))
324 |             channels //= 2
325 | 
326 |         self.xvector.add_sublayer('out_nonlinear', get_nonlinear(config_str, channels))
327 | 
328 |         self.xvector.add_sublayer('stats', StatsPool())
329 |         self.xvector.add_sublayer('dense', DenseLayer(channels * 2, embd_dim, config_str='batchnorm_'))
330 | 
331 |     def forward(self, x):
332 |         x = x.transpose((0, 2, 1))  # (B,T,F) => (B,F,T)
333 |         x = self.head(x)
334 |         x = self.xvector(x)
335 |         return x
336 | 


--------------------------------------------------------------------------------
/ppvector/models/ecapa_tdnn.py:
--------------------------------------------------------------------------------
  1 | import paddle
  2 | import paddle.nn as nn
  3 | 
  4 | from ppvector.models.pooling import AttentiveStatisticsPooling, SelfAttentivePooling
  5 | from ppvector.models.pooling import TemporalAveragePooling, TemporalStatisticsPooling
  6 | from ppvector.models.utils import BatchNorm1d, Conv1d, TDNNBlock, length_to_mask
  7 | 
  8 | __all__ = ['EcapaTdnn']
  9 | 
 10 | 
 11 | class Res2NetBlock(nn.Layer):
 12 |     def __init__(self, in_channels, out_channels, scale=8, dilation=1):
 13 |         """Implementation of Res2Net Block with dilation
 14 |            The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
 15 |            whose url is https://arxiv.org/abs/1904.01169
 16 |         Args:
 17 |             in_channels (int): input channels or input dimensions
 18 |             out_channels (int): output channels or output dimensions
 19 |             scale (int, optional): scale in res2net bolck. Defaults to 8.
 20 |             dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
 21 |         """
 22 |         super().__init__()
 23 |         assert in_channels % scale == 0
 24 |         assert out_channels % scale == 0
 25 | 
 26 |         in_channel = in_channels // scale
 27 |         hidden_channel = out_channels // scale
 28 | 
 29 |         self.blocks = nn.LayerList([
 30 |             TDNNBlock(
 31 |                 in_channel, hidden_channel, kernel_size=3, dilation=dilation)
 32 |             for i in range(scale - 1)
 33 |         ])
 34 |         self.scale = scale
 35 | 
 36 |     def forward(self, x):
 37 |         y = []
 38 |         for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)):
 39 |             if i == 0:
 40 |                 y_i = x_i
 41 |             elif i == 1:
 42 |                 y_i = self.blocks[i - 1](x_i)
 43 |             else:
 44 |                 y_i = self.blocks[i - 1](x_i + y_i)
 45 |             y.append(y_i)
 46 |         y = paddle.concat(y, axis=1)
 47 |         return y
 48 | 
 49 | 
 50 | class SEBlock(nn.Layer):
 51 |     def __init__(self, in_channels, se_channels, out_channels):
 52 |         """Implementation of SEBlock
 53 |            The paper is refered as "Squeeze-and-Excitation Networks"
 54 |            whose url is https://arxiv.org/abs/1709.01507
 55 |         Args:
 56 |             in_channels (int): input channels or input data dimensions
 57 |             se_channels (_type_): _description_
 58 |             out_channels (int): output channels or output data dimensions
 59 |         """
 60 |         super().__init__()
 61 | 
 62 |         self.conv1 = Conv1d(
 63 |             in_channels=in_channels, out_channels=se_channels, kernel_size=1)
 64 |         self.relu = paddle.nn.ReLU()
 65 |         self.conv2 = Conv1d(
 66 |             in_channels=se_channels, out_channels=out_channels, kernel_size=1)
 67 |         self.sigmoid = paddle.nn.Sigmoid()
 68 | 
 69 |     def forward(self, x, lengths=None):
 70 |         L = x.shape[-1]
 71 |         if lengths is not None:
 72 |             mask = length_to_mask(lengths * L, max_len=L)
 73 |             mask = mask.unsqueeze(1)
 74 |             total = mask.sum(axis=2, keepdim=True)
 75 |             s = (x * mask).sum(axis=2, keepdim=True) / total
 76 |         else:
 77 |             s = x.mean(axis=2, keepdim=True)
 78 | 
 79 |         s = self.relu(self.conv1(s))
 80 |         s = self.sigmoid(self.conv2(s))
 81 | 
 82 |         return s * x
 83 | 
 84 | 
 85 | class SERes2NetBlock(nn.Layer):
 86 |     def __init__(
 87 |             self,
 88 |             in_channels,
 89 |             out_channels,
 90 |             res2net_scale=8,
 91 |             se_channels=128,
 92 |             kernel_size=1,
 93 |             dilation=1,
 94 |             activation=nn.ReLU, ):
 95 |         """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
 96 |            The paper is refered "Squeeze-and-Excitation Networks"
 97 |            whose url is: https://arxiv.org/pdf/1709.01507.pdf
 98 |         Args:
 99 |             in_channels (int): input channels or input data dimensions
100 |             out_channels (int): output channels or output data dimensions
101 |             res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
102 |             se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
103 |             kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
104 |             dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
105 |             activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
106 |         """
107 |         super().__init__()
108 |         self.out_channels = out_channels
109 |         self.tdnn1 = TDNNBlock(
110 |             in_channels,
111 |             out_channels,
112 |             kernel_size=1,
113 |             dilation=1,
114 |             activation=activation, )
115 |         self.res2net_block = Res2NetBlock(out_channels, out_channels,
116 |                                           res2net_scale, dilation)
117 |         self.tdnn2 = TDNNBlock(
118 |             out_channels,
119 |             out_channels,
120 |             kernel_size=1,
121 |             dilation=1,
122 |             activation=activation, )
123 |         self.se_block = SEBlock(out_channels, se_channels, out_channels)
124 | 
125 |         self.shortcut = None
126 |         if in_channels != out_channels:
127 |             self.shortcut = Conv1d(
128 |                 in_channels=in_channels,
129 |                 out_channels=out_channels,
130 |                 kernel_size=1, )
131 | 
132 |     def forward(self, x, lengths=None):
133 |         residual = x
134 |         if self.shortcut:
135 |             residual = self.shortcut(x)
136 | 
137 |         x = self.tdnn1(x)
138 |         x = self.res2net_block(x)
139 |         x = self.tdnn2(x)
140 |         x = self.se_block(x, lengths)
141 | 
142 |         return x + residual
143 | 
144 | 
145 | class EcapaTdnn(nn.Layer):
146 |     def __init__(
147 |             self,
148 |             input_size,
149 |             embd_dim=192,
150 |             pooling_type="ASP",
151 |             activation=nn.ReLU,
152 |             channels=[512, 512, 512, 512, 1536],
153 |             kernel_sizes=[5, 3, 3, 3, 1],
154 |             dilations=[1, 2, 3, 4, 1],
155 |             attention_channels=128,
156 |             res2net_scale=8,
157 |             se_channels=128,
158 |             global_context=True, ):
159 |         """Implementation of ECAPA-TDNN backbone model network
160 |            The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
161 |            whose url is: https://arxiv.org/abs/2005.07143
162 |         Args:
163 |             input_size (_type_): input fature dimension
164 |             embd_dim (int, optional): speaker embedding size. Defaults to 192.
165 |             activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
166 |             channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
167 |             kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
168 |             dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
169 |             attention_channels (int, optional): attention dimensions. Defaults to 128.
170 |             res2net_scale (int, optional): scale value in res2net. Defaults to 8.
171 |             se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
172 |             global_context (bool, optional): global context flag. Defaults to True.
173 |         """
174 |         super().__init__()
175 |         assert len(channels) == len(kernel_sizes)
176 |         assert len(channels) == len(dilations)
177 |         self.input_size = input_size
178 |         self.channels = channels
179 |         self.blocks = nn.LayerList()
180 |         self.embd_dim = embd_dim
181 | 
182 |         # The initial TDNN layer
183 |         self.blocks.append(
184 |             TDNNBlock(
185 |                 input_size,
186 |                 channels[0],
187 |                 kernel_sizes[0],
188 |                 dilations[0],
189 |                 activation, ))
190 | 
191 |         # SE-Res2Net layers
192 |         for i in range(1, len(channels) - 1):
193 |             self.blocks.append(
194 |                 SERes2NetBlock(
195 |                     channels[i - 1],
196 |                     channels[i],
197 |                     res2net_scale=res2net_scale,
198 |                     se_channels=se_channels,
199 |                     kernel_size=kernel_sizes[i],
200 |                     dilation=dilations[i],
201 |                     activation=activation, ))
202 | 
203 |         # Multi-layer feature aggregation
204 |         self.mfa = TDNNBlock(
205 |             channels[-1],
206 |             channels[-1],
207 |             kernel_sizes[-1],
208 |             dilations[-1],
209 |             activation, )
210 | 
211 |         cat_channels = channels[-1]
212 |         if pooling_type == "ASP":
213 |             self.asp = AttentiveStatisticsPooling(channels[-1],
214 |                                                   attention_channels=attention_channels,
215 |                                                   global_context=global_context)
216 |             self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
217 |             # Final linear transformation
218 |             self.fc = Conv1d(in_channels=channels[-1] * 2,
219 |                              out_channels=self.embd_dim,
220 |                              kernel_size=1)
221 |         elif pooling_type == "SAP":
222 |             self.asp = SelfAttentivePooling(cat_channels, 128)
223 |             self.asp_bn = nn.BatchNorm1D(cat_channels)
224 |             # Final linear transformation
225 |             self.fc = Conv1d(in_channels=cat_channels,
226 |                              out_channels=self.embd_dim,
227 |                              kernel_size=1)
228 |         elif pooling_type == "TAP":
229 |             self.asp = TemporalAveragePooling()
230 |             self.asp_bn = nn.BatchNorm1D(cat_channels)
231 |             # Final linear transformation
232 |             self.fc = Conv1d(in_channels=cat_channels,
233 |                              out_channels=self.embd_dim,
234 |                              kernel_size=1)
235 |         elif pooling_type == "TSP":
236 |             self.asp = TemporalStatisticsPooling()
237 |             self.asp_bn = nn.BatchNorm1D(cat_channels * 2)
238 |             # Final linear transformation
239 |             self.fc = Conv1d(in_channels=cat_channels * 2,
240 |                              out_channels=self.embd_dim,
241 |                              kernel_size=1)
242 |         else:
243 |             raise Exception(f'没有{pooling_type}池化层！')
244 | 
245 |     def forward(self, x, lengths=None):
246 |         """
247 |         Compute embeddings.
248 | 
249 |         Args:
250 |             x (paddle.Tensor): Input data with shape (N, time, freq).
251 |             lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None.
252 | 
253 |         Returns:
254 |             paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1)
255 |         """
256 |         x = x.transpose([0, 2, 1])
257 |         xl = []
258 |         for layer in self.blocks:
259 |             try:
260 |                 x = layer(x, lengths=lengths)
261 |             except TypeError:
262 |                 x = layer(x)
263 |             xl.append(x)
264 | 
265 |         # Multi-layer feature aggregation
266 |         x = paddle.concat(xl[1:], axis=1)
267 |         x = self.mfa(x)
268 | 
269 |         # Attentive Statistical Pooling
270 |         x = self.asp(x, lengths=lengths)
271 |         x = self.asp_bn(x)
272 |         x = x.unsqueeze(2)
273 |         # Final linear transformation
274 |         x = self.fc(x).squeeze(-1)  # (N, emb_size, 1) -> (N, emb_size)
275 | 
276 |         return x
277 | 


--------------------------------------------------------------------------------
/ppvector/models/eres2net.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import paddle
  4 | import paddle.nn as nn
  5 | import paddle.nn.functional as F
  6 | 
  7 | from ppvector.models.pooling import TemporalStatsPool
  8 | 
  9 | __all__ = ['ERes2Net', 'ERes2NetV2']
 10 | 
 11 | 
 12 | class ReLU(nn.Hardtanh):
 13 | 
 14 |     def __init__(self, inplace=False):
 15 |         super(ReLU, self).__init__(0, 20, inplace)
 16 | 
 17 |     def __repr__(self):
 18 |         inplace_str = 'inplace' if self.inplace else ''
 19 |         return self.__class__.__name__ + ' (' + inplace_str + ')'
 20 | 
 21 | 
 22 | def conv1x1(in_planes, out_planes, stride=1):
 23 |     "1x1 convolution without padding"
 24 |     return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, padding=0)
 25 | 
 26 | 
 27 | def conv3x3(in_planes, out_planes, stride=1):
 28 |     "3x3 convolution with padding"
 29 |     return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride, padding=1)
 30 | 
 31 | 
 32 | class AFF(nn.Layer):
 33 | 
 34 |     def __init__(self, channels=64, r=4):
 35 |         super(AFF, self).__init__()
 36 |         inter_channels = int(channels // r)
 37 | 
 38 |         self.local_att = nn.Sequential(
 39 |             nn.Conv2D(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0),
 40 |             nn.BatchNorm2D(inter_channels),
 41 |             nn.Silu(),
 42 |             nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0),
 43 |             nn.BatchNorm2D(channels),
 44 |         )
 45 | 
 46 |     def forward(self, x, ds_y):
 47 |         xa = paddle.concat((x, ds_y), axis=1)
 48 |         x_att = self.local_att(xa)
 49 |         x_att = 1.0 + paddle.tanh(x_att)
 50 |         xo = paddle.multiply(x, x_att) + paddle.multiply(ds_y, 2.0 - x_att)
 51 | 
 52 |         return xo
 53 | 
 54 | 
 55 | class BasicBlockERes2Net(nn.Layer):
 56 | 
 57 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2):
 58 |         super(BasicBlockERes2Net, self).__init__()
 59 |         self.expansion = expansion
 60 |         width = int(math.floor(planes * (base_width / 64.0)))
 61 |         self.conv1 = conv1x1(in_planes, width * scale, stride)
 62 |         self.bn1 = nn.BatchNorm2D(width * scale)
 63 |         self.nums = scale
 64 | 
 65 |         convs = []
 66 |         bns = []
 67 |         for i in range(self.nums):
 68 |             convs.append(conv3x3(width, width))
 69 |             bns.append(nn.BatchNorm2D(width))
 70 |         self.convs = nn.LayerList(convs)
 71 |         self.bns = nn.LayerList(bns)
 72 |         self.relu = ReLU(inplace=True)
 73 | 
 74 |         self.conv3 = conv1x1(width * scale, planes * self.expansion)
 75 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
 76 |         self.shortcut = nn.Sequential()
 77 |         if stride != 1 or in_planes != self.expansion * planes:
 78 |             self.shortcut = nn.Sequential(
 79 |                 nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride),
 80 |                 nn.BatchNorm2D(self.expansion * planes))
 81 |         self.stride = stride
 82 |         self.width = width
 83 |         self.scale = scale
 84 | 
 85 |     def forward(self, x):
 86 |         out = self.conv1(x)
 87 |         out = self.bn1(out)
 88 |         out = self.relu(out)
 89 |         spx = paddle.split(out, int(out.shape[1] / self.width), 1)
 90 |         for i in range(self.nums):
 91 |             if i == 0:
 92 |                 sp = spx[i]
 93 |             else:
 94 |                 sp = sp + spx[i]
 95 |             sp = self.convs[i](sp)
 96 |             sp = self.relu(self.bns[i](sp))
 97 |             if i == 0:
 98 |                 out = sp
 99 |             else:
100 |                 out = paddle.concat((out, sp), 1)
101 |         out = self.conv3(out)
102 |         out = self.bn3(out)
103 | 
104 |         residual = self.shortcut(x)
105 |         out += residual
106 |         out = self.relu(out)
107 | 
108 |         return out
109 | 
110 | 
111 | class BasicBlockERes2Net_diff_AFF(nn.Layer):
112 | 
113 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2):
114 |         super(BasicBlockERes2Net_diff_AFF, self).__init__()
115 |         self.expansion = expansion
116 |         width = int(math.floor(planes * (base_width / 64.0)))
117 |         self.conv1 = conv1x1(in_planes, width * scale, stride)
118 |         self.bn1 = nn.BatchNorm2D(width * scale)
119 | 
120 |         self.nums = scale
121 | 
122 |         convs = []
123 |         fuse_models = []
124 |         bns = []
125 |         for i in range(self.nums):
126 |             convs.append(conv3x3(width, width))
127 |             bns.append(nn.BatchNorm2D(width))
128 |         for j in range(self.nums - 1):
129 |             fuse_models.append(AFF(channels=width))
130 | 
131 |         self.convs = nn.LayerList(convs)
132 |         self.bns = nn.LayerList(bns)
133 |         self.fuse_models = nn.LayerList(fuse_models)
134 |         self.relu = ReLU(inplace=True)
135 | 
136 |         self.conv3 = conv1x1(width * scale, planes * self.expansion)
137 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
138 |         self.shortcut = nn.Sequential()
139 |         if stride != 1 or in_planes != self.expansion * planes:
140 |             self.shortcut = nn.Sequential(
141 |                 nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride),
142 |                 nn.BatchNorm2D(self.expansion * planes))
143 |         self.stride = stride
144 |         self.width = width
145 |         self.scale = scale
146 | 
147 |     def forward(self, x):
148 |         out = self.conv1(x)
149 |         out = self.bn1(out)
150 |         out = self.relu(out)
151 |         spx = paddle.split(out, int(out.shape[1] / self.width), 1)
152 |         for i in range(self.nums):
153 |             if i == 0:
154 |                 sp = spx[i]
155 |             else:
156 |                 sp = self.fuse_models[i - 1](sp, spx[i])
157 |             sp = self.convs[i](sp)
158 |             sp = self.relu(self.bns[i](sp))
159 |             if i == 0:
160 |                 out = sp
161 |             else:
162 |                 out = paddle.concat((out, sp), 1)
163 |         out = self.conv3(out)
164 |         out = self.bn3(out)
165 | 
166 |         residual = self.shortcut(x)
167 |         out += residual
168 |         out = self.relu(out)
169 | 
170 |         return out
171 | 
172 | 
173 | class ERes2Net(nn.Layer):
174 |     def __init__(self,
175 |                  input_size,
176 |                  block=BasicBlockERes2Net,
177 |                  block_fuse=BasicBlockERes2Net_diff_AFF,
178 |                  num_blocks=[3, 4, 6, 3],
179 |                  m_channels=32,
180 |                  mul_channel=1,
181 |                  expansion=2,
182 |                  base_width=32,
183 |                  scale=2,
184 |                  embd_dim=192,
185 |                  pooling_type='TSTP',
186 |                  two_emb_layer=False):
187 |         super(ERes2Net, self).__init__()
188 |         self.in_planes = m_channels
189 |         self.expansion = expansion
190 |         self.feat_dim = input_size
191 |         self.embd_dim = embd_dim
192 |         self.stats_dim = int(input_size / 8) * m_channels * 8
193 |         self.two_emb_layer = two_emb_layer
194 | 
195 |         self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1)
196 |         self.bn1 = nn.BatchNorm2D(m_channels)
197 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1,
198 |                                        base_width=base_width, scale=scale)
199 |         self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2,
200 |                                        base_width=base_width, scale=scale)
201 |         self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2,
202 |                                        base_width=base_width, scale=scale)
203 |         self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2,
204 |                                        base_width=base_width, scale=scale)
205 | 
206 |         # Downsampling module for each layer
207 |         self.layer1_downsample = nn.Conv2D(m_channels * 2 * mul_channel, m_channels * 4 * mul_channel, kernel_size=3,
208 |                                            padding=1, stride=2)
209 |         self.layer2_downsample = nn.Conv2D(m_channels * 4 * mul_channel, m_channels * 8 * mul_channel, kernel_size=3,
210 |                                            padding=1, stride=2)
211 |         self.layer3_downsample = nn.Conv2D(m_channels * 8 * mul_channel, m_channels * 16 * mul_channel, kernel_size=3,
212 |                                            padding=1, stride=2)
213 |         self.fuse_mode12 = AFF(channels=m_channels * 4 * mul_channel)
214 |         self.fuse_mode123 = AFF(channels=m_channels * 8 * mul_channel)
215 |         self.fuse_mode1234 = AFF(channels=m_channels * 16 * mul_channel)
216 | 
217 |         self.n_stats = 1 if pooling_type == 'TAP' else 2
218 |         if pooling_type == "TSTP":
219 |             self.pooling = TemporalStatsPool()
220 |         else:
221 |             raise Exception(f'没有{pooling_type}池化层！')
222 | 
223 |         self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim)
224 |         if self.two_emb_layer:
225 |             self.seg_bn_1 = nn.BatchNorm1D(embd_dim)
226 |             self.seg_2 = nn.Linear(embd_dim, embd_dim)
227 |         else:
228 |             self.seg_bn_1 = nn.Identity()
229 |             self.seg_2 = nn.Identity()
230 | 
231 |     def _make_layer(self, block, planes, num_blocks, stride, base_width, scale):
232 |         strides = [stride] + [1] * (num_blocks - 1)
233 |         layers = []
234 |         for stride in strides:
235 |             layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale))
236 |             self.in_planes = planes * self.expansion
237 |         return nn.Sequential(*layers)
238 | 
239 |     def forward(self, x):
240 |         x = x.transpose((0, 2, 1))  # (B,T,F) => (B,F,T)
241 | 
242 |         x = x.unsqueeze_(1)
243 |         out = F.relu(self.bn1(self.conv1(x)))
244 |         out1 = self.layer1(out)
245 |         out2 = self.layer2(out1)
246 |         out1_downsample = self.layer1_downsample(out1)
247 |         fuse_out12 = self.fuse_mode12(out2, out1_downsample)
248 |         out3 = self.layer3(out2)
249 |         fuse_out12_downsample = self.layer2_downsample(fuse_out12)
250 |         fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
251 |         out4 = self.layer4(out3)
252 |         fuse_out123_downsample = self.layer3_downsample(fuse_out123)
253 |         fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
254 |         stats = self.pooling(fuse_out1234)
255 | 
256 |         embed_a = self.seg_1(stats)
257 |         if self.two_emb_layer:
258 |             out = F.relu(embed_a)
259 |             out = self.seg_bn_1(out)
260 |             embed_b = self.seg_2(out)
261 |             return embed_b
262 |         else:
263 |             return embed_a
264 | 
265 | 
266 | class BasicBlockERes2NetV2(nn.Layer):
267 | 
268 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2):
269 |         super(BasicBlockERes2NetV2, self).__init__()
270 |         self.expansion = expansion
271 |         width = int(math.floor(planes * (base_width / 64.0)))
272 |         self.conv1 = nn.Conv2D(in_planes, width * scale, kernel_size=1, stride=stride)
273 |         self.bn1 = nn.BatchNorm2D(width * scale)
274 |         self.nums = scale
275 | 
276 |         convs = []
277 |         bns = []
278 |         for i in range(self.nums):
279 |             convs.append(nn.Conv2D(width, width, kernel_size=3, padding=1))
280 |             bns.append(nn.BatchNorm2D(width))
281 |         self.convs = nn.LayerList(convs)
282 |         self.bns = nn.LayerList(bns)
283 |         self.relu = ReLU(inplace=True)
284 | 
285 |         self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1)
286 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
287 |         self.shortcut = nn.Sequential()
288 |         if stride != 1 or in_planes != self.expansion * planes:
289 |             self.shortcut = nn.Sequential(
290 |                 nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride),
291 |                 nn.BatchNorm2D(self.expansion * planes))
292 |         self.stride = stride
293 |         self.width = width
294 |         self.scale = scale
295 | 
296 |     def forward(self, x):
297 |         out = self.conv1(x)
298 |         out = self.bn1(out)
299 |         out = self.relu(out)
300 |         spx = paddle.split(out, int(out.shape[1] / self.width), 1)
301 |         for i in range(self.nums):
302 |             if i == 0:
303 |                 sp = spx[i]
304 |             else:
305 |                 sp = sp + spx[i]
306 |             sp = self.convs[i](sp)
307 |             sp = self.relu(self.bns[i](sp))
308 |             if i == 0:
309 |                 out = sp
310 |             else:
311 |                 out = paddle.concat((out, sp), 1)
312 |         out = self.conv3(out)
313 |         out = self.bn3(out)
314 | 
315 |         residual = self.shortcut(x)
316 |         out += residual
317 |         out = self.relu(out)
318 | 
319 |         return out
320 | 
321 | 
322 | class BasicBlockERes2NetV2_AFF(nn.Layer):
323 | 
324 |     def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2):
325 |         super(BasicBlockERes2NetV2_AFF, self).__init__()
326 |         self.expansion = expansion
327 |         width = int(math.floor(planes * (base_width / 64.0)))
328 |         self.conv1 = nn.Conv2D(in_planes, width * scale, kernel_size=1, stride=stride)
329 |         self.bn1 = nn.BatchNorm2D(width * scale)
330 |         self.nums = scale
331 | 
332 |         convs = []
333 |         fuse_models = []
334 |         bns = []
335 |         for i in range(self.nums):
336 |             convs.append(nn.Conv2D(width, width, kernel_size=3, padding=1))
337 |             bns.append(nn.BatchNorm2D(width))
338 |         for j in range(self.nums - 1):
339 |             fuse_models.append(AFF(channels=width, r=4))
340 | 
341 |         self.convs = nn.LayerList(convs)
342 |         self.bns = nn.LayerList(bns)
343 |         self.fuse_models = nn.LayerList(fuse_models)
344 |         self.relu = ReLU(inplace=True)
345 | 
346 |         self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1)
347 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
348 |         self.shortcut = nn.Sequential()
349 |         if stride != 1 or in_planes != self.expansion * planes:
350 |             self.shortcut = nn.Sequential(
351 |                 nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride),
352 |                 nn.BatchNorm2D(self.expansion * planes))
353 |         self.stride = stride
354 |         self.width = width
355 |         self.scale = scale
356 | 
357 |     def forward(self, x):
358 |         out = self.conv1(x)
359 |         out = self.bn1(out)
360 |         out = self.relu(out)
361 |         spx = paddle.split(out, int(out.shape[1] / self.width), 1)
362 |         for i in range(self.nums):
363 |             if i == 0:
364 |                 sp = spx[i]
365 |             else:
366 |                 sp = self.fuse_models[i - 1](sp, spx[i])
367 |             sp = self.convs[i](sp)
368 |             sp = self.relu(self.bns[i](sp))
369 |             if i == 0:
370 |                 out = sp
371 |             else:
372 |                 out = paddle.concat((out, sp), 1)
373 |         out = self.conv3(out)
374 |         out = self.bn3(out)
375 | 
376 |         residual = self.shortcut(x)
377 |         out += residual
378 |         out = self.relu(out)
379 | 
380 |         return out
381 | 
382 | 
383 | class ERes2NetV2(nn.Layer):
384 |     def __init__(self,
385 |                  input_size,
386 |                  block=BasicBlockERes2NetV2,
387 |                  block_fuse=BasicBlockERes2NetV2_AFF,
388 |                  num_blocks=[3, 4, 6, 3],
389 |                  m_channels=32,
390 |                  expansion=2,
391 |                  base_width=26,
392 |                  scale=2,
393 |                  embd_dim=192,
394 |                  pooling_type='TSTP',
395 |                  two_emb_layer=False):
396 |         super(ERes2NetV2, self).__init__()
397 |         self.in_planes = m_channels
398 |         self.expansion = expansion
399 |         self.embd_dim = embd_dim
400 |         self.stats_dim = int(input_size / 8) * m_channels * 8
401 |         self.two_emb_layer = two_emb_layer
402 | 
403 |         self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1)
404 |         self.bn1 = nn.BatchNorm2D(m_channels)
405 |         self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1,
406 |                                        base_width=base_width, scale=scale)
407 |         self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2,
408 |                                        base_width=base_width, scale=scale)
409 |         self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2,
410 |                                        base_width=base_width, scale=scale)
411 |         self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2,
412 |                                        base_width=base_width, scale=scale)
413 | 
414 |         # Downsampling module
415 |         self.layer3_ds = nn.Conv2D(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2)
416 | 
417 |         # Bottom-up fusion module
418 |         self.fuse34 = AFF(channels=m_channels * 16, r=4)
419 | 
420 |         self.n_stats = 1 if pooling_type == 'TAP' else 2
421 |         if pooling_type == "TSTP":
422 |             self.pooling = TemporalStatsPool()
423 |         else:
424 |             raise Exception(f'没有{pooling_type}池化层！')
425 | 
426 |         self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim)
427 |         if self.two_emb_layer:
428 |             self.seg_bn_1 = nn.BatchNorm1D(embd_dim)
429 |             self.seg_2 = nn.Linear(embd_dim, embd_dim)
430 |         else:
431 |             self.seg_bn_1 = nn.Identity()
432 |             self.seg_2 = nn.Identity()
433 | 
434 |     def _make_layer(self, block, planes, num_blocks, stride, base_width, scale):
435 |         strides = [stride] + [1] * (num_blocks - 1)
436 |         layers = []
437 |         for stride in strides:
438 |             layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale))
439 |             self.in_planes = planes * self.expansion
440 |         return nn.Sequential(*layers)
441 | 
442 |     def forward(self, x):
443 |         x = x.transpose((0, 2, 1))  # (B,T,F) => (B,F,T)
444 | 
445 |         x = x.unsqueeze_(1)
446 |         out = F.relu(self.bn1(self.conv1(x)))
447 |         out1 = self.layer1(out)
448 |         out2 = self.layer2(out1)
449 |         out3 = self.layer3(out2)
450 |         out4 = self.layer4(out3)
451 |         out3_ds = self.layer3_ds(out3)
452 |         fuse_out34 = self.fuse34(out4, out3_ds)
453 |         stats = self.pooling(fuse_out34)
454 | 
455 |         embed_a = self.seg_1(stats)
456 |         if self.two_emb_layer:
457 |             out = F.relu(embed_a)
458 |             out = self.seg_bn_1(out)
459 |             embed_b = self.seg_2(out)
460 |             return embed_b
461 |         else:
462 |             return embed_a
463 | 


--------------------------------------------------------------------------------
/ppvector/models/fc.py:
--------------------------------------------------------------------------------
 1 | import paddle
 2 | import paddle.nn as nn
 3 | import paddle.nn.functional as F
 4 | 
 5 | 
 6 | class SpeakerIdentification(nn.Layer):
 7 |     def __init__(self,
 8 |                  input_dim,
 9 |                  num_speakers,
10 |                  classifier_type='Cosine',
11 |                  K=1,
12 |                  num_blocks=0,
13 |                  inter_dim=512):
14 |         """The speaker identification model, which includes the speaker backbone network
15 |            and the a linear transform to speaker class num in training
16 | 
17 |         Args:
18 |             input_dim (nn.Module, class): embedding model output dim.
19 |             num_speakers (_type_): the speaker class num in the training dataset
20 |             classifier_type (str, optional): type of output layer to uses.
21 |             num_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
22 |             inter_dim (int, optional): the output dimension of dense layer. Defaults to 512.
23 |         """
24 |         super(SpeakerIdentification, self).__init__()
25 |         self.classifier_type = classifier_type
26 |         self.blocks = nn.LayerList()
27 | 
28 |         for index in range(num_blocks):
29 |             self.blocks.append(DenseLayer(input_dim, inter_dim, config_str='batchnorm'))
30 |             input_dim = inter_dim
31 | 
32 |         if self.classifier_type == 'Cosine':
33 |             self.weight = paddle.create_parameter(shape=[input_dim, num_speakers * K],
34 |                                                   dtype='float32',
35 |                                                   attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
36 |         elif self.classifier_type == 'Linear':
37 |             self.output = nn.Linear(input_dim, num_speakers)
38 |         else:
39 |             raise ValueError(f'不支持该输出层：{self.classifier_type}')
40 | 
41 |     def forward(self, features):
42 |         # x: [B, dim]
43 |         x = features
44 |         for layer in self.blocks:
45 |             x = layer(x)
46 | 
47 |         # normalized
48 |         if self.classifier_type == 'Cosine':
49 |             logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
50 |         else:
51 |             logits = self.output(x)
52 | 
53 |         return {"features": features, "logits": logits}
54 | 
55 | 
56 | class DenseLayer(nn.Layer):
57 |     def __init__(self,
58 |                  in_channels,
59 |                  out_channels,
60 |                  config_str='batchnorm-relu'):
61 |         super(DenseLayer, self).__init__()
62 |         self.linear = nn.Conv1D(in_channels, out_channels, 1)
63 |         self.nonlinear = get_nonlinear(config_str, out_channels)
64 | 
65 |     def forward(self, x):
66 |         if len(x.shape) == 2:
67 |             x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
68 |         else:
69 |             x = self.linear(x)
70 |         x = self.nonlinear(x)
71 |         return x
72 | 
73 | 
74 | def get_nonlinear(config_str, channels):
75 |     nonlinear = nn.Sequential()
76 |     for name in config_str.split('-'):
77 |         if name == 'relu':
78 |             nonlinear.add_module('relu', nn.ReLU())
79 |         elif name == 'prelu':
80 |             nonlinear.add_module('prelu', nn.PReLU(channels))
81 |         elif name == 'batchnorm':
82 |             nonlinear.add_module('batchnorm', nn.BatchNorm1D(channels))
83 |         elif name == 'batchnorm_':
84 |             nonlinear.add_module('batchnorm', nn.BatchNorm1D(channels))
85 |         else:
86 |             raise ValueError('Unexpected module ({}).'.format(name))
87 |     return nonlinear
88 | 


--------------------------------------------------------------------------------
/ppvector/models/pooling.py:
--------------------------------------------------------------------------------
  1 | import paddle
  2 | import paddle.nn as nn
  3 | import paddle.nn.functional as F
  4 | 
  5 | from ppvector.models.utils import length_to_mask, Conv1d, TDNNBlock
  6 | 
  7 | 
  8 | class TemporalAveragePooling(nn.Layer):
  9 |     def __init__(self):
 10 |         """TAP
 11 |         Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification
 12 |         Link: https://arxiv.org/pdf/1903.12058.pdf
 13 |         """
 14 |         super(TemporalAveragePooling, self).__init__()
 15 | 
 16 |     def forward(self, x, lengths=None):
 17 |         """Computes Temporal Average Pooling Module
 18 |         Args:
 19 |             x (torch.Tensor): Input tensor (#batch, channels, frames).
 20 |         Returns:
 21 |             torch.Tensor: Output tensor (#batch, channels)
 22 |         """
 23 |         x = paddle.mean(x, axis=2)
 24 |         x = x.unsqueeze(2)
 25 |         return x
 26 | 
 27 | 
 28 | class TemporalStatisticsPooling(nn.Layer):
 29 |     def __init__(self):
 30 |         """TSP
 31 |         Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition
 32 |         Link： http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
 33 |         """
 34 |         super(TemporalStatisticsPooling, self).__init__()
 35 | 
 36 |     def forward(self, x, lengths=None):
 37 |         """Computes Temporal Statistics Pooling Module
 38 |         Args:
 39 |             x (torch.Tensor): Input tensor (#batch, channels, frames).
 40 |         Returns:
 41 |             torch.Tensor: Output tensor (#batch, channels*2)
 42 |         """
 43 |         mean = paddle.mean(x, axis=2)
 44 |         var = paddle.var(x, axis=2)
 45 |         x = paddle.concat((mean, var), axis=1)
 46 |         x = x.unsqueeze(2)
 47 |         return x
 48 | 
 49 | 
 50 | class SelfAttentivePooling(nn.Layer):
 51 |     """SAP"""
 52 | 
 53 |     def __init__(self, in_dim, bottleneck_dim=128):
 54 |         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
 55 |         # attention dim = 128
 56 |         super(SelfAttentivePooling, self).__init__()
 57 |         self.linear1 = nn.Conv1D(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
 58 |         self.linear2 = nn.Conv1D(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
 59 | 
 60 |     def forward(self, x, lengths=None):
 61 |         # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
 62 |         alpha = paddle.tanh(self.linear1(x))
 63 |         alpha = paddle.nn.functional.softmax(self.linear2(alpha), axis=2)
 64 |         mean = paddle.sum(alpha * x, axis=2)
 65 |         mean = mean.unsqueeze(2)
 66 |         return mean
 67 | 
 68 | 
 69 | class AttentiveStatisticsPooling(nn.Layer):
 70 |     """TSP"""
 71 | 
 72 |     def __init__(self, channels, attention_channels=128, global_context=True):
 73 |         super().__init__()
 74 |         self.eps = 1e-12
 75 |         self.global_context = global_context
 76 |         if global_context:
 77 |             self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
 78 |         else:
 79 |             self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
 80 |         self.tanh = nn.Tanh()
 81 |         self.conv = Conv1d(
 82 |             in_channels=attention_channels,
 83 |             out_channels=channels,
 84 |             kernel_size=1)
 85 | 
 86 |     def forward(self, x, lengths=None):
 87 |         C, L = x.shape[1], x.shape[2]  # KP: (N, C, L)
 88 | 
 89 |         def _compute_statistics(x, m, axis=2, eps=self.eps):
 90 |             mean = (m * x).sum(axis)
 91 |             std = paddle.sqrt((m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps))
 92 |             return mean, std
 93 | 
 94 |         if lengths is None:
 95 |             lengths = paddle.ones([x.shape[0]])
 96 | 
 97 |         # Make binary mask of shape [N, 1, L]
 98 |         mask = length_to_mask(lengths * L, max_len=L)
 99 |         mask = mask.unsqueeze(1)
100 | 
101 |         # 通过允许自我注意观察话语的全局属性，扩展汇集层的时间上下文。
102 |         if self.global_context:
103 |             total = mask.sum(axis=2, keepdim=True).astype('float32')
104 |             mean, std = _compute_statistics(x, mask / total)
105 |             mean = mean.unsqueeze(2).tile((1, 1, L))
106 |             std = std.unsqueeze(2).tile((1, 1, L))
107 |             attn = paddle.concat([x, mean, std], axis=1)
108 |         else:
109 |             attn = x
110 | 
111 |         # Apply layers
112 |         attn = self.conv(self.tanh(self.tdnn(attn)))
113 | 
114 |         # Filter out zero-paddings
115 |         attn = paddle.where(
116 |             mask.tile((1, C, 1)) == 0,
117 |             paddle.ones_like(attn) * float("-inf"), attn)
118 | 
119 |         attn = F.softmax(attn, axis=2)
120 |         mean, std = _compute_statistics(x, attn)
121 | 
122 |         # Append mean and std of the batch
123 |         pooled_stats = paddle.concat((mean, std), axis=1)
124 | 
125 |         return pooled_stats
126 | 
127 | 
128 | class TemporalStatsPool(nn.Layer):
129 |     """TSTP
130 |     Temporal statistics pooling, concatenate mean and std, which is used in
131 |     x-vector
132 |     Comment: simple concatenation can not make full use of both statistics
133 |     """
134 | 
135 |     def __init__(self):
136 |         super(TemporalStatsPool, self).__init__()
137 | 
138 |     def forward(self, x, lengths=None):
139 |         # The last dimension is the temporal axis
140 |         pooling_mean = x.mean(axis=-1)
141 |         pooling_std = paddle.sqrt(paddle.var(x, axis=-1) + 1e-8)
142 |         pooling_mean = pooling_mean.flatten(start_axis=1)
143 |         pooling_std = pooling_std.flatten(start_axis=1)
144 | 
145 |         stats = paddle.concat((pooling_mean, pooling_std), 1)
146 |         return stats
147 | 


--------------------------------------------------------------------------------
/ppvector/models/res2net.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import paddle
  4 | import paddle.nn as nn
  5 | 
  6 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
  7 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
  8 | from ppvector.models.utils import BatchNorm1d
  9 | 
 10 | 
 11 | class Bottle2neck(nn.Layer):
 12 |     expansion = 4
 13 | 
 14 |     def __init__(self, inplanes, planes, stride=1, downsample=None, baseWidth=26, scale=4, stype='normal'):
 15 |         """ Constructor
 16 |         Args:
 17 |             inplanes: input channel dimensionality
 18 |             planes: output channel dimensionality
 19 |             stride: conv stride. Replaces pooling layer.
 20 |             downsample: None when stride = 1
 21 |             baseWidth: basic width of conv3x3
 22 |             scale: number of scale.
 23 |             type: 'normal': normal set. 'stage': first block of a new stage.
 24 |         """
 25 |         super(Bottle2neck, self).__init__()
 26 | 
 27 |         width = int(math.floor(planes * (baseWidth / 64.0)))
 28 |         self.conv1 = nn.Conv2D(inplanes, width * scale, kernel_size=1)
 29 |         self.bn1 = nn.BatchNorm2D(width * scale)
 30 | 
 31 |         if scale == 1:
 32 |             self.nums = 1
 33 |         else:
 34 |             self.nums = scale - 1
 35 |         if stype == 'stage':
 36 |             self.pool = nn.AvgPool2D(kernel_size=3, stride=stride, padding=1)
 37 |         convs = []
 38 |         bns = []
 39 |         for i in range(self.nums):
 40 |             convs.append(nn.Conv2D(width, width, kernel_size=3, stride=stride, padding=1))
 41 |             bns.append(nn.BatchNorm2D(width))
 42 |         self.convs = nn.LayerList(convs)
 43 |         self.bns = nn.LayerList(bns)
 44 | 
 45 |         self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1)
 46 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
 47 | 
 48 |         self.relu = nn.ReLU()
 49 |         self.downsample = downsample
 50 |         self.stype = stype
 51 |         self.scale = scale
 52 |         self.width = width
 53 | 
 54 |     def forward(self, x):
 55 |         residual = x
 56 | 
 57 |         out = self.conv1(x)
 58 |         out = self.bn1(out)
 59 |         out = self.relu(out)
 60 | 
 61 |         spx = paddle.split(out, self.scale, 1)
 62 |         for i in range(self.nums):
 63 |             if i == 0 or self.stype == 'stage':
 64 |                 sp = spx[i]
 65 |             else:
 66 |                 sp = sp + spx[i]
 67 |             sp = self.convs[i](sp)
 68 |             sp = self.relu(self.bns[i](sp))
 69 |             if i == 0:
 70 |                 out = sp
 71 |             else:
 72 |                 out = paddle.concat((out, sp), 1)
 73 |         if self.scale != 1 and self.stype == 'normal':
 74 |             out = paddle.concat((out, spx[self.nums]), 1)
 75 |         elif self.scale != 1 and self.stype == 'stage':
 76 |             out = paddle.concat((out, self.pool(spx[self.nums])), 1)
 77 | 
 78 |         out = self.conv3(out)
 79 |         out = self.bn3(out)
 80 | 
 81 |         if self.downsample is not None:
 82 |             residual = self.downsample(x)
 83 | 
 84 |         out += residual
 85 |         out = self.relu(out)
 86 | 
 87 |         return out
 88 | 
 89 | 
 90 | class Res2Net(nn.Layer):
 91 | 
 92 |     def __init__(self, input_size, m_channels=32, layers=[3, 4, 6, 3], base_width=32, scale=2, embd_dim=192,
 93 |                  pooling_type="ASP"):
 94 |         super(Res2Net, self).__init__()
 95 |         self.inplanes = m_channels
 96 |         self.base_width = base_width
 97 |         self.scale = scale
 98 |         self.embd_dim = embd_dim
 99 |         self.conv1 = nn.Conv2D(1, m_channels, kernel_size=7, stride=3, padding=1)
100 |         self.bn1 = nn.BatchNorm2D(m_channels)
101 |         self.relu = nn.ReLU()
102 |         self.max_pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
103 |         self.layer1 = self._make_layer(Bottle2neck, m_channels, layers[0])
104 |         self.layer2 = self._make_layer(Bottle2neck, m_channels*2, layers[1], stride=2)
105 |         self.layer3 = self._make_layer(Bottle2neck, m_channels * 4, layers[2], stride=2)
106 |         self.layer4 = self._make_layer(Bottle2neck, m_channels * 8, layers[3], stride=2)
107 | 
108 |         if input_size < 96:
109 |             cat_channels = m_channels * 8 * Bottle2neck.expansion * (input_size // self.base_width)
110 |         else:
111 |             cat_channels = m_channels * 8 * Bottle2neck.expansion * (
112 |                         input_size // self.base_width - int(math.sqrt(input_size / 64)))
113 |         if pooling_type == "ASP":
114 |             self.pooling = AttentiveStatisticsPooling(cat_channels, attention_channels=128)
115 |             self.bn2 = BatchNorm1d(cat_channels * 2)
116 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
117 |             self.bn3 = BatchNorm1d(embd_dim)
118 |         elif pooling_type == "SAP":
119 |             self.pooling = SelfAttentivePooling(cat_channels, 128)
120 |             self.bn2 = BatchNorm1d(cat_channels)
121 |             self.linear = nn.Linear(cat_channels, embd_dim)
122 |             self.bn3 = BatchNorm1d(embd_dim)
123 |         elif pooling_type == "TAP":
124 |             self.pooling = TemporalAveragePooling()
125 |             self.bn2 = BatchNorm1d(cat_channels)
126 |             self.linear = nn.Linear(cat_channels, embd_dim)
127 |             self.bn3 = BatchNorm1d(embd_dim)
128 |         elif pooling_type == "TSP":
129 |             self.pooling = TemporalStatisticsPooling()
130 |             self.bn2 = BatchNorm1d(cat_channels * 2)
131 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
132 |             self.bn3 = BatchNorm1d(embd_dim)
133 |         else:
134 |             raise Exception(f'没有{pooling_type}池化层！')
135 | 
136 |     def _make_layer(self, block, planes, blocks, stride=1):
137 |         downsample = None
138 |         if stride != 1 or self.inplanes != planes * block.expansion:
139 |             downsample = nn.Sequential(
140 |                 nn.Conv2D(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride),
141 |                 nn.BatchNorm2D(planes * block.expansion),
142 |             )
143 | 
144 |         layers = [block(self.inplanes, planes, stride, downsample=downsample,
145 |                         stype='stage', baseWidth=self.base_width, scale=self.scale)]
146 |         self.inplanes = planes * block.expansion
147 |         for i in range(1, blocks):
148 |             layers.append(block(self.inplanes, planes, baseWidth=self.base_width, scale=self.scale))
149 | 
150 |         return nn.Sequential(*layers)
151 | 
152 |     def forward(self, x):
153 |         x = x.transpose([0, 2, 1])
154 |         x = x.unsqueeze(1)
155 |         x = self.conv1(x)
156 |         x = self.bn1(x)
157 |         x = self.relu(x)
158 |         x = self.max_pool(x)
159 | 
160 |         x = self.layer1(x)
161 |         x = self.layer2(x)
162 |         x = self.layer3(x)
163 |         x = self.layer4(x)
164 | 
165 |         x = x.reshape([x.shape[0], -1, x.shape[-1]])
166 |         x = self.pooling(x)
167 |         x = self.bn2(x)
168 |         x = self.linear(x)
169 |         x = self.bn3(x)
170 | 
171 |         return x
172 | 


--------------------------------------------------------------------------------
/ppvector/models/resnet_se.py:
--------------------------------------------------------------------------------
  1 | import paddle.nn as nn
  2 | 
  3 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
  4 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
  5 | from ppvector.models.utils import BatchNorm1d
  6 | 
  7 | 
  8 | class SEBottleneck(nn.Layer):
  9 |     expansion = 2
 10 | 
 11 |     def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
 12 |         super(SEBottleneck, self).__init__()
 13 |         self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1)
 14 |         self.bn1 = nn.BatchNorm2D(planes)
 15 |         self.conv2 = nn.Conv2D(planes, planes, kernel_size=3, stride=stride, padding=1)
 16 |         self.bn2 = nn.BatchNorm2D(planes)
 17 |         self.conv3 = nn.Conv2D(planes, planes * self.expansion, kernel_size=1)
 18 |         self.bn3 = nn.BatchNorm2D(planes * self.expansion)
 19 |         self.relu = nn.ReLU()
 20 |         self.se = SELayer(planes * self.expansion, reduction)
 21 |         self.downsample = downsample
 22 |         self.stride = stride
 23 | 
 24 |     def forward(self, x):
 25 |         residual = x
 26 | 
 27 |         out = self.conv1(x)
 28 |         out = self.bn1(out)
 29 |         out = self.relu(out)
 30 | 
 31 |         out = self.conv2(out)
 32 |         out = self.bn2(out)
 33 |         out = self.relu(out)
 34 | 
 35 |         out = self.conv3(out)
 36 |         out = self.bn3(out)
 37 |         out = self.se(out)
 38 | 
 39 |         if self.downsample is not None:
 40 |             residual = self.downsample(x)
 41 | 
 42 |         out += residual
 43 |         out = self.relu(out)
 44 | 
 45 |         return out
 46 | 
 47 | 
 48 | class SELayer(nn.Layer):
 49 |     def __init__(self, channel, reduction=8):
 50 |         super(SELayer, self).__init__()
 51 |         self.avg_pool = nn.AdaptiveAvgPool2D(1)
 52 |         self.fc = nn.Sequential(
 53 |             nn.Linear(channel, channel // reduction),
 54 |             nn.ReLU(),
 55 |             nn.Linear(channel // reduction, channel),
 56 |             nn.Sigmoid()
 57 |         )
 58 | 
 59 |     def forward(self, x):
 60 |         b, c, _, _ = x.shape
 61 |         y = self.avg_pool(x).reshape([b, c])
 62 |         y = self.fc(y).reshape([b, c, 1, 1])
 63 |         return x * y
 64 | 
 65 | 
 66 | class ResNetSE(nn.Layer):
 67 |     def __init__(self, input_size, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], embd_dim=192,
 68 |                  pooling_type="ASP"):
 69 |         super(ResNetSE, self).__init__()
 70 |         self.inplanes = num_filters[0]
 71 |         self.embd_dim = embd_dim
 72 |         self.conv1 = nn.Conv2D(1, num_filters[0], kernel_size=3, stride=(1, 1), padding=1)
 73 |         self.bn1 = nn.BatchNorm2D(num_filters[0])
 74 |         self.relu = nn.ReLU()
 75 | 
 76 |         self.layer1 = self._make_layer(SEBottleneck, num_filters[0], layers[0])
 77 |         self.layer2 = self._make_layer(SEBottleneck, num_filters[1], layers[1], stride=(2, 2))
 78 |         self.layer3 = self._make_layer(SEBottleneck, num_filters[2], layers[2], stride=(2, 2))
 79 |         self.layer4 = self._make_layer(SEBottleneck, num_filters[3], layers[3], stride=(2, 2))
 80 | 
 81 |         cat_channels = num_filters[3] * SEBottleneck.expansion * (input_size // 8)
 82 |         if pooling_type == "ASP":
 83 |             self.pooling = AttentiveStatisticsPooling(cat_channels, attention_channels=128)
 84 |             self.bn2 = BatchNorm1d(cat_channels * 2)
 85 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
 86 |             self.bn3 = BatchNorm1d(embd_dim)
 87 |         elif pooling_type == "SAP":
 88 |             self.pooling = SelfAttentivePooling(cat_channels, 128)
 89 |             self.bn2 = BatchNorm1d(cat_channels)
 90 |             self.linear = nn.Linear(cat_channels, embd_dim)
 91 |             self.bn3 = BatchNorm1d(embd_dim)
 92 |         elif pooling_type == "TAP":
 93 |             self.pooling = TemporalAveragePooling()
 94 |             self.bn2 = BatchNorm1d(cat_channels)
 95 |             self.linear = nn.Linear(cat_channels, embd_dim)
 96 |             self.bn3 = BatchNorm1d(embd_dim)
 97 |         elif pooling_type == "TSP":
 98 |             self.pooling = TemporalStatisticsPooling()
 99 |             self.bn2 = BatchNorm1d(cat_channels * 2)
100 |             self.linear = nn.Linear(cat_channels * 2, embd_dim)
101 |             self.bn3 = BatchNorm1d(embd_dim)
102 |         else:
103 |             raise Exception(f'没有{pooling_type}池化层！')
104 | 
105 |     def _make_layer(self, block, planes, blocks, stride=1):
106 |         downsample = None
107 |         if stride != 1 or self.inplanes != planes * block.expansion:
108 |             downsample = nn.Sequential(
109 |                 nn.Conv2D(self.inplanes, planes * block.expansion,
110 |                           kernel_size=1, stride=stride),
111 |                 nn.BatchNorm2D(planes * block.expansion),
112 |             )
113 | 
114 |         layers = [block(self.inplanes, planes, stride, downsample)]
115 |         self.inplanes = planes * block.expansion
116 |         for i in range(1, blocks):
117 |             layers.append(block(self.inplanes, planes))
118 | 
119 |         return nn.Sequential(*layers)
120 | 
121 |     def forward(self, x):
122 |         x = x.transpose([0, 2, 1])
123 |         x = x.unsqueeze(1)
124 |         x = self.conv1(x)
125 |         x = self.bn1(x)
126 |         x = self.relu(x)
127 | 
128 |         x = self.layer1(x)
129 |         x = self.layer2(x)
130 |         x = self.layer3(x)
131 |         x = self.layer4(x)
132 | 
133 |         x = x.reshape([x.shape[0], -1, x.shape[-1]])
134 | 
135 |         x = self.pooling(x)
136 |         x = self.bn2(x)
137 |         x = self.linear(x)
138 |         x = self.bn3(x)
139 |         return x
140 | 


--------------------------------------------------------------------------------
/ppvector/models/tdnn.py:
--------------------------------------------------------------------------------
 1 | import paddle.nn as nn
 2 | import paddle.nn.functional as F
 3 | 
 4 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling
 5 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling
 6 | from ppvector.models.utils import BatchNorm1d
 7 | 
 8 | 
 9 | class TDNN(nn.Layer):
10 |     def __init__(self, input_size, channels=512, embd_dim=192, pooling_type="ASP"):
11 |         super(TDNN, self).__init__()
12 |         self.embd_dim = embd_dim
13 |         self.td_layer1 = nn.Conv1D(in_channels=input_size, out_channels=channels, dilation=1, kernel_size=5, stride=1)
14 |         self.bn1 = nn.BatchNorm1D(channels)
15 |         self.td_layer2 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=2, kernel_size=3, stride=1)
16 |         self.bn2 = nn.BatchNorm1D(channels)
17 |         self.td_layer3 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=3, kernel_size=3, stride=1)
18 |         self.bn3 = nn.BatchNorm1D(channels)
19 |         self.td_layer4 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1)
20 |         self.bn4 = nn.BatchNorm1D(channels)
21 |         self.td_layer5 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1)
22 | 
23 |         if pooling_type == "ASP":
24 |             self.pooling = AttentiveStatisticsPooling(channels, attention_channels=128)
25 |             self.bn5 = BatchNorm1d(channels * 2)
26 |             self.linear = nn.Linear(channels * 2, embd_dim)
27 |             self.bn6 = BatchNorm1d(embd_dim)
28 |         elif pooling_type == "SAP":
29 |             self.pooling = SelfAttentivePooling(channels, 128)
30 |             self.bn5 = BatchNorm1d(channels)
31 |             self.linear = nn.Linear(channels, embd_dim)
32 |             self.bn6 = BatchNorm1d(embd_dim)
33 |         elif pooling_type == "TAP":
34 |             self.pooling = TemporalAveragePooling()
35 |             self.bn5 = BatchNorm1d(channels)
36 |             self.linear = nn.Linear(channels, embd_dim)
37 |             self.bn6 = BatchNorm1d(embd_dim)
38 |         elif pooling_type == "TSP":
39 |             self.pooling = TemporalStatisticsPooling()
40 |             self.bn5 = BatchNorm1d(channels * 2)
41 |             self.linear = nn.Linear(channels * 2, embd_dim)
42 |             self.bn6 = BatchNorm1d(embd_dim)
43 |         else:
44 |             raise Exception(f'没有{pooling_type}池化层！')
45 | 
46 |     def forward(self, x):
47 |         """
48 |         Compute embeddings.
49 | 
50 |         Args:
51 |             x (paddle.Tensor): Input data with shape (N, time, freq).
52 | 
53 |         Returns:
54 |             paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1)
55 |         """
56 |         x = x.transpose([0, 2, 1])
57 |         x = F.relu(self.td_layer1(x))
58 |         x = self.bn1(x)
59 |         x = F.relu(self.td_layer2(x))
60 |         x = self.bn2(x)
61 |         x = F.relu(self.td_layer3(x))
62 |         x = self.bn3(x)
63 |         x = F.relu(self.td_layer4(x))
64 |         x = self.bn4(x)
65 |         x = F.relu(self.td_layer5(x))
66 |         out = self.bn5(self.pooling(x))
67 |         out = self.bn6(self.linear(out))
68 |         return out
69 | 


--------------------------------------------------------------------------------
/ppvector/models/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import paddle
  4 | import paddle.nn as nn
  5 | import paddle.nn.functional as F
  6 | 
  7 | 
  8 | def length_to_mask(length, max_len=None, dtype=None):
  9 |     assert len(length.shape) == 1
 10 | 
 11 |     if max_len is None:
 12 |         max_len = length.max().astype('int').item()  # using arange to generate mask
 13 |     mask = paddle.arange(max_len, dtype=length.dtype).expand((len(length), max_len)) < length.unsqueeze(1)
 14 | 
 15 |     if dtype is None:
 16 |         dtype = length.dtype
 17 | 
 18 |     mask = paddle.to_tensor(mask, dtype=dtype)
 19 |     return mask
 20 | 
 21 | 
 22 | class Conv1d(nn.Layer):
 23 |     def __init__(
 24 |             self,
 25 |             in_channels,
 26 |             out_channels,
 27 |             kernel_size,
 28 |             stride=1,
 29 |             padding="same",
 30 |             dilation=1,
 31 |             groups=1,
 32 |             bias=True,
 33 |             padding_mode="reflect", ):
 34 |         """_summary_
 35 | 
 36 |         Args:
 37 |             in_channels (int): intput channel or input data dimensions
 38 |             out_channels (int): output channel or output data dimensions
 39 |             kernel_size (int): kernel size of 1-d convolution
 40 |             stride (int, optional): strid in 1-d convolution . Defaults to 1.
 41 |             padding (str, optional): padding value. Defaults to "same".
 42 |             dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
 43 |             groups (int, optional): groups in 1-d convolution. Defaults to 1.
 44 |             bias (bool, optional): bias in 1-d convolution . Defaults to True.
 45 |             padding_mode (str, optional): padding mode. Defaults to "reflect".
 46 |         """
 47 |         super().__init__()
 48 | 
 49 |         self.kernel_size = kernel_size
 50 |         self.stride = stride
 51 |         self.dilation = dilation
 52 |         self.padding = padding
 53 |         self.padding_mode = padding_mode
 54 | 
 55 |         self.conv = nn.Conv1D(
 56 |             in_channels,
 57 |             out_channels,
 58 |             self.kernel_size,
 59 |             stride=self.stride,
 60 |             padding=0,
 61 |             dilation=self.dilation,
 62 |             groups=groups,
 63 |             bias_attr=bias, )
 64 | 
 65 |     def forward(self, x):
 66 |         if self.padding == "same":
 67 |             x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride)
 68 |         else:
 69 |             raise ValueError(f"Padding must be 'same'. Got {self.padding}")
 70 | 
 71 |         return self.conv(x)
 72 | 
 73 |     def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
 74 |         L_in = x.shape[-1]  # Detecting input shape
 75 |         padding = self._get_padding_elem(L_in, stride, kernel_size, dilation)  # Time padding
 76 |         x = F.pad(x, padding, mode=self.padding_mode, data_format="NCL")  # Applying padding
 77 |         return x
 78 | 
 79 |     def _get_padding_elem(self,
 80 |                           L_in: int,
 81 |                           stride: int,
 82 |                           kernel_size: int,
 83 |                           dilation: int):
 84 |         if stride > 1:
 85 |             n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
 86 |             L_out = stride * (n_steps - 1) + kernel_size * dilation
 87 |             padding = [kernel_size // 2, kernel_size // 2]
 88 |         else:
 89 |             L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
 90 | 
 91 |             padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
 92 | 
 93 |         return padding
 94 | 
 95 | 
 96 | class BatchNorm1d(nn.Layer):
 97 |     def __init__(
 98 |             self,
 99 |             input_size,
100 |             eps=1e-05,
101 |             momentum=0.9,
102 |             weight_attr=None,
103 |             bias_attr=None,
104 |             data_format='NCL',
105 |             use_global_stats=None, ):
106 |         super().__init__()
107 | 
108 |         self.norm = nn.BatchNorm1D(
109 |             input_size,
110 |             epsilon=eps,
111 |             momentum=momentum,
112 |             weight_attr=weight_attr,
113 |             bias_attr=bias_attr,
114 |             data_format=data_format,
115 |             use_global_stats=use_global_stats, )
116 | 
117 |     def forward(self, x):
118 |         x_n = self.norm(x)
119 |         return x_n
120 | 
121 | 
122 | class TDNNBlock(nn.Layer):
123 |     def __init__(
124 |             self,
125 |             in_channels,
126 |             out_channels,
127 |             kernel_size,
128 |             dilation,
129 |             activation=nn.ReLU, ):
130 |         """Implementation of TDNN network
131 | 
132 |         Args:
133 |             in_channels (int): input channels or input embedding dimensions
134 |             out_channels (int): output channels or output embedding dimensions
135 |             kernel_size (int): the kernel size of the TDNN network block
136 |             dilation (int): the dilation of the TDNN network block
137 |             activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
138 |         """
139 |         super().__init__()
140 |         self.conv = Conv1d(in_channels=in_channels,
141 |                            out_channels=out_channels,
142 |                            kernel_size=kernel_size,
143 |                            dilation=dilation, )
144 |         self.activation = activation()
145 |         self.norm = BatchNorm1d(input_size=out_channels)
146 | 
147 |     def forward(self, x):
148 |         return self.norm(self.activation(self.conv(x)))
149 | 


--------------------------------------------------------------------------------
/ppvector/optimizer/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | from loguru import logger
 4 | from paddle.optimizer import *
 5 | from .scheduler import cosine_decay_with_warmup as WarmupCosineSchedulerLR
 6 | from paddle.optimizer.lr import *
 7 | 
 8 | 
 9 | __all__ = ['build_optimizer', 'build_lr_scheduler']
10 | 
11 | 
12 | def build_optimizer(parameters, learning_rate, configs):
13 |     use_optimizer = configs.optimizer_conf.get('optimizer', 'Adam')
14 |     optimizer_args = configs.optimizer_conf.get('optimizer_args', {})
15 |     optim = importlib.import_module(__name__)
16 |     optimizer = getattr(optim, use_optimizer)(parameters=parameters, learning_rate=learning_rate, **optimizer_args)
17 |     logger.info(f'成功创建优化方法：{use_optimizer}，参数为：{optimizer_args}')
18 |     return optimizer
19 | 
20 | 
21 | def build_lr_scheduler(step_per_epoch, configs):
22 |     use_scheduler = configs.optimizer_conf.get('scheduler', 'WarmupCosineSchedulerLR')
23 |     scheduler_args = configs.optimizer_conf.get('scheduler_args', {})
24 |     if configs.optimizer_conf.scheduler == 'CosineAnnealingDecay' and 'T_max' not in scheduler_args:
25 |         scheduler_args.T_max = int(configs.train_conf.max_epoch * 1.2) * step_per_epoch
26 |     if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'fix_epoch' not in scheduler_args:
27 |         scheduler_args.fix_epoch = configs.train_conf.max_epoch
28 |     if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'step_per_epoch' not in scheduler_args:
29 |         scheduler_args.step_per_epoch = step_per_epoch
30 |     optim = importlib.import_module(__name__)
31 |     scheduler = getattr(optim, use_scheduler)(**scheduler_args)
32 |     logger.info(f'成功创建学习率衰减：{use_scheduler}，参数为：{scheduler_args}')
33 |     return scheduler
34 | 


--------------------------------------------------------------------------------
/ppvector/optimizer/scheduler.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import paddle
  4 | 
  5 | 
  6 | def cosine_decay_with_warmup(learning_rate, step_per_epoch, fix_epoch=1000, warmup_epoch=5, min_lr=0.0):
  7 |     """
  8 |     :param learning_rate: 学习率
  9 |     :param step_per_epoch: 每个epoch的步数
 10 |     :param fix_epoch: 最大epoch数
 11 |     :param warmup_epoch: 预热步数
 12 |     :param min_lr: 最小学习率
 13 |     :return:
 14 |     """
 15 |     # 预热步数
 16 |     boundary = []
 17 |     value = []
 18 |     warmup_steps = warmup_epoch * step_per_epoch
 19 |     # 初始化预热步数
 20 |     for i in range(warmup_steps + 1):
 21 |         if warmup_steps > 0:
 22 |             alpha = i / warmup_steps
 23 |             lr = learning_rate * alpha
 24 |             value.append(lr)
 25 |         if i > 0:
 26 |             boundary.append(i)
 27 | 
 28 |     max_iters = fix_epoch * int(step_per_epoch)
 29 |     warmup_iters = len(boundary)
 30 |     # 初始化最大步数
 31 |     for i in range(int(boundary[-1]), max_iters):
 32 |         boundary.append(i)
 33 |         # 如果当前步数小于最大步数，则将当前步数设置为最小学习率
 34 |         if i < max_iters:
 35 |             decayed_lr = min_lr + (learning_rate - min_lr) * 0.5 * (math.cos(
 36 |                 (i - warmup_iters) * math.pi / (max_iters - warmup_iters)) + 1)
 37 |             value.append(decayed_lr)
 38 |         else:
 39 |             value.append(min_lr)
 40 |     return paddle.optimizer.lr.PiecewiseDecay(boundary, value)
 41 | 
 42 | 
 43 | class MarginScheduler:
 44 |     def __init__(
 45 |             self,
 46 |             criterion,
 47 |             increase_start_epoch,
 48 |             fix_epoch,
 49 |             step_per_epoch,
 50 |             initial_margin=0.0,
 51 |             final_margin=0.3,
 52 |             increase_type='exp',
 53 |     ):
 54 |         assert hasattr(criterion, 'update'), "Loss function not has 'update()' attributes."
 55 |         self.criterion = criterion
 56 |         self.increase_start_step = increase_start_epoch * step_per_epoch
 57 |         self.fix_step = fix_epoch * step_per_epoch
 58 |         self.initial_margin = initial_margin
 59 |         self.final_margin = final_margin
 60 |         self.increase_type = increase_type
 61 |         self.margin = initial_margin
 62 | 
 63 |         self.current_step = 0
 64 |         self.increase_step = self.fix_step - self.increase_start_step
 65 | 
 66 |         self.init_margin()
 67 | 
 68 |     def init_margin(self):
 69 |         self.criterion.update(margin=self.initial_margin)
 70 | 
 71 |     def step(self, current_step=None):
 72 |         if current_step is not None:
 73 |             self.current_step = current_step
 74 | 
 75 |         self.margin = self.iter_margin()
 76 |         self.criterion.update(margin=self.margin)
 77 |         self.current_step += 1
 78 | 
 79 |     def iter_margin(self):
 80 |         if self.current_step < self.increase_start_step:
 81 |             return self.initial_margin
 82 | 
 83 |         if self.current_step >= self.fix_step:
 84 |             return self.final_margin
 85 | 
 86 |         a = 1.0
 87 |         b = 1e-3
 88 | 
 89 |         current_step = self.current_step - self.increase_start_step
 90 |         if self.increase_type == 'exp':
 91 |             # exponentially increase the margin
 92 |             ratio = 1.0 - math.exp(
 93 |                 (current_step / self.increase_step) *
 94 |                 math.log(b / (a + 1e-6))) * a
 95 |         else:
 96 |             # linearly increase the margin
 97 |             ratio = 1.0 * current_step / self.increase_step
 98 |         return self.initial_margin + (self.final_margin -
 99 |                                       self.initial_margin) * ratio
100 | 
101 |     def get_margin(self):
102 |         return self.margin
103 | 


--------------------------------------------------------------------------------
/ppvector/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/utils/__init__.py


--------------------------------------------------------------------------------
/ppvector/utils/checkpoint.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | 
  5 | import paddle
  6 | 
  7 | from loguru import logger
  8 | from ppvector import __version__
  9 | 
 10 | 
 11 | def load_pretrained(model, pretrained_model):
 12 |     """加载预训练模型
 13 | 
 14 |     :param model: 使用的模型
 15 |     :param pretrained_model: 预训练模型路径
 16 |     """
 17 |     # 加载预训练模型
 18 |     if pretrained_model is None: return model
 19 |     if os.path.isdir(pretrained_model):
 20 |         pretrained_model = os.path.join(pretrained_model, 'model.pdparams')
 21 |     assert os.path.exists(pretrained_model), f"{pretrained_model} 模型不存在！"
 22 |     model_dict = model.state_dict()
 23 |     model_state_dict = paddle.load(pretrained_model)
 24 |     # 过滤不存在的参数
 25 |     for name, weight in model_dict.items():
 26 |         if name in model_state_dict.keys():
 27 |             if list(weight.shape) != list(model_state_dict[name].shape):
 28 |                 logger.warning('{} not used, shape {} unmatched with {} in model.'.
 29 |                                format(name, list(model_state_dict[name].shape), list(weight.shape)))
 30 |                 model_state_dict.pop(name, None)
 31 |         else:
 32 |             logger.warning('Lack weight: {}'.format(name))
 33 |     # 加载权重
 34 |     missing_keys, unexpected_keys = model.set_state_dict(model_state_dict)
 35 |     if len(unexpected_keys) > 0:
 36 |         logger.warning('Unexpected key(s) in state_dict: {}. '
 37 |                        .format(', '.join('"{}"'.format(k) for k in unexpected_keys)))
 38 |     if len(missing_keys) > 0:
 39 |         logger.warning('Missing key(s) in state_dict: {}. '
 40 |                        .format(', '.join('"{}"'.format(k) for k in missing_keys)))
 41 |     logger.info('成功加载预训练模型：{}'.format(pretrained_model))
 42 |     return model
 43 | 
 44 | 
 45 | def load_checkpoint(configs, model, optimizer, amp_scaler, scheduler,margin_scheduler,
 46 |                     step_epoch, save_model_path, resume_model):
 47 |     """加载模型
 48 | 
 49 |     :param configs: 配置信息
 50 |     :param model: 使用的模型
 51 |     :param optimizer: 使用的优化方法
 52 |     :param amp_scaler: 使用的自动混合精度
 53 |     :param scheduler: 使用的学习率调整策略
 54 |     :param margin_scheduler: margin调整策略
 55 |     :param step_epoch: 每个epoch的step数量
 56 |     :param save_model_path: 模型保存路径
 57 |     :param resume_model: 恢复训练的模型路径
 58 |     """
 59 |     last_epoch1 = 0
 60 |     best_eer1 = 1
 61 | 
 62 |     def load_model(model_path):
 63 |         assert os.path.exists(os.path.join(model_path, 'model.pdparams')), "模型参数文件不存在！"
 64 |         assert os.path.exists(os.path.join(model_path, 'optimizer.pdopt')), "优化方法参数文件不存在！"
 65 |         state_dict = paddle.load(os.path.join(model_path, 'model.pdparams'))
 66 |         missing_keys, unexpected_keys = model.set_state_dict(state_dict)
 67 |         assert len(missing_keys) == len(unexpected_keys) == 0, "模型参数加载失败，参数权重不匹配，请可以考虑当做预训练模型！"
 68 |         optimizer.set_state_dict(paddle.load(os.path.join(model_path, 'optimizer.pdopt')))
 69 |         # 自动混合精度参数
 70 |         if amp_scaler is not None and os.path.exists(os.path.join(model_path, 'scaler.pdparams')):
 71 |             amp_scaler.set_state_dict(paddle.load(os.path.join(model_path, 'scaler.pdparams')))
 72 |         with open(os.path.join(model_path, 'model.state'), 'r', encoding='utf-8') as f:
 73 |             json_data = json.load(f)
 74 |             last_epoch = json_data['last_epoch']
 75 |             best_eer = 1
 76 |             if 'eer' in json_data.keys():
 77 |                 best_eer = json_data['eer']
 78 |         logger.info('成功恢复模型参数和优化方法参数：{}'.format(model_path))
 79 |         optimizer.step()
 80 |         [scheduler.step() for _ in range(last_epoch * step_epoch)]
 81 |         if margin_scheduler is not None:
 82 |             margin_scheduler.step(current_step=last_epoch * step_epoch)
 83 |         return last_epoch, best_eer
 84 | 
 85 |     # 获取最后一个保存的模型
 86 |     save_feature_method = configs.preprocess_conf.feature_method
 87 |     last_model_dir = os.path.join(save_model_path,
 88 |                                   f'{configs.model_conf.model}_{save_feature_method}',
 89 |                                   'last_model')
 90 |     if resume_model is not None or (os.path.exists(os.path.join(last_model_dir, 'model.pdparams'))
 91 |                                     and os.path.exists(os.path.join(last_model_dir, 'optimizer.pdopt'))):
 92 |         if resume_model is not None:
 93 |             last_epoch1, best_eer1 = load_model(resume_model)
 94 |         else:
 95 |             try:
 96 |                 # 自动获取最新保存的模型
 97 |                 last_epoch1, best_eer1 = load_model(last_model_dir)
 98 |             except Exception as e:
 99 |                 logger.warning(f'尝试自动恢复最新模型失败，错误信息：{e}')
100 |     return model, optimizer, amp_scaler, scheduler, margin_scheduler, last_epoch1, best_eer1
101 | 
102 | 
103 | # 保存模型
104 | def save_checkpoint(configs, model, optimizer, amp_scaler, margin_scheduler, save_model_path, epoch_id,
105 |                     eer=None, min_dcf=None, threshold=None, best_model=False):
106 |     """保存模型
107 | 
108 |     :param configs: 配置信息
109 |     :param model: 使用的模型
110 |     :param optimizer: 使用的优化方法
111 |     :param amp_scaler: 使用的自动混合精度
112 |     :param margin_scheduler: margin调整策略
113 |     :param save_model_path: 模型保存路径
114 |     :param epoch_id: 当前epoch
115 |     :param eer: 当前eer
116 |     :param min_dcf: 当前min_dcf
117 |     :param threshold: 当前threshold
118 |     :param best_model: 是否为最佳模型
119 |     """
120 |     # 保存模型的路径
121 |     save_feature_method = configs.preprocess_conf.feature_method
122 |     if best_model:
123 |         model_path = os.path.join(save_model_path,
124 |                                   f'{configs.model_conf.model}_{save_feature_method}', 'best_model')
125 |     else:
126 |         model_path = os.path.join(save_model_path,
127 |                                   f'{configs.model_conf.model}_{save_feature_method}', 'epoch_{}'.format(epoch_id))
128 |     if os.path.exists(model_path):
129 |         shutil.rmtree(model_path)
130 |     os.makedirs(model_path, exist_ok=True)
131 |     # 保存模型参数
132 |     paddle.save(optimizer.state_dict(), os.path.join(model_path, 'optimizer.pdopt'))
133 |     paddle.save(model.state_dict(), os.path.join(model_path, 'model.pdparams'))
134 |     # 自动混合精度参数
135 |     if amp_scaler is not None:
136 |         paddle.save(amp_scaler.state_dict(), os.path.join(model_path, 'scaler.pdparams'))
137 |     with open(os.path.join(model_path, 'model.state'), 'w', encoding='utf-8') as f:
138 |         use_loss = configs.loss_conf.get('use_loss', 'AAMLoss')
139 |         data = {"last_epoch": epoch_id, "version": __version__, "model_conf.model": configs.model_conf.model,
140 |                 "feature_method": save_feature_method, "loss": use_loss}
141 |         if eer is not None:
142 |             data['threshold'] = threshold
143 |             data['eer'] = eer
144 |             data['min_dcf'] = min_dcf
145 |         if margin_scheduler:
146 |             data['margin'] = margin_scheduler.get_margin()
147 |         f.write(json.dumps(data, indent=4, ensure_ascii=False))
148 |     if not best_model:
149 |         last_model_path = os.path.join(save_model_path,
150 |                                        f'{configs.model_conf.model}_{save_feature_method}', 'last_model')
151 |         shutil.rmtree(last_model_path, ignore_errors=True)
152 |         shutil.copytree(model_path, last_model_path)
153 |         # 删除旧的模型
154 |         old_model_path = os.path.join(save_model_path,
155 |                                       f'{configs.model_conf.model}_{save_feature_method}',
156 |                                       'epoch_{}'.format(epoch_id - 3))
157 |         if os.path.exists(old_model_path):
158 |             shutil.rmtree(old_model_path)
159 |     logger.info('已保存模型：{}'.format(model_path))
160 | 


--------------------------------------------------------------------------------
/ppvector/utils/record.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import soundcard
 5 | import soundfile
 6 | 
 7 | 
 8 | class RecordAudio:
 9 |     def __init__(self, channels=1, sample_rate=16000):
10 |         # 录音参数
11 |         self.channels = channels
12 |         self.sample_rate = sample_rate
13 | 
14 |         # 获取麦克风
15 |         self.default_mic = soundcard.default_microphone()
16 | 
17 |     def record(self, record_seconds=3, save_path=None):
18 |         """录音
19 | 
20 |         :param record_seconds: 录音时间，默认3秒
21 |         :param save_path: 录音保存的路径，后缀名为wav
22 |         :return: 音频的numpy数据
23 |         """
24 |         print("开始录音......")
25 |         num_frames = int(record_seconds * self.sample_rate)
26 |         start_time = time.time()
27 |         data = self.default_mic.record(samplerate=self.sample_rate, numframes=num_frames, channels=self.channels)
28 |         if int(time.time() - start_time) < record_seconds:
29 |             raise Exception('录音错误，请检查录音设备，或者卸载soundfile，使用命令重新安装：'
30 |                             'pip install git+https://github.com/bastibe/SoundCard.git')
31 |         audio_data = data.squeeze()
32 |         print("录音已结束!")
33 |         if save_path is not None:
34 |             os.makedirs(os.path.dirname(save_path), exist_ok=True)
35 |             soundfile.write(save_path, data=data, samplerate=self.sample_rate)
36 |         return audio_data
37 | 


--------------------------------------------------------------------------------
/ppvector/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import distutils.util
  2 | 
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from loguru import logger
  6 | 
  7 | 
  8 | def print_arguments(args=None, configs=None, title=None):
  9 |     if args:
 10 |         logger.info("----------- 额外配置参数 -----------")
 11 |         for arg, value in sorted(vars(args).items()):
 12 |             logger.info("%s: %s" % (arg, value))
 13 |         logger.info("------------------------------------------------")
 14 |     if configs:
 15 |         title = title if title else "配置文件参数"
 16 |         logger.info(f"----------- {title} -----------")
 17 |         for arg, value in sorted(configs.items()):
 18 |             if isinstance(value, dict):
 19 |                 logger.info(f"{arg}:")
 20 |                 for a, v in sorted(value.items()):
 21 |                     if isinstance(v, dict):
 22 |                         logger.info(f"\t{a}:")
 23 |                         for a1, v1 in sorted(v.items()):
 24 |                             logger.info("\t\t%s: %s" % (a1, v1))
 25 |                     else:
 26 |                         logger.info("\t%s: %s" % (a, v))
 27 |             else:
 28 |                 logger.info("%s: %s" % (arg, value))
 29 |         logger.info("------------------------------------------------")
 30 | 
 31 | 
 32 | def add_arguments(argname, type, default, help, argparser, **kwargs):
 33 |     type = distutils.util.strtobool if type == bool else type
 34 |     argparser.add_argument("--" + argname,
 35 |                            default=default,
 36 |                            type=type,
 37 |                            help=help + ' 默认: %(default)s.',
 38 |                            **kwargs)
 39 | 
 40 | 
 41 | class Dict(dict):
 42 |     __setattr__ = dict.__setitem__
 43 |     __getattr__ = dict.__getitem__
 44 | 
 45 | 
 46 | def dict_to_object(dict_obj):
 47 |     if not isinstance(dict_obj, dict):
 48 |         return dict_obj
 49 |     inst = Dict()
 50 |     for k, v in dict_obj.items():
 51 |         inst[k] = dict_to_object(v)
 52 |     return inst
 53 | 
 54 | 
 55 | # 根据对角余弦值计算准确率和最优的阈值
 56 | def cal_accuracy_threshold(y_score, y_true):
 57 |     y_score = np.asarray(y_score)
 58 |     y_true = np.asarray(y_true)
 59 |     best_accuracy = 0
 60 |     best_threshold = 0
 61 |     for i in tqdm(range(0, 100)):
 62 |         threshold = i * 0.01
 63 |         y_test = (y_score >= threshold)
 64 |         acc = np.mean((y_test == y_true).astype(int))
 65 |         if acc > best_accuracy:
 66 |             best_accuracy = acc
 67 |             best_threshold = threshold
 68 | 
 69 |     return best_accuracy, best_threshold
 70 | 
 71 | 
 72 | # 根据对角余弦值计算准确率
 73 | def cal_accuracy(y_score, y_true, threshold=0.5):
 74 |     y_score = np.asarray(y_score)
 75 |     y_true = np.asarray(y_true)
 76 |     y_test = (y_score >= threshold)
 77 |     accuracy = np.mean((y_test == y_true).astype(int))
 78 |     return accuracy
 79 | 
 80 | 
 81 | # 计算对角余弦值
 82 | def cosin_metric(x1, x2):
 83 |     return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))
 84 | 
 85 | 
 86 | # 根据a的类型，将b转换为相应的类型
 87 | def convert_string_based_on_type(a, b):
 88 |     if isinstance(a, int):
 89 |         try:
 90 |             b = int(b)
 91 |         except ValueError:
 92 |             logger.error("无法将字符串转换为整数")
 93 |     elif isinstance(a, float):
 94 |         try:
 95 |             b = float(b)
 96 |         except ValueError:
 97 |             logger.error("无法将字符串转换为浮点数")
 98 |     elif isinstance(a, str):
 99 |         return b
100 |     elif isinstance(a, bool):
101 |         b = b.lower() == 'true'
102 |     else:
103 |         try:
104 |             b = eval(b)
105 |         except Exception as e:
106 |             logger.exception("无法将字符串转换为其他类型，将忽略该参数类型转换")
107 |     return b
108 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.19.2
 2 | tqdm>=4.59.0
 3 | visualdl==2.5.3
 4 | resampy>=0.2.2
 5 | soundfile>=0.12.1
 6 | soundcard>=0.4.2
 7 | pyyaml>=5.4.1
 8 | paddleaudio>=1.0.1
 9 | scikit-learn>=1.5.2
10 | pydub>=0.25.1
11 | loguru>=0.7.2
12 | yeaudio>=0.0.7


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | import ppvector
 6 | 
 7 | VERSION = ppvector.__version__
 8 | 
 9 | 
10 | # 复制配置文件到项目目录下
11 | shutil.rmtree('./ppvector/configs/', ignore_errors=True)
12 | shutil.copytree('./configs/', './ppvector/configs/')
13 | 
14 | 
15 | def readme():
16 |     with open('README.md', encoding='utf-8') as f:
17 |         content = f.read()
18 |     return content
19 | 
20 | 
21 | def parse_requirements():
22 |     with open('./requirements.txt', encoding="utf-8") as f:
23 |         requirements = f.readlines()
24 |     return requirements
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     setup(
29 |         name='ppvector',
30 |         packages=find_packages(),
31 |         package_data={'': ['configs/*']},
32 |         author='yeyupiaoling',
33 |         version=VERSION,
34 |         install_requires=parse_requirements(),
35 |         description='Voice Print Recognition toolkit on PaddlePaddle',
36 |         long_description=readme(),
37 |         long_description_content_type='text/markdown',
38 |         url='https://github.com/yeyupiaoling/VoiceprintRecognition_PaddlePaddle',
39 |         download_url='https://github.com/yeyupiaoling/VoiceprintRecognition_PaddlePaddle.git',
40 |         keywords=['Voice', 'paddle'],
41 |         classifiers=[
42 |             'Intended Audience :: Developers',
43 |             'License :: OSI Approved :: Apache Software License',
44 |             'Operating System :: OS Independent',
45 |             'Natural Language :: Chinese (Simplified)',
46 |             'Programming Language :: Python :: 3',
47 |             'Programming Language :: Python :: 3.5',
48 |             'Programming Language :: Python :: 3.6',
49 |             'Programming Language :: Python :: 3.7',
50 |             'Programming Language :: Python :: 3.8',
51 |             'Programming Language :: Python :: 3.9', 'Topic :: Utilities'
52 |         ],
53 |         license='Apache License 2.0',
54 |         ext_modules=[])
55 |     shutil.rmtree('./ppvector/configs/', ignore_errors=True)
56 | 


--------------------------------------------------------------------------------
/tools/eval_speaker_diarization/README.md:
--------------------------------------------------------------------------------
 1 | # 说话人日志效果评估
 2 | 
 3 | 1. 安装依赖库
 4 | 
 5 | ```shell
 6 | pip install pyannote.audio[separation]==3.3.0
 7 | ```
 8 | 
 9 | 2. 下载[AIShell-4](https://us.openslr.org/resources/111)的测试数据并解压到当前目录的`dataset`下。
10 | 3. 执行`create_aishell4_test_rttm.py`，创建数据类别和rttm文件。
11 | 4. 执行`infer_data.py`预测数据。
12 | 5. 执行`compute_metrics.py`获取评估结果。
13 | 


--------------------------------------------------------------------------------
/tools/eval_speaker_diarization/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | from pyannote.database.util import load_rttm
 2 | from pyannote.metrics.diarization import DiarizationErrorRate
 3 | 
 4 | metric = DiarizationErrorRate()
 5 | 
 6 | 
 7 | false_alarms, confusions, missed_detections, error_rates = [], [], [], []
 8 | references = load_rttm('dataset/references.rttm')
 9 | hypotheses = load_rttm('dataset/hypotheses.rttm')
10 | for uri, reference in references.items():
11 |     hypothesis = hypotheses[uri]
12 |     result = metric(reference, hypothesis, detailed=True)
13 |     print(uri, ":", result)
14 |     false_alarms.append(result["false alarm"])
15 |     confusions.append(result["confusion"])
16 |     missed_detections.append(result["missed detection"])
17 |     error_rates.append(result["diarization error rate"])
18 | print("False alarm:", round(sum(false_alarms) / len(false_alarms), 5))
19 | print("Confusion:", round(sum(confusions) / len(confusions), 5))
20 | print("Missed detection:", round(sum(missed_detections) / len(missed_detections), 5))
21 | print("Diarization error rate:", round(sum(error_rates) / len(error_rates), 5))
22 | 


--------------------------------------------------------------------------------
/tools/eval_speaker_diarization/create_aishell4_test_rttm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import soundfile
 4 | from pyannote.database.util import load_rttm
 5 | from tqdm import tqdm
 6 | from yeaudio.audio import AudioSegment
 7 | 
 8 | 
 9 | def create_rttm(annotation_dir, output_path):
10 |     with open(output_path, 'w', encoding='utf-8') as f_w:
11 |         for file in os.listdir(annotation_dir):
12 |             if not file.endswith(".rttm"): continue
13 |             with open(os.path.join(annotation_dir, file), 'r', encoding='utf-8') as f_r:
14 |                 lines = f_r.readlines()
15 |                 for line in lines:
16 |                     f_w.write(line)
17 | 
18 | 
19 | def create_audio_path_list(audio_dir, list_path):
20 |     with open(list_path, 'w', encoding='utf-8') as f_w:
21 |         for file in os.listdir(audio_dir):
22 |             if not file.endswith(".flac"): continue
23 |             file_path = os.path.join(audio_dir, file).replace('\\', '/')
24 |             name = file.split('.')[0]
25 |             f_w.write(f'{file_path}\t{name}\n')
26 | 
27 | 
28 | def create_audio_db(data_list_path, rttm_path, output_dir):
29 |     annotations = load_rttm(rttm_path)
30 |     with open(data_list_path, 'r') as f_r:
31 |         for line in tqdm(f_r.readlines(), desc='裁剪说话人音频'):
32 |             audio_path, name = line.strip().split('\t')
33 |             audio_segment = AudioSegment.from_file(audio_path)
34 |             sample_rate = audio_segment.sample_rate
35 |             audio = audio_segment.samples
36 |             annotation = annotations[name]
37 |             for segment, track, label in annotation.itertracks(yield_label=True):
38 |                 if segment.end - segment.start < 0.3: continue
39 |                 save_path = os.path.join(output_dir, name, label, f'{track}.wav')
40 |                 os.makedirs(os.path.dirname(save_path), exist_ok=True)
41 |                 audio_sub = audio[int(segment.start * sample_rate):int(segment.end * sample_rate)]
42 |                 soundfile.write(save_path, audio_sub, sample_rate)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     create_rttm(annotation_dir='dataset/test/TextGrid', output_path='dataset/references.rttm')
47 |     create_audio_path_list(audio_dir='dataset/test/wav', list_path='dataset/data_list.txt')
48 |     create_audio_db(data_list_path='dataset/data_list.txt', rttm_path='dataset/references.rttm',
49 |                     output_dir='dataset/audio_db/')
50 | 


--------------------------------------------------------------------------------
/tools/eval_speaker_diarization/infer_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | import os
 4 | 
 5 | from pyannote.core import Annotation
 6 | from pyannote.core import Segment
 7 | from tqdm import tqdm
 8 | 
 9 | from ppvector.predict import PPVectorPredictor
10 | from ppvector.utils.utils import add_arguments, print_arguments
11 | 
12 | parser = argparse.ArgumentParser(description=__doc__)
13 | add_arg = functools.partial(add_arguments, argparser=parser)
14 | add_arg('configs',          str,    '../../configs/cam++.yml',  '配置文件')
15 | add_arg('use_gpu',          bool,   True,                       '是否使用GPU预测')
16 | add_arg('data_list_path',   str,    'dataset/data_list.txt',    '要预测的音频路径列表')
17 | add_arg('result_path',      str,    'dataset/hypotheses.rttm',  '预测结果')
18 | add_arg('audio_db_path',    str,    'dataset/audio_db/',        '测试数据的音频库的路径')
19 | add_arg('threshold',        float,  0.6,                        '判断是否为同一个人的阈值')
20 | add_arg('model_path',       str,    '../../models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径')
21 | args = parser.parse_args()
22 | print_arguments(args=args)
23 | 
24 | 
25 | # 进行说话人日志识别
26 | with open(args.data_list_path, 'r') as f_r, open(args.result_path, 'w', encoding='utf-8') as f_w:
27 |     for line in tqdm(f_r.readlines()):
28 |         audio_path, name = line.strip().split('\t')
29 |         # 每条音频说话人的数据库
30 |         audio_db_path = os.path.join(args.audio_db_path, name)
31 |         # 获取识别器
32 |         predictor = PPVectorPredictor(configs=args.configs,
33 |                                       model_path=args.model_path,
34 |                                       threshold=args.threshold,
35 |                                       audio_db_path=audio_db_path,
36 |                                       use_gpu=args.use_gpu)
37 | 
38 |         results = predictor.speaker_diarization(audio_path, search_audio_db=True)
39 | 
40 |         annotation = Annotation(uri=name)
41 |         for i, result in enumerate(results):
42 |             annotation[Segment(result['start'], result['end']), i] = str(result['speaker'])
43 |         f_w.write(annotation.to_rttm())
44 |         os.remove(os.path.join(args.audio_db_path, name, "audio_indexes.bin"))
45 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import functools
 3 | 
 4 | from ppvector.trainer import PPVectorTrainer
 5 | from ppvector.utils.utils import add_arguments, print_arguments
 6 | 
 7 | parser = argparse.ArgumentParser(description=__doc__)
 8 | add_arg = functools.partial(add_arguments, argparser=parser)
 9 | add_arg('configs',              str,    'configs/cam++.yml',        '配置文件')
10 | add_arg('data_augment_configs', str,    'configs/augmentation.yml', '数据增强配置文件')
11 | add_arg("use_gpu",              bool,   True,                       '是否使用GPU训练')
12 | add_arg("do_eval",              bool,   True,                       '训练时是否评估模型')
13 | add_arg('save_model_path',      str,    'models/',                  '模型保存的路径')
14 | add_arg('log_dir',              str,    'log/',                     '保存VisualDL日志文件的路径')
15 | add_arg('resume_model',         str,    None,                       '恢复训练，当为None则不使用预训练模型')
16 | add_arg('pretrained_model',     str,    None,                       '预训练模型的路径，当为None则不使用预训练模型')
17 | add_arg('overwrites',           str,    None,    '覆盖配置文件中的参数，比如"train_conf.max_epoch=100"，多个用逗号隔开')
18 | args = parser.parse_args()
19 | print_arguments(args=args)
20 | 
21 | # 获取训练器
22 | trainer = PPVectorTrainer(configs=args.configs,
23 |                           use_gpu=args.use_gpu,
24 |                           data_augment_configs=args.data_augment_configs,
25 |                           overwrites=args.overwrites)
26 | 
27 | trainer.train(save_model_path=args.save_model_path,
28 |               log_dir=args.log_dir,
29 |               resume_model=args.resume_model,
30 |               pretrained_model=args.pretrained_model,
31 |               do_eval=args.do_eval)
32 | 


--------------------------------------------------------------------------------