├── .gitignore ├── LICENSE ├── README.md ├── audio_db ├── 李达康 │ └── 0.wav └── 沙瑞金 │ └── 0.wav ├── configs ├── augmentation.yml ├── cam++.yml ├── ecapa_tdnn.yml ├── eres2net.yml ├── res2net.yml ├── resnet_se.yml └── tdnn.yml ├── create_data.py ├── dataset ├── a_1.wav ├── a_2.wav ├── b_1.wav ├── b_2.wav └── test_long.wav ├── docs └── images │ ├── contrast.jpg │ ├── log.jpg │ ├── recognition.jpg │ ├── speaker_diarization.jpg │ └── speaker_diarization_gui.png ├── eval.py ├── extract_features.py ├── infer_contrast.py ├── infer_contrast_gui.py ├── infer_recognition.py ├── infer_recognition_gui.py ├── infer_speaker_diarization.py ├── infer_speaker_diarization_gui.py ├── ppvector ├── __init__.py ├── data_utils │ ├── __init__.py │ ├── collate_fn.py │ ├── featurizer.py │ ├── pk_sampler.py │ └── reader.py ├── infer_utils │ ├── __init__.py │ ├── player.py │ ├── speaker_diarization.py │ └── viewer.py ├── loss │ ├── __init__.py │ ├── aamloss.py │ ├── amloss.py │ ├── armloss.py │ ├── celoss.py │ ├── sphereface2.py │ ├── subcenterloss.py │ └── tripletangularmarginloss.py ├── metric │ ├── __init__.py │ └── metrics.py ├── models │ ├── __init__.py │ ├── campplus.py │ ├── ecapa_tdnn.py │ ├── eres2net.py │ ├── fc.py │ ├── pooling.py │ ├── res2net.py │ ├── resnet_se.py │ ├── tdnn.py │ └── utils.py ├── optimizer │ ├── __init__.py │ └── scheduler.py ├── predict.py ├── trainer.py └── utils │ ├── __init__.py │ ├── checkpoint.py │ ├── record.py │ └── utils.py ├── requirements.txt ├── setup.py ├── tools └── eval_speaker_diarization │ ├── README.md │ ├── compute_metrics.py │ ├── create_aishell4_test_rttm.py │ └── infer_data.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | build/ 4 | dist/ 5 | ppvector.egg-info/ 6 | log/ 7 | models/ 8 | test*.py 9 | dataset/ 10 | audio_db/ 11 | output/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /audio_db/李达康/0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/audio_db/李达康/0.wav -------------------------------------------------------------------------------- /audio_db/沙瑞金/0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/audio_db/沙瑞金/0.wav -------------------------------------------------------------------------------- /configs/augmentation.yml: -------------------------------------------------------------------------------- 1 | # 语速增强 2 | speed: 3 | # 增强概率 4 | prob: 1.0 5 | # 使用语速增强是否分类大小翻三倍 6 | speed_perturb_3_class: False 7 | 8 | # 音量增强 9 | volume: 10 | # 增强概率 11 | prob: 0.0 12 | # 最小增益 13 | min_gain_dBFS: -15 14 | # 最大增益 15 | max_gain_dBFS: 15 16 | 17 | # 噪声增强 18 | noise: 19 | # 增强概率 20 | prob: 0.5 21 | # 噪声增强的噪声文件夹 22 | noise_dir: 'dataset/noise' 23 | # 针对噪声的最小音量增益 24 | min_snr_dB: 10 25 | # 针对噪声的最大音量增益 26 | max_snr_dB: 50 27 | 28 | # 混响增强 29 | reverb: 30 | # 增强概率 31 | prob: 0.5 32 | # 混响增强的混响文件夹 33 | reverb_dir: 'dataset/reverb' 34 | 35 | # Spec增强 36 | spec_aug: 37 | # 增强概率 38 | prob: 0.5 39 | # 频域掩蔽的比例 40 | freq_mask_ratio: 0.1 41 | # 频域掩蔽次数 42 | n_freq_masks: 1 43 | # 频域掩蔽的比例 44 | time_mask_ratio: 0.05 45 | # 频域掩蔽次数 46 | n_time_masks: 1 47 | # 最大时间扭曲 48 | max_time_warp: 0 49 | -------------------------------------------------------------------------------- /configs/cam++.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型 52 | model: 'CAMPPlus' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | # 分类器参数 57 | classifier: 58 | # 分类器类型,支持Cosine、Linear 59 | classifier_type: 'Cosine' 60 | # 说话人数量,即分类大小 61 | num_speakers: 2796 62 | num_blocks: 0 63 | 64 | loss_conf: 65 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 66 | loss: 'AAMLoss' 67 | # 损失函数参数 68 | loss_args: 69 | margin: 0.2 70 | scale: 32 71 | easy_margin: False 72 | label_smoothing: 0.0 73 | # 是否使用损失函数margin调度器 74 | use_margin_scheduler: True 75 | # margin调度器参数 76 | margin_scheduler_args: 77 | initial_margin: 0.0 78 | final_margin: 0.3 79 | 80 | optimizer_conf: 81 | # 优化方法 82 | optimizer: 'Adam' 83 | # 优化方法参数 84 | optimizer_args: 85 | weight_decay: !!float 1e-6 86 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 87 | scheduler: 'WarmupCosineSchedulerLR' 88 | # 学习率衰减函数参数 89 | scheduler_args: 90 | # 学习率的大小 91 | learning_rate: 0.001 92 | min_lr: !!float 1e-5 93 | warmup_epoch: 5 94 | 95 | train_conf: 96 | # 是否开启自动混合精度 97 | enable_amp: False 98 | # 训练的轮数 99 | max_epoch: 60 100 | log_interval: 10 101 | -------------------------------------------------------------------------------- /configs/ecapa_tdnn.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型 52 | model: 'EcapaTdnn' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | # 所使用的池化层,支持ASP、SAP、TSP、TAP 57 | pooling_type: 'ASP' 58 | channels: [512, 512, 512, 512, 1536] 59 | # 分类器参数 60 | classifier: 61 | # 分类器类型,支持Cosine、Linear 62 | classifier_type: 'Cosine' 63 | # 说话人数量,即分类大小 64 | num_speakers: 2796 65 | num_blocks: 0 66 | 67 | loss_conf: 68 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 69 | loss: 'AAMLoss' 70 | # 损失函数参数 71 | loss_args: 72 | margin: 0.2 73 | scale: 32 74 | easy_margin: False 75 | label_smoothing: 0.0 76 | # 是否使用损失函数margin调度器 77 | use_margin_scheduler: True 78 | # margin调度器参数 79 | margin_scheduler_args: 80 | initial_margin: 0.0 81 | final_margin: 0.3 82 | 83 | optimizer_conf: 84 | # 优化方法 85 | optimizer: 'Adam' 86 | # 优化方法参数 87 | optimizer_args: 88 | weight_decay: !!float 1e-6 89 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 90 | scheduler: 'WarmupCosineSchedulerLR' 91 | # 学习率衰减函数参数 92 | scheduler_args: 93 | # 学习率的大小 94 | learning_rate: 0.001 95 | min_lr: !!float 1e-5 96 | warmup_epoch: 5 97 | 98 | train_conf: 99 | # 是否开启自动混合精度 100 | enable_amp: False 101 | # 训练的轮数 102 | max_epoch: 60 103 | log_interval: 10 104 | -------------------------------------------------------------------------------- /configs/eres2net.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型,支持ERes2Net、ERes2NetV2 52 | model: 'ERes2Net' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | m_channels: 32 57 | # 分类器参数 58 | classifier: 59 | # 分类器类型,支持Cosine、Linear 60 | classifier_type: 'Cosine' 61 | # 说话人数量,即分类大小 62 | num_speakers: 2796 63 | num_blocks: 0 64 | loss_conf: 65 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 66 | loss: 'AAMLoss' 67 | # 损失函数参数 68 | loss_args: 69 | margin: 0.2 70 | scale: 32 71 | easy_margin: False 72 | label_smoothing: 0.0 73 | # 是否使用损失函数margin调度器 74 | use_margin_scheduler: True 75 | # margin调度器参数 76 | margin_scheduler_args: 77 | initial_margin: 0.0 78 | final_margin: 0.3 79 | 80 | optimizer_conf: 81 | # 优化方法 82 | optimizer: 'Adam' 83 | # 优化方法参数 84 | optimizer_args: 85 | weight_decay: !!float 1e-6 86 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 87 | scheduler: 'WarmupCosineSchedulerLR' 88 | # 学习率衰减函数参数 89 | scheduler_args: 90 | # 学习率的大小 91 | learning_rate: 0.001 92 | min_lr: !!float 1e-5 93 | warmup_epoch: 5 94 | 95 | train_conf: 96 | # 是否开启自动混合精度 97 | enable_amp: False 98 | # 训练的轮数 99 | max_epoch: 60 100 | log_interval: 10 101 | -------------------------------------------------------------------------------- /configs/res2net.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型 52 | model: 'Res2Net' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | # 所使用的池化层,支持ASP、SAP、TSP、TAP 57 | pooling_type: 'ASP' 58 | m_channels: 32 59 | # 分类器参数 60 | classifier: 61 | # 分类器类型,支持Cosine、Linear 62 | classifier_type: 'Cosine' 63 | # 说话人数量,即分类大小 64 | num_speakers: 2796 65 | num_blocks: 0 66 | 67 | loss_conf: 68 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 69 | loss: 'AAMLoss' 70 | # 损失函数参数 71 | loss_args: 72 | margin: 0.2 73 | scale: 32 74 | easy_margin: False 75 | label_smoothing: 0.0 76 | # 是否使用损失函数margin调度器 77 | use_margin_scheduler: True 78 | # margin调度器参数 79 | margin_scheduler_args: 80 | initial_margin: 0.0 81 | final_margin: 0.3 82 | 83 | optimizer_conf: 84 | # 优化方法 85 | optimizer: 'Adam' 86 | # 优化方法参数 87 | optimizer_args: 88 | weight_decay: !!float 1e-6 89 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 90 | scheduler: 'WarmupCosineSchedulerLR' 91 | # 学习率衰减函数参数 92 | scheduler_args: 93 | # 学习率的大小 94 | learning_rate: 0.001 95 | min_lr: !!float 1e-5 96 | warmup_epoch: 5 97 | 98 | train_conf: 99 | # 是否开启自动混合精度 100 | enable_amp: False 101 | # 训练的轮数 102 | max_epoch: 60 103 | log_interval: 10 104 | -------------------------------------------------------------------------------- /configs/resnet_se.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型 52 | model: 'ResNetSE' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | # 所使用的池化层,支持ASP、SAP、TSP、TAP 57 | pooling_type: 'ASP' 58 | # 分类器参数 59 | classifier: 60 | # 分类器类型,支持Cosine、Linear 61 | classifier_type: 'Cosine' 62 | # 说话人数量,即分类大小 63 | num_speakers: 2796 64 | num_blocks: 0 65 | 66 | loss_conf: 67 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 68 | loss: 'AAMLoss' 69 | # 损失函数参数 70 | loss_args: 71 | margin: 0.2 72 | scale: 32 73 | easy_margin: False 74 | label_smoothing: 0.0 75 | # 是否使用损失函数margin调度器 76 | use_margin_scheduler: True 77 | # margin调度器参数 78 | margin_scheduler_args: 79 | initial_margin: 0.0 80 | final_margin: 0.3 81 | 82 | optimizer_conf: 83 | # 优化方法 84 | optimizer: 'Adam' 85 | # 优化方法参数 86 | optimizer_args: 87 | weight_decay: !!float 1e-6 88 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 89 | scheduler: 'WarmupCosineSchedulerLR' 90 | # 学习率衰减函数参数 91 | scheduler_args: 92 | # 学习率的大小 93 | learning_rate: 0.001 94 | min_lr: !!float 1e-5 95 | warmup_epoch: 5 96 | 97 | train_conf: 98 | # 是否开启自动混合精度 99 | enable_amp: False 100 | # 训练的轮数 101 | max_epoch: 60 102 | log_interval: 10 103 | -------------------------------------------------------------------------------- /configs/tdnn.yml: -------------------------------------------------------------------------------- 1 | # 数据集参数 2 | dataset_conf: 3 | dataset: 4 | # 过滤最短的音频长度 5 | min_duration: 0.3 6 | # 最长的音频长度,大于这个长度会裁剪掉 7 | max_duration: 3 8 | # 音频的采样率 9 | sample_rate: 16000 10 | # 是否对音频进行音量归一化 11 | use_dB_normalization: True 12 | # 对音频进行音量归一化的音量分贝值 13 | target_dB: -20 14 | sampler: 15 | # 训练的批量大小 16 | batch_size: 64 17 | # 是否打乱数据 18 | shuffle: True 19 | # 是否丢弃最后一个样本 20 | drop_last: True 21 | dataLoader: 22 | # 读取数据的线程数量 23 | num_workers: 8 24 | # 评估的数据要特殊处理 25 | eval_conf: 26 | # 评估的批量大小 27 | batch_size: 8 28 | # 最长的音频长度 29 | max_duration: 20 30 | # 训练数据的数据列表路径 31 | train_list: 'dataset/train_list.txt' 32 | # 评估注册的数据列表路径 33 | enroll_list: 'dataset/cn-celeb-test/enroll_list.txt' 34 | # 评估检验的数据列表路径 35 | trials_list: 'dataset/cn-celeb-test/trials_list.txt' 36 | # 是否使用PKSampler,该Sampler可以保证每个说话人都有sample_per_id个样本 37 | is_use_pksampler: False 38 | # 使用PKSampler时设置样本数量 39 | sample_per_id: 4 40 | 41 | # 数据预处理参数 42 | preprocess_conf: 43 | # 音频预处理方法,支持:LogMelSpectrogram、MelSpectrogram、Spectrogram、MFCC、Fbank 44 | feature_method: 'Fbank' 45 | # 设置API参数,更参数查看对应API,不清楚的可以直接删除该部分,直接使用默认值 46 | method_args: 47 | sr: 16000 48 | n_mels: 80 49 | 50 | model_conf: 51 | # 所使用的模型 52 | model: 'TDNN' 53 | # 模型参数 54 | model_args: 55 | embd_dim: 192 56 | channels: 512 57 | # 所使用的池化层,支持ASP、SAP、TSP、TAP 58 | pooling_type: 'ASP' 59 | # 分类器参数 60 | classifier: 61 | # 分类器类型,支持Cosine、Linear 62 | classifier_type: 'Cosine' 63 | # 说话人数量,即分类大小 64 | num_speakers: 2796 65 | num_blocks: 0 66 | 67 | loss_conf: 68 | # 所使用的损失函数,支持AAMLoss、SphereFace2、AMLoss、ARMLoss、CELoss、SubCenterLoss、TripletAngularMarginLoss 69 | loss: 'AAMLoss' 70 | # 损失函数参数 71 | loss_args: 72 | margin: 0.2 73 | scale: 32 74 | easy_margin: False 75 | label_smoothing: 0.0 76 | # 是否使用损失函数margin调度器 77 | use_margin_scheduler: True 78 | # margin调度器参数 79 | margin_scheduler_args: 80 | initial_margin: 0.0 81 | final_margin: 0.3 82 | 83 | optimizer_conf: 84 | # 优化方法 85 | optimizer: 'Adam' 86 | # 优化方法参数 87 | optimizer_args: 88 | weight_decay: !!float 1e-6 89 | # 学习率衰减函数,支持PaddlePaddle支持的和项目提供的WarmupCosineSchedulerLR 90 | scheduler: 'WarmupCosineSchedulerLR' 91 | # 学习率衰减函数参数 92 | scheduler_args: 93 | # 学习率的大小 94 | learning_rate: 0.001 95 | min_lr: !!float 1e-5 96 | warmup_epoch: 5 97 | 98 | train_conf: 99 | # 是否开启自动混合精度 100 | enable_amp: False 101 | # 训练的轮数 102 | max_epoch: 60 103 | log_interval: 10 104 | -------------------------------------------------------------------------------- /create_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | # 制作CN-Celeb数据集列表 5 | # 下载地址:https://openslr.trmal.net/resources/82/cn-celeb_v2.tar.gz 6 | # 下载并解压到dataset目录,解压命令:tar -zxvf cn-celeb_v2.tar.gz 7 | def create_cn_celeb(list_path, data_path='dataset/'): 8 | f_train = open(list_path, 'w', encoding='utf-8') 9 | data_dir = os.path.join(data_path, 'CN-Celeb_flac/data/') 10 | dirs = sorted(os.listdir(data_dir)) 11 | for label, d in enumerate(dirs): 12 | # 跳过测试集 13 | if label >= 800:continue 14 | for file in os.listdir(os.path.join(data_dir, d)): 15 | sound_path = os.path.join(data_dir, d, file).replace('\\', '/') 16 | f_train.write(f'{sound_path}\t{label}\n') 17 | f_train.close() 18 | 19 | 20 | # 制作CN-Celeb2数据集列表,如果是Windows,请跳过这个数据集 21 | # 下载分包1地址:https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzaa 22 | # 下载分包2地址:https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzab 23 | # 下载分包3地址:https://openslr.trmal.net/resources/82/cn-celeb2_v2.tar.gzac 24 | # 下载并解压到dataset目录,合并压缩包命令:cat cn-celeb2_v2.tar.gza* > cn-celeb2_v2.tar.gz,解压命令:tar -zxvf cn-celeb2_v2.tar.gz 25 | def create_cn_celeb2(list_path, data_path='dataset/'): 26 | data_dir = os.path.join(data_path, 'CN-Celeb2_flac/data/') 27 | if not os.path.exists(data_dir): 28 | print('CN-Celeb2数据集不存在,请先下载并解压到dataset目录,目前忽略,你也可继续正常训练') 29 | return 30 | f_train = open(list_path, 'a', encoding='utf-8') 31 | dirs = sorted(os.listdir(data_dir)) 32 | last_label = 800 33 | for label, d in enumerate(dirs): 34 | for file in os.listdir(os.path.join(data_dir, d)): 35 | sound_path = os.path.join(data_dir, d, file).replace('\\', '/') 36 | f_train.write(f'{sound_path}\t{label + last_label}\n') 37 | f_train.close() 38 | 39 | 40 | if __name__ == '__main__': 41 | create_cn_celeb(list_path='dataset/train_list.txt', data_path='dataset') 42 | create_cn_celeb2(list_path='dataset/train_list.txt', data_path='dataset') 43 | -------------------------------------------------------------------------------- /dataset/a_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/a_1.wav -------------------------------------------------------------------------------- /dataset/a_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/a_2.wav -------------------------------------------------------------------------------- /dataset/b_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/b_1.wav -------------------------------------------------------------------------------- /dataset/b_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/b_2.wav -------------------------------------------------------------------------------- /dataset/test_long.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/dataset/test_long.wav -------------------------------------------------------------------------------- /docs/images/contrast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/contrast.jpg -------------------------------------------------------------------------------- /docs/images/log.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/log.jpg -------------------------------------------------------------------------------- /docs/images/recognition.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/recognition.jpg -------------------------------------------------------------------------------- /docs/images/speaker_diarization.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/speaker_diarization.jpg -------------------------------------------------------------------------------- /docs/images/speaker_diarization_gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/docs/images/speaker_diarization_gui.png -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import time 4 | 5 | from ppvector.trainer import PPVectorTrainer 6 | from ppvector.utils.utils import add_arguments, print_arguments 7 | 8 | parser = argparse.ArgumentParser(description=__doc__) 9 | add_arg = functools.partial(add_arguments, argparser=parser) 10 | add_arg('configs', str, 'configs/cam++.yml', "配置文件") 11 | add_arg("use_gpu", bool, True, "是否使用GPU评估模型") 12 | add_arg('save_image_path', str, 'output/images/', "保存结果图的路径") 13 | add_arg('resume_model', str, 'models/CAMPPlus_Fbank/best_model/', "模型的路径") 14 | add_arg('overwrites', str, None, '覆盖配置文件中的参数,比如"train_conf.max_epoch=100",多个用逗号隔开') 15 | args = parser.parse_args() 16 | print_arguments(args=args) 17 | 18 | # 获取训练器 19 | trainer = PPVectorTrainer(configs=args.configs, use_gpu=args.use_gpu, overwrites=args.overwrites) 20 | 21 | # 开始评估 22 | start = time.time() 23 | eer, min_dcf, threshold = trainer.evaluate(resume_model=args.resume_model, save_image_path=args.save_image_path) 24 | end = time.time() 25 | print('评估消耗时间:{}s,threshold:{:.2f},EER: {:.5f}, MinDCF: {:.5f}' 26 | .format(int(end - start), threshold, eer, min_dcf)) 27 | -------------------------------------------------------------------------------- /extract_features.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | 4 | from ppvector.trainer import PPVectorTrainer 5 | from ppvector.utils.utils import add_arguments, print_arguments 6 | 7 | parser = argparse.ArgumentParser(description=__doc__) 8 | add_arg = functools.partial(add_arguments, argparser=parser) 9 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 10 | add_arg('save_dir', str, 'dataset/features', '保存特征的路径') 11 | add_arg('max_duration', int, 100, '提取特征的最大时长,单位秒') 12 | args = parser.parse_args() 13 | print_arguments(args=args) 14 | 15 | # 获取训练器 16 | trainer = PPVectorTrainer(configs=args.configs) 17 | 18 | # 提取特征保存文件 19 | trainer.extract_features(save_dir=args.save_dir, max_duration=args.max_duration) 20 | -------------------------------------------------------------------------------- /infer_contrast.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | 4 | from ppvector.predict import PPVectorPredictor 5 | from ppvector.utils.utils import add_arguments, print_arguments 6 | 7 | parser = argparse.ArgumentParser(description=__doc__) 8 | add_arg = functools.partial(add_arguments, argparser=parser) 9 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 10 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 11 | add_arg('audio_path1', str, 'dataset/a_1.wav', '预测第一个音频') 12 | add_arg('audio_path2', str, 'dataset/b_2.wav', '预测第二个音频') 13 | add_arg('threshold', float, 0.6, '判断是否为同一个人的阈值') 14 | add_arg('model_path', str, 'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 15 | args = parser.parse_args() 16 | print_arguments(args=args) 17 | 18 | # 获取识别器 19 | predictor = PPVectorPredictor(configs=args.configs, 20 | model_path=args.model_path, 21 | use_gpu=args.use_gpu) 22 | 23 | dist = predictor.contrast(args.audio_path1, args.audio_path2) 24 | if dist > args.threshold: 25 | print(f"{args.audio_path1} 和 {args.audio_path2} 为同一个人,相似度为:{dist}") 26 | else: 27 | print(f"{args.audio_path1} 和 {args.audio_path2} 不是同一个人,相似度为:{dist}") 28 | -------------------------------------------------------------------------------- /infer_contrast_gui.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import tkinter as tk 3 | from tkinter import ttk, filedialog, messagebox 4 | import functools 5 | import argparse 6 | import threading 7 | import time 8 | from pathlib import Path 9 | 10 | from ppvector.predict import PPVectorPredictor 11 | from ppvector.utils.utils import add_arguments, print_arguments 12 | 13 | parser = argparse.ArgumentParser(description=__doc__) 14 | add_arg = functools.partial(add_arguments, argparser=parser) 15 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 16 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 17 | add_arg('model_path', str, 'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 18 | args = parser.parse_args() 19 | print_arguments(args=args) 20 | 21 | 22 | class VoiceContrastGUI: 23 | def __init__(self, master): 24 | self.master = master 25 | master.title("夜雨飘零声纹对比系统") 26 | master.geometry('700x670') 27 | master.resizable(True, True) 28 | master.configure(bg='#f0f0f0') 29 | 30 | # 使用ttk样式 31 | self.style = ttk.Style() 32 | self.style.theme_use('clam') 33 | 34 | # 配置样式 35 | self.style.configure('TButton', font=('微软雅黑', 10), padding=5) 36 | self.style.configure('TLabel', font=('微软雅黑', 10), background='#f0f0f0') 37 | self.style.configure('Header.TLabel', font=('微软雅黑', 14, 'bold'), background='#f0f0f0') 38 | self.style.configure('Result.TLabel', font=('微软雅黑', 16, 'bold'), foreground='#007bff', background='#f0f0f0') 39 | self.style.configure("Green.Horizontal.TProgressbar", background='#4CAF50', troughcolor='#f0f0f0', 40 | borderwidth=0, thickness=20) 41 | 42 | # 创建主框架 43 | self.main_frame = ttk.Frame(master, padding="20 20 20 20") 44 | self.main_frame.pack(fill=tk.BOTH, expand=True) 45 | 46 | # 创建标题 47 | self.title_label = ttk.Label(self.main_frame, text="声纹对比系统", style='Header.TLabel') 48 | self.title_label.grid(row=0, column=0, columnspan=4, pady=(0, 20)) 49 | 50 | # 音频选择区域 51 | self.audio_frame = ttk.LabelFrame(self.main_frame, text="音频选择", padding="10 10 10 10") 52 | self.audio_frame.grid(row=1, column=0, columnspan=4, sticky="ew", pady=(0, 20)) 53 | 54 | # 音频1选择 55 | self.label1 = ttk.Label(self.audio_frame, text="音频文件1:") 56 | self.label1.grid(row=0, column=0, padx=10, pady=10, sticky="w") 57 | 58 | self.audio1_path = tk.StringVar() 59 | self.entry_audio1 = ttk.Entry(self.audio_frame, width=50, textvariable=self.audio1_path) 60 | self.entry_audio1.grid(row=0, column=1, padx=10, pady=10, sticky="ew") 61 | 62 | self.btn_audio1 = ttk.Button(self.audio_frame, text="选择文件", command=self.select_audio1) 63 | self.btn_audio1.grid(row=0, column=2, padx=10, pady=10) 64 | 65 | # 音频2选择 66 | self.label2 = ttk.Label(self.audio_frame, text="音频文件2:") 67 | self.label2.grid(row=1, column=0, padx=10, pady=10, sticky="w") 68 | 69 | self.audio2_path = tk.StringVar() 70 | self.entry_audio2 = ttk.Entry(self.audio_frame, width=50, textvariable=self.audio2_path) 71 | self.entry_audio2.grid(row=1, column=1, padx=10, pady=10, sticky="ew") 72 | 73 | self.btn_audio2 = ttk.Button(self.audio_frame, text="选择文件", command=self.select_audio2) 74 | self.btn_audio2.grid(row=1, column=2, padx=10, pady=10) 75 | 76 | # 设置列的权重 77 | self.audio_frame.columnconfigure(1, weight=1) 78 | 79 | # 参数设置区域 80 | self.settings_frame = ttk.LabelFrame(self.main_frame, text="参数设置", padding="10 10 10 10") 81 | self.settings_frame.grid(row=2, column=0, columnspan=4, sticky="ew", pady=(0, 20)) 82 | 83 | # 判断阈值 84 | self.label3 = ttk.Label(self.settings_frame, text="对比阈值:") 85 | self.label3.grid(row=0, column=0, padx=10, pady=10, sticky="w") 86 | 87 | self.threshold = tk.StringVar(value="0.6") 88 | self.entry_threshold = ttk.Entry(self.settings_frame, width=10, textvariable=self.threshold) 89 | self.entry_threshold.grid(row=0, column=1, padx=10, pady=10, sticky="w") 90 | 91 | self.threshold_info = ttk.Label(self.settings_frame, text="(取值范围0-1,越大表示要求越严格)") 92 | self.threshold_info.grid(row=0, column=2, padx=10, pady=10, sticky="w") 93 | 94 | # 操作按钮区域 95 | self.button_frame = ttk.Frame(self.main_frame) 96 | self.button_frame.grid(row=3, column=0, columnspan=4, sticky="ew", pady=(0, 20)) 97 | 98 | self.btn_predict = ttk.Button(self.button_frame, text="开始对比", command=self.predict_thread) 99 | self.btn_predict.pack(side=tk.LEFT, padx=10) 100 | 101 | self.btn_clear = ttk.Button(self.button_frame, text="清空", command=self.clear) 102 | self.btn_clear.pack(side=tk.LEFT, padx=10) 103 | 104 | self.btn_quit = ttk.Button(self.button_frame, text="退出", command=self.quit) 105 | self.btn_quit.pack(side=tk.RIGHT, padx=10) 106 | 107 | # 状态区域 108 | self.status_frame = ttk.LabelFrame(self.main_frame, text="状态", padding="10 10 10 10") 109 | self.status_frame.grid(row=4, column=0, columnspan=4, sticky="ew", pady=(0, 10)) 110 | 111 | # 进度条 112 | self.progress_var = tk.DoubleVar() 113 | self.progress_bar = ttk.Progressbar(self.status_frame, orient="horizontal", 114 | mode="determinate", variable=self.progress_var, 115 | style="Green.Horizontal.TProgressbar") 116 | self.progress_bar.pack(fill=tk.X, padx=5, pady=5) 117 | 118 | # 结果显示区域 119 | self.result_frame = ttk.LabelFrame(self.main_frame, text="对比结果", padding="10 10 10 10") 120 | self.result_frame.grid(row=5, column=0, columnspan=4, sticky="ew") 121 | 122 | self.result_label = ttk.Label(self.result_frame, text="请选择两个音频文件进行对比", 123 | style='Result.TLabel', anchor=tk.CENTER) 124 | self.result_label.pack(fill=tk.X, pady=10) 125 | 126 | # 结果详情 127 | self.detail_frame = ttk.Frame(self.result_frame) 128 | self.detail_frame.pack(fill=tk.X, pady=5) 129 | 130 | self.similarity_label = ttk.Label(self.detail_frame, text="相似度: ") 131 | self.similarity_label.pack(side=tk.LEFT, padx=10) 132 | 133 | self.similarity_value = ttk.Label(self.detail_frame, text="--", font=('微软雅黑', 12, 'bold')) 134 | self.similarity_value.pack(side=tk.LEFT) 135 | 136 | # 设置列的权重 137 | for i in range(4): 138 | self.main_frame.columnconfigure(i, weight=1) 139 | 140 | # 预测器 141 | self.predictor = PPVectorPredictor(configs=args.configs, model_path=args.model_path, use_gpu=args.use_gpu) 142 | self.is_predicting = False 143 | 144 | def select_audio1(self): 145 | filename = filedialog.askopenfilename(initialdir='./dataset', 146 | filetypes=[("音频文件", "*.wav *.mp3 *.flac *.ogg *.m4a"), 147 | ("所有文件", "*.*")]) 148 | if filename: 149 | self.audio1_path.set(filename) 150 | 151 | def select_audio2(self): 152 | filename = filedialog.askopenfilename(initialdir='./dataset', 153 | filetypes=[("音频文件", "*.wav *.mp3 *.flac *.ogg *.m4a"), 154 | ("所有文件", "*.*")]) 155 | if filename: 156 | self.audio2_path.set(filename) 157 | 158 | def predict_thread(self): 159 | """在线程中执行预测""" 160 | if self.is_predicting: 161 | messagebox.showinfo("提示", "正在处理中,请稍候...") 162 | return 163 | 164 | audio_path1 = self.audio1_path.get() 165 | audio_path2 = self.audio2_path.get() 166 | 167 | if not audio_path1 or not audio_path2: 168 | messagebox.showerror("错误", "请选择两个音频文件") 169 | return 170 | 171 | try: 172 | threshold = float(self.threshold.get()) 173 | if threshold < 0 or threshold > 1: 174 | messagebox.showerror("错误", "阈值必须在0-1之间") 175 | return 176 | except ValueError: 177 | messagebox.showerror("错误", "请输入有效的阈值") 178 | return 179 | 180 | self.is_predicting = True 181 | self.btn_predict.config(state=tk.DISABLED) 182 | self.result_label.config(text="正在处理...") 183 | self.similarity_value.config(text="--") 184 | self.update_progress_bar(0) 185 | 186 | # 启动线程进行预测 187 | threading.Thread(target=self._predict, args=(audio_path1, audio_path2, threshold)).start() 188 | 189 | def _predict(self, audio_path1, audio_path2, threshold): 190 | """执行预测""" 191 | try: 192 | # 模拟进度 193 | for i in range(1, 101): 194 | if i < 90: # 预留最后10%用于实际计算结果 195 | self.update_progress_bar(i) 196 | time.sleep(0.02) # 调整速度使进度条看起来更自然 197 | 198 | # 执行实际预测 199 | dist = self.predictor.contrast(audio_path1, audio_path2) 200 | 201 | # 完成进度 202 | self.update_progress_bar(100) 203 | 204 | # 更新UI显示结果 205 | self.similarity_value.config(text=f"{dist:.5f}") 206 | 207 | if dist > threshold: 208 | result_text = f"两段语音来自同一个人" 209 | self.result_label.config(text=result_text, foreground="#4CAF50") 210 | else: 211 | result_text = f"两段语音来自不同的人" 212 | self.result_label.config(text=result_text, foreground="#F44336") 213 | 214 | except Exception as e: 215 | messagebox.showerror("错误", f"预测失败: {str(e)}") 216 | self.result_label.config(text="预测失败,请检查音频文件格式", foreground="#F44336") 217 | 218 | finally: 219 | self.btn_predict.config(state=tk.NORMAL) 220 | self.is_predicting = False 221 | 222 | def clear(self): 223 | """清空所有输入和结果""" 224 | self.audio1_path.set("") 225 | self.audio2_path.set("") 226 | self.threshold.set("0.6") 227 | self.result_label.config(text="请选择两个音频文件进行对比", foreground="#007bff") 228 | self.similarity_value.config(text="--") 229 | self.update_progress_bar(0) 230 | 231 | def update_progress_bar(self, value): 232 | """更新进度条""" 233 | self.progress_var.set(value) 234 | self.master.update_idletasks() 235 | 236 | def quit(self): 237 | self.master.destroy() 238 | 239 | 240 | if __name__ == '__main__': 241 | root = tk.Tk() 242 | app = VoiceContrastGUI(root) 243 | root.mainloop() 244 | -------------------------------------------------------------------------------- /infer_recognition.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | 4 | from ppvector.predict import PPVectorPredictor 5 | from ppvector.utils.record import RecordAudio 6 | from ppvector.utils.utils import add_arguments, print_arguments 7 | 8 | parser = argparse.ArgumentParser(description=__doc__) 9 | add_arg = functools.partial(add_arguments, argparser=parser) 10 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 11 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 12 | add_arg('audio_db_path', str, 'audio_db/', '音频库的路径') 13 | add_arg('record_seconds', int, 3, '录音长度') 14 | add_arg('threshold', float, 0.6, '判断是否为同一个人的阈值') 15 | add_arg('model_path', str, 'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 16 | args = parser.parse_args() 17 | print_arguments(args=args) 18 | 19 | # 获取识别器 20 | predictor = PPVectorPredictor(configs=args.configs, 21 | threshold=args.threshold, 22 | audio_db_path=args.audio_db_path, 23 | model_path=args.model_path, 24 | use_gpu=args.use_gpu) 25 | # 获取录音器 26 | record_audio = RecordAudio() 27 | 28 | while True: 29 | select_fun = int(input("请选择功能,0为注册音频到声纹库,1为执行声纹识别,2为删除用户:")) 30 | if select_fun == 0: 31 | input(f"按下回车键开机录音,录音{args.record_seconds}秒中:") 32 | audio_data = record_audio.record(record_seconds=args.record_seconds) 33 | name = input("请输入该音频用户的名称:") 34 | if name == '': continue 35 | predictor.register(user_name=name, audio_data=audio_data, sample_rate=record_audio.sample_rate) 36 | elif select_fun == 1: 37 | input(f"按下回车键开机录音,录音{args.record_seconds}秒中:") 38 | audio_data = record_audio.record(record_seconds=args.record_seconds) 39 | name, score = predictor.recognition(audio_data, sample_rate=record_audio.sample_rate) 40 | if name: 41 | print(f"识别说话的为:{name},得分:{score}") 42 | else: 43 | print(f"没有识别到说话人,可能是没注册。") 44 | elif select_fun == 2: 45 | name = input("请输入该音频用户的名称:") 46 | if name == '': continue 47 | predictor.remove_user(user_name=name) 48 | else: 49 | print('请正确选择功能') 50 | -------------------------------------------------------------------------------- /infer_speaker_diarization.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import os 4 | 5 | from ppvector.predict import PPVectorPredictor 6 | from ppvector.utils.utils import add_arguments, print_arguments 7 | 8 | parser = argparse.ArgumentParser(description=__doc__) 9 | add_arg = functools.partial(add_arguments, argparser=parser) 10 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 11 | add_arg('audio_path', str, 'dataset/test_long.wav', '预测音频路径') 12 | add_arg('audio_db_path', str, 'audio_db/', '音频库的路径') 13 | add_arg('speaker_num', int, None, '说话人数量,提供说话人数量可以提高准确率') 14 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 15 | add_arg('show_plot', bool, True, '是否显示结果图像') 16 | add_arg('search_audio_db', bool, True, '是否在音频库中搜索对应的说话人') 17 | add_arg('threshold', float, 0.6, '判断是否为同一个人的阈值') 18 | add_arg('model_path', str, 'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 19 | args = parser.parse_args() 20 | print_arguments(args=args) 21 | 22 | if args.search_audio_db: 23 | assert args.audio_db_path is not None, "请指定音频库的路径" 24 | 25 | # 获取识别器 26 | predictor = PPVectorPredictor(configs=args.configs, 27 | model_path=args.model_path, 28 | threshold=args.threshold, 29 | audio_db_path=args.audio_db_path, 30 | use_gpu=args.use_gpu) 31 | 32 | # 进行说话人日志识别 33 | results = predictor.speaker_diarization(args.audio_path, 34 | speaker_num=args.speaker_num, 35 | search_audio_db=args.search_audio_db) 36 | print(f"识别结果:") 37 | for result in results: 38 | print(result) 39 | 40 | # 绘制结果图像 41 | if args.show_plot: 42 | from ppvector.infer_utils.viewer import PlotSpeaker 43 | plot_speaker = PlotSpeaker(results, audio_path=args.audio_path) 44 | os.makedirs('output', exist_ok=True) 45 | plot_speaker.draw('output/speaker_diarization.png') 46 | plot_speaker.plot.show() 47 | -------------------------------------------------------------------------------- /infer_speaker_diarization_gui.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import os.path 4 | import threading 5 | import tkinter as tk 6 | from tkinter import filedialog 7 | 8 | from ppvector.predict import PPVectorPredictor 9 | from ppvector.utils.utils import add_arguments, print_arguments 10 | 11 | parser = argparse.ArgumentParser(description=__doc__) 12 | add_arg = functools.partial(add_arguments, argparser=parser) 13 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 14 | add_arg('audio_path', str, 'dataset/test_long.wav', '预测音频路径') 15 | add_arg('audio_db_path', str, 'audio_db/', '音频库的路径') 16 | add_arg('speaker_num', int, None, '说话人数量,提供说话人数量可以提高准确率') 17 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 18 | add_arg('threshold', float, 0.6, '判断是否为同一个人的阈值') 19 | add_arg('model_path', str, 'models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 20 | args = parser.parse_args() 21 | print_arguments(args=args) 22 | 23 | 24 | class SpeakerDiarizationGUI: 25 | def __init__(self, window): 26 | self.window = window 27 | window.title("夜雨飘零说话人日志") 28 | self.plot_speaker = None 29 | self.show_plot = True 30 | self.search_audio_db = True 31 | # 添加组件 32 | self.label1 = tk.Label(window, text="音频路径:") 33 | self.label1.grid(row=0, column=0, padx=10, pady=10) 34 | self.entry_audio1 = tk.Entry(window, width=60) 35 | self.entry_audio1.grid(row=0, column=1, columnspan=2, padx=10, pady=10) 36 | self.btn_audio1 = tk.Button(window, text="选择", command=self.select_audio) 37 | self.btn_audio1.grid(row=0, column=3, padx=10, pady=10) 38 | self.btn_predict = tk.Button(window, text="开始识别", command=self.predict) 39 | self.btn_predict.grid(row=0, column=4, padx=10, pady=10) 40 | self.an_frame = tk.Frame(window) 41 | self.check_var = tk.BooleanVar(value=False) 42 | self.is_show_check = tk.Checkbutton(self.an_frame, text='是否显示结果图', variable=self.check_var, command=self.is_show_state) 43 | self.is_show_check.grid(row=0) 44 | self.is_show_check.select() 45 | self.an_frame.grid(row=1) 46 | self.an_frame.grid(row=2, column=1, padx=10) 47 | self.an_frame1 = tk.Frame(window) 48 | self.check_var1 = tk.BooleanVar(value=False) 49 | self.is_search_check = tk.Checkbutton(self.an_frame1, text='是否检索数据库', variable=self.check_var1, command=self.is_search_state) 50 | self.is_search_check.grid(row=0) 51 | self.is_search_check.select() 52 | self.an_frame1.grid(row=1) 53 | self.an_frame1.grid(row=2, column=2, padx=10) 54 | # 输出结果文本框 55 | self.result_label = tk.Label(self.window, text="输出结果:") 56 | self.result_label.grid(row=3, column=0, padx=10, pady=10) 57 | self.result_text = tk.Text(self.window, width=60, height=20) 58 | self.result_text.grid(row=3, column=1, columnspan=2, padx=10, pady=10) 59 | 60 | # 预测器 61 | self.predictor = PPVectorPredictor(configs=args.configs, 62 | model_path=args.model_path, 63 | threshold=args.threshold, 64 | audio_db_path=args.audio_db_path, 65 | use_gpu=args.use_gpu) 66 | 67 | def is_show_state(self): 68 | self.show_plot = self.check_var.get() 69 | 70 | def is_search_state(self): 71 | self.search_audio_db = self.check_var1.get() 72 | 73 | def select_audio(self): 74 | filename = filedialog.askopenfilename(initialdir='./dataset') 75 | self.entry_audio1.delete(0, tk.END) 76 | self.entry_audio1.insert(tk.END, filename) 77 | 78 | def predict(self): 79 | if self.plot_speaker: 80 | self.plot_speaker.plot.close() 81 | self.plot_speaker = None 82 | audio_path = self.entry_audio1.get() 83 | if audio_path is None or len(audio_path) == 0: return 84 | print(f'选择音频路径:{audio_path}') 85 | # 进行说话人日志识别 86 | results = self.predictor.speaker_diarization(audio_path, 87 | speaker_num=args.speaker_num, 88 | search_audio_db=self.search_audio_db) 89 | self.result_text.delete('1.0', 'end') 90 | for result in results: 91 | self.result_text.insert(tk.END, f"{result}\n") 92 | 93 | if self.show_plot: 94 | threading.Thread(target=self.show_result(results), args=(results,)).start() 95 | 96 | def show_result(self, results): 97 | from ppvector.infer_utils.viewer import PlotSpeaker 98 | self.plot_speaker = PlotSpeaker(results, audio_path=args.audio_path) 99 | os.makedirs('output', exist_ok=True) 100 | self.plot_speaker.draw('output/speaker_diarization.png') 101 | self.plot_speaker.plot.show() 102 | self.plot_speaker = None 103 | 104 | 105 | if __name__ == '__main__': 106 | root = tk.Tk() 107 | app = SpeakerDiarizationGUI(root) 108 | root.mainloop() 109 | -------------------------------------------------------------------------------- /ppvector/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.5" 2 | -------------------------------------------------------------------------------- /ppvector/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/data_utils/__init__.py -------------------------------------------------------------------------------- /ppvector/data_utils/collate_fn.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | 3 | 4 | # 对一个batch的数据处理 5 | def collate_fn(batch): 6 | # 找出音频长度最长的 7 | batch_sorted = sorted(batch, key=lambda sample: sample[0].shape[0], reverse=True) 8 | freq_size = batch_sorted[0][0].shape[1] 9 | max_freq_length = batch_sorted[0][0].shape[0] 10 | batch_size = len(batch_sorted) 11 | # 以最大的长度创建0张量 12 | features = paddle.zeros((batch_size, max_freq_length, freq_size), dtype=paddle.float32) 13 | input_lens, labels = [], [] 14 | for x in range(batch_size): 15 | tensor, label = batch[x] 16 | seq_length = tensor.shape[0] 17 | # 将数据插入都0张量中,实现了padding 18 | features[x, :seq_length, :] = tensor[:, :] 19 | labels.append(label) 20 | input_lens.append(seq_length) 21 | labels = paddle.to_tensor(labels, dtype=paddle.int64) 22 | input_lens = paddle.to_tensor(input_lens, dtype=paddle.int64) 23 | return features, labels, input_lens 24 | -------------------------------------------------------------------------------- /ppvector/data_utils/featurizer.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | from paddle import nn 3 | import paddleaudio.compliance.kaldi as Kaldi 4 | from paddle.audio.features import LogMelSpectrogram, MelSpectrogram, Spectrogram, MFCC 5 | 6 | 7 | class AudioFeaturizer(nn.Layer): 8 | """音频特征器 9 | 10 | :param feature_method: 所使用的预处理方法 11 | :type feature_method: str 12 | :param method_args: 预处理方法的参数 13 | :type method_args: dict 14 | """ 15 | 16 | def __init__(self, feature_method='MelSpectrogram', method_args={}): 17 | super().__init__() 18 | self._method_args = method_args 19 | self._feature_method = feature_method 20 | if feature_method == 'LogMelSpectrogram': 21 | self.feat_fun = LogMelSpectrogram(**method_args) 22 | elif feature_method == 'MelSpectrogram': 23 | self.feat_fun = MelSpectrogram(**method_args) 24 | elif feature_method == 'Spectrogram': 25 | self.feat_fun = Spectrogram(**method_args) 26 | elif feature_method == 'MFCC': 27 | self.feat_fun = MFCC(**method_args) 28 | elif feature_method == 'Fbank': 29 | self.feat_fun = KaldiFbank(**method_args) 30 | else: 31 | raise Exception(f'预处理方法 {self._feature_method} 不存在!') 32 | 33 | def forward(self, waveforms, input_lens_ratio=None): 34 | """从AudioSegment中提取音频特征 35 | 36 | :param waveforms: Audio segment to extract features from. 37 | :type waveforms: AudioSegment 38 | :param input_lens_ratio: input length ratio 39 | :type input_lens_ratio: tensor 40 | :return: Spectrogram audio feature in 2darray. 41 | :rtype: ndarray 42 | """ 43 | if len(waveforms.shape) == 1: 44 | waveforms = waveforms.unsqueeze(0) 45 | feature = self.feat_fun(waveforms) 46 | feature = feature.transpose([0, 2, 1]) 47 | # 归一化 48 | feature = feature - feature.mean(1, keepdim=True) 49 | if input_lens_ratio is not None: 50 | # 对掩码比例进行扩展 51 | input_lens = (input_lens_ratio * feature.shape[1]).astype(paddle.int32) 52 | mask_lens = input_lens.unsqueeze(1) 53 | # 生成掩码张量 54 | idxs = paddle.arange(feature.shape[1]) 55 | idxs = idxs.tile([feature.shape[0], 1]) 56 | mask = idxs < mask_lens 57 | mask = mask.unsqueeze(-1) 58 | # 对特征进行掩码操作 59 | feature = paddle.where(mask, feature, paddle.zeros_like(feature)) 60 | return feature 61 | 62 | @property 63 | def feature_dim(self): 64 | """返回特征大小 65 | 66 | :return: 特征大小 67 | :rtype: int 68 | """ 69 | if self._feature_method == 'LogMelSpectrogram': 70 | return self._method_args.get('n_mels', 128) 71 | elif self._feature_method == 'MelSpectrogram': 72 | return self._method_args.get('n_mels', 64) 73 | elif self._feature_method == 'Spectrogram': 74 | return self._method_args.get('n_fft', 512) // 2 + 1 75 | elif self._feature_method == 'MFCC': 76 | return self._method_args.get('n_mfcc', 40) 77 | elif self._feature_method == 'Fbank': 78 | return self._method_args.get('n_mels', 23) 79 | else: 80 | raise Exception('没有{}预处理方法'.format(self._feature_method)) 81 | 82 | 83 | class KaldiFbank(nn.Layer): 84 | def __init__(self, **kwargs): 85 | super(KaldiFbank, self).__init__() 86 | self.kwargs = kwargs 87 | 88 | def forward(self, waveforms): 89 | """ 90 | :param waveforms: [Batch, Length] 91 | :return: [Batch, Length, Feature] 92 | """ 93 | log_fbanks = [] 94 | for waveform in waveforms: 95 | if len(waveform.shape) == 1: 96 | waveform = waveform.unsqueeze(0) 97 | log_fbank = Kaldi.fbank(waveform, **self.kwargs) 98 | log_fbank = log_fbank.transpose((1, 0)) 99 | log_fbanks.append(log_fbank) 100 | log_fbank = paddle.stack(log_fbanks) 101 | return log_fbank 102 | -------------------------------------------------------------------------------- /ppvector/data_utils/pk_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | import paddle.distributed as dist 5 | from paddle.io import DistributedBatchSampler 6 | 7 | 8 | class PKSampler(DistributedBatchSampler): 9 | """随机取一批数据,保证每个类别的数量都是相同的。 10 | 11 | Args: 12 | dataset (Dataset): 数据的Dataset 13 | batch_size (int): batch size 14 | sample_per_id (int): 每个类别的样本数量 15 | shuffle (bool, optional): 是否随机打乱数据 16 | drop_last (bool, optional): 是否丢掉最后一个batch 17 | """ 18 | 19 | def __init__(self, 20 | dataset, 21 | batch_size, 22 | sample_per_id, 23 | shuffle=True, 24 | drop_last=True): 25 | super().__init__(dataset, batch_size, shuffle=shuffle, drop_last=drop_last) 26 | assert batch_size % sample_per_id == 0, f"batch_size({batch_size})必须是sample_per_id({sample_per_id})的整数倍" 27 | self.sample_per_id = sample_per_id 28 | self.label_dict = defaultdict(list) 29 | for idx, label in enumerate(self.dataset.labels): 30 | self.label_dict[label].append(idx) 31 | self.label_list = list(self.label_dict) 32 | assert len(self.label_list) * self.sample_per_id >= self.batch_size, \ 33 | f"batch_size({self.batch_size})必须大于等于label_list({len(self.label_list)})*sample_per_id({self.sample_per_id})" 34 | self.prob_list = np.array([1 / len(self.label_list)] * len(self.label_list)) 35 | diff = np.abs(sum(self.prob_list) - 1) 36 | if diff > 0.00000001: 37 | self.prob_list[-1] = 1 - sum(self.prob_list[:-1]) 38 | 39 | def __iter__(self): 40 | if self.shuffle: 41 | rank = dist.get_rank() 42 | np.random.RandomState(rank * self.epoch).shuffle(self.label_list) 43 | np.random.RandomState(rank * self.epoch).shuffle(self.prob_list) 44 | self.epoch += 1 45 | 46 | label_per_batch = self.batch_size // self.sample_per_id 47 | for _ in range(len(self)): 48 | batch_index = [] 49 | batch_label_list = np.random.choice(self.label_list, size=label_per_batch, replace=False, p=self.prob_list) 50 | for label_i in batch_label_list: 51 | label_i_indexes = self.label_dict[label_i] 52 | batch_index.extend( 53 | np.random.choice(label_i_indexes, size=self.sample_per_id, 54 | replace=not self.sample_per_id <= len(label_i_indexes))) 55 | # 再次随机打乱 56 | if self.shuffle: 57 | np.random.shuffle(batch_index) 58 | if not self.drop_last or len(batch_index) == self.batch_size: 59 | yield batch_index 60 | -------------------------------------------------------------------------------- /ppvector/data_utils/reader.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import paddle 5 | from loguru import logger 6 | from paddle.io import Dataset 7 | from tqdm import tqdm 8 | from yeaudio.audio import AudioSegment 9 | from yeaudio.augmentation import ReverbPerturbAugmentor, SpecAugmentor 10 | from yeaudio.augmentation import SpeedPerturbAugmentor, VolumePerturbAugmentor, NoisePerturbAugmentor 11 | 12 | from ppvector.data_utils.featurizer import AudioFeaturizer 13 | 14 | 15 | # 音频数据加载器 16 | class PPVectorDataset(Dataset): 17 | def __init__(self, 18 | data_list_path, 19 | audio_featurizer: AudioFeaturizer, 20 | max_duration=3, 21 | min_duration=0.5, 22 | mode='train', 23 | sample_rate=16000, 24 | aug_conf=None, 25 | num_speakers=None, 26 | use_dB_normalization=True, 27 | target_dB=-20): 28 | """音频数据加载器 29 | 30 | Args: 31 | data_list_path: 包含音频路径和标签的数据列表文件的路径 32 | audio_featurizer: 声纹特征提取器 33 | max_duration: 最长的音频长度,大于这个长度会裁剪掉 34 | min_duration: 过滤最短的音频长度 35 | aug_conf: 用于指定音频增强的配置 36 | mode: 数据集模式。在训练模式下,数据集可能会进行一些数据增强的预处理 37 | sample_rate: 采样率 38 | num_speakers: 总说话人数量 39 | use_dB_normalization: 是否对音频进行音量归一化 40 | target_dB: 音量归一化的大小 41 | """ 42 | super(PPVectorDataset, self).__init__() 43 | assert mode in ['train', 'eval', 'extract_feature'] 44 | self.data_list_path = data_list_path 45 | self.max_duration = max_duration 46 | self.min_duration = min_duration 47 | self.mode = mode 48 | self._target_sample_rate = sample_rate 49 | self._use_dB_normalization = use_dB_normalization 50 | self._target_dB = target_dB 51 | self.num_speakers = num_speakers 52 | self.speed_augment = None 53 | self.volume_augment = None 54 | self.noise_augment = None 55 | self.reverb_augment = None 56 | self.spec_augment = None 57 | # 获取特征器 58 | self.audio_featurizer = audio_featurizer 59 | # 获取特征裁剪的大小 60 | self.max_feature_len = self.get_crop_feature_len() 61 | # 获取数据列表 62 | with open(self.data_list_path, 'r', encoding='utf-8') as f: 63 | self.lines = f.readlines() 64 | self.labels = [np.int64(line.strip().split('\t')[1]) for line in self.lines] 65 | if mode == 'train' and aug_conf is not None: 66 | # 获取数据增强器 67 | self.get_augmentor(aug_conf) 68 | # 评估模式下,数据列表需要排序 69 | if self.mode == 'eval': 70 | self.sort_list() 71 | 72 | def __getitem__(self, idx): 73 | # 分割音频路径和标签 74 | data_path, spk_id = self.lines[idx].strip().split('\t') 75 | spk_id = int(spk_id) 76 | # 如果后缀名为.npy的文件,那么直接读取 77 | if data_path.endswith('.npy'): 78 | feature = np.load(data_path) 79 | if feature.shape[0] > self.max_feature_len: 80 | crop_start = random.randint(0, feature.shape[0] - self.max_feature_len) if self.mode == 'train' else 0 81 | feature = feature[crop_start:crop_start + self.max_feature_len, :] 82 | feature = paddle.to_tensor(feature, dtype=paddle.float32) 83 | else: 84 | # 读取音频 85 | audio_segment = AudioSegment.from_file(data_path) 86 | # 数据太短不利于训练 87 | if self.mode == 'train' or self.mode == 'extract_feature': 88 | if audio_segment.duration < self.min_duration: 89 | return self.__getitem__(idx + 1 if idx < len(self.lines) - 1 else 0) 90 | # 重采样 91 | if audio_segment.sample_rate != self._target_sample_rate: 92 | audio_segment.resample(self._target_sample_rate) 93 | # 音频增强 94 | if self.mode == 'train': 95 | audio_segment, spk_id = self.augment_audio(audio_segment, spk_id) 96 | # decibel normalization 97 | if self._use_dB_normalization: 98 | audio_segment.normalize(target_db=self._target_dB) 99 | # 裁剪需要的数据 100 | if self.mode != 'extract_feature' and audio_segment.duration > self.max_duration: 101 | audio_segment.crop(duration=self.max_duration, mode=self.mode) 102 | samples = paddle.to_tensor(audio_segment.samples, dtype=paddle.float32) 103 | feature = self.audio_featurizer(samples) 104 | feature = feature.squeeze(0) 105 | if self.mode == 'train' and self.spec_augment is not None: 106 | feature = self.spec_augment(feature.numpy()) 107 | feature = paddle.to_tensor(feature, dtype=paddle.float32) 108 | spk_id = paddle.to_tensor(int(spk_id), dtype=paddle.int64) 109 | return feature, spk_id 110 | 111 | def __len__(self): 112 | return len(self.lines) 113 | 114 | # 获取特征裁剪的大小,对应max_duration音频提取特征后的长度 115 | def get_crop_feature_len(self): 116 | samples = paddle.randn((1, self.max_duration * self._target_sample_rate)) 117 | feature = self.audio_featurizer(samples).squeeze(0) 118 | freq_len = feature.shape[0] 119 | return freq_len 120 | 121 | # 数据列表需要排序 122 | def sort_list(self): 123 | lengths = [] 124 | for line in tqdm(self.lines, desc=f"对列表[{self.data_list_path}]进行长度排序"): 125 | # 分割数据文件路径和标签 126 | data_path, _ = line.split('\t') 127 | if data_path.endswith('.npy'): 128 | feature = np.load(data_path) 129 | length = feature.shape[0] 130 | lengths.append(length) 131 | else: 132 | # 读取音频 133 | audio_segment = AudioSegment.from_file(data_path) 134 | length = audio_segment.duration 135 | lengths.append(length) 136 | # 对长度排序并获取索引 137 | sorted_indexes = np.argsort(lengths) 138 | self.lines = [self.lines[i] for i in sorted_indexes] 139 | 140 | # 获取数据增强器 141 | def get_augmentor(self, aug_conf): 142 | if aug_conf.speed is not None: 143 | self.speed_augment = SpeedPerturbAugmentor(num_speakers=self.num_speakers, **aug_conf.speed) 144 | if aug_conf.volume is not None: 145 | self.volume_augment = VolumePerturbAugmentor(**aug_conf.volume) 146 | if aug_conf.noise is not None: 147 | self.noise_augment = NoisePerturbAugmentor(**aug_conf.noise) 148 | if aug_conf.reverb is not None: 149 | self.reverb_augment = ReverbPerturbAugmentor(**aug_conf.reverb) 150 | if aug_conf.spec_aug is not None: 151 | self.spec_augment = SpecAugmentor(**aug_conf.spec_aug) 152 | 153 | # 音频增强 154 | def augment_audio(self, audio_segment, spk_id): 155 | if self.speed_augment is not None: 156 | audio_segment, spk_id = self.speed_augment(audio_segment, spk_id) 157 | if self.volume_augment is not None: 158 | audio_segment = self.volume_augment(audio_segment) 159 | if self.noise_augment is not None: 160 | audio_segment = self.noise_augment(audio_segment) 161 | if self.reverb_augment is not None: 162 | audio_segment = self.reverb_augment(audio_segment) 163 | return audio_segment, spk_id 164 | -------------------------------------------------------------------------------- /ppvector/infer_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/infer_utils/__init__.py -------------------------------------------------------------------------------- /ppvector/infer_utils/player.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import soundcard 4 | from yeaudio.audio import AudioSegment 5 | 6 | 7 | class AudioPlayer: 8 | def __init__(self, audio_path): 9 | """音频播放器 10 | 11 | Args: 12 | audio_path (str): 音频文件路径 13 | """ 14 | self.playing = False 15 | self.to_pause = False 16 | self.pos = 0 17 | self.audio_segment = AudioSegment.from_file(audio_path) 18 | self.audio_data = self.audio_segment.to_bytes(dtype="int16") 19 | self.audio_segment = AudioSegment.from_file(audio_path) 20 | self.audio_data = self.audio_segment.to_bytes(dtype="int16") 21 | self.samples = self.audio_segment.samples 22 | self.sample_rate = self.audio_segment.sample_rate 23 | self.default_speaker = soundcard.default_speaker() 24 | self.block_size = self.sample_rate // 2 25 | 26 | def _play(self): 27 | self.to_pause = False 28 | self.playing = True 29 | with self.default_speaker.player(samplerate=self.sample_rate) as p: 30 | for i in range(int(self.pos * self.sample_rate), len(self.samples), self.block_size): 31 | if self.to_pause: break 32 | self.pos = i / self.sample_rate 33 | p.play(self.samples[i:i + self.block_size]) 34 | self.playing = False 35 | 36 | # 播放音频 37 | def play(self): 38 | if not self.playing: 39 | thread = threading.Thread(target=self._play) 40 | thread.start() 41 | 42 | # 暂停播放 43 | def pause(self): 44 | self.to_pause = True 45 | 46 | # 跳转到指定时间 47 | def seek(self, seconds=0.0): 48 | self.pos = seconds 49 | 50 | # 获取当前播放时间 51 | def current_time(self): 52 | return self.pos 53 | -------------------------------------------------------------------------------- /ppvector/infer_utils/speaker_diarization.py: -------------------------------------------------------------------------------- 1 | # This implementation is adapted from https://github.com/modelscope/modelscope 2 | import numpy as np 3 | import scipy 4 | import sklearn 5 | from sklearn.cluster import k_means 6 | from yeaudio.audio import AudioSegment 7 | 8 | 9 | class SpeakerDiarization(object): 10 | 11 | def __init__(self, seg_duration=1.5, seg_shift=0.75, sample_rate=16000, merge_threshold=0.78): 12 | """说话人日志工具 13 | 14 | Args: 15 | seg_duration (float, optional): 每个分割片段的持续时间(秒),默认为1.5秒。 16 | seg_shift (float, optional): 分割片段之间的时间间隔(秒),默认为0.75秒。 17 | sample_rate (int, optional): 音频采样率,默认为16000Hz。 18 | merge_threshold (float, optional): 合并片段的阈值,默认为0.78。当两个片段之间的相似度大于此阈值时,将合并这两个片段。 19 | """ 20 | self.seg_duration = seg_duration 21 | self.seg_shift = seg_shift 22 | self.sample_rate = sample_rate 23 | self.merge_threshold = merge_threshold 24 | self.spectral_cluster = SpectralCluster() 25 | 26 | def segments_audio(self, audio_segment: AudioSegment) -> list: 27 | """ 从音频段中分割出有效的语音段。 28 | 29 | Args: 30 | audio_segment (AudioSegment): 要分割的音频段对象。 31 | Returns: 32 | list: 分割出的有效语音段列表,每个元素是一个包含起始时间戳、结束时间戳和对应音频样本的列表。 33 | """ 34 | vad_segments = [] 35 | samples = audio_segment.samples 36 | self.sample_rate = audio_segment.sample_rate 37 | vad_time_list = audio_segment.vad(return_seconds=True) 38 | for t in vad_time_list: 39 | st = round(t['start'], 3) 40 | ed = round(t['end'], 3) 41 | vad_segments.append([st, ed, samples[int(st * self.sample_rate):int(ed * self.sample_rate)]]) 42 | self._check_audio_list(vad_segments) 43 | segments = self._chunk(vad_segments) 44 | return segments 45 | 46 | # 检查分割的结果数据是否符合要求 47 | def _check_audio_list(self, audio: list): 48 | audio_duration = 0 49 | for i in range(len(audio)): 50 | seg = audio[i] 51 | assert seg[1] >= seg[0], '分割的时间戳错误' 52 | assert isinstance(seg[2], np.ndarray), '数据的类型不正确' 53 | assert int(seg[1] * self.sample_rate) - int(seg[0] * self.sample_rate) == seg[2].shape[0], '时间长度和数据长度不匹配' 54 | if i > 0: 55 | assert seg[0] >= audio[i - 1][1], 'modelscope error: Wrong time stamps.' 56 | audio_duration += seg[1] - seg[0] 57 | assert audio_duration > 5, f'音频时间过短,应当大于5秒,当前长度是{audio_duration}秒' 58 | 59 | # 将音频片段继续细分割成固定长度的片段 60 | def _chunk(self, vad_segments: list) -> list: 61 | 62 | def seg_chunk(seg_data): 63 | seg_st = seg_data[0] 64 | data = seg_data[2] 65 | chunk_len = int(self.seg_duration * self.sample_rate) 66 | chunk_shift = int(self.seg_shift * self.sample_rate) 67 | last_chunk_ed = 0 68 | seg_res = [] 69 | for chunk_st in range(0, data.shape[0], chunk_shift): 70 | chunk_ed = min(chunk_st + chunk_len, data.shape[0]) 71 | if chunk_ed <= last_chunk_ed: 72 | break 73 | last_chunk_ed = chunk_ed 74 | chunk_st = max(0, chunk_ed - chunk_len) 75 | chunk_data = data[chunk_st:chunk_ed] 76 | if chunk_data.shape[0] < chunk_len: 77 | chunk_data = np.pad(chunk_data, (0, chunk_len - chunk_data.shape[0]), 'constant') 78 | seg_res.append([ 79 | chunk_st / self.sample_rate + seg_st, chunk_ed / self.sample_rate + seg_st, 80 | chunk_data 81 | ]) 82 | return seg_res 83 | 84 | segs = [] 85 | for i, s in enumerate(vad_segments): 86 | segs.extend(seg_chunk(s)) 87 | return segs 88 | 89 | def clustering(self, embeddings: np.ndarray, speaker_num=None) -> [np.ndarray, np.ndarray]: 90 | """聚类音频特征向量,返回聚类后的标签数组 91 | 92 | Args: 93 | embeddings (np.ndarray): 音频特征向量数组,形状为 (n_samples, embedding_dim) 94 | speaker_num (int): 说话人数量,提供说话人数量可以提高准确率 95 | Returns: 96 | Dict[np.ndarray, dict]: 聚类后的标签数组,形状为 (n_samples,) 97 | """ 98 | labels = self.spectral_cluster(embeddings, oracle_num=speaker_num) 99 | labels = self._correct_labels(labels) 100 | # 每个说话人特征向量平均值 101 | spk_num = labels.max() + 1 102 | spk_center = [] 103 | for i in range(spk_num): 104 | spk_emb = embeddings[labels == i].mean(0) 105 | spk_center.append(spk_emb) 106 | assert len(spk_center) > 0 107 | spk_center_embeddings = np.stack(spk_center, axis=0) 108 | labels = self._merge_by_cos(labels, spk_center, self.merge_threshold) 109 | return labels, spk_center_embeddings 110 | 111 | # 通过余弦相似度合并相似说话人 112 | @staticmethod 113 | def _merge_by_cos(labels, spk_center_emb, cos_thr): 114 | assert 0 < cos_thr <= 1 115 | while True: 116 | spk_num = labels.max() + 1 117 | if spk_num == 1: 118 | break 119 | spk_center = [] 120 | for i in range(spk_num): 121 | spk_emb = spk_center_emb[i] 122 | spk_center.append(spk_emb) 123 | assert len(spk_center) > 0 124 | spk_center = np.stack(spk_center, axis=0) 125 | norm_spk_center = spk_center / np.linalg.norm(spk_center, axis=1, keepdims=True) 126 | affinity = np.matmul(norm_spk_center, norm_spk_center.T) 127 | affinity = np.triu(affinity, 1) 128 | spks = np.unravel_index(np.argmax(affinity), affinity.shape) 129 | if affinity[spks] < cos_thr: 130 | break 131 | for i in range(len(labels)): 132 | if labels[i] == spks[1]: 133 | labels[i] = spks[0] 134 | elif labels[i] > spks[1]: 135 | labels[i] -= 1 136 | return labels 137 | 138 | def postprocess(self, segments: list, labels: np.ndarray) -> list: 139 | """对音频分割结果进行后处理,包括标签校正、片段合并、重叠区域分配和平滑处理。 140 | 141 | Args: 142 | segments (list): 包含分割的数据列表,每个元素是一个包含起始时间、结束时间,音频数据。 143 | labels (np.ndarray): 包含每个音频片段对应说话人标签的数组。 144 | Returns: 145 | list: 包含处理后的音频片段信息的列表,包含说话人标签、起始时间和结束时间。 146 | """ 147 | assert len(segments) == len(labels) 148 | distribute_res = [] 149 | for i in range(len(segments)): 150 | distribute_res.append([segments[i][0], segments[i][1], labels[i]]) 151 | # 按时间顺序合并相同的说话人 152 | distribute_res = self._merge_seque(distribute_res) 153 | 154 | def is_overlapped(t1, t2): 155 | if t1 > t2 + 1e-4: 156 | return True 157 | return False 158 | 159 | # 分割重叠区域 160 | for i in range(1, len(distribute_res)): 161 | if is_overlapped(distribute_res[i - 1][1], distribute_res[i][0]): 162 | p = (distribute_res[i][0] + distribute_res[i - 1][1]) / 2 163 | distribute_res[i][0] = p 164 | distribute_res[i - 1][1] = p 165 | 166 | # 平滑处理 167 | distribute_res = self._smooth(distribute_res) 168 | 169 | # 将结果转换为字典形式 170 | results = [] 171 | for result in distribute_res: 172 | results.append(dict(speaker=result[2], start=round(result[0], 3), end=round(result[1], 3))) 173 | 174 | return results 175 | 176 | # 重排序标签 177 | @staticmethod 178 | def _correct_labels(labels): 179 | labels_id = 0 180 | id2id = {} 181 | new_labels = [] 182 | for i in labels: 183 | if i not in id2id: 184 | id2id[i] = labels_id 185 | labels_id += 1 186 | new_labels.append(id2id[i]) 187 | return np.array(new_labels) 188 | 189 | # 合并连续且属于同一说话人的音频片段 190 | @staticmethod 191 | def _merge_seque(distribute_res): 192 | res = [distribute_res[0]] 193 | for i in range(1, len(distribute_res)): 194 | if distribute_res[i][2] != res[-1][2] or distribute_res[i][0] > res[-1][1]: 195 | res.append(distribute_res[i]) 196 | else: 197 | res[-1][1] = distribute_res[i][1] 198 | return res 199 | 200 | # 对结果进行平滑处理,主要是处理时间长度过短的片段 201 | def _smooth(self, res, min_duration=1): 202 | for i in range(len(res)): 203 | res[i][0] = round(res[i][0], 2) 204 | res[i][1] = round(res[i][1], 2) 205 | if res[i][1] - res[i][0] < min_duration: 206 | if i == 0: 207 | res[i][2] = res[i + 1][2] 208 | elif i == len(res) - 1: 209 | res[i][2] = res[i - 1][2] 210 | elif res[i][0] - res[i - 1][1] <= res[i + 1][0] - res[i][1]: 211 | res[i][2] = res[i - 1][2] 212 | else: 213 | res[i][2] = res[i + 1][2] 214 | # 合并说话人 215 | res = self._merge_seque(res) 216 | return res 217 | 218 | 219 | class SpectralCluster: 220 | def __init__(self, min_num_spks=1, max_num_spks=15, pval=0.022): 221 | """实现了基于相似度矩阵的非归一化拉普拉斯矩阵的谱聚类方法。 222 | 223 | :param min_num_spks: 聚类的最小数量,默认为1。 224 | :type min_num_spks: int 225 | :param max_num_spks: 聚类的最大数量,默认为15。 226 | :type max_num_spks: int 227 | :param pval: 用于相似度矩阵修剪的阈值,默认为0.022。 228 | :type pval: float 229 | """ 230 | self.min_num_spks = min_num_spks 231 | self.max_num_spks = max_num_spks 232 | self.pval = pval 233 | 234 | # 对输入数据X进行谱聚类,返回聚类标签 235 | def __call__(self, X, oracle_num=None): 236 | """ 237 | 238 | :param X: 输入数据,形状为[n_samples, n_features] 239 | :type X: np.ndarray 240 | :param oracle_num: 聚类数量,默认为None,此时将根据特征间隙自动选择聚类数量。 241 | :type oracle_num: int 242 | :return: 聚类标签,形状为[n_samples] 243 | """ 244 | sim_mat = self.get_sim_mat(X) 245 | prunned_sim_mat = self.p_pruning(sim_mat) 246 | sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T) 247 | laplacian = self.get_laplacian(sym_prund_sim_mat) 248 | emb, num_of_spk = self.get_spec_embs(laplacian, oracle_num) 249 | labels = self.cluster_embs(emb, num_of_spk) 250 | return labels 251 | 252 | # 计算输入数据X的相似度矩阵 253 | @staticmethod 254 | def get_sim_mat(X): 255 | # Cosine similarities 256 | M = sklearn.metrics.pairwise.cosine_similarity(X, X) 257 | return M 258 | 259 | # 根据阈值pval修剪相似度矩阵A 260 | def p_pruning(self, A): 261 | if A.shape[0] * self.pval < 6: 262 | pval = 6. / A.shape[0] 263 | else: 264 | pval = self.pval 265 | n_elems = int((1 - pval) * A.shape[0]) 266 | 267 | # 关联矩阵中的每一行中的前n_elems个最小值下标 268 | for i in range(A.shape[0]): 269 | low_indexes = np.argsort(A[i, :]) 270 | low_indexes = low_indexes[0:n_elems] 271 | # 用0替换较小的相似度值 272 | A[i, low_indexes] = 0 273 | return A 274 | 275 | # 计算对称相似度矩阵M的拉普拉斯矩阵 276 | @staticmethod 277 | def get_laplacian(M): 278 | M[np.diag_indices(M.shape[0])] = 0 279 | D = np.sum(np.abs(M), axis=1) 280 | D = np.diag(D) 281 | L = D - M 282 | return L 283 | 284 | # 计算拉普拉斯矩阵L的谱嵌入,并根据特征间隙或指定的oracle_num确定聚类数量 285 | def get_spec_embs(self, L, k_oracle=None): 286 | lambdas, eig_vecs = scipy.linalg.eigh(L) 287 | 288 | if k_oracle is not None: 289 | num_of_spk = k_oracle 290 | else: 291 | lambda_gap_list = self.get_eigen_gaps(lambdas[self.min_num_spks - 1:self.max_num_spks + 1]) 292 | num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks 293 | 294 | emb = eig_vecs[:, :num_of_spk] 295 | return emb, num_of_spk 296 | 297 | # 使用k-means算法对谱嵌入emb进行聚类,返回聚类标签 298 | @staticmethod 299 | def cluster_embs(emb, k): 300 | _, labels, _ = k_means(emb, k, n_init="auto") 301 | return labels 302 | 303 | # 计算特征值的间隙列表 304 | @staticmethod 305 | def get_eigen_gaps(eig_vals): 306 | eig_vals_gap_list = [] 307 | for i in range(len(eig_vals) - 1): 308 | gap = float(eig_vals[i + 1]) - float(eig_vals[i]) 309 | eig_vals_gap_list.append(gap) 310 | return eig_vals_gap_list 311 | -------------------------------------------------------------------------------- /ppvector/infer_utils/viewer.py: -------------------------------------------------------------------------------- 1 | # This implementation is adapted from https://github.com/taylorlu/Speaker-Diarization 2 | import matplotlib.pyplot as plot 3 | 4 | from ppvector.infer_utils.player import AudioPlayer 5 | 6 | 7 | class PlotSpeaker: 8 | def __init__(self, speakers_data, audio_path=None, title="speaker-diarization", gui=True, size=(14, 6)): 9 | """绘制说话人结果 10 | 11 | Args: 12 | speakers_data (list): 包含说话人信息的列表,每个元素是一个包含起始时间戳、结束时间戳和说话人的字典。 13 | audio_path (str, optional): 音频文件的路径,默认为None。如果提供,则使用AudioPlayer播放音频。 14 | title (str, optional): 图形窗口的标题,默认为"speaker-diarization"。 15 | gui (bool, optional): 是否启用图形用户界面,默认为True。 16 | size (tuple, optional): 图形窗口的大小(宽度,高度),默认为(14, 6)。 17 | """ 18 | # 检测类别名称是否包含中文,是则设置相应字体 19 | s = ''.join([str(data["speaker"]) for data in speakers_data]) 20 | s += title 21 | is_ascii = all(ord(c) < 128 for c in s) 22 | if not is_ascii: 23 | plot.rcParams['font.sans-serif'] = ['SimHei'] 24 | plot.rcParams['axes.unicode_minus'] = False 25 | # 定义颜色 26 | self.rect_color = (0.0, 0.6, 1.0, 1.0) 27 | self.rect_selected_color = (0.75, 0.75, 0, 1.0) 28 | self.cluster_colors = [(0.0, 0.6, 1.0, 1.0), (0.0, 1.0, 0.6, 1.0), (0.6, 0.0, 1.0, 1.0), 29 | (0.6, 1.0, 0.0, 1.0), (1.0, 0.0, 0.6, 1.0), (1.0, 0.6, 0.0, 1.0)] 30 | self.gui = gui 31 | self.title = title 32 | self.fig = plot.figure(figsize=size, facecolor='white', tight_layout=True) 33 | self.plot = plot 34 | 35 | self.ax = self.fig.add_subplot(1, 1, 1) 36 | if self.gui: 37 | self.fig.canvas.mpl_connect('key_press_event', self._on_keypress) 38 | self.fig.canvas.mpl_connect('button_press_event', self._on_click) 39 | self.height = 5 40 | self.maxx = 0 41 | self.audio = None 42 | if audio_path is not None and self.gui: 43 | self.audio = AudioPlayer(audio_path) 44 | self.timer = self.fig.canvas.new_timer(interval=500) 45 | self.timer.add_callback(self._update_timeline) 46 | self.timer.start() 47 | 48 | self.timeline = self.ax.plot([0, 0], [0, 0], color='r')[-1] 49 | segment_data = dict() 50 | for data in speakers_data: 51 | start, end, speaker = data['start'], data['end'], data['speaker'] 52 | if speaker not in segment_data: 53 | segment_data[speaker] = [] 54 | segment_data[speaker].append(dict(start=start, end=end)) 55 | self.speakers_data = segment_data 56 | 57 | # 根据音频播放器中的位置更新时间轴 58 | def _update_timeline(self): 59 | if self.audio is not None and self.audio.playing: 60 | t = self.audio.current_time() 61 | self._draw_timeline(t) 62 | self.fig.canvas.draw() 63 | 64 | # 绘制时间轴 65 | def _draw_timeline(self, t): 66 | min_y, max_y = self.ax.get_ylim() 67 | self.timeline.set_data([t, t], [min_y, max_y]) 68 | self._draw_info(t) 69 | 70 | # 绘制信息 71 | @staticmethod 72 | def _draw_info(t): 73 | h = int(t) // 3600 74 | t %= 3600 75 | m = int(t) // 60 76 | s = int(t % 60) 77 | plot.xlabel(f'time: {h:02}:{m:02}:{s:02}') 78 | 79 | def draw(self, save_path=None): 80 | """绘制说话人分割结果 81 | 82 | Args: 83 | save_path (str, optional): 保存图像的路径,默认为None。如果提供,则将绘制的图像保存到指定路径。 84 | """ 85 | y = 0 86 | labels_pos = [] 87 | labels = [] 88 | for i, cluster in enumerate(self.speakers_data.keys()): 89 | labels.append(cluster) 90 | labels_pos.append(y + self.height // 2) 91 | for row in self.speakers_data[cluster]: 92 | x = row['start'] 93 | w = row['end'] - row['start'] 94 | self.maxx = max(self.maxx, row['end']) 95 | c = self.cluster_colors[i % len(self.cluster_colors)] 96 | rect = plot.Rectangle((x, y), w, self.height, color=c) 97 | self.ax.add_patch(rect) 98 | y += self.height 99 | if self.gui: 100 | plot.xlim([0, min(600, self.maxx)]) 101 | else: 102 | plot.xlim([0, self.maxx]) 103 | 104 | plot.ylim([0, y]) 105 | plot.yticks(labels_pos, labels) 106 | for _ in self.speakers_data: 107 | self.ax.plot([0, self.maxx], [y, y], linestyle=':', color='#AAAAAA') 108 | y -= self.height 109 | 110 | plot.title(self.title) 111 | if self.gui: 112 | self._draw_info(0) 113 | plot.tight_layout() 114 | if save_path is not None: 115 | plot.savefig(save_path) 116 | 117 | # 键盘点击事件处理函数 118 | def _on_keypress(self, event): 119 | if event.key == ' ' and self.audio is not None: 120 | if self.audio.playing: 121 | self.audio.pause() 122 | else: 123 | self.audio.play() 124 | self.fig.canvas.draw() 125 | 126 | # 鼠标点击事件处理函数 127 | def _on_click(self, event): 128 | if event.xdata is not None: 129 | if self.audio is not None: 130 | self.audio.pause() 131 | self.audio.seek(event.xdata) 132 | self._draw_timeline(event.xdata) 133 | self.fig.canvas.draw() 134 | -------------------------------------------------------------------------------- /ppvector/loss/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from loguru import logger 4 | 5 | from .aamloss import AAMLoss 6 | from .amloss import AMLoss 7 | from .armloss import ARMLoss 8 | from .celoss import CELoss 9 | from .sphereface2 import SphereFace2 10 | from .subcenterloss import SubCenterLoss 11 | from .tripletangularmarginloss import TripletAngularMarginLoss 12 | 13 | __all__ = ['build_loss'] 14 | 15 | 16 | def build_loss(configs): 17 | use_loss = configs.loss_conf.get('loss', 'AAMLoss') 18 | loss_args = configs.loss_conf.get('loss_args', {}) 19 | los = importlib.import_module(__name__) 20 | loss = getattr(los, use_loss)(**loss_args) 21 | logger.info(f'成功创建损失函数:{use_loss},参数为:{loss_args}') 22 | return loss 23 | -------------------------------------------------------------------------------- /ppvector/loss/aamloss.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | import paddle.nn.functional as F 6 | 7 | 8 | class AAMLoss(nn.Layer): 9 | def __init__(self, margin=0.2, scale=32, easy_margin=False, label_smoothing=0.0): 10 | """The Implementation of Additive Angular Margin (AAM) proposed 11 | in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition''' 12 | (https://arxiv.org/abs/1906.07317) 13 | 14 | Args: 15 | margin (float, optional): margin factor. Defaults to 0.3. 16 | scale (float, optional): scale factor. Defaults to 32.0. 17 | easy_margin (bool, optional): easy_margin flag. Defaults to False. 18 | """ 19 | super(AAMLoss, self).__init__() 20 | self.scale = scale 21 | self.easy_margin = easy_margin 22 | self.cos_m = math.cos(margin) 23 | self.sin_m = math.sin(margin) 24 | self.th = math.cos(math.pi - margin) 25 | self.mmm = 1.0 + math.cos(math.pi - margin) 26 | self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing) 27 | 28 | def forward(self, inputs, labels): 29 | """ 30 | Args: 31 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 32 | labels(paddle.Tensor): 类别标签 (batch_size) 33 | """ 34 | features, logits = inputs['features'], inputs['logits'] 35 | sine = paddle.sqrt(1.0 - paddle.pow(logits, 2)) 36 | phi = logits * self.cos_m - sine * self.sin_m 37 | if self.easy_margin: 38 | phi = paddle.where(logits > 0, phi, logits) 39 | else: 40 | phi = paddle.where(logits > self.th, phi, logits - self.mmm) 41 | 42 | one_hot = F.one_hot(labels, logits.shape[1]) 43 | output = (one_hot * phi) + ((1.0 - one_hot) * logits) 44 | output *= self.scale 45 | 46 | loss = self.criterion(output, labels) 47 | return loss 48 | 49 | def update(self, margin=0.2): 50 | self.cos_m = math.cos(margin) 51 | self.sin_m = math.sin(margin) 52 | self.th = math.cos(math.pi - margin) 53 | self.mmm = 1.0 + math.cos(math.pi - margin) 54 | -------------------------------------------------------------------------------- /ppvector/loss/amloss.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | 4 | 5 | class AMLoss(nn.Layer): 6 | def __init__(self, margin=0.2, scale=30, label_smoothing=0.0): 7 | super(AMLoss, self).__init__() 8 | self.margin = margin 9 | self.scale = scale 10 | self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing) 11 | 12 | def forward(self, inputs, labels): 13 | """ 14 | Args: 15 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 16 | labels(paddle.Tensor): 类别标签 (batch_size) 17 | """ 18 | features, logits = inputs['features'], inputs['logits'] 19 | delt_costh = paddle.zeros(logits.shape) 20 | for i, index in enumerate(labels): 21 | delt_costh[i, index] = self.margin 22 | costh_m = logits - delt_costh 23 | predictions = self.scale * costh_m 24 | loss = self.criterion(predictions, labels) / labels.shape[0] 25 | return loss 26 | 27 | def update(self, margin=0.2): 28 | self.margin = margin 29 | -------------------------------------------------------------------------------- /ppvector/loss/armloss.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | 4 | 5 | class ARMLoss(nn.Layer): 6 | def __init__(self, margin=0.2, scale=30, label_smoothing=0.0): 7 | super(ARMLoss, self).__init__() 8 | self.margin = margin 9 | self.scale = scale 10 | self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing) 11 | 12 | def forward(self, inputs, labels): 13 | """ 14 | Args: 15 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 16 | labels(paddle.Tensor): 类别标签 (batch_size) 17 | """ 18 | features, logits = inputs['features'], inputs['logits'] 19 | delt_costh = paddle.zeros(logits.shape) 20 | for i, index in enumerate(labels): 21 | delt_costh[i, index] = self.margin 22 | costh_m = logits - delt_costh 23 | costh_m_s = self.scale * costh_m 24 | delt_costh_m_s = paddle.zeros([logits.shape[0], 1], dtype=paddle.float32) 25 | for i, index in enumerate(labels): 26 | delt_costh_m_s[i] = costh_m_s[i, index] 27 | delt_costh_m_s = delt_costh_m_s.tile([1, costh_m_s.shape[1]]) 28 | costh_m_s_reduct = costh_m_s - delt_costh_m_s 29 | predictions = paddle.where(costh_m_s_reduct < 0.0, paddle.zeros_like(costh_m_s), costh_m_s) 30 | loss = self.criterion(predictions, labels) / labels.shape[0] 31 | return loss 32 | 33 | def update(self, margin=0.2): 34 | self.margin = margin 35 | 36 | -------------------------------------------------------------------------------- /ppvector/loss/celoss.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | 4 | 5 | class CELoss(nn.Layer): 6 | def __init__(self, label_smoothing=0.0): 7 | super(CELoss, self).__init__() 8 | self.criterion = paddle.nn.CrossEntropyLoss(reduction="sum", label_smoothing=label_smoothing) 9 | 10 | def forward(self, inputs, labels): 11 | """ 12 | Args: 13 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 14 | labels(paddle.Tensor): 类别标签 (batch_size) 15 | """ 16 | features, logits = inputs['features'], inputs['logits'] 17 | loss = self.criterion(logits, labels) / labels.shape[0] 18 | return loss 19 | 20 | def update(self, margin=0.2): 21 | pass 22 | 23 | -------------------------------------------------------------------------------- /ppvector/loss/sphereface2.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | import paddle.nn.functional as F 6 | 7 | 8 | 9 | class SphereFace2(nn.Layer): 10 | def __init__(self, margin=0.2, scale=32.0, lanbuda=0.7, t=3, margin_type='C'): 11 | """Implement of sphereface2 for speaker verification: 12 | Reference: 13 | [1] Exploring Binary Classification Loss for Speaker Verification 14 | https://ieeexplore.ieee.org/abstract/document/10094954 15 | [2] Sphereface2: Binary classification is all you need for deep face recognition 16 | https://arxiv.org/pdf/2108.01513 17 | Args: 18 | scale: norm of logits feature 19 | margin: margin 20 | lanbuda: weight of positive and negative pairs 21 | t: parameter for adjust score distribution 22 | margin_type: A:cos(theta+margin) or C:cos(theta)-margin 23 | Recommend margin: 24 | training: 0.2 for C and 0.15 for A 25 | LMF: 0.3 for C and 0.25 for A 26 | """ 27 | super(SphereFace2, self).__init__() 28 | self.scale = scale 29 | self.bias = paddle.create_parameter([1, 1], dtype=paddle.float32, is_bias=True) 30 | self.t = t 31 | self.lanbuda = lanbuda 32 | self.margin_type = margin_type 33 | 34 | self.margin = margin 35 | self.cos_m = math.cos(margin) 36 | self.sin_m = math.sin(margin) 37 | self.th = math.cos(math.pi - margin) 38 | self.mmm = 1.0 + math.cos(math.pi - margin) 39 | 40 | def fun_g(self, z, t: int): 41 | gz = 2 * paddle.pow((z + 1) / 2, t) - 1 42 | return gz 43 | 44 | def forward(self, inputs, labels): 45 | """ 46 | Args: 47 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 48 | labels(paddle.Tensor): 类别标签 (batch_size) 49 | """ 50 | features, logits = inputs['features'], inputs['logits'] 51 | if self.margin_type == 'A': # arcface type 52 | sin = paddle.sqrt(1.0 - paddle.pow(logits, 2)) 53 | cos_m_theta_p = self.scale * self.fun_g( 54 | paddle.where(logits > self.th, logits * self.cos_m - sin * self.sin_m, logits - self.mmm), self.t) + \ 55 | self.bias[0][0] 56 | cos_m_theta_n = self.scale * self.fun_g(logits * self.cos_m + sin * self.sin_m, self.t) + self.bias[0][0] 57 | cos_p_theta = self.lanbuda * paddle.log(1 + paddle.exp(-1.0 * cos_m_theta_p)) 58 | cos_n_theta = (1 - self.lanbuda) * paddle.log(1 + paddle.exp(cos_m_theta_n)) 59 | else: 60 | # cosface type 61 | cos_m_theta_p = self.scale * (self.fun_g(logits, self.t) - self.margin) + self.bias[0][0] 62 | cos_m_theta_n = self.scale * (self.fun_g(logits, self.t) + self.margin) + self.bias[0][0] 63 | cos_p_theta = self.lanbuda * paddle.log(1 + paddle.exp(-1.0 * cos_m_theta_p)) 64 | cos_n_theta = (1 - self.lanbuda) * paddle.log(1 + paddle.exp(cos_m_theta_n)) 65 | 66 | target_mask = F.one_hot(labels, logits.shape[1]) 67 | nontarget_mask = 1 - target_mask 68 | loss = (target_mask * cos_p_theta + nontarget_mask * cos_n_theta).sum(1).mean() 69 | return loss 70 | 71 | def update(self, margin=0.2): 72 | self.margin = margin 73 | self.cos_m = math.cos(margin) 74 | self.sin_m = math.sin(margin) 75 | self.th = math.cos(math.pi - margin) 76 | self.mmm = 1.0 + math.cos(math.pi - margin) 77 | 78 | -------------------------------------------------------------------------------- /ppvector/loss/subcenterloss.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | import paddle.nn.functional as F 6 | 7 | 8 | class SubCenterLoss(nn.Layer): 9 | r"""Implement of large margin arc distance with subcenter: 10 | Reference:Sub-center ArcFace: Boosting Face Recognition byLarge-Scale Noisy 11 | Web Faces.https://ibug.doc.ic.ac.uk/media/uploads/documents/eccv_1445.pdf 12 | 13 | Args: 14 | margin (float, optional): margin factor. Defaults to 0.3. 15 | scale (float, optional): scale factor. Defaults to 32.0. 16 | easy_margin (bool, optional): easy_margin flag. Defaults to False. 17 | K: number of sub-centers, same classifier K. 18 | """ 19 | 20 | def __init__(self, margin=0.2, scale=32, easy_margin=False, K=3, label_smoothing=0.0): 21 | super(SubCenterLoss, self).__init__() 22 | self.scale = scale 23 | # subcenter 24 | self.K = K 25 | self.easy_margin = easy_margin 26 | self.cos_m = math.cos(margin) 27 | self.sin_m = math.sin(margin) 28 | self.th = math.cos(math.pi - margin) 29 | self.mmm = 1.0 + math.cos(math.pi - margin) 30 | self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing) 31 | 32 | def forward(self, inputs, labels): 33 | """ 34 | Args: 35 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 36 | labels(paddle.Tensor): 类别标签 (batch_size) 37 | """ 38 | features, logits = inputs['features'], inputs['logits'] 39 | # (batch, out_dim, k) 40 | cosine = paddle.reshape(logits, (-1, logits.shape[1] // self.K, self.K)) 41 | # (batch, out_dim) 42 | cosine = paddle.max(cosine, 2) 43 | sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2)) 44 | phi = cosine * self.cos_m - sine * self.sin_m 45 | if self.easy_margin: 46 | phi = paddle.where(cosine > 0, phi, cosine) 47 | else: 48 | phi = paddle.where(cosine > self.th, phi, cosine - self.mmm) 49 | 50 | one_hot = F.one_hot(labels, cosine.shape[1]) 51 | output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 52 | output *= self.scale 53 | 54 | loss = self.criterion(output, labels) 55 | return loss 56 | 57 | def update(self, margin=0.2): 58 | self.cos_m = math.cos(margin) 59 | self.sin_m = math.sin(margin) 60 | self.th = math.cos(math.pi - margin) 61 | self.mmm = 1.0 + math.cos(math.pi - margin) 62 | -------------------------------------------------------------------------------- /ppvector/loss/tripletangularmarginloss.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | 4 | 5 | class TripletAngularMarginLoss(nn.Layer): 6 | """A more robust triplet loss with hard positive/negative mining on angular margin instead of relative distance between d(a,p) and d(a,n). 7 | 8 | Args: 9 | margin (float, optional): angular margin. Defaults to 0.5. 10 | normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True. 11 | add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to True. 12 | absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0. 13 | ap_value (float, optional): weight for d(a, p). Defaults to 0.8. 14 | an_value (float, optional): weight for d(a, n). Defaults to 0.4. 15 | """ 16 | 17 | def __init__(self, 18 | margin=0.5, 19 | normalize_feature=True, 20 | add_absolute=True, 21 | absolute_loss_weight=1.0, 22 | ap_value=0.8, 23 | an_value=0.4, 24 | label_smoothing=0.0): 25 | super(TripletAngularMarginLoss, self).__init__() 26 | self.margin = margin 27 | self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin) 28 | self.normalize_feature = normalize_feature 29 | self.add_absolute = add_absolute 30 | self.ap_value = ap_value 31 | self.an_value = an_value 32 | self.absolute_loss_weight = absolute_loss_weight 33 | self.criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing) 34 | 35 | def forward(self, inputs, labels): 36 | """ 37 | Args: 38 | inputs(dict): 模型输出的特征向量 (batch_size, feat_dim) 和分类层输出的logits(batch_size, class_num) 39 | labels(paddle.Tensor): 类别标签 (batch_size) 40 | """ 41 | features, logits = inputs['features'], inputs['logits'] 42 | loss_ce = self.criterion(logits, labels) 43 | 44 | if self.normalize_feature: 45 | features = paddle.divide(features, paddle.norm(features, p=2, axis=-1, keepdim=True)) 46 | 47 | bs = features.shape[0] 48 | 49 | # compute distance(cos-similarity) 50 | dist = paddle.matmul(features, features.t()) 51 | 52 | # hard negative mining 53 | is_pos = paddle.expand(labels, (bs, bs)).equal(paddle.expand(labels, (bs, bs)).t()) 54 | is_neg = paddle.expand(labels, (bs, bs)).not_equal(paddle.expand(labels, (bs, bs)).t()) 55 | 56 | # `dist_ap` means distance(anchor, positive) 57 | # both `dist_ap` and `relative_p_inds` with shape [N, 1] 58 | d1 = paddle.masked_select(dist, is_pos) 59 | d2 = paddle.reshape(d1, (bs, -1)) 60 | dist_ap = paddle.min(d2, axis=1, keepdim=True) 61 | # `dist_an` means distance(anchor, negative) 62 | # both `dist_an` and `relative_n_inds` with shape [N, 1] 63 | dist_an = paddle.max(paddle.reshape( 64 | paddle.masked_select(dist, is_neg), (bs, -1)), axis=1, keepdim=True) 65 | # shape [N] 66 | dist_ap = paddle.squeeze(dist_ap, axis=1) 67 | dist_an = paddle.squeeze(dist_an, axis=1) 68 | 69 | # Compute ranking hinge loss 70 | y = paddle.ones_like(dist_an) 71 | loss = self.ranking_loss(dist_ap, dist_an, y) 72 | 73 | if self.add_absolute: 74 | absolut_loss_ap = self.ap_value - dist_ap 75 | absolut_loss_ap = paddle.where(absolut_loss_ap > 0, absolut_loss_ap, paddle.zeros_like(absolut_loss_ap)) 76 | 77 | absolut_loss_an = dist_an - self.an_value 78 | absolut_loss_an = paddle.where(absolut_loss_an > 0, absolut_loss_an, paddle.ones_like(absolut_loss_an)) 79 | 80 | loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()) * self.absolute_loss_weight + loss.mean() 81 | loss = loss + loss_ce 82 | return loss 83 | 84 | def update(self, margin=0.5): 85 | self.ranking_loss.margin = margin 86 | -------------------------------------------------------------------------------- /ppvector/metric/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/metric/__init__.py -------------------------------------------------------------------------------- /ppvector/metric/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compute_fnr_fpr(scores, labels, weights=None): 5 | sorted_ndx = np.argsort(scores) 6 | thresholds = scores[sorted_ndx] 7 | labels = labels[sorted_ndx] 8 | if weights is not None: 9 | weights = weights[sorted_ndx] 10 | else: 11 | weights = np.ones(labels.shape, dtype='f8') 12 | 13 | tgt_wghts = weights * (labels == 1).astype('f8') 14 | imp_wghts = weights * (labels == 0).astype('f8') 15 | 16 | fnr = np.cumsum(tgt_wghts) / np.sum(tgt_wghts) 17 | fpr = 1 - np.cumsum(imp_wghts) / np.sum(imp_wghts) 18 | return fnr, fpr, thresholds 19 | 20 | 21 | def compute_eer(fnr, fpr, scores=None): 22 | diff_pm_fa = fnr - fpr 23 | x1 = np.flatnonzero(diff_pm_fa >= 0)[0] 24 | x2 = np.flatnonzero(diff_pm_fa < 0)[-1] 25 | a = (fnr[x1] - fpr[x1]) / (fpr[x2] - fpr[x1] - (fnr[x2] - fnr[x1])) 26 | 27 | if scores is not None: 28 | score_sort = np.sort(scores) 29 | return fnr[x1] + a * (fnr[x2] - fnr[x1]), score_sort[x1] 30 | 31 | return fnr[x1] + a * (fnr[x2] - fnr[x1]) 32 | 33 | 34 | def compute_dcf(fnr, fpr, p_target=0.01, c_miss=1, c_fa=1): 35 | c_det = min(c_miss * fnr * p_target + c_fa * fpr * (1 - p_target)) 36 | c_def = min(c_miss * p_target, c_fa * (1 - p_target)) 37 | return c_det / c_def 38 | -------------------------------------------------------------------------------- /ppvector/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from loguru import logger 4 | 5 | from .campplus import CAMPPlus 6 | from .ecapa_tdnn import EcapaTdnn 7 | from .eres2net import ERes2Net, ERes2NetV2 8 | from .res2net import Res2Net 9 | from .resnet_se import ResNetSE 10 | from .tdnn import TDNN 11 | 12 | __all__ = ['build_model'] 13 | 14 | 15 | def build_model(input_size, configs): 16 | use_model = configs.model_conf.get('model', 'CAMPPlus') 17 | model_args = configs.model_conf.get('model_args', {}) 18 | mod = importlib.import_module(__name__) 19 | model = getattr(mod, use_model)(input_size=input_size, **model_args) 20 | logger.info(f'成功创建模型:{use_model},参数为:{model_args}') 21 | return model 22 | -------------------------------------------------------------------------------- /ppvector/models/campplus.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn.functional as F 5 | from paddle import nn 6 | 7 | 8 | def get_nonlinear(config_str, channels): 9 | nonlinear = nn.Sequential() 10 | for name in config_str.split('-'): 11 | if name == 'relu': 12 | nonlinear.add_sublayer('relu', nn.ReLU()) 13 | elif name == 'prelu': 14 | nonlinear.add_sublayer('prelu', nn.PReLU(channels)) 15 | elif name == 'batchnorm': 16 | nonlinear.add_sublayer('batchnorm', nn.BatchNorm1D(channels)) 17 | elif name == 'batchnorm_': 18 | nonlinear.add_sublayer('batchnorm', nn.BatchNorm1D(channels)) 19 | else: 20 | raise ValueError('Unexpected module ({}).'.format(name)) 21 | return nonlinear 22 | 23 | 24 | def statistics_pooling(x, axis=-1, keepdim=False, unbiased=True, eps=1e-2): 25 | mean = x.mean(axis=axis) 26 | std = x.std(axis=axis, unbiased=unbiased) 27 | stats = paddle.concat([mean, std], axis=-1) 28 | if keepdim: 29 | stats = stats.unsqueeze(axis=axis) 30 | return stats 31 | 32 | 33 | class StatsPool(nn.Layer): 34 | def forward(self, x): 35 | return statistics_pooling(x) 36 | 37 | 38 | class TDNNLayer(nn.Layer): 39 | def __init__(self, 40 | in_channels, 41 | out_channels, 42 | kernel_size, 43 | stride=1, 44 | padding=0, 45 | dilation=1, 46 | bias=False, 47 | config_str='batchnorm-relu'): 48 | super(TDNNLayer, self).__init__() 49 | if padding < 0: 50 | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( 51 | kernel_size) 52 | padding = (kernel_size - 1) // 2 * dilation 53 | self.linear = nn.Conv1D(in_channels, 54 | out_channels, 55 | kernel_size, 56 | stride=stride, 57 | padding=padding, 58 | dilation=dilation) 59 | self.nonlinear = get_nonlinear(config_str, out_channels) 60 | 61 | def forward(self, x): 62 | x = self.linear(x) 63 | x = self.nonlinear(x) 64 | return x 65 | 66 | 67 | class CAMLayer(nn.Layer): 68 | def __init__(self, 69 | bn_channels, 70 | out_channels, 71 | kernel_size, 72 | stride, 73 | padding, 74 | dilation, 75 | bias, 76 | reduction=2): 77 | super(CAMLayer, self).__init__() 78 | self.linear_local = nn.Conv1D(bn_channels, 79 | out_channels, 80 | kernel_size, 81 | stride=stride, 82 | padding=padding, 83 | dilation=dilation) 84 | self.linear1 = nn.Conv1D(bn_channels, bn_channels // reduction, 1) 85 | self.relu = nn.ReLU() 86 | self.linear2 = nn.Conv1D(bn_channels // reduction, out_channels, 1) 87 | self.sigmoid = nn.Sigmoid() 88 | 89 | def forward(self, x): 90 | y = self.linear_local(x) 91 | context = x.mean(-1, keepdim=True) + self.seg_pooling(x) 92 | context = self.relu(self.linear1(context)) 93 | m = self.sigmoid(self.linear2(context)) 94 | return y * m 95 | 96 | def seg_pooling(self, x, seg_len=100, stype='avg'): 97 | if stype == 'avg': 98 | seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) 99 | elif stype == 'max': 100 | seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True) 101 | else: 102 | raise ValueError('Wrong segment pooling type.') 103 | shape = seg.shape 104 | seg = seg.unsqueeze(-1).expand((*shape, seg_len)).reshape((*shape[:-1], -1)) 105 | seg = seg[..., :x.shape[-1]] 106 | return seg 107 | 108 | 109 | class CAMDenseTDNNLayer(nn.Layer): 110 | def __init__(self, 111 | in_channels, 112 | out_channels, 113 | bn_channels, 114 | kernel_size, 115 | stride=1, 116 | dilation=1, 117 | bias=False, 118 | config_str='batchnorm-relu', 119 | memory_efficient=False): 120 | super(CAMDenseTDNNLayer, self).__init__() 121 | assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format( 122 | kernel_size) 123 | padding = (kernel_size - 1) // 2 * dilation 124 | self.memory_efficient = memory_efficient 125 | self.nonlinear1 = get_nonlinear(config_str, in_channels) 126 | self.linear1 = nn.Conv1D(in_channels, bn_channels, 1) 127 | self.nonlinear2 = get_nonlinear(config_str, bn_channels) 128 | self.cam_layer = CAMLayer(bn_channels, 129 | out_channels, 130 | kernel_size, 131 | stride=stride, 132 | padding=padding, 133 | dilation=dilation, 134 | bias=bias) 135 | 136 | def bn_function(self, x): 137 | return self.linear1(self.nonlinear1(x)) 138 | 139 | def forward(self, x): 140 | x = self.bn_function(x) 141 | x = self.cam_layer(self.nonlinear2(x)) 142 | return x 143 | 144 | 145 | class CAMDenseTDNNBlock(nn.LayerList): 146 | def __init__(self, 147 | num_layers, 148 | in_channels, 149 | out_channels, 150 | bn_channels, 151 | kernel_size, 152 | stride=1, 153 | dilation=1, 154 | bias=False, 155 | config_str='batchnorm-relu', 156 | memory_efficient=False): 157 | super(CAMDenseTDNNBlock, self).__init__() 158 | for i in range(num_layers): 159 | layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels, 160 | out_channels=out_channels, 161 | bn_channels=bn_channels, 162 | kernel_size=kernel_size, 163 | stride=stride, 164 | dilation=dilation, 165 | bias=bias, 166 | config_str=config_str, 167 | memory_efficient=memory_efficient) 168 | self.add_sublayer('tdnnd%d' % (i + 1), layer) 169 | 170 | def forward(self, x): 171 | for layer in self: 172 | x = paddle.concat([x, layer(x)], axis=1) 173 | return x 174 | 175 | 176 | class TransitLayer(nn.Layer): 177 | def __init__(self, 178 | in_channels, 179 | out_channels, 180 | bias=True, 181 | config_str='batchnorm-relu'): 182 | super(TransitLayer, self).__init__() 183 | self.nonlinear = get_nonlinear(config_str, in_channels) 184 | self.linear = nn.Conv1D(in_channels, out_channels, 1) 185 | 186 | def forward(self, x): 187 | x = self.nonlinear(x) 188 | x = self.linear(x) 189 | return x 190 | 191 | 192 | class DenseLayer(nn.Layer): 193 | def __init__(self, 194 | in_channels, 195 | out_channels, 196 | bias=False, 197 | config_str='batchnorm-relu'): 198 | super(DenseLayer, self).__init__() 199 | self.linear = nn.Conv1D(in_channels, out_channels, 1) 200 | self.nonlinear = get_nonlinear(config_str, out_channels) 201 | 202 | def forward(self, x): 203 | if len(x.shape) == 2: 204 | x = self.linear(x.unsqueeze(axis=-1)).squeeze(axis=-1) 205 | else: 206 | x = self.linear(x) 207 | x = self.nonlinear(x) 208 | return x 209 | 210 | 211 | class BasicResBlock(nn.Layer): 212 | expansion = 1 213 | 214 | def __init__(self, in_planes, planes, stride=1): 215 | super(BasicResBlock, self).__init__() 216 | self.conv1 = nn.Conv2D(in_planes, 217 | planes, 218 | kernel_size=3, 219 | stride=(stride, 1), 220 | padding=1) 221 | self.bn1 = nn.BatchNorm2D(planes) 222 | self.conv2 = nn.Conv2D(planes, 223 | planes, 224 | kernel_size=3, 225 | stride=1, 226 | padding=1) 227 | self.bn2 = nn.BatchNorm2D(planes) 228 | 229 | self.shortcut = nn.Sequential() 230 | if stride != 1 or in_planes != self.expansion * planes: 231 | self.shortcut = nn.Sequential( 232 | nn.Conv2D(in_planes, 233 | self.expansion * planes, 234 | kernel_size=1, 235 | stride=(stride, 1)), 236 | nn.BatchNorm2D(self.expansion * planes)) 237 | 238 | def forward(self, x): 239 | out = F.relu(self.bn1(self.conv1(x))) 240 | out = self.bn2(self.conv2(out)) 241 | out += self.shortcut(x) 242 | out = F.relu(out) 243 | return out 244 | 245 | 246 | class FCM(nn.Layer): 247 | def __init__(self, 248 | block=BasicResBlock, 249 | num_blocks=[2, 2], 250 | m_channels=32, 251 | feat_dim=80): 252 | super(FCM, self).__init__() 253 | self.in_planes = m_channels 254 | self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1) 255 | self.bn1 = nn.BatchNorm2D(m_channels) 256 | 257 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2) 258 | self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2) 259 | 260 | self.conv2 = nn.Conv2D(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1) 261 | self.bn2 = nn.BatchNorm2D(m_channels) 262 | self.out_channels = m_channels * (math.ceil(feat_dim / 8)) 263 | 264 | def _make_layer(self, block, planes, num_blocks, stride): 265 | strides = [stride] + [1] * (num_blocks - 1) 266 | layers = [] 267 | for stride in strides: 268 | layers.append(block(self.in_planes, planes, stride)) 269 | self.in_planes = planes * block.expansion 270 | return nn.Sequential(*layers) 271 | 272 | def forward(self, x): 273 | x = x.unsqueeze(1) 274 | out = F.relu(self.bn1(self.conv1(x))) 275 | out = self.layer1(out) 276 | out = self.layer2(out) 277 | out = F.relu(self.bn2(self.conv2(out))) 278 | 279 | shape = out.shape 280 | out = out.reshape((shape[0], shape[1] * shape[2], shape[3])) 281 | return out 282 | 283 | 284 | class CAMPPlus(nn.Layer): 285 | def __init__(self, 286 | input_size, 287 | embd_dim=512, 288 | growth_rate=32, 289 | bn_size=4, 290 | init_channels=128, 291 | config_str='batchnorm-relu', 292 | memory_efficient=True): 293 | super(CAMPPlus, self).__init__() 294 | 295 | self.head = FCM(feat_dim=input_size) 296 | channels = self.head.out_channels 297 | self.embd_dim = embd_dim 298 | 299 | self.xvector = nn.Sequential(('tdnn', TDNNLayer(channels, 300 | init_channels, 301 | 5, 302 | stride=2, 303 | dilation=1, 304 | padding=-1, 305 | config_str=config_str))) 306 | channels = init_channels 307 | for i, (num_layers, kernel_size, 308 | dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 2))): 309 | block = CAMDenseTDNNBlock(num_layers=num_layers, 310 | in_channels=channels, 311 | out_channels=growth_rate, 312 | bn_channels=bn_size * growth_rate, 313 | kernel_size=kernel_size, 314 | dilation=dilation, 315 | config_str=config_str, 316 | memory_efficient=memory_efficient) 317 | self.xvector.add_sublayer('block%d' % (i + 1), block) 318 | channels = channels + num_layers * growth_rate 319 | self.xvector.add_sublayer('transit%d' % (i + 1), 320 | TransitLayer(channels, 321 | channels // 2, 322 | bias=False, 323 | config_str=config_str)) 324 | channels //= 2 325 | 326 | self.xvector.add_sublayer('out_nonlinear', get_nonlinear(config_str, channels)) 327 | 328 | self.xvector.add_sublayer('stats', StatsPool()) 329 | self.xvector.add_sublayer('dense', DenseLayer(channels * 2, embd_dim, config_str='batchnorm_')) 330 | 331 | def forward(self, x): 332 | x = x.transpose((0, 2, 1)) # (B,T,F) => (B,F,T) 333 | x = self.head(x) 334 | x = self.xvector(x) 335 | return x 336 | -------------------------------------------------------------------------------- /ppvector/models/ecapa_tdnn.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | 4 | from ppvector.models.pooling import AttentiveStatisticsPooling, SelfAttentivePooling 5 | from ppvector.models.pooling import TemporalAveragePooling, TemporalStatisticsPooling 6 | from ppvector.models.utils import BatchNorm1d, Conv1d, TDNNBlock, length_to_mask 7 | 8 | __all__ = ['EcapaTdnn'] 9 | 10 | 11 | class Res2NetBlock(nn.Layer): 12 | def __init__(self, in_channels, out_channels, scale=8, dilation=1): 13 | """Implementation of Res2Net Block with dilation 14 | The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture", 15 | whose url is https://arxiv.org/abs/1904.01169 16 | Args: 17 | in_channels (int): input channels or input dimensions 18 | out_channels (int): output channels or output dimensions 19 | scale (int, optional): scale in res2net bolck. Defaults to 8. 20 | dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1. 21 | """ 22 | super().__init__() 23 | assert in_channels % scale == 0 24 | assert out_channels % scale == 0 25 | 26 | in_channel = in_channels // scale 27 | hidden_channel = out_channels // scale 28 | 29 | self.blocks = nn.LayerList([ 30 | TDNNBlock( 31 | in_channel, hidden_channel, kernel_size=3, dilation=dilation) 32 | for i in range(scale - 1) 33 | ]) 34 | self.scale = scale 35 | 36 | def forward(self, x): 37 | y = [] 38 | for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)): 39 | if i == 0: 40 | y_i = x_i 41 | elif i == 1: 42 | y_i = self.blocks[i - 1](x_i) 43 | else: 44 | y_i = self.blocks[i - 1](x_i + y_i) 45 | y.append(y_i) 46 | y = paddle.concat(y, axis=1) 47 | return y 48 | 49 | 50 | class SEBlock(nn.Layer): 51 | def __init__(self, in_channels, se_channels, out_channels): 52 | """Implementation of SEBlock 53 | The paper is refered as "Squeeze-and-Excitation Networks" 54 | whose url is https://arxiv.org/abs/1709.01507 55 | Args: 56 | in_channels (int): input channels or input data dimensions 57 | se_channels (_type_): _description_ 58 | out_channels (int): output channels or output data dimensions 59 | """ 60 | super().__init__() 61 | 62 | self.conv1 = Conv1d( 63 | in_channels=in_channels, out_channels=se_channels, kernel_size=1) 64 | self.relu = paddle.nn.ReLU() 65 | self.conv2 = Conv1d( 66 | in_channels=se_channels, out_channels=out_channels, kernel_size=1) 67 | self.sigmoid = paddle.nn.Sigmoid() 68 | 69 | def forward(self, x, lengths=None): 70 | L = x.shape[-1] 71 | if lengths is not None: 72 | mask = length_to_mask(lengths * L, max_len=L) 73 | mask = mask.unsqueeze(1) 74 | total = mask.sum(axis=2, keepdim=True) 75 | s = (x * mask).sum(axis=2, keepdim=True) / total 76 | else: 77 | s = x.mean(axis=2, keepdim=True) 78 | 79 | s = self.relu(self.conv1(s)) 80 | s = self.sigmoid(self.conv2(s)) 81 | 82 | return s * x 83 | 84 | 85 | class SERes2NetBlock(nn.Layer): 86 | def __init__( 87 | self, 88 | in_channels, 89 | out_channels, 90 | res2net_scale=8, 91 | se_channels=128, 92 | kernel_size=1, 93 | dilation=1, 94 | activation=nn.ReLU, ): 95 | """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model 96 | The paper is refered "Squeeze-and-Excitation Networks" 97 | whose url is: https://arxiv.org/pdf/1709.01507.pdf 98 | Args: 99 | in_channels (int): input channels or input data dimensions 100 | out_channels (int): output channels or output data dimensions 101 | res2net_scale (int, optional): scale in the res2net block. Defaults to 8. 102 | se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128. 103 | kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1. 104 | dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1. 105 | activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU. 106 | """ 107 | super().__init__() 108 | self.out_channels = out_channels 109 | self.tdnn1 = TDNNBlock( 110 | in_channels, 111 | out_channels, 112 | kernel_size=1, 113 | dilation=1, 114 | activation=activation, ) 115 | self.res2net_block = Res2NetBlock(out_channels, out_channels, 116 | res2net_scale, dilation) 117 | self.tdnn2 = TDNNBlock( 118 | out_channels, 119 | out_channels, 120 | kernel_size=1, 121 | dilation=1, 122 | activation=activation, ) 123 | self.se_block = SEBlock(out_channels, se_channels, out_channels) 124 | 125 | self.shortcut = None 126 | if in_channels != out_channels: 127 | self.shortcut = Conv1d( 128 | in_channels=in_channels, 129 | out_channels=out_channels, 130 | kernel_size=1, ) 131 | 132 | def forward(self, x, lengths=None): 133 | residual = x 134 | if self.shortcut: 135 | residual = self.shortcut(x) 136 | 137 | x = self.tdnn1(x) 138 | x = self.res2net_block(x) 139 | x = self.tdnn2(x) 140 | x = self.se_block(x, lengths) 141 | 142 | return x + residual 143 | 144 | 145 | class EcapaTdnn(nn.Layer): 146 | def __init__( 147 | self, 148 | input_size, 149 | embd_dim=192, 150 | pooling_type="ASP", 151 | activation=nn.ReLU, 152 | channels=[512, 512, 512, 512, 1536], 153 | kernel_sizes=[5, 3, 3, 3, 1], 154 | dilations=[1, 2, 3, 4, 1], 155 | attention_channels=128, 156 | res2net_scale=8, 157 | se_channels=128, 158 | global_context=True, ): 159 | """Implementation of ECAPA-TDNN backbone model network 160 | The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification" 161 | whose url is: https://arxiv.org/abs/2005.07143 162 | Args: 163 | input_size (_type_): input fature dimension 164 | embd_dim (int, optional): speaker embedding size. Defaults to 192. 165 | activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU. 166 | channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536]. 167 | kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1]. 168 | dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1]. 169 | attention_channels (int, optional): attention dimensions. Defaults to 128. 170 | res2net_scale (int, optional): scale value in res2net. Defaults to 8. 171 | se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128. 172 | global_context (bool, optional): global context flag. Defaults to True. 173 | """ 174 | super().__init__() 175 | assert len(channels) == len(kernel_sizes) 176 | assert len(channels) == len(dilations) 177 | self.input_size = input_size 178 | self.channels = channels 179 | self.blocks = nn.LayerList() 180 | self.embd_dim = embd_dim 181 | 182 | # The initial TDNN layer 183 | self.blocks.append( 184 | TDNNBlock( 185 | input_size, 186 | channels[0], 187 | kernel_sizes[0], 188 | dilations[0], 189 | activation, )) 190 | 191 | # SE-Res2Net layers 192 | for i in range(1, len(channels) - 1): 193 | self.blocks.append( 194 | SERes2NetBlock( 195 | channels[i - 1], 196 | channels[i], 197 | res2net_scale=res2net_scale, 198 | se_channels=se_channels, 199 | kernel_size=kernel_sizes[i], 200 | dilation=dilations[i], 201 | activation=activation, )) 202 | 203 | # Multi-layer feature aggregation 204 | self.mfa = TDNNBlock( 205 | channels[-1], 206 | channels[-1], 207 | kernel_sizes[-1], 208 | dilations[-1], 209 | activation, ) 210 | 211 | cat_channels = channels[-1] 212 | if pooling_type == "ASP": 213 | self.asp = AttentiveStatisticsPooling(channels[-1], 214 | attention_channels=attention_channels, 215 | global_context=global_context) 216 | self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) 217 | # Final linear transformation 218 | self.fc = Conv1d(in_channels=channels[-1] * 2, 219 | out_channels=self.embd_dim, 220 | kernel_size=1) 221 | elif pooling_type == "SAP": 222 | self.asp = SelfAttentivePooling(cat_channels, 128) 223 | self.asp_bn = nn.BatchNorm1D(cat_channels) 224 | # Final linear transformation 225 | self.fc = Conv1d(in_channels=cat_channels, 226 | out_channels=self.embd_dim, 227 | kernel_size=1) 228 | elif pooling_type == "TAP": 229 | self.asp = TemporalAveragePooling() 230 | self.asp_bn = nn.BatchNorm1D(cat_channels) 231 | # Final linear transformation 232 | self.fc = Conv1d(in_channels=cat_channels, 233 | out_channels=self.embd_dim, 234 | kernel_size=1) 235 | elif pooling_type == "TSP": 236 | self.asp = TemporalStatisticsPooling() 237 | self.asp_bn = nn.BatchNorm1D(cat_channels * 2) 238 | # Final linear transformation 239 | self.fc = Conv1d(in_channels=cat_channels * 2, 240 | out_channels=self.embd_dim, 241 | kernel_size=1) 242 | else: 243 | raise Exception(f'没有{pooling_type}池化层!') 244 | 245 | def forward(self, x, lengths=None): 246 | """ 247 | Compute embeddings. 248 | 249 | Args: 250 | x (paddle.Tensor): Input data with shape (N, time, freq). 251 | lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None. 252 | 253 | Returns: 254 | paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1) 255 | """ 256 | x = x.transpose([0, 2, 1]) 257 | xl = [] 258 | for layer in self.blocks: 259 | try: 260 | x = layer(x, lengths=lengths) 261 | except TypeError: 262 | x = layer(x) 263 | xl.append(x) 264 | 265 | # Multi-layer feature aggregation 266 | x = paddle.concat(xl[1:], axis=1) 267 | x = self.mfa(x) 268 | 269 | # Attentive Statistical Pooling 270 | x = self.asp(x, lengths=lengths) 271 | x = self.asp_bn(x) 272 | x = x.unsqueeze(2) 273 | # Final linear transformation 274 | x = self.fc(x).squeeze(-1) # (N, emb_size, 1) -> (N, emb_size) 275 | 276 | return x 277 | -------------------------------------------------------------------------------- /ppvector/models/eres2net.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | import paddle.nn.functional as F 6 | 7 | from ppvector.models.pooling import TemporalStatsPool 8 | 9 | __all__ = ['ERes2Net', 'ERes2NetV2'] 10 | 11 | 12 | class ReLU(nn.Hardtanh): 13 | 14 | def __init__(self, inplace=False): 15 | super(ReLU, self).__init__(0, 20, inplace) 16 | 17 | def __repr__(self): 18 | inplace_str = 'inplace' if self.inplace else '' 19 | return self.__class__.__name__ + ' (' + inplace_str + ')' 20 | 21 | 22 | def conv1x1(in_planes, out_planes, stride=1): 23 | "1x1 convolution without padding" 24 | return nn.Conv2D(in_planes, out_planes, kernel_size=1, stride=stride, padding=0) 25 | 26 | 27 | def conv3x3(in_planes, out_planes, stride=1): 28 | "3x3 convolution with padding" 29 | return nn.Conv2D(in_planes, out_planes, kernel_size=3, stride=stride, padding=1) 30 | 31 | 32 | class AFF(nn.Layer): 33 | 34 | def __init__(self, channels=64, r=4): 35 | super(AFF, self).__init__() 36 | inter_channels = int(channels // r) 37 | 38 | self.local_att = nn.Sequential( 39 | nn.Conv2D(channels * 2, inter_channels, kernel_size=1, stride=1, padding=0), 40 | nn.BatchNorm2D(inter_channels), 41 | nn.Silu(), 42 | nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0), 43 | nn.BatchNorm2D(channels), 44 | ) 45 | 46 | def forward(self, x, ds_y): 47 | xa = paddle.concat((x, ds_y), axis=1) 48 | x_att = self.local_att(xa) 49 | x_att = 1.0 + paddle.tanh(x_att) 50 | xo = paddle.multiply(x, x_att) + paddle.multiply(ds_y, 2.0 - x_att) 51 | 52 | return xo 53 | 54 | 55 | class BasicBlockERes2Net(nn.Layer): 56 | 57 | def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2): 58 | super(BasicBlockERes2Net, self).__init__() 59 | self.expansion = expansion 60 | width = int(math.floor(planes * (base_width / 64.0))) 61 | self.conv1 = conv1x1(in_planes, width * scale, stride) 62 | self.bn1 = nn.BatchNorm2D(width * scale) 63 | self.nums = scale 64 | 65 | convs = [] 66 | bns = [] 67 | for i in range(self.nums): 68 | convs.append(conv3x3(width, width)) 69 | bns.append(nn.BatchNorm2D(width)) 70 | self.convs = nn.LayerList(convs) 71 | self.bns = nn.LayerList(bns) 72 | self.relu = ReLU(inplace=True) 73 | 74 | self.conv3 = conv1x1(width * scale, planes * self.expansion) 75 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 76 | self.shortcut = nn.Sequential() 77 | if stride != 1 or in_planes != self.expansion * planes: 78 | self.shortcut = nn.Sequential( 79 | nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride), 80 | nn.BatchNorm2D(self.expansion * planes)) 81 | self.stride = stride 82 | self.width = width 83 | self.scale = scale 84 | 85 | def forward(self, x): 86 | out = self.conv1(x) 87 | out = self.bn1(out) 88 | out = self.relu(out) 89 | spx = paddle.split(out, int(out.shape[1] / self.width), 1) 90 | for i in range(self.nums): 91 | if i == 0: 92 | sp = spx[i] 93 | else: 94 | sp = sp + spx[i] 95 | sp = self.convs[i](sp) 96 | sp = self.relu(self.bns[i](sp)) 97 | if i == 0: 98 | out = sp 99 | else: 100 | out = paddle.concat((out, sp), 1) 101 | out = self.conv3(out) 102 | out = self.bn3(out) 103 | 104 | residual = self.shortcut(x) 105 | out += residual 106 | out = self.relu(out) 107 | 108 | return out 109 | 110 | 111 | class BasicBlockERes2Net_diff_AFF(nn.Layer): 112 | 113 | def __init__(self, expansion, in_planes, planes, stride=1, base_width=32, scale=2): 114 | super(BasicBlockERes2Net_diff_AFF, self).__init__() 115 | self.expansion = expansion 116 | width = int(math.floor(planes * (base_width / 64.0))) 117 | self.conv1 = conv1x1(in_planes, width * scale, stride) 118 | self.bn1 = nn.BatchNorm2D(width * scale) 119 | 120 | self.nums = scale 121 | 122 | convs = [] 123 | fuse_models = [] 124 | bns = [] 125 | for i in range(self.nums): 126 | convs.append(conv3x3(width, width)) 127 | bns.append(nn.BatchNorm2D(width)) 128 | for j in range(self.nums - 1): 129 | fuse_models.append(AFF(channels=width)) 130 | 131 | self.convs = nn.LayerList(convs) 132 | self.bns = nn.LayerList(bns) 133 | self.fuse_models = nn.LayerList(fuse_models) 134 | self.relu = ReLU(inplace=True) 135 | 136 | self.conv3 = conv1x1(width * scale, planes * self.expansion) 137 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 138 | self.shortcut = nn.Sequential() 139 | if stride != 1 or in_planes != self.expansion * planes: 140 | self.shortcut = nn.Sequential( 141 | nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride), 142 | nn.BatchNorm2D(self.expansion * planes)) 143 | self.stride = stride 144 | self.width = width 145 | self.scale = scale 146 | 147 | def forward(self, x): 148 | out = self.conv1(x) 149 | out = self.bn1(out) 150 | out = self.relu(out) 151 | spx = paddle.split(out, int(out.shape[1] / self.width), 1) 152 | for i in range(self.nums): 153 | if i == 0: 154 | sp = spx[i] 155 | else: 156 | sp = self.fuse_models[i - 1](sp, spx[i]) 157 | sp = self.convs[i](sp) 158 | sp = self.relu(self.bns[i](sp)) 159 | if i == 0: 160 | out = sp 161 | else: 162 | out = paddle.concat((out, sp), 1) 163 | out = self.conv3(out) 164 | out = self.bn3(out) 165 | 166 | residual = self.shortcut(x) 167 | out += residual 168 | out = self.relu(out) 169 | 170 | return out 171 | 172 | 173 | class ERes2Net(nn.Layer): 174 | def __init__(self, 175 | input_size, 176 | block=BasicBlockERes2Net, 177 | block_fuse=BasicBlockERes2Net_diff_AFF, 178 | num_blocks=[3, 4, 6, 3], 179 | m_channels=32, 180 | mul_channel=1, 181 | expansion=2, 182 | base_width=32, 183 | scale=2, 184 | embd_dim=192, 185 | pooling_type='TSTP', 186 | two_emb_layer=False): 187 | super(ERes2Net, self).__init__() 188 | self.in_planes = m_channels 189 | self.expansion = expansion 190 | self.feat_dim = input_size 191 | self.embd_dim = embd_dim 192 | self.stats_dim = int(input_size / 8) * m_channels * 8 193 | self.two_emb_layer = two_emb_layer 194 | 195 | self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1) 196 | self.bn1 = nn.BatchNorm2D(m_channels) 197 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1, 198 | base_width=base_width, scale=scale) 199 | self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2, 200 | base_width=base_width, scale=scale) 201 | self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2, 202 | base_width=base_width, scale=scale) 203 | self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2, 204 | base_width=base_width, scale=scale) 205 | 206 | # Downsampling module for each layer 207 | self.layer1_downsample = nn.Conv2D(m_channels * 2 * mul_channel, m_channels * 4 * mul_channel, kernel_size=3, 208 | padding=1, stride=2) 209 | self.layer2_downsample = nn.Conv2D(m_channels * 4 * mul_channel, m_channels * 8 * mul_channel, kernel_size=3, 210 | padding=1, stride=2) 211 | self.layer3_downsample = nn.Conv2D(m_channels * 8 * mul_channel, m_channels * 16 * mul_channel, kernel_size=3, 212 | padding=1, stride=2) 213 | self.fuse_mode12 = AFF(channels=m_channels * 4 * mul_channel) 214 | self.fuse_mode123 = AFF(channels=m_channels * 8 * mul_channel) 215 | self.fuse_mode1234 = AFF(channels=m_channels * 16 * mul_channel) 216 | 217 | self.n_stats = 1 if pooling_type == 'TAP' else 2 218 | if pooling_type == "TSTP": 219 | self.pooling = TemporalStatsPool() 220 | else: 221 | raise Exception(f'没有{pooling_type}池化层!') 222 | 223 | self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim) 224 | if self.two_emb_layer: 225 | self.seg_bn_1 = nn.BatchNorm1D(embd_dim) 226 | self.seg_2 = nn.Linear(embd_dim, embd_dim) 227 | else: 228 | self.seg_bn_1 = nn.Identity() 229 | self.seg_2 = nn.Identity() 230 | 231 | def _make_layer(self, block, planes, num_blocks, stride, base_width, scale): 232 | strides = [stride] + [1] * (num_blocks - 1) 233 | layers = [] 234 | for stride in strides: 235 | layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale)) 236 | self.in_planes = planes * self.expansion 237 | return nn.Sequential(*layers) 238 | 239 | def forward(self, x): 240 | x = x.transpose((0, 2, 1)) # (B,T,F) => (B,F,T) 241 | 242 | x = x.unsqueeze_(1) 243 | out = F.relu(self.bn1(self.conv1(x))) 244 | out1 = self.layer1(out) 245 | out2 = self.layer2(out1) 246 | out1_downsample = self.layer1_downsample(out1) 247 | fuse_out12 = self.fuse_mode12(out2, out1_downsample) 248 | out3 = self.layer3(out2) 249 | fuse_out12_downsample = self.layer2_downsample(fuse_out12) 250 | fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample) 251 | out4 = self.layer4(out3) 252 | fuse_out123_downsample = self.layer3_downsample(fuse_out123) 253 | fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample) 254 | stats = self.pooling(fuse_out1234) 255 | 256 | embed_a = self.seg_1(stats) 257 | if self.two_emb_layer: 258 | out = F.relu(embed_a) 259 | out = self.seg_bn_1(out) 260 | embed_b = self.seg_2(out) 261 | return embed_b 262 | else: 263 | return embed_a 264 | 265 | 266 | class BasicBlockERes2NetV2(nn.Layer): 267 | 268 | def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2): 269 | super(BasicBlockERes2NetV2, self).__init__() 270 | self.expansion = expansion 271 | width = int(math.floor(planes * (base_width / 64.0))) 272 | self.conv1 = nn.Conv2D(in_planes, width * scale, kernel_size=1, stride=stride) 273 | self.bn1 = nn.BatchNorm2D(width * scale) 274 | self.nums = scale 275 | 276 | convs = [] 277 | bns = [] 278 | for i in range(self.nums): 279 | convs.append(nn.Conv2D(width, width, kernel_size=3, padding=1)) 280 | bns.append(nn.BatchNorm2D(width)) 281 | self.convs = nn.LayerList(convs) 282 | self.bns = nn.LayerList(bns) 283 | self.relu = ReLU(inplace=True) 284 | 285 | self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1) 286 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 287 | self.shortcut = nn.Sequential() 288 | if stride != 1 or in_planes != self.expansion * planes: 289 | self.shortcut = nn.Sequential( 290 | nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride), 291 | nn.BatchNorm2D(self.expansion * planes)) 292 | self.stride = stride 293 | self.width = width 294 | self.scale = scale 295 | 296 | def forward(self, x): 297 | out = self.conv1(x) 298 | out = self.bn1(out) 299 | out = self.relu(out) 300 | spx = paddle.split(out, int(out.shape[1] / self.width), 1) 301 | for i in range(self.nums): 302 | if i == 0: 303 | sp = spx[i] 304 | else: 305 | sp = sp + spx[i] 306 | sp = self.convs[i](sp) 307 | sp = self.relu(self.bns[i](sp)) 308 | if i == 0: 309 | out = sp 310 | else: 311 | out = paddle.concat((out, sp), 1) 312 | out = self.conv3(out) 313 | out = self.bn3(out) 314 | 315 | residual = self.shortcut(x) 316 | out += residual 317 | out = self.relu(out) 318 | 319 | return out 320 | 321 | 322 | class BasicBlockERes2NetV2_AFF(nn.Layer): 323 | 324 | def __init__(self, expansion, in_planes, planes, stride=1, base_width=26, scale=2): 325 | super(BasicBlockERes2NetV2_AFF, self).__init__() 326 | self.expansion = expansion 327 | width = int(math.floor(planes * (base_width / 64.0))) 328 | self.conv1 = nn.Conv2D(in_planes, width * scale, kernel_size=1, stride=stride) 329 | self.bn1 = nn.BatchNorm2D(width * scale) 330 | self.nums = scale 331 | 332 | convs = [] 333 | fuse_models = [] 334 | bns = [] 335 | for i in range(self.nums): 336 | convs.append(nn.Conv2D(width, width, kernel_size=3, padding=1)) 337 | bns.append(nn.BatchNorm2D(width)) 338 | for j in range(self.nums - 1): 339 | fuse_models.append(AFF(channels=width, r=4)) 340 | 341 | self.convs = nn.LayerList(convs) 342 | self.bns = nn.LayerList(bns) 343 | self.fuse_models = nn.LayerList(fuse_models) 344 | self.relu = ReLU(inplace=True) 345 | 346 | self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1) 347 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 348 | self.shortcut = nn.Sequential() 349 | if stride != 1 or in_planes != self.expansion * planes: 350 | self.shortcut = nn.Sequential( 351 | nn.Conv2D(in_planes, self.expansion * planes, kernel_size=1, stride=stride), 352 | nn.BatchNorm2D(self.expansion * planes)) 353 | self.stride = stride 354 | self.width = width 355 | self.scale = scale 356 | 357 | def forward(self, x): 358 | out = self.conv1(x) 359 | out = self.bn1(out) 360 | out = self.relu(out) 361 | spx = paddle.split(out, int(out.shape[1] / self.width), 1) 362 | for i in range(self.nums): 363 | if i == 0: 364 | sp = spx[i] 365 | else: 366 | sp = self.fuse_models[i - 1](sp, spx[i]) 367 | sp = self.convs[i](sp) 368 | sp = self.relu(self.bns[i](sp)) 369 | if i == 0: 370 | out = sp 371 | else: 372 | out = paddle.concat((out, sp), 1) 373 | out = self.conv3(out) 374 | out = self.bn3(out) 375 | 376 | residual = self.shortcut(x) 377 | out += residual 378 | out = self.relu(out) 379 | 380 | return out 381 | 382 | 383 | class ERes2NetV2(nn.Layer): 384 | def __init__(self, 385 | input_size, 386 | block=BasicBlockERes2NetV2, 387 | block_fuse=BasicBlockERes2NetV2_AFF, 388 | num_blocks=[3, 4, 6, 3], 389 | m_channels=32, 390 | expansion=2, 391 | base_width=26, 392 | scale=2, 393 | embd_dim=192, 394 | pooling_type='TSTP', 395 | two_emb_layer=False): 396 | super(ERes2NetV2, self).__init__() 397 | self.in_planes = m_channels 398 | self.expansion = expansion 399 | self.embd_dim = embd_dim 400 | self.stats_dim = int(input_size / 8) * m_channels * 8 401 | self.two_emb_layer = two_emb_layer 402 | 403 | self.conv1 = nn.Conv2D(1, m_channels, kernel_size=3, stride=1, padding=1) 404 | self.bn1 = nn.BatchNorm2D(m_channels) 405 | self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1, 406 | base_width=base_width, scale=scale) 407 | self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2, 408 | base_width=base_width, scale=scale) 409 | self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2, 410 | base_width=base_width, scale=scale) 411 | self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2, 412 | base_width=base_width, scale=scale) 413 | 414 | # Downsampling module 415 | self.layer3_ds = nn.Conv2D(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2) 416 | 417 | # Bottom-up fusion module 418 | self.fuse34 = AFF(channels=m_channels * 16, r=4) 419 | 420 | self.n_stats = 1 if pooling_type == 'TAP' else 2 421 | if pooling_type == "TSTP": 422 | self.pooling = TemporalStatsPool() 423 | else: 424 | raise Exception(f'没有{pooling_type}池化层!') 425 | 426 | self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embd_dim) 427 | if self.two_emb_layer: 428 | self.seg_bn_1 = nn.BatchNorm1D(embd_dim) 429 | self.seg_2 = nn.Linear(embd_dim, embd_dim) 430 | else: 431 | self.seg_bn_1 = nn.Identity() 432 | self.seg_2 = nn.Identity() 433 | 434 | def _make_layer(self, block, planes, num_blocks, stride, base_width, scale): 435 | strides = [stride] + [1] * (num_blocks - 1) 436 | layers = [] 437 | for stride in strides: 438 | layers.append(block(self.expansion, self.in_planes, planes, stride, base_width, scale)) 439 | self.in_planes = planes * self.expansion 440 | return nn.Sequential(*layers) 441 | 442 | def forward(self, x): 443 | x = x.transpose((0, 2, 1)) # (B,T,F) => (B,F,T) 444 | 445 | x = x.unsqueeze_(1) 446 | out = F.relu(self.bn1(self.conv1(x))) 447 | out1 = self.layer1(out) 448 | out2 = self.layer2(out1) 449 | out3 = self.layer3(out2) 450 | out4 = self.layer4(out3) 451 | out3_ds = self.layer3_ds(out3) 452 | fuse_out34 = self.fuse34(out4, out3_ds) 453 | stats = self.pooling(fuse_out34) 454 | 455 | embed_a = self.seg_1(stats) 456 | if self.two_emb_layer: 457 | out = F.relu(embed_a) 458 | out = self.seg_bn_1(out) 459 | embed_b = self.seg_2(out) 460 | return embed_b 461 | else: 462 | return embed_a 463 | -------------------------------------------------------------------------------- /ppvector/models/fc.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | import paddle.nn.functional as F 4 | 5 | 6 | class SpeakerIdentification(nn.Layer): 7 | def __init__(self, 8 | input_dim, 9 | num_speakers, 10 | classifier_type='Cosine', 11 | K=1, 12 | num_blocks=0, 13 | inter_dim=512): 14 | """The speaker identification model, which includes the speaker backbone network 15 | and the a linear transform to speaker class num in training 16 | 17 | Args: 18 | input_dim (nn.Module, class): embedding model output dim. 19 | num_speakers (_type_): the speaker class num in the training dataset 20 | classifier_type (str, optional): type of output layer to uses. 21 | num_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0. 22 | inter_dim (int, optional): the output dimension of dense layer. Defaults to 512. 23 | """ 24 | super(SpeakerIdentification, self).__init__() 25 | self.classifier_type = classifier_type 26 | self.blocks = nn.LayerList() 27 | 28 | for index in range(num_blocks): 29 | self.blocks.append(DenseLayer(input_dim, inter_dim, config_str='batchnorm')) 30 | input_dim = inter_dim 31 | 32 | if self.classifier_type == 'Cosine': 33 | self.weight = paddle.create_parameter(shape=[input_dim, num_speakers * K], 34 | dtype='float32', 35 | attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), ) 36 | elif self.classifier_type == 'Linear': 37 | self.output = nn.Linear(input_dim, num_speakers) 38 | else: 39 | raise ValueError(f'不支持该输出层:{self.classifier_type}') 40 | 41 | def forward(self, features): 42 | # x: [B, dim] 43 | x = features 44 | for layer in self.blocks: 45 | x = layer(x) 46 | 47 | # normalized 48 | if self.classifier_type == 'Cosine': 49 | logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0)) 50 | else: 51 | logits = self.output(x) 52 | 53 | return {"features": features, "logits": logits} 54 | 55 | 56 | class DenseLayer(nn.Layer): 57 | def __init__(self, 58 | in_channels, 59 | out_channels, 60 | config_str='batchnorm-relu'): 61 | super(DenseLayer, self).__init__() 62 | self.linear = nn.Conv1D(in_channels, out_channels, 1) 63 | self.nonlinear = get_nonlinear(config_str, out_channels) 64 | 65 | def forward(self, x): 66 | if len(x.shape) == 2: 67 | x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1) 68 | else: 69 | x = self.linear(x) 70 | x = self.nonlinear(x) 71 | return x 72 | 73 | 74 | def get_nonlinear(config_str, channels): 75 | nonlinear = nn.Sequential() 76 | for name in config_str.split('-'): 77 | if name == 'relu': 78 | nonlinear.add_module('relu', nn.ReLU()) 79 | elif name == 'prelu': 80 | nonlinear.add_module('prelu', nn.PReLU(channels)) 81 | elif name == 'batchnorm': 82 | nonlinear.add_module('batchnorm', nn.BatchNorm1D(channels)) 83 | elif name == 'batchnorm_': 84 | nonlinear.add_module('batchnorm', nn.BatchNorm1D(channels)) 85 | else: 86 | raise ValueError('Unexpected module ({}).'.format(name)) 87 | return nonlinear 88 | -------------------------------------------------------------------------------- /ppvector/models/pooling.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | import paddle.nn as nn 3 | import paddle.nn.functional as F 4 | 5 | from ppvector.models.utils import length_to_mask, Conv1d, TDNNBlock 6 | 7 | 8 | class TemporalAveragePooling(nn.Layer): 9 | def __init__(self): 10 | """TAP 11 | Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification 12 | Link: https://arxiv.org/pdf/1903.12058.pdf 13 | """ 14 | super(TemporalAveragePooling, self).__init__() 15 | 16 | def forward(self, x, lengths=None): 17 | """Computes Temporal Average Pooling Module 18 | Args: 19 | x (torch.Tensor): Input tensor (#batch, channels, frames). 20 | Returns: 21 | torch.Tensor: Output tensor (#batch, channels) 22 | """ 23 | x = paddle.mean(x, axis=2) 24 | x = x.unsqueeze(2) 25 | return x 26 | 27 | 28 | class TemporalStatisticsPooling(nn.Layer): 29 | def __init__(self): 30 | """TSP 31 | Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition 32 | Link: http://www.danielpovey.com/files/2018_icassp_xvectors.pdf 33 | """ 34 | super(TemporalStatisticsPooling, self).__init__() 35 | 36 | def forward(self, x, lengths=None): 37 | """Computes Temporal Statistics Pooling Module 38 | Args: 39 | x (torch.Tensor): Input tensor (#batch, channels, frames). 40 | Returns: 41 | torch.Tensor: Output tensor (#batch, channels*2) 42 | """ 43 | mean = paddle.mean(x, axis=2) 44 | var = paddle.var(x, axis=2) 45 | x = paddle.concat((mean, var), axis=1) 46 | x = x.unsqueeze(2) 47 | return x 48 | 49 | 50 | class SelfAttentivePooling(nn.Layer): 51 | """SAP""" 52 | 53 | def __init__(self, in_dim, bottleneck_dim=128): 54 | # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs. 55 | # attention dim = 128 56 | super(SelfAttentivePooling, self).__init__() 57 | self.linear1 = nn.Conv1D(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper 58 | self.linear2 = nn.Conv1D(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper 59 | 60 | def forward(self, x, lengths=None): 61 | # DON'T use ReLU here! In experiments, I find ReLU hard to converge. 62 | alpha = paddle.tanh(self.linear1(x)) 63 | alpha = paddle.nn.functional.softmax(self.linear2(alpha), axis=2) 64 | mean = paddle.sum(alpha * x, axis=2) 65 | mean = mean.unsqueeze(2) 66 | return mean 67 | 68 | 69 | class AttentiveStatisticsPooling(nn.Layer): 70 | """TSP""" 71 | 72 | def __init__(self, channels, attention_channels=128, global_context=True): 73 | super().__init__() 74 | self.eps = 1e-12 75 | self.global_context = global_context 76 | if global_context: 77 | self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) 78 | else: 79 | self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) 80 | self.tanh = nn.Tanh() 81 | self.conv = Conv1d( 82 | in_channels=attention_channels, 83 | out_channels=channels, 84 | kernel_size=1) 85 | 86 | def forward(self, x, lengths=None): 87 | C, L = x.shape[1], x.shape[2] # KP: (N, C, L) 88 | 89 | def _compute_statistics(x, m, axis=2, eps=self.eps): 90 | mean = (m * x).sum(axis) 91 | std = paddle.sqrt((m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps)) 92 | return mean, std 93 | 94 | if lengths is None: 95 | lengths = paddle.ones([x.shape[0]]) 96 | 97 | # Make binary mask of shape [N, 1, L] 98 | mask = length_to_mask(lengths * L, max_len=L) 99 | mask = mask.unsqueeze(1) 100 | 101 | # 通过允许自我注意观察话语的全局属性,扩展汇集层的时间上下文。 102 | if self.global_context: 103 | total = mask.sum(axis=2, keepdim=True).astype('float32') 104 | mean, std = _compute_statistics(x, mask / total) 105 | mean = mean.unsqueeze(2).tile((1, 1, L)) 106 | std = std.unsqueeze(2).tile((1, 1, L)) 107 | attn = paddle.concat([x, mean, std], axis=1) 108 | else: 109 | attn = x 110 | 111 | # Apply layers 112 | attn = self.conv(self.tanh(self.tdnn(attn))) 113 | 114 | # Filter out zero-paddings 115 | attn = paddle.where( 116 | mask.tile((1, C, 1)) == 0, 117 | paddle.ones_like(attn) * float("-inf"), attn) 118 | 119 | attn = F.softmax(attn, axis=2) 120 | mean, std = _compute_statistics(x, attn) 121 | 122 | # Append mean and std of the batch 123 | pooled_stats = paddle.concat((mean, std), axis=1) 124 | 125 | return pooled_stats 126 | 127 | 128 | class TemporalStatsPool(nn.Layer): 129 | """TSTP 130 | Temporal statistics pooling, concatenate mean and std, which is used in 131 | x-vector 132 | Comment: simple concatenation can not make full use of both statistics 133 | """ 134 | 135 | def __init__(self): 136 | super(TemporalStatsPool, self).__init__() 137 | 138 | def forward(self, x, lengths=None): 139 | # The last dimension is the temporal axis 140 | pooling_mean = x.mean(axis=-1) 141 | pooling_std = paddle.sqrt(paddle.var(x, axis=-1) + 1e-8) 142 | pooling_mean = pooling_mean.flatten(start_axis=1) 143 | pooling_std = pooling_std.flatten(start_axis=1) 144 | 145 | stats = paddle.concat((pooling_mean, pooling_std), 1) 146 | return stats 147 | -------------------------------------------------------------------------------- /ppvector/models/res2net.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | 6 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling 7 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling 8 | from ppvector.models.utils import BatchNorm1d 9 | 10 | 11 | class Bottle2neck(nn.Layer): 12 | expansion = 4 13 | 14 | def __init__(self, inplanes, planes, stride=1, downsample=None, baseWidth=26, scale=4, stype='normal'): 15 | """ Constructor 16 | Args: 17 | inplanes: input channel dimensionality 18 | planes: output channel dimensionality 19 | stride: conv stride. Replaces pooling layer. 20 | downsample: None when stride = 1 21 | baseWidth: basic width of conv3x3 22 | scale: number of scale. 23 | type: 'normal': normal set. 'stage': first block of a new stage. 24 | """ 25 | super(Bottle2neck, self).__init__() 26 | 27 | width = int(math.floor(planes * (baseWidth / 64.0))) 28 | self.conv1 = nn.Conv2D(inplanes, width * scale, kernel_size=1) 29 | self.bn1 = nn.BatchNorm2D(width * scale) 30 | 31 | if scale == 1: 32 | self.nums = 1 33 | else: 34 | self.nums = scale - 1 35 | if stype == 'stage': 36 | self.pool = nn.AvgPool2D(kernel_size=3, stride=stride, padding=1) 37 | convs = [] 38 | bns = [] 39 | for i in range(self.nums): 40 | convs.append(nn.Conv2D(width, width, kernel_size=3, stride=stride, padding=1)) 41 | bns.append(nn.BatchNorm2D(width)) 42 | self.convs = nn.LayerList(convs) 43 | self.bns = nn.LayerList(bns) 44 | 45 | self.conv3 = nn.Conv2D(width * scale, planes * self.expansion, kernel_size=1) 46 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 47 | 48 | self.relu = nn.ReLU() 49 | self.downsample = downsample 50 | self.stype = stype 51 | self.scale = scale 52 | self.width = width 53 | 54 | def forward(self, x): 55 | residual = x 56 | 57 | out = self.conv1(x) 58 | out = self.bn1(out) 59 | out = self.relu(out) 60 | 61 | spx = paddle.split(out, self.scale, 1) 62 | for i in range(self.nums): 63 | if i == 0 or self.stype == 'stage': 64 | sp = spx[i] 65 | else: 66 | sp = sp + spx[i] 67 | sp = self.convs[i](sp) 68 | sp = self.relu(self.bns[i](sp)) 69 | if i == 0: 70 | out = sp 71 | else: 72 | out = paddle.concat((out, sp), 1) 73 | if self.scale != 1 and self.stype == 'normal': 74 | out = paddle.concat((out, spx[self.nums]), 1) 75 | elif self.scale != 1 and self.stype == 'stage': 76 | out = paddle.concat((out, self.pool(spx[self.nums])), 1) 77 | 78 | out = self.conv3(out) 79 | out = self.bn3(out) 80 | 81 | if self.downsample is not None: 82 | residual = self.downsample(x) 83 | 84 | out += residual 85 | out = self.relu(out) 86 | 87 | return out 88 | 89 | 90 | class Res2Net(nn.Layer): 91 | 92 | def __init__(self, input_size, m_channels=32, layers=[3, 4, 6, 3], base_width=32, scale=2, embd_dim=192, 93 | pooling_type="ASP"): 94 | super(Res2Net, self).__init__() 95 | self.inplanes = m_channels 96 | self.base_width = base_width 97 | self.scale = scale 98 | self.embd_dim = embd_dim 99 | self.conv1 = nn.Conv2D(1, m_channels, kernel_size=7, stride=3, padding=1) 100 | self.bn1 = nn.BatchNorm2D(m_channels) 101 | self.relu = nn.ReLU() 102 | self.max_pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) 103 | self.layer1 = self._make_layer(Bottle2neck, m_channels, layers[0]) 104 | self.layer2 = self._make_layer(Bottle2neck, m_channels*2, layers[1], stride=2) 105 | self.layer3 = self._make_layer(Bottle2neck, m_channels * 4, layers[2], stride=2) 106 | self.layer4 = self._make_layer(Bottle2neck, m_channels * 8, layers[3], stride=2) 107 | 108 | if input_size < 96: 109 | cat_channels = m_channels * 8 * Bottle2neck.expansion * (input_size // self.base_width) 110 | else: 111 | cat_channels = m_channels * 8 * Bottle2neck.expansion * ( 112 | input_size // self.base_width - int(math.sqrt(input_size / 64))) 113 | if pooling_type == "ASP": 114 | self.pooling = AttentiveStatisticsPooling(cat_channels, attention_channels=128) 115 | self.bn2 = BatchNorm1d(cat_channels * 2) 116 | self.linear = nn.Linear(cat_channels * 2, embd_dim) 117 | self.bn3 = BatchNorm1d(embd_dim) 118 | elif pooling_type == "SAP": 119 | self.pooling = SelfAttentivePooling(cat_channels, 128) 120 | self.bn2 = BatchNorm1d(cat_channels) 121 | self.linear = nn.Linear(cat_channels, embd_dim) 122 | self.bn3 = BatchNorm1d(embd_dim) 123 | elif pooling_type == "TAP": 124 | self.pooling = TemporalAveragePooling() 125 | self.bn2 = BatchNorm1d(cat_channels) 126 | self.linear = nn.Linear(cat_channels, embd_dim) 127 | self.bn3 = BatchNorm1d(embd_dim) 128 | elif pooling_type == "TSP": 129 | self.pooling = TemporalStatisticsPooling() 130 | self.bn2 = BatchNorm1d(cat_channels * 2) 131 | self.linear = nn.Linear(cat_channels * 2, embd_dim) 132 | self.bn3 = BatchNorm1d(embd_dim) 133 | else: 134 | raise Exception(f'没有{pooling_type}池化层!') 135 | 136 | def _make_layer(self, block, planes, blocks, stride=1): 137 | downsample = None 138 | if stride != 1 or self.inplanes != planes * block.expansion: 139 | downsample = nn.Sequential( 140 | nn.Conv2D(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride), 141 | nn.BatchNorm2D(planes * block.expansion), 142 | ) 143 | 144 | layers = [block(self.inplanes, planes, stride, downsample=downsample, 145 | stype='stage', baseWidth=self.base_width, scale=self.scale)] 146 | self.inplanes = planes * block.expansion 147 | for i in range(1, blocks): 148 | layers.append(block(self.inplanes, planes, baseWidth=self.base_width, scale=self.scale)) 149 | 150 | return nn.Sequential(*layers) 151 | 152 | def forward(self, x): 153 | x = x.transpose([0, 2, 1]) 154 | x = x.unsqueeze(1) 155 | x = self.conv1(x) 156 | x = self.bn1(x) 157 | x = self.relu(x) 158 | x = self.max_pool(x) 159 | 160 | x = self.layer1(x) 161 | x = self.layer2(x) 162 | x = self.layer3(x) 163 | x = self.layer4(x) 164 | 165 | x = x.reshape([x.shape[0], -1, x.shape[-1]]) 166 | x = self.pooling(x) 167 | x = self.bn2(x) 168 | x = self.linear(x) 169 | x = self.bn3(x) 170 | 171 | return x 172 | -------------------------------------------------------------------------------- /ppvector/models/resnet_se.py: -------------------------------------------------------------------------------- 1 | import paddle.nn as nn 2 | 3 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling 4 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling 5 | from ppvector.models.utils import BatchNorm1d 6 | 7 | 8 | class SEBottleneck(nn.Layer): 9 | expansion = 2 10 | 11 | def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): 12 | super(SEBottleneck, self).__init__() 13 | self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1) 14 | self.bn1 = nn.BatchNorm2D(planes) 15 | self.conv2 = nn.Conv2D(planes, planes, kernel_size=3, stride=stride, padding=1) 16 | self.bn2 = nn.BatchNorm2D(planes) 17 | self.conv3 = nn.Conv2D(planes, planes * self.expansion, kernel_size=1) 18 | self.bn3 = nn.BatchNorm2D(planes * self.expansion) 19 | self.relu = nn.ReLU() 20 | self.se = SELayer(planes * self.expansion, reduction) 21 | self.downsample = downsample 22 | self.stride = stride 23 | 24 | def forward(self, x): 25 | residual = x 26 | 27 | out = self.conv1(x) 28 | out = self.bn1(out) 29 | out = self.relu(out) 30 | 31 | out = self.conv2(out) 32 | out = self.bn2(out) 33 | out = self.relu(out) 34 | 35 | out = self.conv3(out) 36 | out = self.bn3(out) 37 | out = self.se(out) 38 | 39 | if self.downsample is not None: 40 | residual = self.downsample(x) 41 | 42 | out += residual 43 | out = self.relu(out) 44 | 45 | return out 46 | 47 | 48 | class SELayer(nn.Layer): 49 | def __init__(self, channel, reduction=8): 50 | super(SELayer, self).__init__() 51 | self.avg_pool = nn.AdaptiveAvgPool2D(1) 52 | self.fc = nn.Sequential( 53 | nn.Linear(channel, channel // reduction), 54 | nn.ReLU(), 55 | nn.Linear(channel // reduction, channel), 56 | nn.Sigmoid() 57 | ) 58 | 59 | def forward(self, x): 60 | b, c, _, _ = x.shape 61 | y = self.avg_pool(x).reshape([b, c]) 62 | y = self.fc(y).reshape([b, c, 1, 1]) 63 | return x * y 64 | 65 | 66 | class ResNetSE(nn.Layer): 67 | def __init__(self, input_size, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], embd_dim=192, 68 | pooling_type="ASP"): 69 | super(ResNetSE, self).__init__() 70 | self.inplanes = num_filters[0] 71 | self.embd_dim = embd_dim 72 | self.conv1 = nn.Conv2D(1, num_filters[0], kernel_size=3, stride=(1, 1), padding=1) 73 | self.bn1 = nn.BatchNorm2D(num_filters[0]) 74 | self.relu = nn.ReLU() 75 | 76 | self.layer1 = self._make_layer(SEBottleneck, num_filters[0], layers[0]) 77 | self.layer2 = self._make_layer(SEBottleneck, num_filters[1], layers[1], stride=(2, 2)) 78 | self.layer3 = self._make_layer(SEBottleneck, num_filters[2], layers[2], stride=(2, 2)) 79 | self.layer4 = self._make_layer(SEBottleneck, num_filters[3], layers[3], stride=(2, 2)) 80 | 81 | cat_channels = num_filters[3] * SEBottleneck.expansion * (input_size // 8) 82 | if pooling_type == "ASP": 83 | self.pooling = AttentiveStatisticsPooling(cat_channels, attention_channels=128) 84 | self.bn2 = BatchNorm1d(cat_channels * 2) 85 | self.linear = nn.Linear(cat_channels * 2, embd_dim) 86 | self.bn3 = BatchNorm1d(embd_dim) 87 | elif pooling_type == "SAP": 88 | self.pooling = SelfAttentivePooling(cat_channels, 128) 89 | self.bn2 = BatchNorm1d(cat_channels) 90 | self.linear = nn.Linear(cat_channels, embd_dim) 91 | self.bn3 = BatchNorm1d(embd_dim) 92 | elif pooling_type == "TAP": 93 | self.pooling = TemporalAveragePooling() 94 | self.bn2 = BatchNorm1d(cat_channels) 95 | self.linear = nn.Linear(cat_channels, embd_dim) 96 | self.bn3 = BatchNorm1d(embd_dim) 97 | elif pooling_type == "TSP": 98 | self.pooling = TemporalStatisticsPooling() 99 | self.bn2 = BatchNorm1d(cat_channels * 2) 100 | self.linear = nn.Linear(cat_channels * 2, embd_dim) 101 | self.bn3 = BatchNorm1d(embd_dim) 102 | else: 103 | raise Exception(f'没有{pooling_type}池化层!') 104 | 105 | def _make_layer(self, block, planes, blocks, stride=1): 106 | downsample = None 107 | if stride != 1 or self.inplanes != planes * block.expansion: 108 | downsample = nn.Sequential( 109 | nn.Conv2D(self.inplanes, planes * block.expansion, 110 | kernel_size=1, stride=stride), 111 | nn.BatchNorm2D(planes * block.expansion), 112 | ) 113 | 114 | layers = [block(self.inplanes, planes, stride, downsample)] 115 | self.inplanes = planes * block.expansion 116 | for i in range(1, blocks): 117 | layers.append(block(self.inplanes, planes)) 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def forward(self, x): 122 | x = x.transpose([0, 2, 1]) 123 | x = x.unsqueeze(1) 124 | x = self.conv1(x) 125 | x = self.bn1(x) 126 | x = self.relu(x) 127 | 128 | x = self.layer1(x) 129 | x = self.layer2(x) 130 | x = self.layer3(x) 131 | x = self.layer4(x) 132 | 133 | x = x.reshape([x.shape[0], -1, x.shape[-1]]) 134 | 135 | x = self.pooling(x) 136 | x = self.bn2(x) 137 | x = self.linear(x) 138 | x = self.bn3(x) 139 | return x 140 | -------------------------------------------------------------------------------- /ppvector/models/tdnn.py: -------------------------------------------------------------------------------- 1 | import paddle.nn as nn 2 | import paddle.nn.functional as F 3 | 4 | from ppvector.models.pooling import AttentiveStatisticsPooling, TemporalAveragePooling 5 | from ppvector.models.pooling import SelfAttentivePooling, TemporalStatisticsPooling 6 | from ppvector.models.utils import BatchNorm1d 7 | 8 | 9 | class TDNN(nn.Layer): 10 | def __init__(self, input_size, channels=512, embd_dim=192, pooling_type="ASP"): 11 | super(TDNN, self).__init__() 12 | self.embd_dim = embd_dim 13 | self.td_layer1 = nn.Conv1D(in_channels=input_size, out_channels=channels, dilation=1, kernel_size=5, stride=1) 14 | self.bn1 = nn.BatchNorm1D(channels) 15 | self.td_layer2 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=2, kernel_size=3, stride=1) 16 | self.bn2 = nn.BatchNorm1D(channels) 17 | self.td_layer3 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=3, kernel_size=3, stride=1) 18 | self.bn3 = nn.BatchNorm1D(channels) 19 | self.td_layer4 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1) 20 | self.bn4 = nn.BatchNorm1D(channels) 21 | self.td_layer5 = nn.Conv1D(in_channels=channels, out_channels=channels, dilation=1, kernel_size=1, stride=1) 22 | 23 | if pooling_type == "ASP": 24 | self.pooling = AttentiveStatisticsPooling(channels, attention_channels=128) 25 | self.bn5 = BatchNorm1d(channels * 2) 26 | self.linear = nn.Linear(channels * 2, embd_dim) 27 | self.bn6 = BatchNorm1d(embd_dim) 28 | elif pooling_type == "SAP": 29 | self.pooling = SelfAttentivePooling(channels, 128) 30 | self.bn5 = BatchNorm1d(channels) 31 | self.linear = nn.Linear(channels, embd_dim) 32 | self.bn6 = BatchNorm1d(embd_dim) 33 | elif pooling_type == "TAP": 34 | self.pooling = TemporalAveragePooling() 35 | self.bn5 = BatchNorm1d(channels) 36 | self.linear = nn.Linear(channels, embd_dim) 37 | self.bn6 = BatchNorm1d(embd_dim) 38 | elif pooling_type == "TSP": 39 | self.pooling = TemporalStatisticsPooling() 40 | self.bn5 = BatchNorm1d(channels * 2) 41 | self.linear = nn.Linear(channels * 2, embd_dim) 42 | self.bn6 = BatchNorm1d(embd_dim) 43 | else: 44 | raise Exception(f'没有{pooling_type}池化层!') 45 | 46 | def forward(self, x): 47 | """ 48 | Compute embeddings. 49 | 50 | Args: 51 | x (paddle.Tensor): Input data with shape (N, time, freq). 52 | 53 | Returns: 54 | paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1) 55 | """ 56 | x = x.transpose([0, 2, 1]) 57 | x = F.relu(self.td_layer1(x)) 58 | x = self.bn1(x) 59 | x = F.relu(self.td_layer2(x)) 60 | x = self.bn2(x) 61 | x = F.relu(self.td_layer3(x)) 62 | x = self.bn3(x) 63 | x = F.relu(self.td_layer4(x)) 64 | x = self.bn4(x) 65 | x = F.relu(self.td_layer5(x)) 66 | out = self.bn5(self.pooling(x)) 67 | out = self.bn6(self.linear(out)) 68 | return out 69 | -------------------------------------------------------------------------------- /ppvector/models/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | import paddle.nn as nn 5 | import paddle.nn.functional as F 6 | 7 | 8 | def length_to_mask(length, max_len=None, dtype=None): 9 | assert len(length.shape) == 1 10 | 11 | if max_len is None: 12 | max_len = length.max().astype('int').item() # using arange to generate mask 13 | mask = paddle.arange(max_len, dtype=length.dtype).expand((len(length), max_len)) < length.unsqueeze(1) 14 | 15 | if dtype is None: 16 | dtype = length.dtype 17 | 18 | mask = paddle.to_tensor(mask, dtype=dtype) 19 | return mask 20 | 21 | 22 | class Conv1d(nn.Layer): 23 | def __init__( 24 | self, 25 | in_channels, 26 | out_channels, 27 | kernel_size, 28 | stride=1, 29 | padding="same", 30 | dilation=1, 31 | groups=1, 32 | bias=True, 33 | padding_mode="reflect", ): 34 | """_summary_ 35 | 36 | Args: 37 | in_channels (int): intput channel or input data dimensions 38 | out_channels (int): output channel or output data dimensions 39 | kernel_size (int): kernel size of 1-d convolution 40 | stride (int, optional): strid in 1-d convolution . Defaults to 1. 41 | padding (str, optional): padding value. Defaults to "same". 42 | dilation (int, optional): dilation in 1-d convolution. Defaults to 1. 43 | groups (int, optional): groups in 1-d convolution. Defaults to 1. 44 | bias (bool, optional): bias in 1-d convolution . Defaults to True. 45 | padding_mode (str, optional): padding mode. Defaults to "reflect". 46 | """ 47 | super().__init__() 48 | 49 | self.kernel_size = kernel_size 50 | self.stride = stride 51 | self.dilation = dilation 52 | self.padding = padding 53 | self.padding_mode = padding_mode 54 | 55 | self.conv = nn.Conv1D( 56 | in_channels, 57 | out_channels, 58 | self.kernel_size, 59 | stride=self.stride, 60 | padding=0, 61 | dilation=self.dilation, 62 | groups=groups, 63 | bias_attr=bias, ) 64 | 65 | def forward(self, x): 66 | if self.padding == "same": 67 | x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride) 68 | else: 69 | raise ValueError(f"Padding must be 'same'. Got {self.padding}") 70 | 71 | return self.conv(x) 72 | 73 | def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): 74 | L_in = x.shape[-1] # Detecting input shape 75 | padding = self._get_padding_elem(L_in, stride, kernel_size, dilation) # Time padding 76 | x = F.pad(x, padding, mode=self.padding_mode, data_format="NCL") # Applying padding 77 | return x 78 | 79 | def _get_padding_elem(self, 80 | L_in: int, 81 | stride: int, 82 | kernel_size: int, 83 | dilation: int): 84 | if stride > 1: 85 | n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) 86 | L_out = stride * (n_steps - 1) + kernel_size * dilation 87 | padding = [kernel_size // 2, kernel_size // 2] 88 | else: 89 | L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 90 | 91 | padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] 92 | 93 | return padding 94 | 95 | 96 | class BatchNorm1d(nn.Layer): 97 | def __init__( 98 | self, 99 | input_size, 100 | eps=1e-05, 101 | momentum=0.9, 102 | weight_attr=None, 103 | bias_attr=None, 104 | data_format='NCL', 105 | use_global_stats=None, ): 106 | super().__init__() 107 | 108 | self.norm = nn.BatchNorm1D( 109 | input_size, 110 | epsilon=eps, 111 | momentum=momentum, 112 | weight_attr=weight_attr, 113 | bias_attr=bias_attr, 114 | data_format=data_format, 115 | use_global_stats=use_global_stats, ) 116 | 117 | def forward(self, x): 118 | x_n = self.norm(x) 119 | return x_n 120 | 121 | 122 | class TDNNBlock(nn.Layer): 123 | def __init__( 124 | self, 125 | in_channels, 126 | out_channels, 127 | kernel_size, 128 | dilation, 129 | activation=nn.ReLU, ): 130 | """Implementation of TDNN network 131 | 132 | Args: 133 | in_channels (int): input channels or input embedding dimensions 134 | out_channels (int): output channels or output embedding dimensions 135 | kernel_size (int): the kernel size of the TDNN network block 136 | dilation (int): the dilation of the TDNN network block 137 | activation (paddle class, optional): the activation layers. Defaults to nn.ReLU. 138 | """ 139 | super().__init__() 140 | self.conv = Conv1d(in_channels=in_channels, 141 | out_channels=out_channels, 142 | kernel_size=kernel_size, 143 | dilation=dilation, ) 144 | self.activation = activation() 145 | self.norm = BatchNorm1d(input_size=out_channels) 146 | 147 | def forward(self, x): 148 | return self.norm(self.activation(self.conv(x))) 149 | -------------------------------------------------------------------------------- /ppvector/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | from loguru import logger 4 | from paddle.optimizer import * 5 | from .scheduler import cosine_decay_with_warmup as WarmupCosineSchedulerLR 6 | from paddle.optimizer.lr import * 7 | 8 | 9 | __all__ = ['build_optimizer', 'build_lr_scheduler'] 10 | 11 | 12 | def build_optimizer(parameters, learning_rate, configs): 13 | use_optimizer = configs.optimizer_conf.get('optimizer', 'Adam') 14 | optimizer_args = configs.optimizer_conf.get('optimizer_args', {}) 15 | optim = importlib.import_module(__name__) 16 | optimizer = getattr(optim, use_optimizer)(parameters=parameters, learning_rate=learning_rate, **optimizer_args) 17 | logger.info(f'成功创建优化方法:{use_optimizer},参数为:{optimizer_args}') 18 | return optimizer 19 | 20 | 21 | def build_lr_scheduler(step_per_epoch, configs): 22 | use_scheduler = configs.optimizer_conf.get('scheduler', 'WarmupCosineSchedulerLR') 23 | scheduler_args = configs.optimizer_conf.get('scheduler_args', {}) 24 | if configs.optimizer_conf.scheduler == 'CosineAnnealingDecay' and 'T_max' not in scheduler_args: 25 | scheduler_args.T_max = int(configs.train_conf.max_epoch * 1.2) * step_per_epoch 26 | if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'fix_epoch' not in scheduler_args: 27 | scheduler_args.fix_epoch = configs.train_conf.max_epoch 28 | if configs.optimizer_conf.scheduler == 'WarmupCosineSchedulerLR' and 'step_per_epoch' not in scheduler_args: 29 | scheduler_args.step_per_epoch = step_per_epoch 30 | optim = importlib.import_module(__name__) 31 | scheduler = getattr(optim, use_scheduler)(**scheduler_args) 32 | logger.info(f'成功创建学习率衰减:{use_scheduler},参数为:{scheduler_args}') 33 | return scheduler 34 | -------------------------------------------------------------------------------- /ppvector/optimizer/scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import paddle 4 | 5 | 6 | def cosine_decay_with_warmup(learning_rate, step_per_epoch, fix_epoch=1000, warmup_epoch=5, min_lr=0.0): 7 | """ 8 | :param learning_rate: 学习率 9 | :param step_per_epoch: 每个epoch的步数 10 | :param fix_epoch: 最大epoch数 11 | :param warmup_epoch: 预热步数 12 | :param min_lr: 最小学习率 13 | :return: 14 | """ 15 | # 预热步数 16 | boundary = [] 17 | value = [] 18 | warmup_steps = warmup_epoch * step_per_epoch 19 | # 初始化预热步数 20 | for i in range(warmup_steps + 1): 21 | if warmup_steps > 0: 22 | alpha = i / warmup_steps 23 | lr = learning_rate * alpha 24 | value.append(lr) 25 | if i > 0: 26 | boundary.append(i) 27 | 28 | max_iters = fix_epoch * int(step_per_epoch) 29 | warmup_iters = len(boundary) 30 | # 初始化最大步数 31 | for i in range(int(boundary[-1]), max_iters): 32 | boundary.append(i) 33 | # 如果当前步数小于最大步数,则将当前步数设置为最小学习率 34 | if i < max_iters: 35 | decayed_lr = min_lr + (learning_rate - min_lr) * 0.5 * (math.cos( 36 | (i - warmup_iters) * math.pi / (max_iters - warmup_iters)) + 1) 37 | value.append(decayed_lr) 38 | else: 39 | value.append(min_lr) 40 | return paddle.optimizer.lr.PiecewiseDecay(boundary, value) 41 | 42 | 43 | class MarginScheduler: 44 | def __init__( 45 | self, 46 | criterion, 47 | increase_start_epoch, 48 | fix_epoch, 49 | step_per_epoch, 50 | initial_margin=0.0, 51 | final_margin=0.3, 52 | increase_type='exp', 53 | ): 54 | assert hasattr(criterion, 'update'), "Loss function not has 'update()' attributes." 55 | self.criterion = criterion 56 | self.increase_start_step = increase_start_epoch * step_per_epoch 57 | self.fix_step = fix_epoch * step_per_epoch 58 | self.initial_margin = initial_margin 59 | self.final_margin = final_margin 60 | self.increase_type = increase_type 61 | self.margin = initial_margin 62 | 63 | self.current_step = 0 64 | self.increase_step = self.fix_step - self.increase_start_step 65 | 66 | self.init_margin() 67 | 68 | def init_margin(self): 69 | self.criterion.update(margin=self.initial_margin) 70 | 71 | def step(self, current_step=None): 72 | if current_step is not None: 73 | self.current_step = current_step 74 | 75 | self.margin = self.iter_margin() 76 | self.criterion.update(margin=self.margin) 77 | self.current_step += 1 78 | 79 | def iter_margin(self): 80 | if self.current_step < self.increase_start_step: 81 | return self.initial_margin 82 | 83 | if self.current_step >= self.fix_step: 84 | return self.final_margin 85 | 86 | a = 1.0 87 | b = 1e-3 88 | 89 | current_step = self.current_step - self.increase_start_step 90 | if self.increase_type == 'exp': 91 | # exponentially increase the margin 92 | ratio = 1.0 - math.exp( 93 | (current_step / self.increase_step) * 94 | math.log(b / (a + 1e-6))) * a 95 | else: 96 | # linearly increase the margin 97 | ratio = 1.0 * current_step / self.increase_step 98 | return self.initial_margin + (self.final_margin - 99 | self.initial_margin) * ratio 100 | 101 | def get_margin(self): 102 | return self.margin 103 | -------------------------------------------------------------------------------- /ppvector/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle/9802e856e51bb54b648f242b98c45666e8590e0e/ppvector/utils/__init__.py -------------------------------------------------------------------------------- /ppvector/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | 5 | import paddle 6 | 7 | from loguru import logger 8 | from ppvector import __version__ 9 | 10 | 11 | def load_pretrained(model, pretrained_model): 12 | """加载预训练模型 13 | 14 | :param model: 使用的模型 15 | :param pretrained_model: 预训练模型路径 16 | """ 17 | # 加载预训练模型 18 | if pretrained_model is None: return model 19 | if os.path.isdir(pretrained_model): 20 | pretrained_model = os.path.join(pretrained_model, 'model.pdparams') 21 | assert os.path.exists(pretrained_model), f"{pretrained_model} 模型不存在!" 22 | model_dict = model.state_dict() 23 | model_state_dict = paddle.load(pretrained_model) 24 | # 过滤不存在的参数 25 | for name, weight in model_dict.items(): 26 | if name in model_state_dict.keys(): 27 | if list(weight.shape) != list(model_state_dict[name].shape): 28 | logger.warning('{} not used, shape {} unmatched with {} in model.'. 29 | format(name, list(model_state_dict[name].shape), list(weight.shape))) 30 | model_state_dict.pop(name, None) 31 | else: 32 | logger.warning('Lack weight: {}'.format(name)) 33 | # 加载权重 34 | missing_keys, unexpected_keys = model.set_state_dict(model_state_dict) 35 | if len(unexpected_keys) > 0: 36 | logger.warning('Unexpected key(s) in state_dict: {}. ' 37 | .format(', '.join('"{}"'.format(k) for k in unexpected_keys))) 38 | if len(missing_keys) > 0: 39 | logger.warning('Missing key(s) in state_dict: {}. ' 40 | .format(', '.join('"{}"'.format(k) for k in missing_keys))) 41 | logger.info('成功加载预训练模型:{}'.format(pretrained_model)) 42 | return model 43 | 44 | 45 | def load_checkpoint(configs, model, optimizer, amp_scaler, scheduler,margin_scheduler, 46 | step_epoch, save_model_path, resume_model): 47 | """加载模型 48 | 49 | :param configs: 配置信息 50 | :param model: 使用的模型 51 | :param optimizer: 使用的优化方法 52 | :param amp_scaler: 使用的自动混合精度 53 | :param scheduler: 使用的学习率调整策略 54 | :param margin_scheduler: margin调整策略 55 | :param step_epoch: 每个epoch的step数量 56 | :param save_model_path: 模型保存路径 57 | :param resume_model: 恢复训练的模型路径 58 | """ 59 | last_epoch1 = 0 60 | best_eer1 = 1 61 | 62 | def load_model(model_path): 63 | assert os.path.exists(os.path.join(model_path, 'model.pdparams')), "模型参数文件不存在!" 64 | assert os.path.exists(os.path.join(model_path, 'optimizer.pdopt')), "优化方法参数文件不存在!" 65 | state_dict = paddle.load(os.path.join(model_path, 'model.pdparams')) 66 | missing_keys, unexpected_keys = model.set_state_dict(state_dict) 67 | assert len(missing_keys) == len(unexpected_keys) == 0, "模型参数加载失败,参数权重不匹配,请可以考虑当做预训练模型!" 68 | optimizer.set_state_dict(paddle.load(os.path.join(model_path, 'optimizer.pdopt'))) 69 | # 自动混合精度参数 70 | if amp_scaler is not None and os.path.exists(os.path.join(model_path, 'scaler.pdparams')): 71 | amp_scaler.set_state_dict(paddle.load(os.path.join(model_path, 'scaler.pdparams'))) 72 | with open(os.path.join(model_path, 'model.state'), 'r', encoding='utf-8') as f: 73 | json_data = json.load(f) 74 | last_epoch = json_data['last_epoch'] 75 | best_eer = 1 76 | if 'eer' in json_data.keys(): 77 | best_eer = json_data['eer'] 78 | logger.info('成功恢复模型参数和优化方法参数:{}'.format(model_path)) 79 | optimizer.step() 80 | [scheduler.step() for _ in range(last_epoch * step_epoch)] 81 | if margin_scheduler is not None: 82 | margin_scheduler.step(current_step=last_epoch * step_epoch) 83 | return last_epoch, best_eer 84 | 85 | # 获取最后一个保存的模型 86 | save_feature_method = configs.preprocess_conf.feature_method 87 | last_model_dir = os.path.join(save_model_path, 88 | f'{configs.model_conf.model}_{save_feature_method}', 89 | 'last_model') 90 | if resume_model is not None or (os.path.exists(os.path.join(last_model_dir, 'model.pdparams')) 91 | and os.path.exists(os.path.join(last_model_dir, 'optimizer.pdopt'))): 92 | if resume_model is not None: 93 | last_epoch1, best_eer1 = load_model(resume_model) 94 | else: 95 | try: 96 | # 自动获取最新保存的模型 97 | last_epoch1, best_eer1 = load_model(last_model_dir) 98 | except Exception as e: 99 | logger.warning(f'尝试自动恢复最新模型失败,错误信息:{e}') 100 | return model, optimizer, amp_scaler, scheduler, margin_scheduler, last_epoch1, best_eer1 101 | 102 | 103 | # 保存模型 104 | def save_checkpoint(configs, model, optimizer, amp_scaler, margin_scheduler, save_model_path, epoch_id, 105 | eer=None, min_dcf=None, threshold=None, best_model=False): 106 | """保存模型 107 | 108 | :param configs: 配置信息 109 | :param model: 使用的模型 110 | :param optimizer: 使用的优化方法 111 | :param amp_scaler: 使用的自动混合精度 112 | :param margin_scheduler: margin调整策略 113 | :param save_model_path: 模型保存路径 114 | :param epoch_id: 当前epoch 115 | :param eer: 当前eer 116 | :param min_dcf: 当前min_dcf 117 | :param threshold: 当前threshold 118 | :param best_model: 是否为最佳模型 119 | """ 120 | # 保存模型的路径 121 | save_feature_method = configs.preprocess_conf.feature_method 122 | if best_model: 123 | model_path = os.path.join(save_model_path, 124 | f'{configs.model_conf.model}_{save_feature_method}', 'best_model') 125 | else: 126 | model_path = os.path.join(save_model_path, 127 | f'{configs.model_conf.model}_{save_feature_method}', 'epoch_{}'.format(epoch_id)) 128 | if os.path.exists(model_path): 129 | shutil.rmtree(model_path) 130 | os.makedirs(model_path, exist_ok=True) 131 | # 保存模型参数 132 | paddle.save(optimizer.state_dict(), os.path.join(model_path, 'optimizer.pdopt')) 133 | paddle.save(model.state_dict(), os.path.join(model_path, 'model.pdparams')) 134 | # 自动混合精度参数 135 | if amp_scaler is not None: 136 | paddle.save(amp_scaler.state_dict(), os.path.join(model_path, 'scaler.pdparams')) 137 | with open(os.path.join(model_path, 'model.state'), 'w', encoding='utf-8') as f: 138 | use_loss = configs.loss_conf.get('use_loss', 'AAMLoss') 139 | data = {"last_epoch": epoch_id, "version": __version__, "model_conf.model": configs.model_conf.model, 140 | "feature_method": save_feature_method, "loss": use_loss} 141 | if eer is not None: 142 | data['threshold'] = threshold 143 | data['eer'] = eer 144 | data['min_dcf'] = min_dcf 145 | if margin_scheduler: 146 | data['margin'] = margin_scheduler.get_margin() 147 | f.write(json.dumps(data, indent=4, ensure_ascii=False)) 148 | if not best_model: 149 | last_model_path = os.path.join(save_model_path, 150 | f'{configs.model_conf.model}_{save_feature_method}', 'last_model') 151 | shutil.rmtree(last_model_path, ignore_errors=True) 152 | shutil.copytree(model_path, last_model_path) 153 | # 删除旧的模型 154 | old_model_path = os.path.join(save_model_path, 155 | f'{configs.model_conf.model}_{save_feature_method}', 156 | 'epoch_{}'.format(epoch_id - 3)) 157 | if os.path.exists(old_model_path): 158 | shutil.rmtree(old_model_path) 159 | logger.info('已保存模型:{}'.format(model_path)) 160 | -------------------------------------------------------------------------------- /ppvector/utils/record.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import soundcard 5 | import soundfile 6 | 7 | 8 | class RecordAudio: 9 | def __init__(self, channels=1, sample_rate=16000): 10 | # 录音参数 11 | self.channels = channels 12 | self.sample_rate = sample_rate 13 | 14 | # 获取麦克风 15 | self.default_mic = soundcard.default_microphone() 16 | 17 | def record(self, record_seconds=3, save_path=None): 18 | """录音 19 | 20 | :param record_seconds: 录音时间,默认3秒 21 | :param save_path: 录音保存的路径,后缀名为wav 22 | :return: 音频的numpy数据 23 | """ 24 | print("开始录音......") 25 | num_frames = int(record_seconds * self.sample_rate) 26 | start_time = time.time() 27 | data = self.default_mic.record(samplerate=self.sample_rate, numframes=num_frames, channels=self.channels) 28 | if int(time.time() - start_time) < record_seconds: 29 | raise Exception('录音错误,请检查录音设备,或者卸载soundfile,使用命令重新安装:' 30 | 'pip install git+https://github.com/bastibe/SoundCard.git') 31 | audio_data = data.squeeze() 32 | print("录音已结束!") 33 | if save_path is not None: 34 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 35 | soundfile.write(save_path, data=data, samplerate=self.sample_rate) 36 | return audio_data 37 | -------------------------------------------------------------------------------- /ppvector/utils/utils.py: -------------------------------------------------------------------------------- 1 | import distutils.util 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | from loguru import logger 6 | 7 | 8 | def print_arguments(args=None, configs=None, title=None): 9 | if args: 10 | logger.info("----------- 额外配置参数 -----------") 11 | for arg, value in sorted(vars(args).items()): 12 | logger.info("%s: %s" % (arg, value)) 13 | logger.info("------------------------------------------------") 14 | if configs: 15 | title = title if title else "配置文件参数" 16 | logger.info(f"----------- {title} -----------") 17 | for arg, value in sorted(configs.items()): 18 | if isinstance(value, dict): 19 | logger.info(f"{arg}:") 20 | for a, v in sorted(value.items()): 21 | if isinstance(v, dict): 22 | logger.info(f"\t{a}:") 23 | for a1, v1 in sorted(v.items()): 24 | logger.info("\t\t%s: %s" % (a1, v1)) 25 | else: 26 | logger.info("\t%s: %s" % (a, v)) 27 | else: 28 | logger.info("%s: %s" % (arg, value)) 29 | logger.info("------------------------------------------------") 30 | 31 | 32 | def add_arguments(argname, type, default, help, argparser, **kwargs): 33 | type = distutils.util.strtobool if type == bool else type 34 | argparser.add_argument("--" + argname, 35 | default=default, 36 | type=type, 37 | help=help + ' 默认: %(default)s.', 38 | **kwargs) 39 | 40 | 41 | class Dict(dict): 42 | __setattr__ = dict.__setitem__ 43 | __getattr__ = dict.__getitem__ 44 | 45 | 46 | def dict_to_object(dict_obj): 47 | if not isinstance(dict_obj, dict): 48 | return dict_obj 49 | inst = Dict() 50 | for k, v in dict_obj.items(): 51 | inst[k] = dict_to_object(v) 52 | return inst 53 | 54 | 55 | # 根据对角余弦值计算准确率和最优的阈值 56 | def cal_accuracy_threshold(y_score, y_true): 57 | y_score = np.asarray(y_score) 58 | y_true = np.asarray(y_true) 59 | best_accuracy = 0 60 | best_threshold = 0 61 | for i in tqdm(range(0, 100)): 62 | threshold = i * 0.01 63 | y_test = (y_score >= threshold) 64 | acc = np.mean((y_test == y_true).astype(int)) 65 | if acc > best_accuracy: 66 | best_accuracy = acc 67 | best_threshold = threshold 68 | 69 | return best_accuracy, best_threshold 70 | 71 | 72 | # 根据对角余弦值计算准确率 73 | def cal_accuracy(y_score, y_true, threshold=0.5): 74 | y_score = np.asarray(y_score) 75 | y_true = np.asarray(y_true) 76 | y_test = (y_score >= threshold) 77 | accuracy = np.mean((y_test == y_true).astype(int)) 78 | return accuracy 79 | 80 | 81 | # 计算对角余弦值 82 | def cosin_metric(x1, x2): 83 | return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2)) 84 | 85 | 86 | # 根据a的类型,将b转换为相应的类型 87 | def convert_string_based_on_type(a, b): 88 | if isinstance(a, int): 89 | try: 90 | b = int(b) 91 | except ValueError: 92 | logger.error("无法将字符串转换为整数") 93 | elif isinstance(a, float): 94 | try: 95 | b = float(b) 96 | except ValueError: 97 | logger.error("无法将字符串转换为浮点数") 98 | elif isinstance(a, str): 99 | return b 100 | elif isinstance(a, bool): 101 | b = b.lower() == 'true' 102 | else: 103 | try: 104 | b = eval(b) 105 | except Exception as e: 106 | logger.exception("无法将字符串转换为其他类型,将忽略该参数类型转换") 107 | return b 108 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.2 2 | tqdm>=4.59.0 3 | visualdl==2.5.3 4 | resampy>=0.2.2 5 | soundfile>=0.12.1 6 | soundcard>=0.4.2 7 | pyyaml>=5.4.1 8 | paddleaudio>=1.0.1 9 | scikit-learn>=1.5.2 10 | pydub>=0.25.1 11 | loguru>=0.7.2 12 | yeaudio>=0.0.7 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from setuptools import setup, find_packages 4 | 5 | import ppvector 6 | 7 | VERSION = ppvector.__version__ 8 | 9 | 10 | # 复制配置文件到项目目录下 11 | shutil.rmtree('./ppvector/configs/', ignore_errors=True) 12 | shutil.copytree('./configs/', './ppvector/configs/') 13 | 14 | 15 | def readme(): 16 | with open('README.md', encoding='utf-8') as f: 17 | content = f.read() 18 | return content 19 | 20 | 21 | def parse_requirements(): 22 | with open('./requirements.txt', encoding="utf-8") as f: 23 | requirements = f.readlines() 24 | return requirements 25 | 26 | 27 | if __name__ == "__main__": 28 | setup( 29 | name='ppvector', 30 | packages=find_packages(), 31 | package_data={'': ['configs/*']}, 32 | author='yeyupiaoling', 33 | version=VERSION, 34 | install_requires=parse_requirements(), 35 | description='Voice Print Recognition toolkit on PaddlePaddle', 36 | long_description=readme(), 37 | long_description_content_type='text/markdown', 38 | url='https://github.com/yeyupiaoling/VoiceprintRecognition_PaddlePaddle', 39 | download_url='https://github.com/yeyupiaoling/VoiceprintRecognition_PaddlePaddle.git', 40 | keywords=['Voice', 'paddle'], 41 | classifiers=[ 42 | 'Intended Audience :: Developers', 43 | 'License :: OSI Approved :: Apache Software License', 44 | 'Operating System :: OS Independent', 45 | 'Natural Language :: Chinese (Simplified)', 46 | 'Programming Language :: Python :: 3', 47 | 'Programming Language :: Python :: 3.5', 48 | 'Programming Language :: Python :: 3.6', 49 | 'Programming Language :: Python :: 3.7', 50 | 'Programming Language :: Python :: 3.8', 51 | 'Programming Language :: Python :: 3.9', 'Topic :: Utilities' 52 | ], 53 | license='Apache License 2.0', 54 | ext_modules=[]) 55 | shutil.rmtree('./ppvector/configs/', ignore_errors=True) 56 | -------------------------------------------------------------------------------- /tools/eval_speaker_diarization/README.md: -------------------------------------------------------------------------------- 1 | # 说话人日志效果评估 2 | 3 | 1. 安装依赖库 4 | 5 | ```shell 6 | pip install pyannote.audio[separation]==3.3.0 7 | ``` 8 | 9 | 2. 下载[AIShell-4](https://us.openslr.org/resources/111)的测试数据并解压到当前目录的`dataset`下。 10 | 3. 执行`create_aishell4_test_rttm.py`,创建数据类别和rttm文件。 11 | 4. 执行`infer_data.py`预测数据。 12 | 5. 执行`compute_metrics.py`获取评估结果。 13 | -------------------------------------------------------------------------------- /tools/eval_speaker_diarization/compute_metrics.py: -------------------------------------------------------------------------------- 1 | from pyannote.database.util import load_rttm 2 | from pyannote.metrics.diarization import DiarizationErrorRate 3 | 4 | metric = DiarizationErrorRate() 5 | 6 | 7 | false_alarms, confusions, missed_detections, error_rates = [], [], [], [] 8 | references = load_rttm('dataset/references.rttm') 9 | hypotheses = load_rttm('dataset/hypotheses.rttm') 10 | for uri, reference in references.items(): 11 | hypothesis = hypotheses[uri] 12 | result = metric(reference, hypothesis, detailed=True) 13 | print(uri, ":", result) 14 | false_alarms.append(result["false alarm"]) 15 | confusions.append(result["confusion"]) 16 | missed_detections.append(result["missed detection"]) 17 | error_rates.append(result["diarization error rate"]) 18 | print("False alarm:", round(sum(false_alarms) / len(false_alarms), 5)) 19 | print("Confusion:", round(sum(confusions) / len(confusions), 5)) 20 | print("Missed detection:", round(sum(missed_detections) / len(missed_detections), 5)) 21 | print("Diarization error rate:", round(sum(error_rates) / len(error_rates), 5)) 22 | -------------------------------------------------------------------------------- /tools/eval_speaker_diarization/create_aishell4_test_rttm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import soundfile 4 | from pyannote.database.util import load_rttm 5 | from tqdm import tqdm 6 | from yeaudio.audio import AudioSegment 7 | 8 | 9 | def create_rttm(annotation_dir, output_path): 10 | with open(output_path, 'w', encoding='utf-8') as f_w: 11 | for file in os.listdir(annotation_dir): 12 | if not file.endswith(".rttm"): continue 13 | with open(os.path.join(annotation_dir, file), 'r', encoding='utf-8') as f_r: 14 | lines = f_r.readlines() 15 | for line in lines: 16 | f_w.write(line) 17 | 18 | 19 | def create_audio_path_list(audio_dir, list_path): 20 | with open(list_path, 'w', encoding='utf-8') as f_w: 21 | for file in os.listdir(audio_dir): 22 | if not file.endswith(".flac"): continue 23 | file_path = os.path.join(audio_dir, file).replace('\\', '/') 24 | name = file.split('.')[0] 25 | f_w.write(f'{file_path}\t{name}\n') 26 | 27 | 28 | def create_audio_db(data_list_path, rttm_path, output_dir): 29 | annotations = load_rttm(rttm_path) 30 | with open(data_list_path, 'r') as f_r: 31 | for line in tqdm(f_r.readlines(), desc='裁剪说话人音频'): 32 | audio_path, name = line.strip().split('\t') 33 | audio_segment = AudioSegment.from_file(audio_path) 34 | sample_rate = audio_segment.sample_rate 35 | audio = audio_segment.samples 36 | annotation = annotations[name] 37 | for segment, track, label in annotation.itertracks(yield_label=True): 38 | if segment.end - segment.start < 0.3: continue 39 | save_path = os.path.join(output_dir, name, label, f'{track}.wav') 40 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 41 | audio_sub = audio[int(segment.start * sample_rate):int(segment.end * sample_rate)] 42 | soundfile.write(save_path, audio_sub, sample_rate) 43 | 44 | 45 | if __name__ == '__main__': 46 | create_rttm(annotation_dir='dataset/test/TextGrid', output_path='dataset/references.rttm') 47 | create_audio_path_list(audio_dir='dataset/test/wav', list_path='dataset/data_list.txt') 48 | create_audio_db(data_list_path='dataset/data_list.txt', rttm_path='dataset/references.rttm', 49 | output_dir='dataset/audio_db/') 50 | -------------------------------------------------------------------------------- /tools/eval_speaker_diarization/infer_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import os 4 | 5 | from pyannote.core import Annotation 6 | from pyannote.core import Segment 7 | from tqdm import tqdm 8 | 9 | from ppvector.predict import PPVectorPredictor 10 | from ppvector.utils.utils import add_arguments, print_arguments 11 | 12 | parser = argparse.ArgumentParser(description=__doc__) 13 | add_arg = functools.partial(add_arguments, argparser=parser) 14 | add_arg('configs', str, '../../configs/cam++.yml', '配置文件') 15 | add_arg('use_gpu', bool, True, '是否使用GPU预测') 16 | add_arg('data_list_path', str, 'dataset/data_list.txt', '要预测的音频路径列表') 17 | add_arg('result_path', str, 'dataset/hypotheses.rttm', '预测结果') 18 | add_arg('audio_db_path', str, 'dataset/audio_db/', '测试数据的音频库的路径') 19 | add_arg('threshold', float, 0.6, '判断是否为同一个人的阈值') 20 | add_arg('model_path', str, '../../models/CAMPPlus_Fbank/best_model/', '导出的预测模型文件路径') 21 | args = parser.parse_args() 22 | print_arguments(args=args) 23 | 24 | 25 | # 进行说话人日志识别 26 | with open(args.data_list_path, 'r') as f_r, open(args.result_path, 'w', encoding='utf-8') as f_w: 27 | for line in tqdm(f_r.readlines()): 28 | audio_path, name = line.strip().split('\t') 29 | # 每条音频说话人的数据库 30 | audio_db_path = os.path.join(args.audio_db_path, name) 31 | # 获取识别器 32 | predictor = PPVectorPredictor(configs=args.configs, 33 | model_path=args.model_path, 34 | threshold=args.threshold, 35 | audio_db_path=audio_db_path, 36 | use_gpu=args.use_gpu) 37 | 38 | results = predictor.speaker_diarization(audio_path, search_audio_db=True) 39 | 40 | annotation = Annotation(uri=name) 41 | for i, result in enumerate(results): 42 | annotation[Segment(result['start'], result['end']), i] = str(result['speaker']) 43 | f_w.write(annotation.to_rttm()) 44 | os.remove(os.path.join(args.audio_db_path, name, "audio_indexes.bin")) 45 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | 4 | from ppvector.trainer import PPVectorTrainer 5 | from ppvector.utils.utils import add_arguments, print_arguments 6 | 7 | parser = argparse.ArgumentParser(description=__doc__) 8 | add_arg = functools.partial(add_arguments, argparser=parser) 9 | add_arg('configs', str, 'configs/cam++.yml', '配置文件') 10 | add_arg('data_augment_configs', str, 'configs/augmentation.yml', '数据增强配置文件') 11 | add_arg("use_gpu", bool, True, '是否使用GPU训练') 12 | add_arg("do_eval", bool, True, '训练时是否评估模型') 13 | add_arg('save_model_path', str, 'models/', '模型保存的路径') 14 | add_arg('log_dir', str, 'log/', '保存VisualDL日志文件的路径') 15 | add_arg('resume_model', str, None, '恢复训练,当为None则不使用预训练模型') 16 | add_arg('pretrained_model', str, None, '预训练模型的路径,当为None则不使用预训练模型') 17 | add_arg('overwrites', str, None, '覆盖配置文件中的参数,比如"train_conf.max_epoch=100",多个用逗号隔开') 18 | args = parser.parse_args() 19 | print_arguments(args=args) 20 | 21 | # 获取训练器 22 | trainer = PPVectorTrainer(configs=args.configs, 23 | use_gpu=args.use_gpu, 24 | data_augment_configs=args.data_augment_configs, 25 | overwrites=args.overwrites) 26 | 27 | trainer.train(save_model_path=args.save_model_path, 28 | log_dir=args.log_dir, 29 | resume_model=args.resume_model, 30 | pretrained_model=args.pretrained_model, 31 | do_eval=args.do_eval) 32 | --------------------------------------------------------------------------------