├── README copy.md ├── README.md ├── checkpts ├── spk_encoder │ ├── LICENSE │ └── pretrained.pt ├── vc │ ├── train_dec_libritts_wodyn.log │ ├── train_dec_vctk_wodyn.log │ ├── train_enc_libritts.log │ └── train_enc_vctk.log └── vocoder │ ├── LICENSE │ └── config.json ├── data.py ├── demos ├── change_csv.py ├── cmu_dictionary.txt ├── create_csv.py ├── create_demo.py ├── create_scps.py ├── data_preparation_UASpeech.py ├── emb_demo.py ├── english.zip ├── inference_allaty.py ├── inference_allaugdata.py ├── inference_librispeech.py ├── librispeech-lexicon.txt ├── listening_test_demo.py ├── modified_cmu_dictionary.txt ├── modified_librispeech-lexicon.txt ├── modify_lexicon.py └── prepare_phonemetime.py ├── hifi-gan ├── LICENSE ├── README.md ├── __pycache__ │ ├── env.cpython-36.pyc │ ├── models.cpython-36.pyc │ └── xutils.cpython-36.pyc ├── env.py ├── meldataset.py ├── models.py └── xutils.py ├── libritts_data ├── global_mean_var.txt ├── mels_mode.pkl ├── phonemes.json └── phonemes.pkl ├── model ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── base.cpython-36.pyc │ ├── diffusion.cpython-36.pyc │ ├── encoder.cpython-36.pyc │ ├── modules.cpython-36.pyc │ ├── postnet.cpython-36.pyc │ ├── utils.cpython-36.pyc │ └── vc.cpython-36.pyc ├── base.py ├── diffusion.py ├── encoder.py ├── modules.py ├── postnet.py ├── utils.py └── vc.py ├── params.py ├── prepare_data.py ├── requirements.txt ├── speaker_encoder ├── LICENSE ├── README.md ├── encoder │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── audio.cpython-36.pyc │ │ ├── inference.cpython-36.pyc │ │ ├── model.cpython-36.pyc │ │ ├── params_data.cpython-36.pyc │ │ └── params_model.cpython-36.pyc │ ├── audio.py │ ├── config.py │ ├── data_objects │ │ ├── __init__.py │ │ ├── random_cycler.py │ │ ├── speaker.py │ │ ├── speaker_batch.py │ │ ├── speaker_verification_dataset.py │ │ └── utterance.py │ ├── inference.py │ ├── model.py │ ├── params_data.py │ ├── params_model.py │ ├── preprocess.py │ ├── train.py │ └── visualizations.py └── utils │ ├── __init__.py │ ├── argutils.py │ ├── logmmse.py │ └── profiler.py ├── train_dec.py ├── train_enc.py └── utils.py /README copy.md: -------------------------------------------------------------------------------- 1 | # DuTa-VC 2 | Source code for INTERPSEECH 2023 paper: [DuTa-VC: A Duration-aware Typical-to-atypical Voice Conversion Approach with Diffusion Probabilistic Model](https://arxiv.org/pdf/2306.10588.pdf) 3 | 4 | You are welcome to take a look at our [demo page](https://wanghelin1997.github.io/DuTa-VC-Demo/)! 5 | 6 | ## Updating! 7 | 8 | 9 | ## Reference 10 | 11 | [DiffVC](https://github.com/huawei-noah/Speech-Backbones/tree/main/DiffVC) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DuTa-VC 2 | Source code for INTERPSEECH 2023 paper: [DuTa-VC: A Duration-aware Typical-to-atypical Voice Conversion Approach with Diffusion Probabilistic Model](https://arxiv.org/pdf/2306.10588.pdf) 3 | 4 | You are welcome to take a look at our [demo page](https://wanghelin1997.github.io/DuTa-VC-Demo/)! 5 | 6 | A following work can be seen at [Aty-TTS](https://github.com/WangHelin1997/Aty-TTS). 7 | 8 | ## Updating! 9 | 10 | 11 | ## References 12 | 13 | If you find the code useful for your research, please consider citing: 14 | 15 | ```bibtex 16 | @inproceedings{wang23qa_interspeech, 17 | author={Helin Wang and Thomas Thebaud and Jesús Villalba and Myra Sydnor and Becky Lammers and Najim Dehak and Laureano Moro-Velazquez}, 18 | title={{DuTa-VC: A Duration-aware Typical-to-atypical Voice Conversion Approach with Diffusion Probabilistic Model}}, 19 | year={2023}, 20 | booktitle={Proc. INTERSPEECH 2023}, 21 | pages={1548--1552}, 22 | doi={10.21437/Interspeech.2023-2203} 23 | } 24 | ``` 25 | 26 | ```bibtex 27 | @inproceedings{wang2023improving, 28 | title={Improving fairness for spoken language understanding in atypical speech with Text-to-Speech}, 29 | author={Helin Wang and Venkatesh Ravichandran and Milind Rao and Becky Lammers and Myra Sydnor and Nicholas Maragakis and Ankur A. Butala and Jayne Zhang and Lora Clawson and Victoria Chovaz and Laureano Moro-Velazquez}, 30 | booktitle={NeurIPS 2023 Workshop on Synthetic Data Generation with Generative AI}, 31 | year={2023}, 32 | url={https://openreview.net/forum?id=YU228ZUCOU} 33 | } 34 | 35 | ``` 36 | 37 | This repo is inspired by: 38 | 39 | [DiffVC](https://github.com/huawei-noah/Speech-Backbones/tree/main/DiffVC) 40 | -------------------------------------------------------------------------------- /checkpts/spk_encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /checkpts/spk_encoder/pretrained.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/checkpts/spk_encoder/pretrained.pt -------------------------------------------------------------------------------- /checkpts/vc/train_dec_libritts_wodyn.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.1397 2 | Epoch 2: loss = 0.1209 3 | Epoch 3: loss = 0.1192 4 | Epoch 4: loss = 0.1185 5 | Epoch 5: loss = 0.1170 6 | Epoch 6: loss = 0.1161 7 | Epoch 7: loss = 0.1157 8 | Epoch 8: loss = 0.1157 9 | Epoch 9: loss = 0.1152 10 | Epoch 10: loss = 0.1141 11 | Epoch 11: loss = 0.1140 12 | Epoch 12: loss = 0.1139 13 | Epoch 13: loss = 0.1132 14 | Epoch 14: loss = 0.1137 15 | Epoch 15: loss = 0.1136 16 | Epoch 16: loss = 0.1138 17 | Epoch 17: loss = 0.1130 18 | Epoch 18: loss = 0.1124 19 | Epoch 19: loss = 0.1121 20 | Epoch 20: loss = 0.1123 21 | Epoch 21: loss = 0.1121 22 | Epoch 22: loss = 0.1122 23 | Epoch 23: loss = 0.1126 24 | Epoch 24: loss = 0.1122 25 | Epoch 25: loss = 0.1118 26 | Epoch 26: loss = 0.1118 27 | Epoch 27: loss = 0.1120 28 | Epoch 28: loss = 0.1112 29 | Epoch 29: loss = 0.1106 30 | Epoch 30: loss = 0.1111 31 | Epoch 31: loss = 0.1111 32 | Epoch 32: loss = 0.1107 33 | Epoch 33: loss = 0.1115 34 | Epoch 34: loss = 0.1111 35 | Epoch 35: loss = 0.1118 36 | Epoch 36: loss = 0.1111 37 | Epoch 37: loss = 0.1106 38 | Epoch 38: loss = 0.1108 39 | Epoch 39: loss = 0.1101 40 | Epoch 40: loss = 0.1109 41 | Epoch 41: loss = 0.1110 42 | Epoch 42: loss = 0.1106 43 | Epoch 43: loss = 0.1107 44 | Epoch 44: loss = 0.1104 45 | Epoch 45: loss = 0.1099 46 | Epoch 46: loss = 0.1093 47 | Epoch 47: loss = 0.1105 48 | Epoch 48: loss = 0.1107 49 | Epoch 49: loss = 0.1092 50 | Epoch 50: loss = 0.1100 51 | Epoch 51: loss = 0.1098 52 | Epoch 52: loss = 0.1097 53 | Epoch 53: loss = 0.1103 54 | Epoch 54: loss = 0.1103 55 | Epoch 55: loss = 0.1101 56 | Epoch 56: loss = 0.1090 57 | Epoch 57: loss = 0.1095 58 | Epoch 58: loss = 0.1105 59 | Epoch 59: loss = 0.1098 60 | Epoch 60: loss = 0.1098 61 | Epoch 61: loss = 0.1098 62 | Epoch 62: loss = 0.1095 63 | Epoch 63: loss = 0.1107 64 | Epoch 64: loss = 0.1097 65 | Epoch 65: loss = 0.1088 66 | Epoch 66: loss = 0.1099 67 | Epoch 67: loss = 0.1085 68 | Epoch 68: loss = 0.1091 69 | Epoch 69: loss = 0.1092 70 | Epoch 70: loss = 0.1093 71 | Epoch 71: loss = 0.1094 72 | Epoch 72: loss = 0.1094 73 | Epoch 73: loss = 0.1084 74 | Epoch 74: loss = 0.1090 75 | Epoch 75: loss = 0.1102 76 | Epoch 76: loss = 0.1083 77 | Epoch 77: loss = 0.1085 78 | Epoch 78: loss = 0.1092 79 | Epoch 79: loss = 0.1088 80 | Epoch 80: loss = 0.1083 81 | Epoch 81: loss = 0.1082 82 | Epoch 82: loss = 0.1083 83 | Epoch 83: loss = 0.1089 84 | Epoch 84: loss = 0.1077 85 | Epoch 85: loss = 0.1089 86 | Epoch 86: loss = 0.1087 87 | Epoch 87: loss = 0.1086 88 | Epoch 88: loss = 0.1086 89 | Epoch 89: loss = 0.1089 90 | Epoch 90: loss = 0.1086 91 | Epoch 91: loss = 0.1082 92 | Epoch 92: loss = 0.1090 93 | Epoch 93: loss = 0.1087 94 | Epoch 94: loss = 0.1081 95 | Epoch 95: loss = 0.1082 96 | Epoch 96: loss = 0.1082 97 | Epoch 97: loss = 0.1079 98 | Epoch 98: loss = 0.1079 99 | Epoch 99: loss = 0.1094 100 | Epoch 100: loss = 0.1092 101 | Epoch 101: loss = 0.1084 102 | Epoch 102: loss = 0.1086 103 | Epoch 103: loss = 0.1082 104 | Epoch 104: loss = 0.1081 105 | Epoch 105: loss = 0.1084 106 | Epoch 106: loss = 0.1081 107 | Epoch 107: loss = 0.1086 108 | Epoch 108: loss = 0.1093 109 | Epoch 109: loss = 0.1070 110 | Epoch 110: loss = 0.1081 111 | -------------------------------------------------------------------------------- /checkpts/vc/train_dec_vctk_wodyn.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.1779 2 | Epoch 2: loss = 0.1237 3 | Epoch 3: loss = 0.1198 4 | Epoch 4: loss = 0.1165 5 | Epoch 5: loss = 0.1158 6 | Epoch 6: loss = 0.1162 7 | Epoch 7: loss = 0.1158 8 | Epoch 8: loss = 0.1129 9 | Epoch 9: loss = 0.1115 10 | Epoch 10: loss = 0.1124 11 | Epoch 11: loss = 0.1107 12 | Epoch 12: loss = 0.1116 13 | Epoch 13: loss = 0.1095 14 | Epoch 14: loss = 0.1079 15 | Epoch 15: loss = 0.1108 16 | Epoch 16: loss = 0.1060 17 | Epoch 17: loss = 0.1081 18 | Epoch 18: loss = 0.1066 19 | Epoch 19: loss = 0.1087 20 | Epoch 20: loss = 0.1057 21 | Epoch 21: loss = 0.1062 22 | Epoch 22: loss = 0.1070 23 | Epoch 23: loss = 0.1078 24 | Epoch 24: loss = 0.1064 25 | Epoch 25: loss = 0.1063 26 | Epoch 26: loss = 0.1066 27 | Epoch 27: loss = 0.1068 28 | Epoch 28: loss = 0.1058 29 | Epoch 29: loss = 0.1052 30 | Epoch 30: loss = 0.1057 31 | Epoch 31: loss = 0.1057 32 | Epoch 32: loss = 0.1055 33 | Epoch 33: loss = 0.1046 34 | Epoch 34: loss = 0.1046 35 | Epoch 35: loss = 0.1052 36 | Epoch 36: loss = 0.1046 37 | Epoch 37: loss = 0.1053 38 | Epoch 38: loss = 0.1049 39 | Epoch 39: loss = 0.1034 40 | Epoch 40: loss = 0.1037 41 | Epoch 41: loss = 0.1051 42 | Epoch 42: loss = 0.1039 43 | Epoch 43: loss = 0.1033 44 | Epoch 44: loss = 0.1058 45 | Epoch 45: loss = 0.1039 46 | Epoch 46: loss = 0.1025 47 | Epoch 47: loss = 0.1031 48 | Epoch 48: loss = 0.1037 49 | Epoch 49: loss = 0.1034 50 | Epoch 50: loss = 0.1046 51 | Epoch 51: loss = 0.1037 52 | Epoch 52: loss = 0.1044 53 | Epoch 53: loss = 0.1029 54 | Epoch 54: loss = 0.1022 55 | Epoch 55: loss = 0.1026 56 | Epoch 56: loss = 0.1031 57 | Epoch 57: loss = 0.1031 58 | Epoch 58: loss = 0.1030 59 | Epoch 59: loss = 0.1036 60 | Epoch 60: loss = 0.1025 61 | Epoch 61: loss = 0.1031 62 | Epoch 62: loss = 0.1042 63 | Epoch 63: loss = 0.1038 64 | Epoch 64: loss = 0.1034 65 | Epoch 65: loss = 0.1031 66 | Epoch 66: loss = 0.1023 67 | Epoch 67: loss = 0.1029 68 | Epoch 68: loss = 0.1018 69 | Epoch 69: loss = 0.1007 70 | Epoch 70: loss = 0.1022 71 | Epoch 71: loss = 0.1020 72 | Epoch 72: loss = 0.1026 73 | Epoch 73: loss = 0.1008 74 | Epoch 74: loss = 0.1024 75 | Epoch 75: loss = 0.1012 76 | Epoch 76: loss = 0.1016 77 | Epoch 77: loss = 0.1036 78 | Epoch 78: loss = 0.1018 79 | Epoch 79: loss = 0.1009 80 | Epoch 80: loss = 0.1009 81 | Epoch 81: loss = 0.1011 82 | Epoch 82: loss = 0.1012 83 | Epoch 83: loss = 0.1024 84 | Epoch 84: loss = 0.1025 85 | Epoch 85: loss = 0.1015 86 | Epoch 86: loss = 0.0998 87 | Epoch 87: loss = 0.1011 88 | Epoch 88: loss = 0.1033 89 | Epoch 89: loss = 0.1024 90 | Epoch 90: loss = 0.1032 91 | Epoch 91: loss = 0.1033 92 | Epoch 92: loss = 0.1014 93 | Epoch 93: loss = 0.1008 94 | Epoch 94: loss = 0.1011 95 | Epoch 95: loss = 0.1010 96 | Epoch 96: loss = 0.1001 97 | Epoch 97: loss = 0.1001 98 | Epoch 98: loss = 0.1011 99 | Epoch 99: loss = 0.1024 100 | Epoch 100: loss = 0.1007 101 | Epoch 101: loss = 0.0998 102 | Epoch 102: loss = 0.1010 103 | Epoch 103: loss = 0.1004 104 | Epoch 104: loss = 0.1014 105 | Epoch 105: loss = 0.1002 106 | Epoch 106: loss = 0.1003 107 | Epoch 107: loss = 0.0998 108 | Epoch 108: loss = 0.0996 109 | Epoch 109: loss = 0.0994 110 | Epoch 110: loss = 0.0997 111 | Epoch 111: loss = 0.1007 112 | Epoch 112: loss = 0.0990 113 | Epoch 113: loss = 0.0997 114 | Epoch 114: loss = 0.0994 115 | Epoch 115: loss = 0.1003 116 | Epoch 116: loss = 0.1011 117 | Epoch 117: loss = 0.1009 118 | Epoch 118: loss = 0.0991 119 | Epoch 119: loss = 0.0992 120 | Epoch 120: loss = 0.0998 121 | Epoch 121: loss = 0.1002 122 | Epoch 122: loss = 0.1007 123 | Epoch 123: loss = 0.1004 124 | Epoch 124: loss = 0.0995 125 | Epoch 125: loss = 0.1004 126 | Epoch 126: loss = 0.0998 127 | Epoch 127: loss = 0.0994 128 | Epoch 128: loss = 0.1007 129 | Epoch 129: loss = 0.0991 130 | Epoch 130: loss = 0.1009 131 | Epoch 131: loss = 0.0994 132 | Epoch 132: loss = 0.0990 133 | Epoch 133: loss = 0.1015 134 | Epoch 134: loss = 0.0986 135 | Epoch 135: loss = 0.1002 136 | Epoch 136: loss = 0.1000 137 | Epoch 137: loss = 0.0996 138 | Epoch 138: loss = 0.0994 139 | Epoch 139: loss = 0.0988 140 | Epoch 140: loss = 0.0996 141 | Epoch 141: loss = 0.0989 142 | Epoch 142: loss = 0.0991 143 | Epoch 143: loss = 0.1002 144 | Epoch 144: loss = 0.0985 145 | Epoch 145: loss = 0.1004 146 | Epoch 146: loss = 0.0998 147 | Epoch 147: loss = 0.0981 148 | Epoch 148: loss = 0.0989 149 | Epoch 149: loss = 0.0997 150 | Epoch 150: loss = 0.0993 151 | Epoch 151: loss = 0.0984 152 | Epoch 152: loss = 0.0993 153 | Epoch 153: loss = 0.0993 154 | Epoch 154: loss = 0.1006 155 | Epoch 155: loss = 0.1009 156 | Epoch 156: loss = 0.0989 157 | Epoch 157: loss = 0.0974 158 | Epoch 158: loss = 0.0978 159 | Epoch 159: loss = 0.0988 160 | Epoch 160: loss = 0.0984 161 | Epoch 161: loss = 0.0985 162 | Epoch 162: loss = 0.1005 163 | Epoch 163: loss = 0.0987 164 | Epoch 164: loss = 0.0992 165 | Epoch 165: loss = 0.0987 166 | Epoch 166: loss = 0.1003 167 | Epoch 167: loss = 0.1000 168 | Epoch 168: loss = 0.0983 169 | Epoch 169: loss = 0.0988 170 | Epoch 170: loss = 0.1004 171 | Epoch 171: loss = 0.0991 172 | Epoch 172: loss = 0.0985 173 | Epoch 173: loss = 0.0999 174 | Epoch 174: loss = 0.1012 175 | Epoch 175: loss = 0.0993 176 | Epoch 176: loss = 0.0980 177 | Epoch 177: loss = 0.0987 178 | Epoch 178: loss = 0.0991 179 | Epoch 179: loss = 0.0987 180 | Epoch 180: loss = 0.0986 181 | Epoch 181: loss = 0.0985 182 | Epoch 182: loss = 0.0968 183 | Epoch 183: loss = 0.0993 184 | Epoch 184: loss = 0.0973 185 | Epoch 185: loss = 0.0981 186 | Epoch 186: loss = 0.0993 187 | Epoch 187: loss = 0.0974 188 | Epoch 188: loss = 0.0989 189 | Epoch 189: loss = 0.0974 190 | Epoch 190: loss = 0.0985 191 | Epoch 191: loss = 0.0989 192 | Epoch 192: loss = 0.0992 193 | Epoch 193: loss = 0.0973 194 | Epoch 194: loss = 0.0980 195 | Epoch 195: loss = 0.0975 196 | Epoch 196: loss = 0.0990 197 | Epoch 197: loss = 0.0969 198 | Epoch 198: loss = 0.0973 199 | Epoch 199: loss = 0.0981 200 | Epoch 200: loss = 0.0978 201 | -------------------------------------------------------------------------------- /checkpts/vc/train_enc_libritts.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.5523 2 | Epoch 2: loss = 0.2962 3 | Epoch 3: loss = 0.2634 4 | Epoch 4: loss = 0.2445 5 | Epoch 5: loss = 0.2324 6 | Epoch 6: loss = 0.2246 7 | Epoch 7: loss = 0.2179 8 | Epoch 8: loss = 0.2124 9 | Epoch 9: loss = 0.2083 10 | Epoch 10: loss = 0.2052 11 | Epoch 11: loss = 0.2023 12 | Epoch 12: loss = 0.2001 13 | Epoch 13: loss = 0.1970 14 | Epoch 14: loss = 0.1947 15 | Epoch 15: loss = 0.1933 16 | Epoch 16: loss = 0.1918 17 | Epoch 17: loss = 0.1904 18 | Epoch 18: loss = 0.1890 19 | Epoch 19: loss = 0.1874 20 | Epoch 20: loss = 0.1867 21 | Epoch 21: loss = 0.1859 22 | Epoch 22: loss = 0.1833 23 | Epoch 23: loss = 0.1827 24 | Epoch 24: loss = 0.1822 25 | Epoch 25: loss = 0.1815 26 | Epoch 26: loss = 0.1803 27 | Epoch 27: loss = 0.1795 28 | Epoch 28: loss = 0.1790 29 | Epoch 29: loss = 0.1784 30 | Epoch 30: loss = 0.1777 31 | Epoch 31: loss = 0.1771 32 | Epoch 32: loss = 0.1761 33 | Epoch 33: loss = 0.1761 34 | Epoch 34: loss = 0.1748 35 | Epoch 35: loss = 0.1740 36 | Epoch 36: loss = 0.1735 37 | Epoch 37: loss = 0.1730 38 | Epoch 38: loss = 0.1722 39 | Epoch 39: loss = 0.1717 40 | Epoch 40: loss = 0.1715 41 | Epoch 41: loss = 0.1705 42 | Epoch 42: loss = 0.1706 43 | Epoch 43: loss = 0.1700 44 | Epoch 44: loss = 0.1694 45 | Epoch 45: loss = 0.1688 46 | Epoch 46: loss = 0.1686 47 | Epoch 47: loss = 0.1684 48 | Epoch 48: loss = 0.1678 49 | Epoch 49: loss = 0.1670 50 | Epoch 50: loss = 0.1670 51 | Epoch 51: loss = 0.1666 52 | Epoch 52: loss = 0.1666 53 | Epoch 53: loss = 0.1659 54 | Epoch 54: loss = 0.1656 55 | Epoch 55: loss = 0.1651 56 | Epoch 56: loss = 0.1647 57 | Epoch 57: loss = 0.1646 58 | Epoch 58: loss = 0.1639 59 | Epoch 59: loss = 0.1638 60 | Epoch 60: loss = 0.1635 61 | Epoch 61: loss = 0.1629 62 | Epoch 62: loss = 0.1635 63 | Epoch 63: loss = 0.1625 64 | Epoch 64: loss = 0.1622 65 | Epoch 65: loss = 0.1622 66 | Epoch 66: loss = 0.1617 67 | Epoch 67: loss = 0.1614 68 | Epoch 68: loss = 0.1614 69 | Epoch 69: loss = 0.1606 70 | Epoch 70: loss = 0.1607 71 | Epoch 71: loss = 0.1603 72 | Epoch 72: loss = 0.1601 73 | Epoch 73: loss = 0.1600 74 | Epoch 74: loss = 0.1594 75 | Epoch 75: loss = 0.1593 76 | Epoch 76: loss = 0.1594 77 | Epoch 77: loss = 0.1590 78 | Epoch 78: loss = 0.1584 79 | Epoch 79: loss = 0.1582 80 | Epoch 80: loss = 0.1581 81 | Epoch 81: loss = 0.1578 82 | Epoch 82: loss = 0.1581 83 | Epoch 83: loss = 0.1578 84 | Epoch 84: loss = 0.1571 85 | Epoch 85: loss = 0.1571 86 | Epoch 86: loss = 0.1572 87 | Epoch 87: loss = 0.1566 88 | Epoch 88: loss = 0.1562 89 | Epoch 89: loss = 0.1566 90 | Epoch 90: loss = 0.1556 91 | Epoch 91: loss = 0.1553 92 | Epoch 92: loss = 0.1559 93 | Epoch 93: loss = 0.1562 94 | Epoch 94: loss = 0.1556 95 | Epoch 95: loss = 0.1553 96 | Epoch 96: loss = 0.1553 97 | Epoch 97: loss = 0.1548 98 | Epoch 98: loss = 0.1544 99 | Epoch 99: loss = 0.1544 100 | Epoch 100: loss = 0.1545 101 | Epoch 101: loss = 0.1538 102 | Epoch 102: loss = 0.1538 103 | Epoch 103: loss = 0.1538 104 | Epoch 104: loss = 0.1538 105 | Epoch 105: loss = 0.1533 106 | Epoch 106: loss = 0.1535 107 | Epoch 107: loss = 0.1528 108 | Epoch 108: loss = 0.1529 109 | Epoch 109: loss = 0.1528 110 | Epoch 110: loss = 0.1523 111 | Epoch 111: loss = 0.1526 112 | Epoch 112: loss = 0.1522 113 | Epoch 113: loss = 0.1518 114 | Epoch 114: loss = 0.1518 115 | Epoch 115: loss = 0.1522 116 | Epoch 116: loss = 0.1514 117 | Epoch 117: loss = 0.1510 118 | Epoch 118: loss = 0.1517 119 | Epoch 119: loss = 0.1519 120 | Epoch 120: loss = 0.1508 121 | Epoch 121: loss = 0.1508 122 | Epoch 122: loss = 0.1515 123 | Epoch 123: loss = 0.1508 124 | Epoch 124: loss = 0.1505 125 | Epoch 125: loss = 0.1507 126 | Epoch 126: loss = 0.1508 127 | Epoch 127: loss = 0.1497 128 | Epoch 128: loss = 0.1497 129 | Epoch 129: loss = 0.1497 130 | Epoch 130: loss = 0.1498 131 | Epoch 131: loss = 0.1498 132 | Epoch 132: loss = 0.1493 133 | Epoch 133: loss = 0.1498 134 | Epoch 134: loss = 0.1488 135 | Epoch 135: loss = 0.1490 136 | Epoch 136: loss = 0.1493 137 | Epoch 137: loss = 0.1488 138 | Epoch 138: loss = 0.1485 139 | Epoch 139: loss = 0.1486 140 | Epoch 140: loss = 0.1486 141 | Epoch 141: loss = 0.1481 142 | Epoch 142: loss = 0.1483 143 | Epoch 143: loss = 0.1475 144 | Epoch 144: loss = 0.1483 145 | Epoch 145: loss = 0.1483 146 | Epoch 146: loss = 0.1476 147 | Epoch 147: loss = 0.1477 148 | Epoch 148: loss = 0.1475 149 | Epoch 149: loss = 0.1473 150 | Epoch 150: loss = 0.1474 151 | Epoch 151: loss = 0.1469 152 | Epoch 152: loss = 0.1473 153 | Epoch 153: loss = 0.1472 154 | Epoch 154: loss = 0.1465 155 | Epoch 155: loss = 0.1467 156 | Epoch 156: loss = 0.1469 157 | Epoch 157: loss = 0.1466 158 | Epoch 158: loss = 0.1468 159 | Epoch 159: loss = 0.1459 160 | Epoch 160: loss = 0.1463 161 | Epoch 161: loss = 0.1461 162 | Epoch 162: loss = 0.1459 163 | Epoch 163: loss = 0.1461 164 | Epoch 164: loss = 0.1455 165 | Epoch 165: loss = 0.1458 166 | Epoch 166: loss = 0.1457 167 | Epoch 167: loss = 0.1455 168 | Epoch 168: loss = 0.1457 169 | Epoch 169: loss = 0.1452 170 | Epoch 170: loss = 0.1457 171 | Epoch 171: loss = 0.1451 172 | Epoch 172: loss = 0.1448 173 | Epoch 173: loss = 0.1445 174 | Epoch 174: loss = 0.1451 175 | Epoch 175: loss = 0.1451 176 | Epoch 176: loss = 0.1451 177 | Epoch 177: loss = 0.1446 178 | Epoch 178: loss = 0.1442 179 | Epoch 179: loss = 0.1452 180 | Epoch 180: loss = 0.1447 181 | Epoch 181: loss = 0.1445 182 | Epoch 182: loss = 0.1444 183 | Epoch 183: loss = 0.1440 184 | Epoch 184: loss = 0.1446 185 | Epoch 185: loss = 0.1442 186 | Epoch 186: loss = 0.1442 187 | Epoch 187: loss = 0.1441 188 | Epoch 188: loss = 0.1438 189 | Epoch 189: loss = 0.1441 190 | Epoch 190: loss = 0.1433 191 | Epoch 191: loss = 0.1436 192 | Epoch 192: loss = 0.1435 193 | Epoch 193: loss = 0.1431 194 | Epoch 194: loss = 0.1431 195 | Epoch 195: loss = 0.1431 196 | Epoch 196: loss = 0.1432 197 | Epoch 197: loss = 0.1434 198 | Epoch 198: loss = 0.1427 199 | Epoch 199: loss = 0.1429 200 | Epoch 200: loss = 0.1428 201 | Epoch 201: loss = 0.1425 202 | Epoch 202: loss = 0.1420 203 | Epoch 203: loss = 0.1431 204 | Epoch 204: loss = 0.1424 205 | Epoch 205: loss = 0.1422 206 | Epoch 206: loss = 0.1425 207 | Epoch 207: loss = 0.1426 208 | Epoch 208: loss = 0.1425 209 | Epoch 209: loss = 0.1419 210 | Epoch 210: loss = 0.1422 211 | Epoch 211: loss = 0.1420 212 | Epoch 212: loss = 0.1419 213 | Epoch 213: loss = 0.1418 214 | Epoch 214: loss = 0.1416 215 | Epoch 215: loss = 0.1415 216 | Epoch 216: loss = 0.1418 217 | Epoch 217: loss = 0.1414 218 | Epoch 218: loss = 0.1417 219 | Epoch 219: loss = 0.1418 220 | Epoch 220: loss = 0.1418 221 | Epoch 221: loss = 0.1414 222 | Epoch 222: loss = 0.1414 223 | Epoch 223: loss = 0.1414 224 | Epoch 224: loss = 0.1410 225 | Epoch 225: loss = 0.1410 226 | Epoch 226: loss = 0.1408 227 | Epoch 227: loss = 0.1409 228 | Epoch 228: loss = 0.1406 229 | Epoch 229: loss = 0.1409 230 | Epoch 230: loss = 0.1407 231 | Epoch 231: loss = 0.1406 232 | Epoch 232: loss = 0.1407 233 | Epoch 233: loss = 0.1412 234 | Epoch 234: loss = 0.1405 235 | Epoch 235: loss = 0.1398 236 | Epoch 236: loss = 0.1402 237 | Epoch 237: loss = 0.1405 238 | Epoch 238: loss = 0.1401 239 | Epoch 239: loss = 0.1401 240 | Epoch 240: loss = 0.1401 241 | Epoch 241: loss = 0.1402 242 | Epoch 242: loss = 0.1398 243 | Epoch 243: loss = 0.1400 244 | Epoch 244: loss = 0.1399 245 | Epoch 245: loss = 0.1395 246 | Epoch 246: loss = 0.1398 247 | Epoch 247: loss = 0.1391 248 | Epoch 248: loss = 0.1397 249 | Epoch 249: loss = 0.1391 250 | Epoch 250: loss = 0.1398 251 | Epoch 251: loss = 0.1394 252 | Epoch 252: loss = 0.1394 253 | Epoch 253: loss = 0.1400 254 | Epoch 254: loss = 0.1395 255 | Epoch 255: loss = 0.1396 256 | Epoch 256: loss = 0.1388 257 | Epoch 257: loss = 0.1391 258 | Epoch 258: loss = 0.1390 259 | Epoch 259: loss = 0.1392 260 | Epoch 260: loss = 0.1391 261 | Epoch 261: loss = 0.1390 262 | Epoch 262: loss = 0.1385 263 | Epoch 263: loss = 0.1383 264 | Epoch 264: loss = 0.1395 265 | Epoch 265: loss = 0.1386 266 | Epoch 266: loss = 0.1382 267 | Epoch 267: loss = 0.1387 268 | Epoch 268: loss = 0.1382 269 | Epoch 269: loss = 0.1384 270 | Epoch 270: loss = 0.1385 271 | Epoch 271: loss = 0.1382 272 | Epoch 272: loss = 0.1385 273 | Epoch 273: loss = 0.1380 274 | Epoch 274: loss = 0.1381 275 | Epoch 275: loss = 0.1385 276 | Epoch 276: loss = 0.1384 277 | Epoch 277: loss = 0.1381 278 | Epoch 278: loss = 0.1380 279 | Epoch 279: loss = 0.1382 280 | Epoch 280: loss = 0.1384 281 | Epoch 281: loss = 0.1376 282 | Epoch 282: loss = 0.1379 283 | Epoch 283: loss = 0.1379 284 | Epoch 284: loss = 0.1378 285 | Epoch 285: loss = 0.1379 286 | Epoch 286: loss = 0.1376 287 | Epoch 287: loss = 0.1373 288 | Epoch 288: loss = 0.1374 289 | Epoch 289: loss = 0.1375 290 | Epoch 290: loss = 0.1372 291 | Epoch 291: loss = 0.1378 292 | Epoch 292: loss = 0.1373 293 | Epoch 293: loss = 0.1375 294 | Epoch 294: loss = 0.1373 295 | Epoch 295: loss = 0.1375 296 | Epoch 296: loss = 0.1372 297 | Epoch 297: loss = 0.1372 298 | Epoch 298: loss = 0.1370 299 | Epoch 299: loss = 0.1367 300 | Epoch 300: loss = 0.1368 301 | -------------------------------------------------------------------------------- /checkpts/vocoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /checkpts/vocoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "segment_size": 8192, 18 | "num_mels": 80, 19 | "num_freq": 1025, 20 | "n_fft": 1024, 21 | "hop_size": 256, 22 | "win_size": 1024, 23 | 24 | "sampling_rate": 22050, 25 | 26 | "fmin": 0, 27 | "fmax": 8000, 28 | "fmax_for_loss": null, 29 | 30 | "num_workers": 4, 31 | 32 | "dist_config": { 33 | "dist_backend": "nccl", 34 | "dist_url": "tcp://localhost:54321", 35 | "world_size": 1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import os 10 | import random 11 | import numpy as np 12 | import torch 13 | from params import seed as random_seed 14 | from params import n_mels, train_frames 15 | 16 | 17 | class ATYDecDataset(torch.utils.data.Dataset): 18 | def __init__(self, data_dir, spk): 19 | self.mel_dir = os.path.join(data_dir, 'mels') 20 | self.emb_dir = os.path.join(data_dir, 'embeds') 21 | self.speakers = [spk] 22 | stats = np.loadtxt(os.path.join(data_dir, 'stats', spk, 'global_mean_var.txt')) 23 | self.mean = stats[0] 24 | self.std = stats[1] 25 | self.train_info = [] 26 | self.valid_info = [] 27 | self.read_info() 28 | for spk in self.speakers: 29 | mel_ids = [] 30 | for root, dirs, files in os.walk(os.path.join(self.mel_dir, spk)): 31 | for f in files: 32 | if f.endswith('.npy'): 33 | mel_ids.append(f.split('.npy')[0]) 34 | self.train_info += [(m, spk) for m in mel_ids] 35 | 36 | print("Total number of training wavs is %d." % len(self.train_info)) 37 | print("Total number of training speakers is %d." % len(self.speakers)) 38 | random.seed(random_seed) 39 | random.shuffle(self.train_info) 40 | 41 | def read_info(self): 42 | allnames = [] 43 | for dys in self.speakers: 44 | for root, dirs, files in os.walk(os.path.join(self.mel_dir, dys)): 45 | for f in files: 46 | if f.endswith('.npy'): 47 | allnames.append(f.split('.npy')[0]) 48 | random.shuffle(allnames) 49 | 50 | def mean_var_norm(self, x): 51 | x = (x - self.mean[:, None]) / self.std[:, None] 52 | return x 53 | 54 | def inv_mean_var_norm(self, x): 55 | x = (x * self.std[:, None]) + self.mean[:, None] 56 | return x 57 | 58 | def get_vc_data(self, audio_info): 59 | audio_id, spk = audio_info 60 | mels = self.get_mels(audio_id, spk) 61 | embed = self.get_embed(audio_id, spk) 62 | return (mels, embed) 63 | 64 | def get_mels(self, audio_id, spk): 65 | mel_path = os.path.join(self.mel_dir, spk, audio_id + '.npy') 66 | mels = np.load(mel_path) 67 | mels = self.mean_var_norm(mels) 68 | mels = torch.from_numpy(mels).float() 69 | return mels 70 | 71 | def get_embed(self, audio_id, spk): 72 | embed_path = os.path.join(self.emb_dir, spk, audio_id + '.npy') 73 | embed = np.load(embed_path) 74 | embed = torch.from_numpy(embed).float() 75 | return embed 76 | 77 | def __getitem__(self, index): 78 | mels, embed = self.get_vc_data(self.train_info[index]) 79 | item = {'mel': mels, 'c': embed} 80 | return item 81 | 82 | def __len__(self): 83 | return len(self.train_info) 84 | 85 | def get_valid_dataset(self): 86 | pairs = [] 87 | for i in range(len(self.valid_info)): 88 | mels, embed = self.get_vc_data(self.valid_info[i]) 89 | pairs.append((mels, embed)) 90 | return pairs 91 | 92 | class ATYDecBatchCollate(object): 93 | def __call__(self, batch): 94 | B = len(batch) 95 | mels1 = torch.zeros((B, n_mels, train_frames), dtype=torch.float32) 96 | mels2 = torch.zeros((B, n_mels, train_frames), dtype=torch.float32) 97 | max_starts = [max(item['mel'].shape[-1] - train_frames, 0) 98 | for item in batch] 99 | starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] 100 | starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] 101 | mel_lengths = [] 102 | for i, item in enumerate(batch): 103 | mel = item['mel'] 104 | if mel.shape[-1] < train_frames: 105 | mel_length = mel.shape[-1] 106 | else: 107 | mel_length = train_frames 108 | mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length] 109 | mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length] 110 | mel_lengths.append(mel_length) 111 | mel_lengths = torch.LongTensor(mel_lengths) 112 | embed = torch.stack([item['c'] for item in batch], 0) 113 | return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths, 'c': embed} -------------------------------------------------------------------------------- /demos/change_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from tqdm import tqdm 4 | 5 | csvpath = '/scratch4/lmorove1/hwang258/data/atypicalspeech/atypicalspeech/excel_out_slurp_metadata.csv' 6 | savepath = '/scratch4/lmorove1/hwang258/data/atypicalspeech/atypicalspeech/excel_out_slurp_metadata_named.csv' 7 | lines = [] 8 | # opening the CSV file 9 | with open(csvpath, mode ='r')as file: 10 | csvFile = csv.reader(file) 11 | for line in csvFile: 12 | lines.append(line) 13 | fields = lines[0] 14 | lines = lines[1:] 15 | dicts = {} 16 | savelines = [] 17 | for line in tqdm(lines): 18 | name = line[2] 19 | if name not in dicts.keys(): 20 | dicts[name] = 1 21 | else: 22 | dicts[name] += 1 23 | savename = 'XXXX_script'+name+'_line000'+str(dicts[name])+'.wav' 24 | line[2] = savename 25 | savelines.append(line) 26 | 27 | with open(savepath, 'w') as csvfile: 28 | csvwriter = csv.writer(csvfile) 29 | csvwriter.writerow(fields) 30 | csvwriter.writerows(savelines) 31 | -------------------------------------------------------------------------------- /demos/cmu_dictionary.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/demos/cmu_dictionary.txt -------------------------------------------------------------------------------- /demos/create_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import random 4 | 5 | rows = [] 6 | fields = ['audio_url'] 7 | temp = 'https://mostest2023.s3.us-east-2.amazonaws.com/' 8 | for root, dirs, files in os.walk('/data/dean/whl-2022/Speech-Backbones/DiffVC/am_data_mos'): 9 | for f in files: 10 | if '_generated_' in f: 11 | rows.append([temp+f]) 12 | random.shuffle(rows) 13 | # name of csv file 14 | filename = "am_demo.csv" 15 | 16 | # writing to csv file 17 | with open(filename, 'w') as csvfile: 18 | # creating a csv writer object 19 | csvwriter = csv.writer(csvfile) 20 | # writing the fields 21 | csvwriter.writerow(fields) 22 | # writing the data rows 23 | csvwriter.writerows(rows) -------------------------------------------------------------------------------- /demos/create_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import shutil 4 | import random 5 | 6 | resultdir = '/data/dean/whl-2022/Speech-Backbones/DiffVC/results_allaugdata_removesil' 7 | audiodir = '/data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_data' 8 | savedir = '/data/dean/whl-2022/Speech-Backbones/DiffVC/am_data_mos' 9 | 10 | # alldys = ['F02','F03','F04','F05','M01','M04','M05','M07','M08','M09','M10','M11','M12','M14','M16'] 11 | alldys = ['M08','M10','M05','M11','M04','M12'] 12 | 13 | for dys in alldys: 14 | cmds = [] 15 | for root, dirs, files in os.walk(os.path.join(resultdir, dys)): 16 | for f in files: 17 | if f.endswith('.wav') and f.split('_')[1] == 'B2': 18 | cmds.append([root, f]) 19 | print(len(cmds)) 20 | random.shuffle(cmds) 21 | cmds = cmds[:50] 22 | t_cmds = [] 23 | for root, dirs, files in os.walk(os.path.join(audiodir, dys)): 24 | for f in files: 25 | if f.endswith('.wav') and f.split('_')[1] != 'B2': 26 | t_cmds.append(os.path.join(root, f)) 27 | print(len(t_cmds)) 28 | random.shuffle(t_cmds) 29 | t_cmds = t_cmds[:50] 30 | datas = [] 31 | for i, c in enumerate(cmds): 32 | gt = os.path.join(audiodir, dys, dys+'_'+c[1].split('_',1)[-1]) 33 | if os.path.exists(gt): 34 | generated = os.path.join(c[0], c[1]) 35 | source = os.path.join(audiodir, c[1].split('_')[0], c[1]) 36 | target = t_cmds[i] 37 | datas.append([source, target, generated, gt]) 38 | print(len(datas)) 39 | os.makedirs(os.path.join(savedir, dys), exist_ok=True) 40 | count = 1 41 | for d in datas: 42 | shutil.copyfile(d[0], os.path.join(savedir, dys, str(count)+'_'+dys+'_source_'+d[0].split('/')[-1])) 43 | shutil.copyfile(d[1], os.path.join(savedir, dys, str(count)+'_'+dys + '_target_' + d[1].split('/')[-1])) 44 | shutil.copyfile(d[2], os.path.join(savedir, dys, str(count)+'_'+dys + '_generated_' + d[2].split('/')[-1])) 45 | # shutil.copyfile(d[3], os.path.join(savedir, dys, str(count) + '_gt_' + d[3].split('/')[-1])) 46 | count += 1 47 | -------------------------------------------------------------------------------- /demos/create_scps.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import shutil 4 | 5 | datapath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_data' 6 | textpath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_data' 7 | savepath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_asrdata_removesil' 8 | augdatapath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/results_allaugdata_removesil' 9 | 10 | alldys = ['F02','F03','F04','F05','M01','M04','M05','M07','M08','M09','M10','M11','M12','M14','M16'] 11 | 12 | def train_ctrl(): 13 | os.makedirs(os.path.join(savepath, 'ctrls', 'train'), exist_ok=True) 14 | wav_scp_train = open(os.path.join(savepath, 'ctrls', 'train', 'wav.scp'), 'w') 15 | text_train = open(os.path.join(savepath, 'ctrls', 'train', 'text'), 'w') 16 | utt2spk_train = open(os.path.join(savepath, 'ctrls', 'train', 'utt2spk'), 'w') 17 | for root, dirs, files in os.walk(os.path.join(datapath)): 18 | for f in files: 19 | if f.endswith('.wav') and f.split('_')[0].startswith('C'): 20 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 21 | wav_scp_train.write('\n') 22 | utt2spk_train.write(f.split('.wav')[0] + " " + f.split('_')[0]) 23 | utt2spk_train.write('\n') 24 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 25 | t = fi.read().replace('\n', '').upper() 26 | text_train.write(f.split('.wav')[0] + " " + t) 27 | text_train.write('\n') 28 | wav_scp_train.close() 29 | text_train.close() 30 | utt2spk_train.close() 31 | 32 | def train_ctrl_valid(): 33 | os.makedirs(os.path.join(savepath, 'ctrls', 'valid'), exist_ok=True) 34 | wav_scp_train = open(os.path.join(savepath, 'ctrls', 'valid', 'wav.scp'), 'w') 35 | text_train = open(os.path.join(savepath, 'ctrls', 'valid', 'text'), 'w') 36 | utt2spk_train = open(os.path.join(savepath, 'ctrls', 'valid', 'utt2spk'), 'w') 37 | cmds = [] 38 | for root, dirs, files in os.walk(os.path.join(datapath)): 39 | for f in files: 40 | if f.endswith('.wav') and not f.split('_')[0].startswith('C'): 41 | cmds.append([f, root]) 42 | random.shuffle(cmds) 43 | cmds = cmds[:800] 44 | for c in cmds: 45 | f = c[0] 46 | root = c[1] 47 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 48 | wav_scp_train.write('\n') 49 | utt2spk_train.write(f.split('.wav')[0] + " " + 'dummy') 50 | utt2spk_train.write('\n') 51 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 52 | t = fi.read().replace('\n', '').upper() 53 | text_train.write(f.split('.wav')[0] + " " + t) 54 | text_train.write('\n') 55 | wav_scp_train.close() 56 | text_train.close() 57 | utt2spk_train.close() 58 | 59 | def train_ctrl_test(): 60 | os.makedirs(os.path.join(savepath, 'ctrls', 'test'), exist_ok=True) 61 | for dys in alldys: 62 | shutil.copytree(os.path.join(savepath, dys, 'test'), os.path.join(savepath, 'ctrls', 'test_'+dys)) 63 | 64 | def train_dys(): 65 | for dys in alldys: 66 | os.makedirs(os.path.join(savepath, dys, 'train'), exist_ok=True) 67 | os.makedirs(os.path.join(savepath, dys, 'test'), exist_ok=True) 68 | # split train, valid and test 69 | wav_scp_train = open(os.path.join(savepath, dys, 'train', 'wav.scp'), 'w') 70 | text_train = open(os.path.join(savepath, dys, 'train', 'text'), 'w') 71 | utt2spk_train = open(os.path.join(savepath, dys, 'train', 'utt2spk'), 'w') 72 | wav_scp_test = open(os.path.join(savepath, dys, 'test', 'wav.scp'), 'w') 73 | text_test = open(os.path.join(savepath, dys, 'test', 'text'), 'w') 74 | utt2spk_test = open(os.path.join(savepath, dys, 'test', 'utt2spk'), 'w') 75 | for root, dirs, files in os.walk(os.path.join(datapath, dys)): 76 | for f in files: 77 | if f.endswith('.wav'): 78 | if f.split('_')[1] != 'B2': 79 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 80 | wav_scp_train.write('\n') 81 | utt2spk_train.write(f.split('.wav')[0] + " " + dys) 82 | utt2spk_train.write('\n') 83 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 84 | t = fi.read().replace('\n', '').upper() 85 | text_train.write(f.split('.wav')[0] + " " + t) 86 | text_train.write('\n') 87 | else: 88 | wav_scp_test.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 89 | wav_scp_test.write('\n') 90 | utt2spk_test.write(f.split('.wav')[0] + " " + dys) 91 | utt2spk_test.write('\n') 92 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 93 | t = fi.read().replace('\n', '').upper() 94 | text_test.write(f.split('.wav')[0] + " " + t) 95 | text_test.write('\n') 96 | for root, dirs, files in os.walk(os.path.join(datapath)): 97 | for f in files: 98 | if f.endswith('.wav') and f.split('_')[0].startswith('C'): 99 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 100 | wav_scp_train.write('\n') 101 | utt2spk_train.write(f.split('.wav')[0] + " " + f.split('_')[0]) 102 | utt2spk_train.write('\n') 103 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 104 | t = fi.read().replace('\n', '').upper() 105 | text_train.write(f.split('.wav')[0] + " " + t) 106 | text_train.write('\n') 107 | 108 | wav_scp_train.close() 109 | wav_scp_test.close() 110 | text_train.close() 111 | text_test.close() 112 | utt2spk_train.close() 113 | utt2spk_test.close() 114 | 115 | def train_dys_aug(): 116 | for dys in alldys: 117 | os.makedirs(os.path.join(savepath, dys+'_aug', 'train'), exist_ok=True) 118 | os.makedirs(os.path.join(savepath, dys+'_aug', 'test'), exist_ok=True) 119 | # split train, valid and test 120 | wav_scp_train = open(os.path.join(savepath, dys+'_aug', 'train', 'wav.scp'), 'w') 121 | text_train = open(os.path.join(savepath, dys+'_aug', 'train', 'text'), 'w') 122 | utt2spk_train = open(os.path.join(savepath, dys+'_aug', 'train', 'utt2spk'), 'w') 123 | wav_scp_test = open(os.path.join(savepath, dys+'_aug', 'test', 'wav.scp'), 'w') 124 | text_test = open(os.path.join(savepath, dys+'_aug', 'test', 'text'), 'w') 125 | utt2spk_test = open(os.path.join(savepath, dys+'_aug', 'test', 'utt2spk'), 'w') 126 | for root, dirs, files in os.walk(os.path.join(datapath, dys)): 127 | for f in files: 128 | if f.endswith('.wav'): 129 | if f.split('_')[1] != 'B2': 130 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 131 | wav_scp_train.write('\n') 132 | utt2spk_train.write(f.split('.wav')[0] + " " + 'dummy') 133 | utt2spk_train.write('\n') 134 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 135 | t = fi.read().replace('\n', '').upper() 136 | text_train.write(f.split('.wav')[0] + " " + t) 137 | text_train.write('\n') 138 | else: 139 | wav_scp_test.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 140 | wav_scp_test.write('\n') 141 | utt2spk_test.write(f.split('.wav')[0] + " " + 'dummy') 142 | utt2spk_test.write('\n') 143 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 144 | t = fi.read().replace('\n', '').upper() 145 | text_test.write(f.split('.wav')[0] + " " + t) 146 | text_test.write('\n') 147 | 148 | for root, dirs, files in os.walk(os.path.join(augdatapath, dys)): 149 | for f in files: 150 | if f.endswith('.wav'): 151 | wav_scp_train.write(f.split('.wav')[0]+'_aug' + " " + os.path.join(root, f)) 152 | wav_scp_train.write('\n') 153 | utt2spk_train.write(f.split('.wav')[0]+'_aug' + " " + 'dummy') 154 | utt2spk_train.write('\n') 155 | with open(os.path.join(datapath, f.split('_')[0], f.replace('.wav', '.lab'))) as fi: 156 | t = fi.read().replace('\n', '').upper() 157 | text_train.write(f.split('.wav')[0]+'_aug' + " " + t) 158 | text_train.write('\n') 159 | 160 | for root, dirs, files in os.walk(os.path.join(datapath)): 161 | for f in files: 162 | if f.endswith('.wav') and f.split('_')[0].startswith('C'): 163 | wav_scp_train.write(f.split('.wav')[0] + " " + os.path.join(root, f)) 164 | wav_scp_train.write('\n') 165 | utt2spk_train.write(f.split('.wav')[0] + " " + 'dummy') 166 | utt2spk_train.write('\n') 167 | with open(os.path.join(root, f.replace('.wav', '.lab'))) as fi: 168 | t = fi.read().replace('\n', '').upper() 169 | text_train.write(f.split('.wav')[0] + " " + t) 170 | text_train.write('\n') 171 | 172 | wav_scp_train.close() 173 | wav_scp_test.close() 174 | text_train.close() 175 | text_test.close() 176 | utt2spk_train.close() 177 | utt2spk_test.close() 178 | 179 | # egs2/TEMPLATE/asr1/setup.sh egs2/uaspeech_ctrl/asr1 180 | # scp -r egs2/uaspeech/asr1/asr.sh egs2/uaspeech_ctrl/asr1/asr.sh 181 | # scp -r egs2/uaspeech/asr1/run.sh egs2/uaspeech_ctrl/asr1/run.sh 182 | ## modify run.sh 183 | 184 | # 185 | # scp -r /data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_asrdata/F02_aug /data/dean/whl-2022/espnet/egs2/uaspeech_F02_aug/asr1/data 186 | 187 | # cd /data/dean/whl-2022/espnet/egs2/wsj/asr1/ 188 | # utils/fix_data_dir.sh /data/dean/whl-2022/espnet/egs2/uaspeech_F02_aug/asr1/data/train 189 | # utils/spk2utt_to_utt2spk.pl data/train/spk2utt > data/train/utt2spk 190 | 191 | # cd /data/dean/whl-2022/espnet/egs2/uaspeech_F02_aug/asr1 192 | 193 | #./asr.sh --stage 2 --ngpu 1 --train_set train --valid_set test --test_sets "test" --lm_train_text "data/train/text" 194 | 195 | train_dys() 196 | train_dys_aug() 197 | train_ctrl() 198 | train_ctrl_valid() 199 | train_ctrl_test() -------------------------------------------------------------------------------- /demos/emb_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pytsmod as tsm 4 | import tgt 5 | from scipy.stats import mode 6 | import librosa 7 | from librosa.core import load 8 | from librosa.filters import mel as librosa_mel_fn 9 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) 10 | import pickle 11 | import numpy as np 12 | import torch 13 | use_gpu = torch.cuda.is_available() 14 | import sys 15 | sys.path.append('speaker_encoder/') 16 | from encoder import inference as spk_encoder 17 | from pathlib import Path 18 | import multiprocessing 19 | import shutil 20 | from tqdm import tqdm 21 | import soundfile as sf 22 | import json 23 | from numpy.linalg import norm 24 | import pandas as pd 25 | 26 | def get_embed(wav_path, spk_encoder, savepath): 27 | if not os.path.exists(savepath): 28 | wav_preprocessed = spk_encoder.preprocess_wav(wav_path) 29 | embed = spk_encoder.embed_utterance(wav_preprocessed) 30 | np.save(savepath, embed) 31 | # print(savepath) 32 | 33 | def generate_emb_GE(source_dys, target_dys): 34 | datapath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/results_allaugdata' 35 | savepath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/embs_demo_all/' 36 | # loading speaker encoder 37 | enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path 38 | spk_encoder.load_model(enc_model_fpath, device="cpu") 39 | cmds = [] 40 | for root, dir, files in os.walk(os.path.join(datapath, target_dys)): 41 | for f in files: 42 | if f.endswith('.wav'): 43 | if f.startswith(source_dys): 44 | savename = os.path.join(savepath, source_dys+'to'+target_dys, f).replace('.wav', '.npy') 45 | os.makedirs(os.path.join(savepath, source_dys+'to'+target_dys), exist_ok=True) 46 | cmds.append((os.path.join(root, f), spk_encoder, savename)) 47 | random.shuffle(cmds) 48 | cmds = cmds[:20] 49 | for c in tqdm(cmds): 50 | get_embed(c[0], c[1], c[2]) 51 | 52 | def cal_similarity(source_dys, target_dys): 53 | generate_emb_GE(source_dys, target_dys) 54 | datapath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/avgmel_data/embeds' 55 | embpath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/embs_demo_all/' 56 | source_embs = [] 57 | target_embs = [] 58 | generated_embs = [] 59 | for root, dirs, files in os.walk(os.path.join(datapath, source_dys)): 60 | for f in files: 61 | if f.endswith('.npy'): 62 | source_embs.append(np.load(os.path.join(root, f))) 63 | for root, dirs, files in os.walk(os.path.join(datapath, target_dys)): 64 | for f in files: 65 | if f.endswith('.npy'): 66 | target_embs.append(np.load(os.path.join(root, f))) 67 | for root, dirs, files in os.walk(os.path.join(embpath, source_dys+'to'+target_dys)): 68 | for f in files: 69 | if f.endswith('.npy'): 70 | generated_embs.append(np.load(os.path.join(root, f))) 71 | source_embs = np.array(source_embs) 72 | target_embs = np.array(target_embs) 73 | generated_embs = np.array(generated_embs) 74 | source_embs = np.mean(source_embs, 0) 75 | target_embs = np.mean(target_embs, 0) 76 | generated_embs = np.mean(generated_embs, 0) 77 | cos_sg = np.dot(source_embs, generated_embs) / (norm(source_embs) * norm(generated_embs)) 78 | cos_tg = np.dot(target_embs, generated_embs) / (norm(target_embs) * norm(generated_embs)) 79 | cos_st = np.dot(target_embs, source_embs) / (norm(target_embs) * norm(source_embs)) 80 | print(source_dys, target_dys, cos_st, cos_sg, cos_tg) 81 | return source_dys, target_dys, cos_st, cos_sg, cos_tg 82 | 83 | dysspks = ['F02', 'F03', 'F04', 'F05', 'M01', 'M04', 'M05', 'M07', 'M08', 'M09', 'M10', 'M11', 'M12', 'M14', 'M16'] 84 | ctrlspks = ['CF02', 'CF03', 'CF04', 'CF05', 'CM01', 'CM04', 'CM05', 'CM06', 'CM08', 'CM09', 'CM10', 'CM12', 'CM13'] 85 | 86 | dicts = {'Source':[], 87 | 'Target':[], 88 | 'ST':[], 89 | 'SG':[], 90 | 'TG':[] 91 | } 92 | df = pd.DataFrame(dicts) 93 | allst, allsg,alltg=0.,0.,0. 94 | for c in ctrlspks: 95 | for d in dysspks: 96 | source_dys, target_dys, cos_st, cos_sg, cos_tg = cal_similarity(c, d) 97 | df2 = {'Source': source_dys, 'Target': target_dys, 'ST': cos_st, 'SG': cos_sg, 'TG':cos_tg} 98 | df = df.append(df2, ignore_index=True) 99 | allsg+=cos_sg 100 | allst+=cos_st 101 | alltg+=cos_tg 102 | allsg/=len(dysspks)*len(ctrlspks) 103 | allst/=len(dysspks)*len(ctrlspks) 104 | alltg/=len(dysspks)*len(ctrlspks) 105 | df2 = {'Source': 'All', 'Target': 'All', 'ST': allst, 'SG': allsg, 'TG': alltg} 106 | df = df.append(df2, ignore_index=True) 107 | df = df.round(3) 108 | df.to_excel('SpeakerSimilarity.xlsx') -------------------------------------------------------------------------------- /demos/english.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/demos/english.zip -------------------------------------------------------------------------------- /demos/inference_allaty.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import numpy as np 5 | import pytsmod as tsm 6 | import torchaudio 7 | import torch 8 | use_gpu = torch.cuda.is_available() 9 | import librosa 10 | from librosa.core import load 11 | from librosa.filters import mel as librosa_mel_fn 12 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) 13 | 14 | import params 15 | from model import DiffVC 16 | import sys 17 | sys.path.append('hifi-gan/') 18 | from env import AttrDict 19 | from models import Generator as HiFiGAN 20 | import pickle 21 | sys.path.append('speaker_encoder/') 22 | from encoder import inference as spk_encoder 23 | from tqdm import tqdm 24 | 25 | def get_mel(wav_path, ratio=1.0, mode=None): 26 | # mode: tempo, speed or None 27 | wav, _ = load(wav_path, sr=22050) 28 | if mode == 'tempo': 29 | wav = tsm.wsola(wav, ratio) 30 | elif mode == 'speed': 31 | wav = librosa.effects.time_stretch(wav, rate=1./ratio) 32 | wav = wav[:(wav.shape[0] // 256)*256] 33 | wav = np.pad(wav, 384, mode='reflect') 34 | stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) 35 | stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) 36 | mel_spectrogram = np.matmul(mel_basis, stftm) 37 | log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) 38 | return log_mel_spectrogram 39 | 40 | def get_embed(wav_path): 41 | wav_preprocessed = spk_encoder.preprocess_wav(wav_path) 42 | embed = spk_encoder.embed_utterance(wav_preprocessed) 43 | return embed 44 | 45 | def noise_median_smoothing(x, w=5): 46 | y = np.copy(x) 47 | x = np.pad(x, w, "edge") 48 | for i in range(y.shape[0]): 49 | med = np.median(x[i:i+2*w+1]) 50 | y[i] = min(x[i+w+1], med) 51 | return y 52 | 53 | def mel_spectral_subtraction(mel_synth, mel_source, spectral_floor=0.02, silence_window=5, smoothing_window=5): 54 | mel_len = mel_source.shape[-1] 55 | energy_min = 100000.0 56 | i_min = 0 57 | for i in range(mel_len - silence_window): 58 | energy_cur = np.sum(np.exp(2.0 * mel_source[:, i:i+silence_window])) 59 | if energy_cur < energy_min: 60 | i_min = i 61 | energy_min = energy_cur 62 | estimated_noise_energy = np.min(np.exp(2.0 * mel_synth[:, i_min:i_min+silence_window]), axis=-1) 63 | if smoothing_window is not None: 64 | estimated_noise_energy = noise_median_smoothing(estimated_noise_energy, smoothing_window) 65 | mel_denoised = np.copy(mel_synth) 66 | for i in range(mel_len): 67 | signal_subtract_noise = np.exp(2.0 * mel_synth[:, i]) - estimated_noise_energy 68 | estimated_signal_energy = np.maximum(signal_subtract_noise, spectral_floor * estimated_noise_energy) 69 | mel_denoised[:, i] = np.log(np.sqrt(estimated_signal_energy)) 70 | return mel_denoised 71 | 72 | 73 | def count_dups(nums): 74 | element = [] 75 | freque = [] 76 | if not nums: 77 | return element 78 | running_count = 1 79 | for i in range(len(nums)-1): 80 | if nums[i] == nums[i+1]: 81 | running_count += 1 82 | else: 83 | freque.append(running_count) 84 | element.append(nums[i]) 85 | running_count = 1 86 | freque.append(running_count) 87 | element.append(nums[i+1]) 88 | return element, freque 89 | 90 | def inference(args, dys, generator, hifigan_universal, src_path, tgt_path, save_path, mean, std, emb): 91 | with open(os.path.join(args.phoneme_uaspeech, dys+'_phonemes.pkl'), 'rb') as f: 92 | ua_dict = pickle.load(f) 93 | phoneme_list = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 94 | 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 95 | 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 96 | 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 97 | 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 98 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 99 | 'UH2', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', 'sil', 100 | 'sp', 'spn'] 101 | os.makedirs(save_path, exist_ok=True) 102 | 103 | # loading source and reference wavs, calculating mel-spectrograms and speaker embeddings 104 | mel_target = torch.from_numpy(get_mel(tgt_path)).float().unsqueeze(0) 105 | mel_target = mel_target.cuda() 106 | mel_target_lengths = torch.LongTensor([mel_target.shape[-1]]) 107 | mel_target_lengths = mel_target_lengths.cuda() 108 | embed_target = torch.from_numpy(emb).float().unsqueeze(0) 109 | embed_target = embed_target.cuda() 110 | 111 | phoneme_logits = np.load(os.path.join(args.phoneme_dir, src_path.split('/')[-1].split('.')[0]+'_phonemes.npy')) 112 | allp, freque = count_dups(list(phoneme_logits)) 113 | num = 0 114 | duration = 0. 115 | duration_gt = 0. 116 | for i, px in enumerate(allp): 117 | if px != 69: 118 | phoneme = phoneme_list[px] 119 | duration += freque[i]*256 120 | duration_gt += ua_dict[phoneme]['avg_duration']* 22050.0 121 | num += 1 122 | ratio = max(duration_gt / duration, args.fast_ratio) 123 | # print(duration/num, duration_gt/num, ratio) 124 | 125 | mel_source_tempo = get_mel(src_path, ratio, 'tempo') 126 | mel_source_tempo = (mel_source_tempo - mean[:, None]) / std[:, None] 127 | mel_source_tempo = torch.from_numpy(mel_source_tempo).float().unsqueeze(0) 128 | mel_source_tempo = mel_source_tempo.cuda() 129 | mel_source_lengths_tempo = torch.LongTensor([mel_source_tempo.shape[-1]]) 130 | mel_source_lengths_tempo = mel_source_lengths_tempo.cuda() 131 | 132 | _, mel_modified = generator(mel_source_tempo, mel_source_lengths_tempo, mel_target, mel_target_lengths, embed_target, 133 | n_timesteps=100, mode='ml') 134 | 135 | mel_synth_np_modified = mel_modified.cpu().detach().squeeze().numpy() 136 | mel_synth_np_modified = (mel_synth_np_modified * std[:, None]) + mean[:, None] 137 | mel_modified = torch.from_numpy(mel_spectral_subtraction(mel_synth_np_modified, mel_synth_np_modified, smoothing_window=1)).float().unsqueeze( 138 | 0) 139 | mel_modified = mel_modified.cuda() 140 | # converted speech modified 141 | with torch.no_grad(): 142 | audio = hifigan_universal.forward(mel_modified).cpu().squeeze().reshape(1, -1) 143 | torchaudio.save(os.path.join(save_path, src_path.split('/')[-1]), audio, 22050) 144 | 145 | def get_avg_emb(emb_dir): 146 | allembs = [] 147 | for root, dirs, files in os.walk(emb_dir): 148 | for f in files: 149 | if f.endswith('.npy'): 150 | allembs.append(np.load(os.path.join(root, f))) 151 | allembs = np.array(allembs) 152 | allembs = np.mean(allembs, 0) 153 | print(f'Embedding shape: {allembs.shape}') 154 | return allembs 155 | 156 | def main(args, dys): 157 | stats = np.loadtxt(os.path.join(args.mean_std_file_ua, 'global_mean_var.txt')) 158 | mean = stats[0] 159 | std = stats[1] 160 | vc_path = os.path.join(args.model_path_dir, dys, 'vc.pt') 161 | emb_dir = os.path.join(args.emb_dir, dys) 162 | vocoder_path = os.path.join(args.vocoder_dir, dys) 163 | results_dir = os.path.join(args.results_dir, dys) 164 | cmds = [] 165 | target_cmds = [] 166 | for root, dir, files in os.walk(args.gsc_dir): 167 | for f in files: 168 | if f.endswith('.wav'): 169 | cmds.append(os.path.join(root, f)) 170 | print(len(cmds)) 171 | for root, dir, files in os.walk(os.path.join(args.aty_dir, dys)): 172 | for f in files: 173 | if f.endswith('.wav'): 174 | target_cmds.append(os.path.join(root, f)) 175 | print(len(target_cmds)) 176 | if args.debug: 177 | cmds = cmds[:2] 178 | target_cmds = target_cmds[:2] 179 | 180 | allembs = get_avg_emb(emb_dir) 181 | # loading voice conversion model 182 | generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 183 | params.layers, params.kernel, params.dropout, params.window_size, 184 | params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 185 | params.beta_min, params.beta_max) 186 | generator = generator.cuda() 187 | generator.load_state_dict(torch.load(vc_path)) 188 | generator.eval() 189 | # loading HiFi-GAN vocoder 190 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path 191 | with open(hfg_path + 'config.json') as f: 192 | h = AttrDict(json.load(f)) 193 | hifigan_universal = HiFiGAN(h).cuda() 194 | hifigan_universal.load_state_dict(torch.load(vocoder_path + '/g')['generator']) 195 | _ = hifigan_universal.eval() 196 | hifigan_universal.remove_weight_norm() 197 | 198 | for c in tqdm(cmds): 199 | tgt_path = target_cmds[0] 200 | try: 201 | inference(args, dys, generator, hifigan_universal, src_path=c, tgt_path=tgt_path, save_path=results_dir, mean=mean, std=std, emb=allembs) 202 | except: 203 | print(c) 204 | 205 | if __name__ == "__main__": 206 | parser = argparse.ArgumentParser() 207 | parser.add_argument('--results_dir', type=str, 208 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/results_allaty') 209 | parser.add_argument('--model_path_dir', type=str, 210 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/logs_dec_aty') 211 | parser.add_argument('--vocoder_dir', type=str, 212 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/hifi-gan/checkpoints') 213 | parser.add_argument('--aty_dir', type=str, 214 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/atypical_data/wav') 215 | parser.add_argument('--gsc_dir', type=str, 216 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/gsc_data/wav') 217 | parser.add_argument('--phoneme_dir', type=str, 218 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/gsc_data/phonemes') 219 | parser.add_argument('--mean_std_file', type=str, 220 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/gsc_data/') 221 | parser.add_argument('--mean_std_file_ua', type=str, 222 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/atypical_data/') 223 | parser.add_argument('--phoneme_uaspeech', type=str, 224 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/atypical_data/') 225 | parser.add_argument('--emb_dir', type=str, 226 | default='/data/dean/whl-2022/Speech-Backbones/DiffVC/atypical_data/embeds') 227 | parser.add_argument('--slow_ratio', type=float, default=1.2) 228 | parser.add_argument('--fast_ratio', type=float, default=0.8) 229 | parser.add_argument('--dys', type=str, default='0005') 230 | parser.add_argument('--debug', action='store_true') 231 | parser.set_defaults(debug=False) 232 | args = parser.parse_args() 233 | main(args, args.dys) 234 | # alldys = ['0005','0006','0007','0008','0009','0010','0011','0012','0013','0014','0015','0017','0018','0019','0020'] 235 | # for dys in alldys: 236 | # main(args, dys) -------------------------------------------------------------------------------- /demos/listening_test_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import soundfile as sf 4 | import random 5 | import numpy as np 6 | from librosa.core import load 7 | import csv 8 | 9 | source_path = '/data/dean/whl-2022/Speech-Backbones/DiffVC/mfa_data' 10 | fake_path = '/data/dean/whl-2022/Speech-Backbones/DiffVC/results_allaugdata' 11 | save_path = '/data/dean/whl-2022/Speech-Backbones/DiffVC/listening_test' 12 | sample_number = 15 13 | count = 1 14 | total_sample = 3 15 | alldys = ['F02','F03','F04','F05','M01','M04','M05','M07','M08','M09','M10','M11','M12','M14','M16'] 16 | fields1 = ['filename', 'speaker', 'severity', 'type of dysarthric', 'others', 'transcriptions'] 17 | fields2 = ['filename', 'severity', 'type of dysarthric', 'others', 'transcriptions'] 18 | rows1 = [] 19 | rows2 = [] 20 | os.makedirs(save_path, exist_ok=True) 21 | allcmds = [] 22 | for i in range(total_sample): 23 | for dys in alldys: 24 | source_cmds = [] 25 | for root, dirs, files in os.walk(os.path.join(source_path, dys)): 26 | for f in files: 27 | if f.endswith('.wav'): 28 | source_cmds.append(os.path.join(root, f)) 29 | random.shuffle(source_cmds) 30 | 31 | cmds = source_cmds[:sample_number] 32 | text = '' 33 | audio = np.zeros(10) 34 | for c in cmds: 35 | wav, _ = load(c, sr=22050) 36 | audio = np.append(audio, wav, 0) 37 | audio = np.append(audio, np.zeros(22050), 0) 38 | with open(os.path.join(source_path, dys, c.split('/')[-1].replace('.wav', '.lab'))) as fi: 39 | tt = fi.readline() 40 | text += tt + ' ' 41 | print(audio.shape) 42 | allcmds.append([dys, audio, text]) 43 | 44 | ge_cmds = [] 45 | text = '' 46 | for root, dirs, files in os.walk(os.path.join(fake_path, dys)): 47 | for f in files: 48 | if f.endswith('.wav'): 49 | ge_cmds.append(os.path.join(root, f)) 50 | random.shuffle(ge_cmds) 51 | cmds = ge_cmds[:sample_number-3] 52 | ge_cmds = [] 53 | for root, dirs, files in os.walk(os.path.join(source_path, dys)): 54 | for f in files: 55 | if f.endswith('.wav'): 56 | ge_cmds.append(os.path.join(root, f)) 57 | random.shuffle(ge_cmds) 58 | cmds = cmds + ge_cmds[:3] 59 | random.shuffle(cmds) 60 | audio = np.zeros(10) 61 | for c in cmds: 62 | wav, _ = load(c, sr=22050) 63 | audio = np.append(audio, wav, 0) 64 | audio = np.append(audio, np.zeros(22050), 0) 65 | with open(os.path.join(source_path, c.split('/')[-1].split('_')[0], c.split('/')[-1].replace('.wav', '.lab'))) as fi: 66 | tt = fi.readline() 67 | text += tt + ' ' 68 | print(audio.shape) 69 | allcmds.append([dys+'_syn', audio, text]) 70 | 71 | random.shuffle(allcmds) 72 | os.makedirs(os.path.join(save_path, 'audios'), exist_ok=True) 73 | os.makedirs(os.path.join(save_path, 'transcriptions'), exist_ok=True) 74 | for c in allcmds: 75 | savename = '{:03}'.format(count) + '.wav' 76 | rows1.append([savename, c[0], '', '', '', c[2]]) 77 | rows2.append([savename, '', '', '', c[2]]) 78 | sf.write(os.path.join(save_path, 'audios', savename), c[1], 22050, 'PCM_24') 79 | with open(os.path.join(save_path, 'transcriptions', savename.replace('.wav', '.txt')), 'w') as fi: 80 | fi.write(c[2]) 81 | count += 1 82 | 83 | filename1 = os.path.join(save_path, "listening_test_spk.csv") 84 | filename2 = os.path.join(save_path, "listening_test.csv") 85 | 86 | with open(filename1, 'w') as csvfile: 87 | csvwriter = csv.writer(csvfile) 88 | csvwriter.writerow(fields1) 89 | csvwriter.writerows(rows1) 90 | 91 | with open(filename2, 'w') as csvfile: 92 | csvwriter = csv.writer(csvfile) 93 | csvwriter.writerow(fields2) 94 | csvwriter.writerows(rows2) -------------------------------------------------------------------------------- /demos/modify_lexicon.py: -------------------------------------------------------------------------------- 1 | # see: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/480 2 | import re 3 | # lexicon = open("/data/lmorove1/hwang258/Speech-Backbones/DiffVC/demos/librispeech-lexicon.txt").readlines() 4 | # sp = re.compile("\s+") 5 | # with open("modified_librispeech-lexicon.txt", "w") as f: 6 | # for line in lexicon: 7 | # word, *phonemes = sp.split(line.strip()) 8 | # phonemes = " ".join(phonemes) 9 | # f.write(f"{word}\t{phonemes}\n") 10 | 11 | lexicon = open("./cmu_dictionary.txt", encoding = "ISO-8859-1").readlines() 12 | sp = re.compile("\s+") 13 | with open("./modified_cmu_dictionary.txt", "w") as f: 14 | for line in lexicon: 15 | word, *phonemes = sp.split(line.strip()) 16 | phonemes = " ".join(phonemes) 17 | f.write(f"{word}\t{phonemes}\n") -------------------------------------------------------------------------------- /demos/prepare_phonemetime.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tgt 3 | import numpy as np 4 | from tqdm import tqdm 5 | import multiprocessing 6 | 7 | phoneme_list = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 8 | 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 9 | 'D', 'DH', 'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 10 | 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 11 | 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1', 'OW2', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 13 | 'UH2', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', 'sil', 14 | 'sp', 'spn'] 15 | 16 | 17 | def process_one(textgrid, savepath): 18 | try: 19 | t = tgt.io.read_textgrid(textgrid) 20 | tp = t.get_tier_by_name('phones') 21 | tw = t.get_tier_by_name('words') 22 | allphones = [] 23 | alldurations = [] 24 | 25 | words = [] 26 | for i in range(len(tw)): 27 | if (i == 0 or i == len(tw) - 1) and ( 28 | tp[i].text == '' or tp[i].text == 'sil' or tp[i].text != 'sp' or tp[i].text == 'spn'): 29 | continue 30 | words.append([tw[i].start_time, tw[i].end_time]) 31 | 32 | j = 0 33 | for i in range(len(tp)): 34 | phoneme = tp[i].text 35 | start = tp[i].start_time 36 | end = tp[i].end_time 37 | if words[j][0] <= start and words[j][1] >= end: 38 | frame_num = int(np.ceil((end - start) * 22050.0 // 256)) 39 | if phoneme == '' or phoneme == 'sil' or phoneme == 'sp' or phoneme == 'spn': 40 | allphones.append(phoneme_list.index('sil')) 41 | else: 42 | allphones.append(phoneme_list.index(phoneme)) 43 | alldurations.append(frame_num) 44 | if words[j][1] == end: 45 | j += 1 46 | if j == len(words): 47 | break 48 | allphones = np.array(allphones) 49 | alldurations = np.array(alldurations) 50 | # print(allphones) 51 | # print(alldurations) 52 | print(textgrid) 53 | np.save(os.path.join(savepath, 'ttsphonemes', textgrid.split('/')[-1].replace('.TextGrid', '.npy')), allphones) 54 | np.save(os.path.join(savepath, 'ttsdurations', textgrid.split('/')[-1].replace('.TextGrid', '.npy')), alldurations) 55 | except: 56 | print(f'error:{textgrid}') 57 | 58 | def process_files(textgrids): 59 | for textgrid in tqdm(textgrids): 60 | process_one(textgrid) 61 | 62 | # textgrid = '/data/dean/whl-2022/LibriMix/data/librispeech/text/dev-clean/84/121550/84-121550-0000.TextGrid' 63 | # textgrids = '/data/dean/whl-2022/Speech-Backbones/DiffVC/librispeechData/textgrids' 64 | # savepath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/librispeechData' 65 | textgrids = '/data/dean/whl-2022/Speech-Backbones/TextGrid/LJSpeech' 66 | savepath = '/data/dean/whl-2022/Speech-Backbones/DiffVC/LJSpeechData' 67 | os.makedirs(os.path.join(savepath, 'ttsphonemes'), exist_ok=True) 68 | os.makedirs(os.path.join(savepath, 'ttsdurations'), exist_ok=True) 69 | cmds = [] 70 | for root, dirs, files in os.walk(textgrids): 71 | for f in files: 72 | if f.endswith('.TextGrid'): 73 | cmds.append((os.path.join(root, f), savepath)) 74 | print(len(cmds)) 75 | with multiprocessing.Pool(processes=50) as pool: 76 | pool.starmap(process_one, cmds) -------------------------------------------------------------------------------- /hifi-gan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /hifi-gan/README.md: -------------------------------------------------------------------------------- 1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis 2 | 3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae 4 | 5 | In our [paper](https://arxiv.org/abs/2010.05646), 6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
7 | We provide our implementation and pretrained models as open source in this repository. 8 | 9 | **Abstract :** 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 11 | Although such methods improve the sampling efficiency and memory usage, 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models. 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 14 | As speech audio consists of sinusoidal signals with various periods, 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart. 21 | 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples. 23 | 24 | 25 | ## Pre-requisites 26 | 1. Python >= 3.6 27 | 2. Clone this repository. 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt) 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). 30 | And move all wav files to `LJSpeech-1.1/wavs` 31 | 32 | 33 | ## Training 34 | ``` 35 | python train.py --config config_v1.json 36 | ``` 37 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
38 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
39 | You can change the path by adding `--checkpoint_path` option. 40 | 41 | Validation loss during training with V1 generator.
42 | ![validation loss](./validation_loss.png) 43 | 44 | ## Pretrained Model 45 | You can also use pretrained models we provide.
46 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
47 | Details of each folder are as in follows: 48 | 49 | |Folder Name|Generator|Dataset|Fine-Tuned| 50 | |------|---|---|---| 51 | |LJ_V1|V1|LJSpeech|No| 52 | |LJ_V2|V2|LJSpeech|No| 53 | |LJ_V3|V3|LJSpeech|No| 54 | |LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 55 | |LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 56 | |LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 57 | |VCTK_V1|V1|VCTK|No| 58 | |VCTK_V2|V2|VCTK|No| 59 | |VCTK_V3|V3|VCTK|No| 60 | |UNIVERSAL_V1|V1|Universal|No| 61 | 62 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets. 63 | 64 | ## Fine-Tuning 65 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
66 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
67 | Example: 68 | ``` 69 | Audio File : LJ001-0001.wav 70 | Mel-Spectrogram File : LJ001-0001.npy 71 | ``` 72 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
73 | 3. Run the following command. 74 | ``` 75 | python train.py --fine_tuning True --config config_v1.json 76 | ``` 77 | For other command line options, please refer to the training section. 78 | 79 | 80 | ## Inference from wav file 81 | 1. Make `test_files` directory and copy wav files into the directory. 82 | 2. Run the following command. 83 | ``` 84 | python inference.py --checkpoint_file [generator checkpoint file path] 85 | ``` 86 | Generated wav files are saved in `generated_files` by default.
87 | You can change the path by adding `--output_dir` option. 88 | 89 | 90 | ## Inference for end-to-end speech synthesis 91 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
92 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 93 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth. 94 | 2. Run the following command. 95 | ``` 96 | python inference_e2e.py --checkpoint_file [generator checkpoint file path] 97 | ``` 98 | Generated wav files are saved in `generated_files_from_mel` by default.
99 | You can change the path by adding `--output_dir` option. 100 | 101 | 102 | ## Acknowledgements 103 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 104 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this. 105 | 106 | -------------------------------------------------------------------------------- /hifi-gan/__pycache__/env.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/hifi-gan/__pycache__/env.cpython-36.pyc -------------------------------------------------------------------------------- /hifi-gan/__pycache__/models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/hifi-gan/__pycache__/models.cpython-36.pyc -------------------------------------------------------------------------------- /hifi-gan/__pycache__/xutils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/hifi-gan/__pycache__/xutils.cpython-36.pyc -------------------------------------------------------------------------------- /hifi-gan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super(AttrDict, self).__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /hifi-gan/meldataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import math 4 | import os 5 | import random 6 | import torch 7 | import torch.utils.data 8 | import numpy as np 9 | from librosa.util import normalize 10 | from scipy.io.wavfile import read 11 | from librosa.filters import mel as librosa_mel_fn 12 | 13 | MAX_WAV_VALUE = 32768.0 14 | 15 | 16 | def load_wav(full_path): 17 | sampling_rate, data = read(full_path) 18 | return data, sampling_rate 19 | 20 | 21 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 22 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 23 | 24 | 25 | def dynamic_range_decompression(x, C=1): 26 | return np.exp(x) / C 27 | 28 | 29 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 30 | return torch.log(torch.clamp(x, min=clip_val) * C) 31 | 32 | 33 | def dynamic_range_decompression_torch(x, C=1): 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 52 | if torch.min(y) < -1.: 53 | print('min value is ', torch.min(y)) 54 | if torch.max(y) > 1.: 55 | print('max value is ', torch.max(y)) 56 | 57 | global mel_basis, hann_window 58 | if fmax not in mel_basis: 59 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 60 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 61 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True) 68 | 69 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 70 | 71 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 72 | spec = spectral_normalize_torch(spec) 73 | 74 | return spec 75 | 76 | 77 | def get_dataset_filelist(a): 78 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 79 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 80 | for x in fi.read().split('\n') if len(x) > 0] 81 | 82 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 83 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 84 | for x in fi.read().split('\n') if len(x) > 0] 85 | return training_files, validation_files 86 | 87 | 88 | class MelDataset(torch.utils.data.Dataset): 89 | def __init__(self, training_files, segment_size, n_fft, num_mels, 90 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 91 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 92 | self.audio_files = training_files 93 | random.seed(1234) 94 | if shuffle: 95 | random.shuffle(self.audio_files) 96 | self.segment_size = segment_size 97 | self.sampling_rate = sampling_rate 98 | self.split = split 99 | self.n_fft = n_fft 100 | self.num_mels = num_mels 101 | self.hop_size = hop_size 102 | self.win_size = win_size 103 | self.fmin = fmin 104 | self.fmax = fmax 105 | self.fmax_loss = fmax_loss 106 | self.cached_wav = None 107 | self.n_cache_reuse = n_cache_reuse 108 | self._cache_ref_count = 0 109 | self.device = device 110 | self.fine_tuning = fine_tuning 111 | self.base_mels_path = base_mels_path 112 | 113 | def __getitem__(self, index): 114 | filename = self.audio_files[index] 115 | if self._cache_ref_count == 0: 116 | audio, sampling_rate = load_wav(filename) 117 | audio = audio / MAX_WAV_VALUE 118 | if not self.fine_tuning: 119 | audio = normalize(audio) * 0.95 120 | self.cached_wav = audio 121 | if sampling_rate != self.sampling_rate: 122 | raise ValueError("{} SR doesn't match target {} SR".format( 123 | sampling_rate, self.sampling_rate)) 124 | self._cache_ref_count = self.n_cache_reuse 125 | else: 126 | audio = self.cached_wav 127 | self._cache_ref_count -= 1 128 | 129 | audio = torch.FloatTensor(audio) 130 | audio = audio.unsqueeze(0) 131 | 132 | if not self.fine_tuning: 133 | if self.split: 134 | if audio.size(1) >= self.segment_size: 135 | max_audio_start = audio.size(1) - self.segment_size 136 | audio_start = random.randint(0, max_audio_start) 137 | audio = audio[:, audio_start:audio_start+self.segment_size] 138 | else: 139 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 140 | 141 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 142 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 143 | center=False) 144 | else: 145 | mel = np.load( 146 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 147 | mel = torch.from_numpy(mel) 148 | 149 | if len(mel.shape) < 3: 150 | mel = mel.unsqueeze(0) 151 | 152 | if self.split: 153 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 154 | 155 | if audio.size(1) >= self.segment_size: 156 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 157 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 158 | audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 159 | else: 160 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') 161 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 162 | 163 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 164 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 165 | center=False) 166 | 167 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 168 | 169 | def __len__(self): 170 | return len(self.audio_files) 171 | -------------------------------------------------------------------------------- /hifi-gan/models.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import torch.nn as nn 6 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 7 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 8 | from xutils import init_weights, get_padding 9 | 10 | LRELU_SLOPE = 0.1 11 | 12 | 13 | class ResBlock1(torch.nn.Module): 14 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 15 | super(ResBlock1, self).__init__() 16 | self.h = h 17 | self.convs1 = nn.ModuleList([ 18 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 19 | padding=get_padding(kernel_size, dilation[0]))), 20 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 21 | padding=get_padding(kernel_size, dilation[1]))), 22 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 23 | padding=get_padding(kernel_size, dilation[2]))) 24 | ]) 25 | self.convs1.apply(init_weights) 26 | 27 | self.convs2 = nn.ModuleList([ 28 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 29 | padding=get_padding(kernel_size, 1))), 30 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 31 | padding=get_padding(kernel_size, 1))), 32 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 33 | padding=get_padding(kernel_size, 1))) 34 | ]) 35 | self.convs2.apply(init_weights) 36 | 37 | def forward(self, x): 38 | for c1, c2 in zip(self.convs1, self.convs2): 39 | xt = F.leaky_relu(x, LRELU_SLOPE) 40 | xt = c1(xt) 41 | xt = F.leaky_relu(xt, LRELU_SLOPE) 42 | xt = c2(xt) 43 | x = xt + x 44 | return x 45 | 46 | def remove_weight_norm(self): 47 | for l in self.convs1: 48 | remove_weight_norm(l) 49 | for l in self.convs2: 50 | remove_weight_norm(l) 51 | 52 | 53 | class ResBlock2(torch.nn.Module): 54 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 55 | super(ResBlock2, self).__init__() 56 | self.h = h 57 | self.convs = nn.ModuleList([ 58 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 59 | padding=get_padding(kernel_size, dilation[0]))), 60 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 61 | padding=get_padding(kernel_size, dilation[1]))) 62 | ]) 63 | self.convs.apply(init_weights) 64 | 65 | def forward(self, x): 66 | for c in self.convs: 67 | xt = F.leaky_relu(x, LRELU_SLOPE) 68 | xt = c(xt) 69 | x = xt + x 70 | return x 71 | 72 | def remove_weight_norm(self): 73 | for l in self.convs: 74 | remove_weight_norm(l) 75 | 76 | 77 | class Generator(torch.nn.Module): 78 | def __init__(self, h): 79 | super(Generator, self).__init__() 80 | self.h = h 81 | self.num_kernels = len(h.resblock_kernel_sizes) 82 | self.num_upsamples = len(h.upsample_rates) 83 | self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) 84 | resblock = ResBlock1 if h.resblock == '1' else ResBlock2 85 | 86 | self.ups = nn.ModuleList() 87 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 88 | self.ups.append(weight_norm( 89 | ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), 90 | k, u, padding=(k-u)//2))) 91 | 92 | self.resblocks = nn.ModuleList() 93 | for i in range(len(self.ups)): 94 | ch = h.upsample_initial_channel//(2**(i+1)) 95 | for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): 96 | self.resblocks.append(resblock(h, ch, k, d)) 97 | 98 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 99 | self.ups.apply(init_weights) 100 | self.conv_post.apply(init_weights) 101 | 102 | def forward(self, x): 103 | x = self.conv_pre(x) 104 | for i in range(self.num_upsamples): 105 | x = F.leaky_relu(x, LRELU_SLOPE) 106 | x = self.ups[i](x) 107 | xs = None 108 | for j in range(self.num_kernels): 109 | if xs is None: 110 | xs = self.resblocks[i*self.num_kernels+j](x) 111 | else: 112 | xs += self.resblocks[i*self.num_kernels+j](x) 113 | x = xs / self.num_kernels 114 | x = F.leaky_relu(x) 115 | x = self.conv_post(x) 116 | x = torch.tanh(x) 117 | 118 | return x 119 | 120 | def remove_weight_norm(self): 121 | print('Removing weight norm...') 122 | for l in self.ups: 123 | remove_weight_norm(l) 124 | for l in self.resblocks: 125 | l.remove_weight_norm() 126 | remove_weight_norm(self.conv_pre) 127 | remove_weight_norm(self.conv_post) 128 | 129 | 130 | class DiscriminatorP(torch.nn.Module): 131 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 132 | super(DiscriminatorP, self).__init__() 133 | self.period = period 134 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 135 | self.convs = nn.ModuleList([ 136 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 137 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 138 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 139 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 140 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 141 | ]) 142 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 143 | 144 | def forward(self, x): 145 | fmap = [] 146 | 147 | # 1d to 2d 148 | b, c, t = x.shape 149 | if t % self.period != 0: # pad first 150 | n_pad = self.period - (t % self.period) 151 | x = F.pad(x, (0, n_pad), "reflect") 152 | t = t + n_pad 153 | x = x.view(b, c, t // self.period, self.period) 154 | 155 | for l in self.convs: 156 | x = l(x) 157 | x = F.leaky_relu(x, LRELU_SLOPE) 158 | fmap.append(x) 159 | x = self.conv_post(x) 160 | fmap.append(x) 161 | x = torch.flatten(x, 1, -1) 162 | 163 | return x, fmap 164 | 165 | 166 | class MultiPeriodDiscriminator(torch.nn.Module): 167 | def __init__(self): 168 | super(MultiPeriodDiscriminator, self).__init__() 169 | self.discriminators = nn.ModuleList([ 170 | DiscriminatorP(2), 171 | DiscriminatorP(3), 172 | DiscriminatorP(5), 173 | DiscriminatorP(7), 174 | DiscriminatorP(11), 175 | ]) 176 | 177 | def forward(self, y, y_hat): 178 | y_d_rs = [] 179 | y_d_gs = [] 180 | fmap_rs = [] 181 | fmap_gs = [] 182 | for i, d in enumerate(self.discriminators): 183 | y_d_r, fmap_r = d(y) 184 | y_d_g, fmap_g = d(y_hat) 185 | y_d_rs.append(y_d_r) 186 | fmap_rs.append(fmap_r) 187 | y_d_gs.append(y_d_g) 188 | fmap_gs.append(fmap_g) 189 | 190 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 191 | 192 | 193 | class DiscriminatorS(torch.nn.Module): 194 | def __init__(self, use_spectral_norm=False): 195 | super(DiscriminatorS, self).__init__() 196 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 197 | self.convs = nn.ModuleList([ 198 | norm_f(Conv1d(1, 128, 15, 1, padding=7)), 199 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 200 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 201 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 202 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 203 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 204 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 205 | ]) 206 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 207 | 208 | def forward(self, x): 209 | fmap = [] 210 | for l in self.convs: 211 | x = l(x) 212 | x = F.leaky_relu(x, LRELU_SLOPE) 213 | fmap.append(x) 214 | x = self.conv_post(x) 215 | fmap.append(x) 216 | x = torch.flatten(x, 1, -1) 217 | 218 | return x, fmap 219 | 220 | 221 | class MultiScaleDiscriminator(torch.nn.Module): 222 | def __init__(self): 223 | super(MultiScaleDiscriminator, self).__init__() 224 | self.discriminators = nn.ModuleList([ 225 | DiscriminatorS(use_spectral_norm=True), 226 | DiscriminatorS(), 227 | DiscriminatorS(), 228 | ]) 229 | self.meanpools = nn.ModuleList([ 230 | AvgPool1d(4, 2, padding=2), 231 | AvgPool1d(4, 2, padding=2) 232 | ]) 233 | 234 | def forward(self, y, y_hat): 235 | y_d_rs = [] 236 | y_d_gs = [] 237 | fmap_rs = [] 238 | fmap_gs = [] 239 | for i, d in enumerate(self.discriminators): 240 | if i != 0: 241 | y = self.meanpools[i-1](y) 242 | y_hat = self.meanpools[i-1](y_hat) 243 | y_d_r, fmap_r = d(y) 244 | y_d_g, fmap_g = d(y_hat) 245 | y_d_rs.append(y_d_r) 246 | fmap_rs.append(fmap_r) 247 | y_d_gs.append(y_d_g) 248 | fmap_gs.append(fmap_g) 249 | 250 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 251 | 252 | 253 | def feature_loss(fmap_r, fmap_g): 254 | loss = 0 255 | for dr, dg in zip(fmap_r, fmap_g): 256 | for rl, gl in zip(dr, dg): 257 | loss += torch.mean(torch.abs(rl - gl)) 258 | 259 | return loss*2 260 | 261 | 262 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 263 | loss = 0 264 | r_losses = [] 265 | g_losses = [] 266 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 267 | r_loss = torch.mean((1-dr)**2) 268 | g_loss = torch.mean(dg**2) 269 | loss += (r_loss + g_loss) 270 | r_losses.append(r_loss.item()) 271 | g_losses.append(g_loss.item()) 272 | 273 | return loss, r_losses, g_losses 274 | 275 | 276 | def generator_loss(disc_outputs): 277 | loss = 0 278 | gen_losses = [] 279 | for dg in disc_outputs: 280 | l = torch.mean((1-dg)**2) 281 | gen_losses.append(l) 282 | loss += l 283 | 284 | return loss, gen_losses 285 | 286 | -------------------------------------------------------------------------------- /hifi-gan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | import matplotlib 6 | import torch 7 | from torch.nn.utils import weight_norm 8 | matplotlib.use("Agg") 9 | import matplotlib.pylab as plt 10 | 11 | 12 | def plot_spectrogram(spectrogram): 13 | fig, ax = plt.subplots(figsize=(10, 2)) 14 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 15 | interpolation='none') 16 | plt.colorbar(im, ax=ax) 17 | 18 | fig.canvas.draw() 19 | plt.close() 20 | 21 | return fig 22 | 23 | 24 | def init_weights(m, mean=0.0, std=0.01): 25 | classname = m.__class__.__name__ 26 | if classname.find("Conv") != -1: 27 | m.weight.data.normal_(mean, std) 28 | 29 | 30 | def apply_weight_norm(m): 31 | classname = m.__class__.__name__ 32 | if classname.find("Conv") != -1: 33 | weight_norm(m) 34 | 35 | 36 | def get_padding(kernel_size, dilation=1): 37 | return int((kernel_size*dilation - dilation)/2) 38 | 39 | 40 | def load_checkpoint(filepath, device): 41 | assert os.path.isfile(filepath) 42 | print("Loading '{}'".format(filepath)) 43 | checkpoint_dict = torch.load(filepath, map_location=device) 44 | print("Complete.") 45 | return checkpoint_dict 46 | 47 | 48 | def save_checkpoint(filepath, obj): 49 | print("Saving checkpoint to {}".format(filepath)) 50 | torch.save(obj, filepath) 51 | print("Complete.") 52 | 53 | 54 | def scan_checkpoint(cp_dir, prefix): 55 | pattern = os.path.join(cp_dir, prefix + '????????') 56 | cp_list = glob.glob(pattern) 57 | if len(cp_list) == 0: 58 | return None 59 | return sorted(cp_list)[-1] 60 | 61 | -------------------------------------------------------------------------------- /libritts_data/global_mean_var.txt: -------------------------------------------------------------------------------- 1 | -4.711277961730957031e+00 -4.554810047149658203e+00 -4.247941493988037109e+00 -3.973285913467407227e+00 -3.864546060562133789e+00 -3.895205736160278320e+00 -4.058072566986083984e+00 -4.236612319946289062e+00 -4.285600662231445312e+00 -4.288820266723632812e+00 -4.219392776489257812e+00 -4.347094535827636719e+00 -4.411542415618896484e+00 -4.491096019744873047e+00 -4.559501647949218750e+00 -4.769878387451171875e+00 -4.903300285339355469e+00 -4.998268604278564453e+00 -5.120747566223144531e+00 -5.239724159240722656e+00 -5.314760684967041016e+00 -5.313437938690185547e+00 -5.437182903289794922e+00 -5.493862152099609375e+00 -5.546514511108398438e+00 -5.534139633178710938e+00 -5.645017147064208984e+00 -5.662632942199707031e+00 -5.688502788543701172e+00 -5.714745998382568359e+00 -5.752959251403808594e+00 -5.783332824707031250e+00 -5.783766269683837891e+00 -5.752015590667724609e+00 -5.719962596893310547e+00 -5.660527229309082031e+00 -5.748497009277343750e+00 -5.719967365264892578e+00 -5.705787181854248047e+00 -5.726572990417480469e+00 -5.738905906677246094e+00 -5.762210845947265625e+00 -5.810612201690673828e+00 -5.839531421661376953e+00 -5.876708984375000000e+00 -5.915248394012451172e+00 -5.950809478759765625e+00 -5.964234828948974609e+00 -5.977433204650878906e+00 -5.966416835784912109e+00 -5.993427753448486328e+00 -5.997142314910888672e+00 -6.039862155914306641e+00 -6.077965259552001953e+00 -6.100508213043212891e+00 -6.137642383575439453e+00 -6.190366268157958984e+00 -6.248041152954101562e+00 -6.302127361297607422e+00 -6.355373859405517578e+00 -6.411968231201171875e+00 -6.452272891998291016e+00 -6.509038448333740234e+00 -6.600063323974609375e+00 -6.718620777130126953e+00 -6.828404426574707031e+00 -6.928081035614013672e+00 -7.030965328216552734e+00 -7.127524852752685547e+00 -7.220531940460205078e+00 -7.291595935821533203e+00 -7.363456249237060547e+00 -7.424736976623535156e+00 -7.462667942047119141e+00 -7.499318599700927734e+00 -7.554832458496093750e+00 -7.617550849914550781e+00 -7.831916332244873047e+00 -8.495668411254882812e+00 -9.581151008605957031e+00 2 | 1.806026961573231171e+00 1.829964422828602499e+00 2.079274397335943103e+00 2.224284283658068517e+00 2.330778794957382161e+00 2.346406584190550593e+00 2.291178057413357561e+00 2.248080852438071542e+00 2.269868309538877416e+00 2.309865203968303415e+00 2.327623276605532254e+00 2.318144416362247551e+00 2.298856797527346174e+00 2.272057701279480035e+00 2.248071508876109625e+00 2.221812090428491704e+00 2.190683659224375024e+00 2.161632477004224118e+00 2.135066138696764870e+00 2.105877374913217359e+00 2.081034013321177767e+00 2.059485395915956030e+00 2.032511986409669014e+00 2.010391886973289743e+00 2.002211725383170826e+00 1.990431573632058138e+00 1.974618541832638741e+00 1.976688162228032342e+00 1.974514896511844420e+00 1.969818211995926882e+00 1.966250457035454868e+00 1.961667688323555980e+00 1.956306095341123097e+00 1.931143493219268592e+00 1.814319267509993949e+00 1.884574907361773688e+00 1.956895463079087127e+00 1.955901196106792694e+00 1.953307458891687665e+00 1.949594439635556675e+00 1.944320473291063722e+00 1.937408359736509711e+00 1.928664042093138908e+00 1.919592705224304474e+00 1.906776270829876996e+00 1.910166789461186898e+00 1.910597115514641064e+00 1.916642804740659844e+00 1.928730385996895613e+00 1.944056617477656124e+00 1.950095813557436220e+00 1.948252261740533120e+00 1.932999773200206217e+00 1.932356545854892138e+00 1.927388576496080530e+00 1.926037127567277629e+00 1.923769781037170779e+00 1.915874294031020719e+00 1.912745883091910182e+00 1.912050388688832703e+00 1.912944952287476408e+00 1.911259160303136806e+00 1.894563779004795778e+00 1.872803694305986477e+00 1.903277554156513007e+00 1.899463088181439163e+00 1.896223458269415296e+00 1.889534690684917972e+00 1.883335887989682877e+00 1.885361082589714243e+00 1.879416353992391597e+00 1.875132668304287487e+00 1.865431913086422755e+00 1.858704498786842407e+00 1.853345224726899554e+00 1.845997833540454325e+00 1.828212656540552539e+00 1.833446537772392304e+00 1.808671236640067681e+00 1.578110571386494021e+00 3 | -------------------------------------------------------------------------------- /libritts_data/mels_mode.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/libritts_data/mels_mode.pkl -------------------------------------------------------------------------------- /libritts_data/phonemes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/libritts_data/phonemes.json -------------------------------------------------------------------------------- /libritts_data/phonemes.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/libritts_data/phonemes.pkl -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | from .vc import DiffVC -------------------------------------------------------------------------------- /model/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/base.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/diffusion.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/diffusion.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/encoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/encoder.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/postnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/postnet.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/vc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/model/__pycache__/vc.cpython-36.pyc -------------------------------------------------------------------------------- /model/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import numpy as np 10 | import torch 11 | 12 | 13 | class BaseModule(torch.nn.Module): 14 | def __init__(self): 15 | super(BaseModule, self).__init__() 16 | 17 | @property 18 | def nparams(self): 19 | num_params = 0 20 | for name, param in self.named_parameters(): 21 | if param.requires_grad: 22 | num_params += np.prod(param.detach().cpu().numpy().shape) 23 | return num_params 24 | 25 | 26 | def relocate_input(self, x: list): 27 | device = next(self.parameters()).device 28 | for i in range(len(x)): 29 | if isinstance(x[i], torch.Tensor) and x[i].device != device: 30 | x[i] = x[i].to(device) 31 | return x 32 | -------------------------------------------------------------------------------- /model/diffusion.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import math 10 | import torch 11 | 12 | from model.base import BaseModule 13 | from model.modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock 14 | from model.modules import LinearAttention, Residual, SinusoidalPosEmb, RefBlock 15 | 16 | 17 | class GradLogPEstimator(BaseModule): 18 | def __init__(self, dim_base, dim_cond, use_ref_t, dim_mults=(1, 2, 4)): 19 | super(GradLogPEstimator, self).__init__() 20 | self.use_ref_t = use_ref_t 21 | dims = [2 + dim_cond, *map(lambda m: dim_base * m, dim_mults)] 22 | in_out = list(zip(dims[:-1], dims[1:])) 23 | 24 | self.time_pos_emb = SinusoidalPosEmb(dim_base) 25 | self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4), 26 | Mish(), torch.nn.Linear(dim_base * 4, dim_base)) 27 | 28 | cond_total = dim_base + 256 29 | if use_ref_t: 30 | self.ref_block = RefBlock(out_dim=dim_cond, time_emb_dim=dim_base) 31 | cond_total += dim_cond 32 | self.cond_block = torch.nn.Sequential(torch.nn.Linear(cond_total, 4 * dim_cond), 33 | Mish(), torch.nn.Linear(4 * dim_cond, dim_cond)) 34 | 35 | self.downs = torch.nn.ModuleList([]) 36 | self.ups = torch.nn.ModuleList([]) 37 | num_resolutions = len(in_out) 38 | 39 | for ind, (dim_in, dim_out) in enumerate(in_out): 40 | is_last = ind >= (num_resolutions - 1) 41 | self.downs.append(torch.nn.ModuleList([ 42 | ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base), 43 | ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base), 44 | Residual(Rezero(LinearAttention(dim_out))), 45 | Downsample(dim_out) if not is_last else torch.nn.Identity()])) 46 | 47 | mid_dim = dims[-1] 48 | self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base) 49 | self.mid_attn = Residual(Rezero(LinearAttention(mid_dim))) 50 | self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base) 51 | 52 | for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])): 53 | self.ups.append(torch.nn.ModuleList([ 54 | ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base), 55 | ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base), 56 | Residual(Rezero(LinearAttention(dim_in))), 57 | Upsample(dim_in)])) 58 | self.final_block = Block(dim_base, dim_base) 59 | self.final_conv = torch.nn.Conv2d(dim_base, 1, 1) 60 | 61 | def forward(self, x, x_mask, mean, ref, ref_mask, c, t): 62 | condition = self.time_pos_emb(t) 63 | t = self.mlp(condition) 64 | 65 | x = torch.stack([mean, x], 1) 66 | x_mask = x_mask.unsqueeze(1) 67 | ref_mask = ref_mask.unsqueeze(1) 68 | 69 | if self.use_ref_t: 70 | condition = torch.cat([condition, self.ref_block(ref, ref_mask, t)], 1) 71 | condition = torch.cat([condition, c], 1) 72 | 73 | condition = self.cond_block(condition).unsqueeze(-1).unsqueeze(-1) 74 | condition = torch.cat(x.shape[2]*[condition], 2) 75 | condition = torch.cat(x.shape[3]*[condition], 3) 76 | x = torch.cat([x, condition], 1) 77 | 78 | hiddens = [] 79 | masks = [x_mask] 80 | for resnet1, resnet2, attn, downsample in self.downs: 81 | mask_down = masks[-1] 82 | x = resnet1(x, mask_down, t) 83 | x = resnet2(x, mask_down, t) 84 | x = attn(x) 85 | hiddens.append(x) 86 | x = downsample(x * mask_down) 87 | masks.append(mask_down[:, :, :, ::2]) 88 | 89 | masks = masks[:-1] 90 | mask_mid = masks[-1] 91 | x = self.mid_block1(x, mask_mid, t) 92 | x = self.mid_attn(x) 93 | x = self.mid_block2(x, mask_mid, t) 94 | 95 | for resnet1, resnet2, attn, upsample in self.ups: 96 | mask_up = masks.pop() 97 | x = torch.cat((x, hiddens.pop()), dim=1) 98 | x = resnet1(x, mask_up, t) 99 | x = resnet2(x, mask_up, t) 100 | x = attn(x) 101 | x = upsample(x * mask_up) 102 | 103 | x = self.final_block(x, x_mask) 104 | output = self.final_conv(x * x_mask) 105 | 106 | return (output * x_mask).squeeze(1) 107 | 108 | 109 | class Diffusion(BaseModule): 110 | def __init__(self, n_feats, dim_unet, dim_spk, use_ref_t, beta_min, beta_max): 111 | super(Diffusion, self).__init__() 112 | self.estimator = GradLogPEstimator(dim_unet, dim_spk, use_ref_t) 113 | self.n_feats = n_feats 114 | self.dim_unet = dim_unet 115 | self.dim_spk = dim_spk 116 | self.use_ref_t = use_ref_t 117 | self.beta_min = beta_min 118 | self.beta_max = beta_max 119 | 120 | def get_beta(self, t): 121 | beta = self.beta_min + (self.beta_max - self.beta_min) * t 122 | return beta 123 | 124 | def get_gamma(self, s, t, p=1.0, use_torch=False): 125 | beta_integral = self.beta_min + 0.5*(self.beta_max - self.beta_min)*(t + s) 126 | beta_integral *= (t - s) 127 | if use_torch: 128 | gamma = torch.exp(-0.5*p*beta_integral).unsqueeze(-1).unsqueeze(-1) 129 | else: 130 | gamma = math.exp(-0.5*p*beta_integral) 131 | return gamma 132 | 133 | def get_mu(self, s, t): 134 | a = self.get_gamma(s, t) 135 | b = 1.0 - self.get_gamma(0, s, p=2.0) 136 | c = 1.0 - self.get_gamma(0, t, p=2.0) 137 | return a * b / c 138 | 139 | def get_nu(self, s, t): 140 | a = self.get_gamma(0, s) 141 | b = 1.0 - self.get_gamma(s, t, p=2.0) 142 | c = 1.0 - self.get_gamma(0, t, p=2.0) 143 | return a * b / c 144 | 145 | def get_sigma(self, s, t): 146 | a = 1.0 - self.get_gamma(0, s, p=2.0) 147 | b = 1.0 - self.get_gamma(s, t, p=2.0) 148 | c = 1.0 - self.get_gamma(0, t, p=2.0) 149 | return math.sqrt(a * b / c) 150 | 151 | def compute_diffused_mean(self, x0, mask, mean, t, use_torch=False): 152 | x0_weight = self.get_gamma(0, t, use_torch=use_torch) 153 | mean_weight = 1.0 - x0_weight 154 | xt_mean = x0 * x0_weight + mean * mean_weight 155 | return xt_mean * mask 156 | 157 | def forward_diffusion(self, x0, mask, mean, t): 158 | xt_mean = self.compute_diffused_mean(x0, mask, mean, t, use_torch=True) 159 | variance = 1.0 - self.get_gamma(0, t, p=2.0, use_torch=True) 160 | z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device, requires_grad=False) 161 | xt = xt_mean + z * torch.sqrt(variance) 162 | return xt * mask, z * mask 163 | 164 | @torch.no_grad() 165 | def reverse_diffusion(self, z, mask, mean, ref, ref_mask, mean_ref, c, 166 | n_timesteps, mode): 167 | h = 1.0 / n_timesteps 168 | xt = z * mask 169 | for i in range(n_timesteps): 170 | t = 1.0 - i*h 171 | time = t * torch.ones(z.shape[0], dtype=z.dtype, device=z.device) 172 | beta_t = self.get_beta(t) 173 | xt_ref = [self.compute_diffused_mean(ref, ref_mask, mean_ref, t)] 174 | # for j in range(15): 175 | # xt_ref += [self.compute_diffused_mean(ref, ref_mask, mean_ref, (j+0.5)/15.0)] 176 | xt_ref = torch.stack(xt_ref, 1) 177 | if mode == 'pf': 178 | dxt = 0.5 * (mean - xt - self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time)) * (beta_t * h) 179 | else: 180 | if mode == 'ml': 181 | kappa = self.get_gamma(0, t - h) * (1.0 - self.get_gamma(t - h, t, p=2.0)) 182 | kappa /= (self.get_gamma(0, t) * beta_t * h) 183 | kappa -= 1.0 184 | omega = self.get_nu(t - h, t) / self.get_gamma(0, t) 185 | omega += self.get_mu(t - h, t) 186 | omega -= (0.5 * beta_t * h + 1.0) 187 | sigma = self.get_sigma(t - h, t) 188 | else: 189 | kappa = 0.0 190 | omega = 0.0 191 | sigma = math.sqrt(beta_t * h) 192 | dxt = (mean - xt) * (0.5 * beta_t * h + omega) 193 | dxt -= self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time) * (1.0 + kappa) * (beta_t * h) 194 | dxt += torch.randn_like(z, device=z.device) * sigma 195 | xt = (xt - dxt) * mask 196 | return xt 197 | 198 | @torch.no_grad() 199 | def forward(self, z, mask, mean, ref, ref_mask, mean_ref, c, 200 | n_timesteps, mode): 201 | if mode not in ['pf', 'em', 'ml']: 202 | print('Inference mode must be one of [pf, em, ml]!') 203 | return z 204 | return self.reverse_diffusion(z, mask, mean, ref, ref_mask, mean_ref, c, 205 | n_timesteps, mode) 206 | 207 | def loss_t(self, x0, mask, mean, x_ref, mean_ref, c, t): 208 | xt, z = self.forward_diffusion(x0, mask, mean, t) 209 | xt_ref = [self.compute_diffused_mean(x_ref, mask, mean_ref, t, use_torch=True)] 210 | # for j in range(15): 211 | # xt_ref += [self.compute_diffused_mean(x_ref, mask, mean_ref, (j+0.5)/15.0)] 212 | xt_ref = torch.stack(xt_ref, 1) 213 | z_estimation = self.estimator(xt, mask, mean, xt_ref, mask, c, t) 214 | z_estimation *= torch.sqrt(1.0 - self.get_gamma(0, t, p=2.0, use_torch=True)) 215 | loss = torch.sum((z_estimation + z)**2) / (torch.sum(mask)*self.n_feats) 216 | return loss 217 | 218 | def compute_loss(self, x0, mask, mean, x_ref, mean_ref, c, offset=1e-5): 219 | b = x0.shape[0] 220 | t = torch.rand(b, dtype=x0.dtype, device=x0.device, requires_grad=False) 221 | t = torch.clamp(t, offset, 1.0 - offset) 222 | return self.loss_t(x0, mask, mean, x_ref, mean_ref, c, t) 223 | -------------------------------------------------------------------------------- /model/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import math 10 | import torch 11 | from einops import rearrange 12 | 13 | from model.base import BaseModule 14 | 15 | 16 | class Mish(BaseModule): 17 | def forward(self, x): 18 | return x * torch.tanh(torch.nn.functional.softplus(x)) 19 | 20 | 21 | class Upsample(BaseModule): 22 | def __init__(self, dim): 23 | super(Upsample, self).__init__() 24 | self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1) 25 | 26 | def forward(self, x): 27 | return self.conv(x) 28 | 29 | 30 | class Downsample(BaseModule): 31 | def __init__(self, dim): 32 | super(Downsample, self).__init__() 33 | self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1) 34 | 35 | def forward(self, x): 36 | return self.conv(x) 37 | 38 | 39 | class Rezero(BaseModule): 40 | def __init__(self, fn): 41 | super(Rezero, self).__init__() 42 | self.fn = fn 43 | self.g = torch.nn.Parameter(torch.zeros(1)) 44 | 45 | def forward(self, x): 46 | return self.fn(x) * self.g 47 | 48 | 49 | class Block(BaseModule): 50 | def __init__(self, dim, dim_out, groups=8): 51 | super(Block, self).__init__() 52 | self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3, 53 | padding=1), torch.nn.GroupNorm( 54 | groups, dim_out), Mish()) 55 | 56 | def forward(self, x, mask): 57 | output = self.block(x * mask) 58 | return output * mask 59 | 60 | 61 | class ResnetBlock(BaseModule): 62 | def __init__(self, dim, dim_out, time_emb_dim, groups=8): 63 | super(ResnetBlock, self).__init__() 64 | self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 65 | dim_out)) 66 | 67 | self.block1 = Block(dim, dim_out, groups=groups) 68 | self.block2 = Block(dim_out, dim_out, groups=groups) 69 | if dim != dim_out: 70 | self.res_conv = torch.nn.Conv2d(dim, dim_out, 1) 71 | else: 72 | self.res_conv = torch.nn.Identity() 73 | 74 | def forward(self, x, mask, time_emb): 75 | h = self.block1(x, mask) 76 | h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1) 77 | h = self.block2(h, mask) 78 | output = h + self.res_conv(x * mask) 79 | return output 80 | 81 | 82 | class LinearAttention(BaseModule): 83 | def __init__(self, dim, heads=4, dim_head=32): 84 | super(LinearAttention, self).__init__() 85 | self.heads = heads 86 | hidden_dim = dim_head * heads 87 | self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) 88 | self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1) 89 | 90 | def forward(self, x): 91 | b, c, h, w = x.shape 92 | qkv = self.to_qkv(x) 93 | q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', 94 | heads = self.heads, qkv=3) 95 | k = k.softmax(dim=-1) 96 | context = torch.einsum('bhdn,bhen->bhde', k, v) 97 | out = torch.einsum('bhde,bhdn->bhen', context, q) 98 | out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', 99 | heads=self.heads, h=h, w=w) 100 | return self.to_out(out) 101 | 102 | 103 | class Residual(BaseModule): 104 | def __init__(self, fn): 105 | super(Residual, self).__init__() 106 | self.fn = fn 107 | 108 | def forward(self, x, *args, **kwargs): 109 | output = self.fn(x, *args, **kwargs) + x 110 | return output 111 | 112 | 113 | class SinusoidalPosEmb(BaseModule): 114 | def __init__(self, dim): 115 | super(SinusoidalPosEmb, self).__init__() 116 | self.dim = dim 117 | 118 | def forward(self, x): 119 | device = x.device 120 | half_dim = self.dim // 2 121 | emb = math.log(10000) / (half_dim - 1) 122 | emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) 123 | emb = 1000.0 * x.unsqueeze(1) * emb.unsqueeze(0) 124 | emb = torch.cat((emb.sin(), emb.cos()), dim=-1) 125 | return emb 126 | 127 | 128 | class RefBlock(BaseModule): 129 | def __init__(self, out_dim, time_emb_dim): 130 | super(RefBlock, self).__init__() 131 | base_dim = out_dim // 4 132 | self.mlp1 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 133 | base_dim)) 134 | self.mlp2 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 135 | 2 * base_dim)) 136 | self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim, 137 | 3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True), 138 | torch.nn.GLU(dim=1)) 139 | self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim, 140 | 3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True), 141 | torch.nn.GLU(dim=1)) 142 | self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim, 143 | 3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True), 144 | torch.nn.GLU(dim=1)) 145 | self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim, 146 | 3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True), 147 | torch.nn.GLU(dim=1)) 148 | self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim, 149 | 3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True), 150 | torch.nn.GLU(dim=1)) 151 | self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim, 152 | 3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True), 153 | torch.nn.GLU(dim=1)) 154 | self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1) 155 | 156 | def forward(self, x, mask, time_emb): 157 | y = self.block11(x * mask) 158 | y = self.block12(y * mask) 159 | y += self.mlp1(time_emb).unsqueeze(-1).unsqueeze(-1) 160 | y = self.block21(y * mask) 161 | y = self.block22(y * mask) 162 | y += self.mlp2(time_emb).unsqueeze(-1).unsqueeze(-1) 163 | y = self.block31(y * mask) 164 | y = self.block32(y * mask) 165 | y = self.final_conv(y * mask) 166 | return (y * mask).sum((2, 3)) / (mask.sum((2, 3)) * x.shape[2]) 167 | -------------------------------------------------------------------------------- /model/postnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | 11 | from model.base import BaseModule 12 | from model.modules import Mish 13 | 14 | 15 | class Block(BaseModule): 16 | def __init__(self, dim, groups=8): 17 | super(Block, self).__init__() 18 | self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim, 7, 19 | padding=3), torch.nn.GroupNorm(groups, dim), Mish()) 20 | 21 | def forward(self, x, mask): 22 | output = self.block(x * mask) 23 | return output * mask 24 | 25 | 26 | class ResnetBlock(BaseModule): 27 | def __init__(self, dim, groups=8): 28 | super(ResnetBlock, self).__init__() 29 | self.block1 = Block(dim, groups=groups) 30 | self.block2 = Block(dim, groups=groups) 31 | self.res = torch.nn.Conv2d(dim, dim, 1) 32 | 33 | def forward(self, x, mask): 34 | h = self.block1(x, mask) 35 | h = self.block2(h, mask) 36 | output = self.res(x * mask) + h 37 | return output 38 | 39 | 40 | class PostNet(BaseModule): 41 | def __init__(self, dim, groups=8): 42 | super(PostNet, self).__init__() 43 | self.init_conv = torch.nn.Conv2d(1, dim, 1) 44 | self.res_block = ResnetBlock(dim, groups=groups) 45 | self.final_conv = torch.nn.Conv2d(dim, 1, 1) 46 | 47 | def forward(self, x, mask): 48 | x = x.unsqueeze(1) 49 | mask = mask.unsqueeze(1) 50 | x = self.init_conv(x * mask) 51 | x = self.res_block(x, mask) 52 | output = self.final_conv(x * mask) 53 | return output.squeeze(1) 54 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | import torchaudio 11 | import numpy as np 12 | from librosa.filters import mel as librosa_mel_fn 13 | 14 | from model.base import BaseModule 15 | 16 | 17 | def mse_loss(x, y, mask, n_feats): 18 | loss = torch.sum(((x - y)**2) * mask) 19 | return loss / (torch.sum(mask) * n_feats) 20 | 21 | 22 | def sequence_mask(length, max_length=None): 23 | if max_length is None: 24 | max_length = length.max() 25 | x = torch.arange(int(max_length), dtype=length.dtype, device=length.device) 26 | return x.unsqueeze(0) < length.unsqueeze(1) 27 | 28 | 29 | def convert_pad_shape(pad_shape): 30 | l = pad_shape[::-1] 31 | pad_shape = [item for sublist in l for item in sublist] 32 | return pad_shape 33 | 34 | 35 | def fix_len_compatibility(length, num_downsamplings_in_unet=2): 36 | while True: 37 | if length % (2**num_downsamplings_in_unet) == 0: 38 | return length 39 | length += 1 40 | 41 | 42 | class PseudoInversion(BaseModule): 43 | def __init__(self, n_mels, sampling_rate, n_fft): 44 | super(PseudoInversion, self).__init__() 45 | self.n_mels = n_mels 46 | self.sampling_rate = sampling_rate 47 | self.n_fft = n_fft 48 | mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mels, 0, 8000) 49 | mel_basis_inverse = np.linalg.pinv(mel_basis) 50 | mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float() 51 | self.register_buffer("mel_basis_inverse", mel_basis_inverse) 52 | 53 | def forward(self, log_mel_spectrogram): 54 | mel_spectrogram = torch.exp(log_mel_spectrogram) 55 | stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram) 56 | return stftm 57 | 58 | 59 | class InitialReconstruction(BaseModule): 60 | def __init__(self, n_fft, hop_size): 61 | super(InitialReconstruction, self).__init__() 62 | self.n_fft = n_fft 63 | self.hop_size = hop_size 64 | window = torch.hann_window(n_fft).float() 65 | self.register_buffer("window", window) 66 | 67 | def forward(self, stftm): 68 | real_part = torch.ones_like(stftm, device=stftm.device) 69 | imag_part = torch.zeros_like(stftm, device=stftm.device) 70 | stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1) 71 | istft = torchaudio.functional.istft(stft, n_fft=self.n_fft, 72 | hop_length=self.hop_size, win_length=self.n_fft, 73 | window=self.window, center=True) 74 | return istft.unsqueeze(1) 75 | 76 | 77 | # Fast Griffin-Lim algorithm as a PyTorch module 78 | class FastGL(BaseModule): 79 | def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99): 80 | super(FastGL, self).__init__() 81 | self.n_mels = n_mels 82 | self.sampling_rate = sampling_rate 83 | self.n_fft = n_fft 84 | self.hop_size = hop_size 85 | self.momentum = momentum 86 | self.pi = PseudoInversion(n_mels, sampling_rate, n_fft) 87 | self.ir = InitialReconstruction(n_fft, hop_size) 88 | window = torch.hann_window(n_fft).float() 89 | self.register_buffer("window", window) 90 | 91 | @torch.no_grad() 92 | def forward(self, s, n_iters=32): 93 | c = self.pi(s) 94 | x = self.ir(c) 95 | x = x.squeeze(1) 96 | c = c.unsqueeze(-1) 97 | prev_angles = torch.zeros_like(c, device=c.device) 98 | for _ in range(n_iters): 99 | s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size, 100 | win_length=self.n_fft, window=self.window, 101 | center=True) 102 | real_part, imag_part = s.unbind(-1) 103 | stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8)) 104 | angles = s / stftm.unsqueeze(-1) 105 | s = c * (angles + self.momentum * (angles - prev_angles)) 106 | x = torchaudio.functional.istft(s, n_fft=self.n_fft, hop_length=self.hop_size, 107 | win_length=self.n_fft, window=self.window, 108 | center=True) 109 | prev_angles = angles 110 | return x.unsqueeze(1) 111 | -------------------------------------------------------------------------------- /model/vc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | 11 | from model.base import BaseModule 12 | from model.encoder import MelEncoder 13 | from model.postnet import PostNet 14 | from model.diffusion import Diffusion 15 | from model.utils import sequence_mask, fix_len_compatibility, mse_loss 16 | 17 | 18 | # "average voice" encoder as the module parameterizing the diffusion prior 19 | class FwdDiffusion(BaseModule): 20 | def __init__(self, n_feats, channels, filters, heads, layers, kernel, 21 | dropout, window_size, dim): 22 | super(FwdDiffusion, self).__init__() 23 | self.n_feats = n_feats 24 | self.channels = channels 25 | self.filters = filters 26 | self.heads = heads 27 | self.layers = layers 28 | self.kernel = kernel 29 | self.dropout = dropout 30 | self.window_size = window_size 31 | self.dim = dim 32 | self.encoder = MelEncoder(n_feats, channels, filters, heads, layers, 33 | kernel, dropout, window_size) 34 | self.postnet = PostNet(dim) 35 | 36 | @torch.no_grad() 37 | def forward(self, x, mask): 38 | x, mask = self.relocate_input([x, mask]) 39 | z = self.encoder(x, mask) 40 | z_output = self.postnet(z, mask) 41 | return z_output 42 | 43 | def compute_loss(self, x, y, mask): 44 | x, y, mask = self.relocate_input([x, y, mask]) 45 | z = self.encoder(x, mask) 46 | z_output = self.postnet(z, mask) 47 | loss = mse_loss(z_output, y, mask, self.n_feats) 48 | return loss 49 | 50 | 51 | # the whole voice conversion model consisting of the "average voice" encoder 52 | # and the diffusion-based speaker-conditional decoder 53 | class DiffVC(BaseModule): 54 | def __init__(self, n_feats, channels, filters, heads, layers, kernel, 55 | dropout, window_size, enc_dim, spk_dim, use_ref_t, dec_dim, 56 | beta_min, beta_max): 57 | super(DiffVC, self).__init__() 58 | self.n_feats = n_feats 59 | self.channels = channels 60 | self.filters = filters 61 | self.heads = heads 62 | self.layers = layers 63 | self.kernel = kernel 64 | self.dropout = dropout 65 | self.window_size = window_size 66 | self.enc_dim = enc_dim 67 | self.spk_dim = spk_dim 68 | self.use_ref_t = use_ref_t 69 | self.dec_dim = dec_dim 70 | self.beta_min = beta_min 71 | self.beta_max = beta_max 72 | self.encoder = FwdDiffusion(n_feats, channels, filters, heads, layers, 73 | kernel, dropout, window_size, enc_dim) 74 | self.decoder = Diffusion(n_feats, dec_dim, spk_dim, use_ref_t, 75 | beta_min, beta_max) 76 | 77 | def load_encoder(self, enc_path): 78 | enc_dict = torch.load(enc_path, map_location=lambda loc, storage: loc) 79 | self.encoder.load_state_dict(enc_dict, strict=False) 80 | 81 | @torch.no_grad() 82 | def forward(self, x, x_lengths, x_ref, x_ref_lengths, c, n_timesteps, 83 | mode='ml'): 84 | """ 85 | Generates mel-spectrogram from source mel-spectrogram conditioned on 86 | target speaker embedding. Returns: 87 | 1. 'average voice' encoder outputs 88 | 2. decoder outputs 89 | 90 | Args: 91 | x (torch.Tensor): batch of source mel-spectrograms. 92 | x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms. 93 | x_ref (torch.Tensor): batch of reference mel-spectrograms. 94 | x_ref_lengths (torch.Tensor): numbers of frames in reference mel-spectrograms. 95 | c (torch.Tensor): batch of reference speaker embeddings 96 | n_timesteps (int): number of steps to use for reverse diffusion in decoder. 97 | mode (string, optional): sampling method. Can be one of: 98 | 'pf' - probability flow sampling (Euler scheme for ODE) 99 | 'em' - Euler-Maruyama SDE solver 100 | 'ml' - Maximum Likelihood SDE solver 101 | """ 102 | x, x_lengths = self.relocate_input([x, x_lengths]) 103 | x_ref, x_ref_lengths, c = self.relocate_input([x_ref, x_ref_lengths, c]) 104 | x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype) 105 | x_ref_mask = sequence_mask(x_ref_lengths).unsqueeze(1).to(x_ref.dtype) 106 | mean = self.encoder(x, x_mask) 107 | mean_x = self.decoder.compute_diffused_mean(x, x_mask, mean, 1.0) 108 | mean_ref = self.encoder(x_ref, x_ref_mask) 109 | 110 | b = x.shape[0] 111 | max_length = int(x_lengths.max()) 112 | max_length_new = fix_len_compatibility(max_length) 113 | x_mask_new = sequence_mask(x_lengths, max_length_new).unsqueeze(1).to(x.dtype) 114 | mean_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 115 | device=x.device) 116 | mean_x_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 117 | device=x.device) 118 | for i in range(b): 119 | mean_new[i, :, :x_lengths[i]] = mean[i, :, :x_lengths[i]] 120 | mean_x_new[i, :, :x_lengths[i]] = mean_x[i, :, :x_lengths[i]] 121 | 122 | z = mean_x_new 123 | z += torch.randn_like(mean_x_new, device=mean_x_new.device) 124 | 125 | y = self.decoder(z, x_mask_new, mean_new, x_ref, x_ref_mask, mean_ref, c, 126 | n_timesteps, mode) 127 | return mean_x, y[:, :, :max_length] 128 | 129 | def compute_loss(self, x, x_lengths, x_ref, c): 130 | """ 131 | Computes diffusion (score matching) loss. 132 | 133 | Args: 134 | x (torch.Tensor): batch of source mel-spectrograms. 135 | x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms. 136 | x_ref (torch.Tensor): batch of reference mel-spectrograms. 137 | c (torch.Tensor): batch of reference speaker embeddings 138 | """ 139 | x, x_lengths, x_ref, c = self.relocate_input([x, x_lengths, x_ref, c]) 140 | x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype) 141 | mean = self.encoder(x, x_mask).detach() 142 | mean_ref = self.encoder(x_ref, x_mask).detach() 143 | diff_loss = self.decoder.compute_loss(x, x_mask, mean, x_ref, mean_ref, c) 144 | return diff_loss 145 | -------------------------------------------------------------------------------- /params.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | # data parameters 10 | n_mels = 80 11 | sampling_rate = 22050 12 | n_fft = 1024 13 | hop_size = 256 14 | 15 | # "average voice" encoder parameters 16 | channels = 192 17 | filters = 768 18 | layers = 6 19 | kernel = 3 20 | dropout = 0.1 21 | heads = 2 22 | window_size = 4 23 | enc_dim = 128 24 | 25 | # diffusion-based decoder parameters 26 | dec_dim = 256 27 | spk_dim = 128 28 | use_ref_t = True 29 | beta_min = 0.05 30 | beta_max = 20.0 31 | 32 | # training parameters 33 | seed = 37 34 | test_size = 1 35 | train_frames = 128 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torchaudio==0.5.1 2 | torch==1.7.1 3 | einops==0.3.0 4 | librosa==0.6.0 5 | webrtcvad==2.0.10 6 | numpy==1.19.0 7 | scipy==1.5.1 8 | matplotlib==3.3.3 9 | tb-nightly 10 | future 11 | tqdm 12 | tgt -------------------------------------------------------------------------------- /speaker_encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /speaker_encoder/README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Voice Cloning 2 | This repository is an implementation of [Transfer Learning from Speaker Verification to 3 | Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). 4 | 5 | SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text. 6 | 7 | **Video demonstration** (click the picture): 8 | 9 | [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) 10 | 11 | 12 | 13 | ### Papers implemented 14 | | URL | Designation | Title | Implementation source | 15 | | --- | ----------- | ----- | --------------------- | 16 | |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | 17 | |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | 18 | |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) 19 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | 20 | 21 | ## News 22 | **10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion. 23 | 24 | **28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below. 25 | 26 | **14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish. 27 | 28 | **13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this: 29 | - **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors. 30 | - **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info. 31 | 32 | **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it. 33 | 34 | 35 | ## Setup 36 | 37 | ### 1. Install Requirements 38 | 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. 39 | 2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. 40 | 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. 41 | 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. 42 | 5. Install the remaining requirements with `pip install -r requirements.txt` 43 | 44 | ### 2. (Optional) Download Pretrained Models 45 | Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). 46 | 47 | ### 3. (Optional) Test Configuration 48 | Before you download any dataset, you can begin by testing your configuration with: 49 | 50 | `python demo_cli.py` 51 | 52 | If all tests pass, you're good to go. 53 | 54 | ### 4. (Optional) Download Datasets 55 | For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. 56 | 57 | ### 5. Launch the Toolbox 58 | You can then try the toolbox: 59 | 60 | `python demo_toolbox.py -d ` 61 | or 62 | `python demo_toolbox.py` 63 | 64 | depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). 65 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/audio.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/audio.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/inference.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/inference.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/params_data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/params_data.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/__pycache__/params_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WangHelin1997/DuTa-VC/f87418c912d46292b08d94102016fdc942c679b3/speaker_encoder/encoder/__pycache__/params_model.cpython-36.pyc -------------------------------------------------------------------------------- /speaker_encoder/encoder/audio.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from scipy.ndimage.morphology import binary_dilation 4 | from encoder.params_data import * 5 | from pathlib import Path 6 | from typing import Optional, Union 7 | import numpy as np 8 | import webrtcvad 9 | import librosa 10 | import struct 11 | 12 | import torch 13 | from torchaudio.transforms import Resample 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | 17 | int16_max = (2 ** 15) - 1 18 | 19 | 20 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], 21 | source_sr: Optional[int] = None): 22 | """ 23 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 24 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 25 | 26 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 27 | just .wav), either the waveform as a numpy array of floats. 28 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 29 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 30 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 31 | this argument will be ignored. 32 | """ 33 | # Load the wav from disk if needed 34 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 35 | wav, source_sr = librosa.load(fpath_or_wav, sr=None) 36 | else: 37 | wav = fpath_or_wav 38 | 39 | # Resample the wav if needed 40 | if source_sr is not None and source_sr != sampling_rate: 41 | wav = librosa.resample(wav, source_sr, sampling_rate) 42 | 43 | # Apply the preprocessing: normalize volume and shorten long silences 44 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 45 | wav = trim_long_silences(wav) 46 | 47 | return wav 48 | 49 | 50 | def preprocess_wav_batch(wavs, source_sr=22050): 51 | # This torch version is designed to cope with a batch of same lengths wavs 52 | if sampling_rate != source_sr: 53 | resample = Resample(source_sr, sampling_rate) 54 | wavs = resample(wavs) 55 | wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, 56 | increase_only=True) 57 | # Trimming silence is not implemented in this version yet! 58 | return wavs_preprocessed 59 | 60 | 61 | def wav_to_mel_spectrogram(wav): 62 | """ 63 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 64 | Note: this not a log-mel spectrogram. 65 | """ 66 | frames = librosa.feature.melspectrogram( 67 | wav, 68 | sampling_rate, 69 | n_fft=int(sampling_rate * mel_window_length / 1000), 70 | hop_length=int(sampling_rate * mel_window_step / 1000), 71 | n_mels=mel_n_channels 72 | ) 73 | return frames.astype(np.float32).T 74 | 75 | 76 | def wav_to_mel_spectrogram_batch(wavs): 77 | # This torch version is designed to cope with a batch of same lengths wavs 78 | n_fft = int(sampling_rate * mel_window_length / 1000) 79 | hop_length = int(sampling_rate * mel_window_step / 1000) 80 | win_length = int(sampling_rate * mel_window_length / 1000) 81 | window = torch.hann_window(n_fft).to(wavs) 82 | mel_basis = torch.from_numpy(librosa_mel_fn(sampling_rate, n_fft, 83 | mel_n_channels)).to(wavs) 84 | s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, 85 | win_length=win_length, window=window, center=True) 86 | real_part, imag_part = s.unbind(-1) 87 | stftm = real_part**2 + imag_part**2 88 | mels = torch.matmul(mel_basis, stftm) 89 | return torch.transpose(mels, 1, 2) 90 | 91 | 92 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 93 | if increase_only and decrease_only: 94 | raise ValueError("Both increase only and decrease only are set") 95 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) 96 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 97 | return wav 98 | return wav * (10 ** (dBFS_change / 20)) 99 | 100 | 101 | def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False): 102 | # This torch version is designed to cope with a batch of same lengths wavs 103 | if increase_only and decrease_only: 104 | raise ValueError("Both increase only and decrease only are set") 105 | dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1)) 106 | scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype) 107 | if increase_only: 108 | mask = (dBFS_change > 0).to(scales) 109 | elif decrease_only: 110 | mask = (dBFS_change < 0).to(scales) 111 | else: 112 | mask = torch.zeros_like(scales) 113 | scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0) 114 | return wavs * scales.unsqueeze(-1) 115 | 116 | 117 | def trim_long_silences(wav): 118 | """ 119 | Ensures that segments without voice in the waveform remain no longer than a 120 | threshold determined by the VAD parameters in params.py. 121 | 122 | :param wav: the raw waveform as a numpy array of floats 123 | :return: the same waveform with silences trimmed away (length <= original wav length) 124 | """ 125 | # Compute the voice detection window size 126 | samples_per_window = (vad_window_length * sampling_rate) // 1000 127 | 128 | # Trim the end of the audio to have a multiple of the window size 129 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] 130 | 131 | # Convert the float waveform to 16-bit mono PCM 132 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) 133 | 134 | # Perform voice activation detection 135 | voice_flags = [] 136 | vad = webrtcvad.Vad(mode=3) 137 | for window_start in range(0, len(wav), samples_per_window): 138 | window_end = window_start + samples_per_window 139 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], 140 | sample_rate=sampling_rate)) 141 | voice_flags = np.array(voice_flags) 142 | 143 | # Smooth the voice detection with a moving average 144 | def moving_average(array, width): 145 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) 146 | ret = np.cumsum(array_padded, dtype=float) 147 | ret[width:] = ret[width:] - ret[:-width] 148 | return ret[width - 1:] / width 149 | 150 | audio_mask = moving_average(voice_flags, vad_moving_average_width) 151 | audio_mask = np.round(audio_mask).astype(np.bool) 152 | 153 | # Dilate the voiced regions 154 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) 155 | audio_mask = np.repeat(audio_mask, samples_per_window) 156 | 157 | return wav[audio_mask == True] 158 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/config.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | librispeech_datasets = { 4 | "train": { 5 | "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], 6 | "other": ["LibriSpeech/train-other-500"] 7 | }, 8 | "test": { 9 | "clean": ["LibriSpeech/test-clean"], 10 | "other": ["LibriSpeech/test-other"] 11 | }, 12 | "dev": { 13 | "clean": ["LibriSpeech/dev-clean"], 14 | "other": ["LibriSpeech/dev-other"] 15 | }, 16 | } 17 | libritts_datasets = { 18 | "train": { 19 | "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], 20 | "other": ["LibriTTS/train-other-500"] 21 | }, 22 | "test": { 23 | "clean": ["LibriTTS/test-clean"], 24 | "other": ["LibriTTS/test-other"] 25 | }, 26 | "dev": { 27 | "clean": ["LibriTTS/dev-clean"], 28 | "other": ["LibriTTS/dev-other"] 29 | }, 30 | } 31 | voxceleb_datasets = { 32 | "voxceleb1" : { 33 | "train": ["VoxCeleb1/wav"], 34 | "test": ["VoxCeleb1/test_wav"] 35 | }, 36 | "voxceleb2" : { 37 | "train": ["VoxCeleb2/dev/aac"], 38 | "test": ["VoxCeleb2/test_wav"] 39 | } 40 | } 41 | 42 | other_datasets = [ 43 | "LJSpeech-1.1", 44 | "VCTK-Corpus/wav48", 45 | ] 46 | 47 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] 48 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 4 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader 5 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/random_cycler.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import random 4 | 5 | class RandomCycler: 6 | """ 7 | Creates an internal copy of a sequence and allows access to its items in a constrained random 8 | order. For a source sequence of n items and one or several consecutive queries of a total 9 | of m items, the following guarantees hold (one implies the other): 10 | - Each item will be returned between m // n and ((m - 1) // n) + 1 times. 11 | - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. 12 | """ 13 | 14 | def __init__(self, source): 15 | if len(source) == 0: 16 | raise Exception("Can't create RandomCycler from an empty collection") 17 | self.all_items = list(source) 18 | self.next_items = [] 19 | 20 | def sample(self, count: int): 21 | shuffle = lambda l: random.sample(l, len(l)) 22 | 23 | out = [] 24 | while count > 0: 25 | if count >= len(self.all_items): 26 | out.extend(shuffle(list(self.all_items))) 27 | count -= len(self.all_items) 28 | continue 29 | n = min(count, len(self.next_items)) 30 | out.extend(self.next_items[:n]) 31 | count -= n 32 | self.next_items = self.next_items[n:] 33 | if len(self.next_items) == 0: 34 | self.next_items = shuffle(list(self.all_items)) 35 | return out 36 | 37 | def __next__(self): 38 | return self.sample(1)[0] 39 | 40 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.random_cycler import RandomCycler 4 | from encoder.data_objects.utterance import Utterance 5 | from pathlib import Path 6 | 7 | # Contains the set of utterances of a single speaker 8 | class Speaker: 9 | def __init__(self, root: Path): 10 | self.root = root 11 | self.name = root.name 12 | self.utterances = None 13 | self.utterance_cycler = None 14 | 15 | def _load_utterances(self): 16 | with self.root.joinpath("_sources.txt").open("r") as sources_file: 17 | sources = [l.split(",") for l in sources_file] 18 | sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} 19 | self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] 20 | self.utterance_cycler = RandomCycler(self.utterances) 21 | 22 | def random_partial(self, count, n_frames): 23 | """ 24 | Samples a batch of unique partial utterances from the disk in a way that all 25 | utterances come up at least once every two cycles and in a random order every time. 26 | 27 | :param count: The number of partial utterances to sample from the set of utterances from 28 | that speaker. Utterances are guaranteed not to be repeated if is not larger than 29 | the number of utterances available. 30 | :param n_frames: The number of frames in the partial utterance. 31 | :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 32 | frames are the frames of the partial utterances and range is the range of the partial 33 | utterance with regard to the complete utterance. 34 | """ 35 | if self.utterances is None: 36 | self._load_utterances() 37 | 38 | utterances = self.utterance_cycler.sample(count) 39 | 40 | a = [(u,) + u.random_partial(n_frames) for u in utterances] 41 | 42 | return a 43 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker_batch.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | from typing import List 5 | from encoder.data_objects.speaker import Speaker 6 | 7 | class SpeakerBatch: 8 | def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): 9 | self.speakers = speakers 10 | self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} 11 | 12 | # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with 13 | # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) 14 | self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) 15 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker_verification_dataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.random_cycler import RandomCycler 4 | from encoder.data_objects.speaker_batch import SpeakerBatch 5 | from encoder.data_objects.speaker import Speaker 6 | from encoder.params_data import partials_n_frames 7 | from torch.utils.data import Dataset, DataLoader 8 | from pathlib import Path 9 | 10 | # TODO: improve with a pool of speakers for data efficiency 11 | 12 | class SpeakerVerificationDataset(Dataset): 13 | def __init__(self, datasets_root: Path): 14 | self.root = datasets_root 15 | speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] 16 | if len(speaker_dirs) == 0: 17 | raise Exception("No speakers found. Make sure you are pointing to the directory " 18 | "containing all preprocessed speaker directories.") 19 | self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] 20 | self.speaker_cycler = RandomCycler(self.speakers) 21 | 22 | def __len__(self): 23 | return int(1e10) 24 | 25 | def __getitem__(self, index): 26 | return next(self.speaker_cycler) 27 | 28 | def get_logs(self): 29 | log_string = "" 30 | for log_fpath in self.root.glob("*.txt"): 31 | with log_fpath.open("r") as log_file: 32 | log_string += "".join(log_file.readlines()) 33 | return log_string 34 | 35 | 36 | class SpeakerVerificationDataLoader(DataLoader): 37 | def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 38 | batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 39 | worker_init_fn=None): 40 | self.utterances_per_speaker = utterances_per_speaker 41 | 42 | super().__init__( 43 | dataset=dataset, 44 | batch_size=speakers_per_batch, 45 | shuffle=False, 46 | sampler=sampler, 47 | batch_sampler=batch_sampler, 48 | num_workers=num_workers, 49 | collate_fn=self.collate, 50 | pin_memory=pin_memory, 51 | drop_last=False, 52 | timeout=timeout, 53 | worker_init_fn=worker_init_fn 54 | ) 55 | 56 | def collate(self, speakers): 57 | return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 58 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/utterance.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | 5 | 6 | class Utterance: 7 | def __init__(self, frames_fpath, wave_fpath): 8 | self.frames_fpath = frames_fpath 9 | self.wave_fpath = wave_fpath 10 | 11 | def get_frames(self): 12 | return np.load(self.frames_fpath) 13 | 14 | def random_partial(self, n_frames): 15 | """ 16 | Crops the frames into a partial utterance of n_frames 17 | 18 | :param n_frames: The number of frames of the partial utterance 19 | :return: the partial utterance frames and a tuple indicating the start and end of the 20 | partial utterance in the complete utterance. 21 | """ 22 | frames = self.get_frames() 23 | if frames.shape[0] == n_frames: 24 | start = 0 25 | else: 26 | start = np.random.randint(0, frames.shape[0] - n_frames) 27 | end = start + n_frames 28 | return frames[start:end], (start, end) -------------------------------------------------------------------------------- /speaker_encoder/encoder/inference.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.params_data import * 4 | from encoder.model import SpeakerEncoder 5 | from encoder.audio import preprocess_wav, preprocess_wav_batch 6 | from matplotlib import cm 7 | from encoder import audio 8 | from pathlib import Path 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import torch 12 | 13 | _model = None # type: SpeakerEncoder 14 | _device = None # type: torch.device 15 | 16 | 17 | def load_model(weights_fpath: Path, device="cpu"): 18 | """ 19 | Loads the model in memory. If this function is not explicitely called, it will be run on the 20 | first call to embed_frames() with the default weights file. 21 | 22 | :param weights_fpath: the path to saved model weights. 23 | :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 24 | model will be loaded and will run on this device. Outputs will however always be on the cpu. 25 | If None, will default to your GPU if it"s available, otherwise your CPU. 26 | """ 27 | # TODO: I think the slow loading of the encoder might have something to do with the device it 28 | # was saved on. Worth investigating. 29 | global _model, _device 30 | if device is None: 31 | _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | elif isinstance(device, str): 33 | _device = torch.device(device) 34 | _model = SpeakerEncoder(_device, torch.device("cpu")) 35 | checkpoint = torch.load(weights_fpath, map_location="cpu") 36 | _model.load_state_dict(checkpoint["model_state"]) 37 | _model.eval() 38 | print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) 39 | 40 | 41 | def is_loaded(): 42 | return _model is not None 43 | 44 | 45 | def embed_frames_batch(frames, use_torch=False): 46 | if _model is None: 47 | raise Exception("Model was not loaded. Call load_model() before inference.") 48 | 49 | if not use_torch: 50 | frames = torch.from_numpy(frames) 51 | frames = frames.to(_device) 52 | 53 | embeds = _model.forward(frames) 54 | if not use_torch: 55 | embeds = embeds.detach().cpu().numpy() 56 | return embeds 57 | 58 | 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, 60 | min_pad_coverage=0.75, overlap=0.5): 61 | """ 62 | Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 63 | partial utterances of each. Both the waveform and the mel 64 | spectrogram slices are returned, so as to make each partial utterance waveform correspond to 65 | its spectrogram. This function assumes that the mel spectrogram parameters used are those 66 | defined in params_data.py. 67 | 68 | The returned ranges may be indexing further than the length of the waveform. It is 69 | recommended that you pad the waveform with zeros up to wave_slices[-1].stop. 70 | 71 | :param n_samples: the number of samples in the waveform 72 | :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 73 | utterance 74 | :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 75 | enough frames. If at least of are present, 76 | then the last partial utterance will be considered, as if we padded the audio. Otherwise, 77 | it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 78 | utterance, this parameter is ignored so that the function always returns at least 1 slice. 79 | :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 80 | utterances are entirely disjoint. 81 | :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 82 | respectively the waveform and the mel spectrogram with these slices to obtain the partial 83 | utterances. 84 | """ 85 | assert 0 <= overlap < 1 86 | assert 0 < min_pad_coverage <= 1 87 | 88 | samples_per_frame = int((sampling_rate * mel_window_step / 1000)) 89 | n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) 90 | frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) 91 | 92 | # Compute the slices 93 | wav_slices, mel_slices = [], [] 94 | steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) 95 | for i in range(0, steps, frame_step): 96 | mel_range = np.array([i, i + partial_utterance_n_frames]) 97 | wav_range = mel_range * samples_per_frame 98 | mel_slices.append(slice(*mel_range)) 99 | wav_slices.append(slice(*wav_range)) 100 | 101 | # Evaluate whether extra padding is warranted or not 102 | last_wav_range = wav_slices[-1] 103 | coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) 104 | if coverage < min_pad_coverage and len(mel_slices) > 1: 105 | mel_slices = mel_slices[:-1] 106 | wav_slices = wav_slices[:-1] 107 | 108 | return wav_slices, mel_slices 109 | 110 | 111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): 112 | """ 113 | Computes an embedding for a single utterance. 114 | 115 | # TODO: handle multiple wavs to benefit from batching on GPU 116 | :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 117 | :param using_partials: if True, then the utterance is split in partial utterances of 118 | frames and the utterance embedding is computed from their 119 | normalized average. If False, the utterance is instead computed from feeding the entire 120 | spectogram to the network. 121 | :param return_partials: if True, the partial embeddings will also be returned along with the 122 | wav slices that correspond to the partial embeddings. 123 | :param kwargs: additional arguments to compute_partial_splits() 124 | :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 125 | is True, the partial utterances as a numpy array of float32 of shape 126 | (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 127 | returned. If is simultaneously set to False, both these values will be None 128 | instead. 129 | """ 130 | # Process the entire utterance if not using partials 131 | if not using_partials: 132 | frames = audio.wav_to_mel_spectrogram(wav) 133 | embed = embed_frames_batch(frames[None, ...])[0] 134 | if return_partials: 135 | return embed, None, None 136 | return embed 137 | 138 | # Compute where to split the utterance into partials and pad if necessary 139 | wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) 140 | max_wave_length = wave_slices[-1].stop 141 | if max_wave_length >= len(wav): 142 | wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") 143 | 144 | # Split the utterance into partials 145 | frames = audio.wav_to_mel_spectrogram(wav) 146 | frames_batch = np.array([frames[s] for s in mel_slices]) 147 | partial_embeds = embed_frames_batch(frames_batch) 148 | 149 | # Compute the utterance embedding from the partial embeddings 150 | raw_embed = np.mean(partial_embeds, axis=0) 151 | embed = raw_embed / np.linalg.norm(raw_embed, 2) 152 | 153 | if return_partials: 154 | return embed, partial_embeds, wave_slices 155 | return embed 156 | 157 | 158 | def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs): 159 | # This torch version is designed to cope with a batch of same lengths wavs 160 | if not using_partials: 161 | print(wavs.shape) 162 | frames = audio.wav_to_mel_spectrogram_batch(wavs) 163 | embeds = embed_frames_batch(frames) 164 | if return_partials: 165 | return embeds, None, None 166 | return embeds 167 | 168 | wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs) 169 | max_wave_length = wave_slices[-1].stop 170 | if max_wave_length >= wavs.shape[-1]: 171 | wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 172 | dtype=wavs.dtype, device=wavs.device)], 1) 173 | 174 | frames = audio.wav_to_mel_spectrogram_batch(wavs) 175 | frames_batch = [] 176 | for i in range(len(frames)): 177 | frames_batch += [frames[i][s] for s in mel_slices] 178 | frames_batch = torch.stack(frames_batch, 0) 179 | partial_embeds = embed_frames_batch(frames_batch, use_torch=True) 180 | partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1) 181 | 182 | raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False) 183 | embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True) 184 | 185 | if return_partials: 186 | return embeds, partial_embeds, wave_slices 187 | return embeds 188 | 189 | 190 | def embed_speaker(wavs, **kwargs): 191 | raise NotImplemented() 192 | 193 | 194 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): 195 | if ax is None: 196 | ax = plt.gca() 197 | 198 | if shape is None: 199 | height = int(np.sqrt(len(embed))) 200 | shape = (height, -1) 201 | embed = embed.reshape(shape) 202 | 203 | cmap = cm.get_cmap() 204 | mappable = ax.imshow(embed, cmap=cmap) 205 | cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) 206 | cbar.set_clim(*color_range) 207 | 208 | ax.set_xticks([]), ax.set_yticks([]) 209 | ax.set_title(title) 210 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.params_model import * 4 | from encoder.params_data import * 5 | from scipy.interpolate import interp1d 6 | from sklearn.metrics import roc_curve 7 | from torch.nn.utils import clip_grad_norm_ 8 | from scipy.optimize import brentq 9 | from torch import nn 10 | import numpy as np 11 | import torch 12 | 13 | 14 | class SpeakerEncoder(nn.Module): 15 | def __init__(self, device, loss_device): 16 | super().__init__() 17 | self.loss_device = loss_device 18 | 19 | # Network defition 20 | self.lstm = nn.LSTM(input_size=mel_n_channels, 21 | hidden_size=model_hidden_size, 22 | num_layers=model_num_layers, 23 | batch_first=True).to(device) 24 | self.linear = nn.Linear(in_features=model_hidden_size, 25 | out_features=model_embedding_size).to(device) 26 | self.relu = torch.nn.ReLU().to(device) 27 | 28 | # Cosine similarity scaling (with fixed initial parameter values) 29 | self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) 30 | self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) 31 | 32 | # Loss 33 | self.loss_fn = nn.CrossEntropyLoss().to(loss_device) 34 | 35 | def do_gradient_ops(self): 36 | # Gradient scale 37 | self.similarity_weight.grad *= 0.01 38 | self.similarity_bias.grad *= 0.01 39 | 40 | # Gradient clipping 41 | clip_grad_norm_(self.parameters(), 3, norm_type=2) 42 | 43 | def forward(self, utterances, hidden_init=None): 44 | """ 45 | Computes the embeddings of a batch of utterance spectrograms. 46 | 47 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 48 | (batch_size, n_frames, n_channels) 49 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 50 | batch_size, hidden_size). Will default to a tensor of zeros if None. 51 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 52 | """ 53 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 54 | # and the final cell state. 55 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 56 | 57 | # We take only the hidden state of the last layer 58 | embeds_raw = self.relu(self.linear(hidden[-1])) 59 | 60 | # L2-normalize it 61 | embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 62 | 63 | return embeds 64 | 65 | def similarity_matrix(self, embeds): 66 | """ 67 | Computes the similarity matrix according the section 2.1 of GE2E. 68 | 69 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 70 | utterances_per_speaker, embedding_size) 71 | :return: the similarity matrix as a tensor of shape (speakers_per_batch, 72 | utterances_per_speaker, speakers_per_batch) 73 | """ 74 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 75 | 76 | # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation 77 | centroids_incl = torch.mean(embeds, dim=1, keepdim=True) 78 | centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) 79 | 80 | # Exclusive centroids (1 per utterance) 81 | centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) 82 | centroids_excl /= (utterances_per_speaker - 1) 83 | centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) 84 | 85 | # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot 86 | # product of these vectors (which is just an element-wise multiplication reduced by a sum). 87 | # We vectorize the computation for efficiency. 88 | sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, 89 | speakers_per_batch).to(self.loss_device) 90 | mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) 91 | for j in range(speakers_per_batch): 92 | mask = np.where(mask_matrix[j])[0] 93 | sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) 94 | sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) 95 | 96 | ## Even more vectorized version (slower maybe because of transpose) 97 | # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker 98 | # ).to(self.loss_device) 99 | # eye = np.eye(speakers_per_batch, dtype=np.int) 100 | # mask = np.where(1 - eye) 101 | # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) 102 | # mask = np.where(eye) 103 | # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) 104 | # sim_matrix2 = sim_matrix2.transpose(1, 2) 105 | 106 | sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias 107 | return sim_matrix 108 | 109 | def loss(self, embeds): 110 | """ 111 | Computes the softmax loss according the section 2.1 of GE2E. 112 | 113 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 114 | utterances_per_speaker, embedding_size) 115 | :return: the loss and the EER for this batch of embeddings. 116 | """ 117 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 118 | 119 | # Loss 120 | sim_matrix = self.similarity_matrix(embeds) 121 | sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 122 | speakers_per_batch)) 123 | ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) 124 | target = torch.from_numpy(ground_truth).long().to(self.loss_device) 125 | loss = self.loss_fn(sim_matrix, target) 126 | 127 | # EER (not backpropagated) 128 | with torch.no_grad(): 129 | inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] 130 | labels = np.array([inv_argmax(i) for i in ground_truth]) 131 | preds = sim_matrix.detach().cpu().numpy() 132 | 133 | # Snippet from https://yangcha.github.io/EER-ROC/ 134 | fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) 135 | eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) 136 | 137 | return loss, eer -------------------------------------------------------------------------------- /speaker_encoder/encoder/params_data.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | ## Mel-filterbank 4 | mel_window_length = 25 # In milliseconds 5 | mel_window_step = 10 # In milliseconds 6 | mel_n_channels = 40 7 | 8 | 9 | ## Audio 10 | sampling_rate = 16000 11 | # Number of spectrogram frames in a partial utterance 12 | partials_n_frames = 160 # 1600 ms 13 | # Number of spectrogram frames at inference 14 | inference_n_frames = 80 # 800 ms 15 | 16 | 17 | ## Voice Activation Detection 18 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 19 | # This sets the granularity of the VAD. Should not need to be changed. 20 | vad_window_length = 30 # In milliseconds 21 | # Number of frames to average together when performing the moving average smoothing. 22 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 23 | vad_moving_average_width = 8 24 | # Maximum number of consecutive silent frames a segment can have. 25 | vad_max_silence_length = 6 26 | 27 | 28 | ## Audio volume normalization 29 | audio_norm_target_dBFS = -30 30 | 31 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/params_model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | ## Model parameters 4 | model_hidden_size = 256 5 | model_embedding_size = 256 6 | model_num_layers = 3 7 | 8 | 9 | ## Training parameters 10 | learning_rate_init = 1e-4 11 | speakers_per_batch = 64 12 | utterances_per_speaker = 10 13 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/preprocess.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from multiprocess.pool import ThreadPool 4 | from encoder.params_data import * 5 | from encoder.config import librispeech_datasets, anglophone_nationalites 6 | from datetime import datetime 7 | from encoder import audio 8 | from pathlib import Path 9 | from tqdm import tqdm 10 | import numpy as np 11 | 12 | 13 | class DatasetLog: 14 | """ 15 | Registers metadata about the dataset in a text file. 16 | """ 17 | def __init__(self, root, name): 18 | self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") 19 | self.sample_data = dict() 20 | 21 | start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 22 | self.write_line("Creating dataset %s on %s" % (name, start_time)) 23 | self.write_line("-----") 24 | self._log_params() 25 | 26 | def _log_params(self): 27 | from encoder import params_data 28 | self.write_line("Parameter values:") 29 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 30 | value = getattr(params_data, param_name) 31 | self.write_line("\t%s: %s" % (param_name, value)) 32 | self.write_line("-----") 33 | 34 | def write_line(self, line): 35 | self.text_file.write("%s\n" % line) 36 | 37 | def add_sample(self, **kwargs): 38 | for param_name, value in kwargs.items(): 39 | if not param_name in self.sample_data: 40 | self.sample_data[param_name] = [] 41 | self.sample_data[param_name].append(value) 42 | 43 | def finalize(self): 44 | self.write_line("Statistics:") 45 | for param_name, values in self.sample_data.items(): 46 | self.write_line("\t%s:" % param_name) 47 | self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) 48 | self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) 49 | self.write_line("-----") 50 | end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 51 | self.write_line("Finished on %s" % end_time) 52 | self.text_file.close() 53 | 54 | 55 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): 56 | dataset_root = datasets_root.joinpath(dataset_name) 57 | if not dataset_root.exists(): 58 | print("Couldn\'t find %s, skipping this dataset." % dataset_root) 59 | return None, None 60 | return dataset_root, DatasetLog(out_dir, dataset_name) 61 | 62 | 63 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, 64 | skip_existing, logger): 65 | print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) 66 | 67 | # Function to preprocess utterances for one speaker 68 | def preprocess_speaker(speaker_dir: Path): 69 | # Give a name to the speaker that includes its dataset 70 | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) 71 | 72 | # Create an output directory with that name, as well as a txt file containing a 73 | # reference to each source file. 74 | speaker_out_dir = out_dir.joinpath(speaker_name) 75 | speaker_out_dir.mkdir(exist_ok=True) 76 | sources_fpath = speaker_out_dir.joinpath("_sources.txt") 77 | 78 | # There's a possibility that the preprocessing was interrupted earlier, check if 79 | # there already is a sources file. 80 | if sources_fpath.exists(): 81 | try: 82 | with sources_fpath.open("r") as sources_file: 83 | existing_fnames = {line.split(",")[0] for line in sources_file} 84 | except: 85 | existing_fnames = {} 86 | else: 87 | existing_fnames = {} 88 | 89 | # Gather all audio files for that speaker recursively 90 | sources_file = sources_fpath.open("a" if skip_existing else "w") 91 | for in_fpath in speaker_dir.glob("**/*.%s" % extension): 92 | # Check if the target output file already exists 93 | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) 94 | out_fname = out_fname.replace(".%s" % extension, ".npy") 95 | if skip_existing and out_fname in existing_fnames: 96 | continue 97 | 98 | # Load and preprocess the waveform 99 | wav = audio.preprocess_wav(in_fpath) 100 | if len(wav) == 0: 101 | continue 102 | 103 | # Create the mel spectrogram, discard those that are too short 104 | frames = audio.wav_to_mel_spectrogram(wav) 105 | if len(frames) < partials_n_frames: 106 | continue 107 | 108 | out_fpath = speaker_out_dir.joinpath(out_fname) 109 | np.save(out_fpath, frames) 110 | logger.add_sample(duration=len(wav) / sampling_rate) 111 | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) 112 | 113 | sources_file.close() 114 | 115 | # Process the utterances for each speaker 116 | with ThreadPool(8) as pool: 117 | list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), 118 | unit="speakers")) 119 | logger.finalize() 120 | print("Done preprocessing %s.\n" % dataset_name) 121 | 122 | 123 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): 124 | for dataset_name in librispeech_datasets["train"]["other"]: 125 | # Initialize the preprocessing 126 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 127 | if not dataset_root: 128 | return 129 | 130 | # Preprocess all speakers 131 | speaker_dirs = list(dataset_root.glob("*")) 132 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", 133 | skip_existing, logger) 134 | 135 | 136 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): 137 | # Initialize the preprocessing 138 | dataset_name = "VoxCeleb1" 139 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 140 | if not dataset_root: 141 | return 142 | 143 | # Get the contents of the meta file 144 | with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: 145 | metadata = [line.split("\t") for line in metafile][1:] 146 | 147 | # Select the ID and the nationality, filter out non-anglophone speakers 148 | nationalities = {line[0]: line[3] for line in metadata} 149 | keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 150 | nationality.lower() in anglophone_nationalites] 151 | print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 152 | (len(keep_speaker_ids), len(nationalities))) 153 | 154 | # Get the speaker directories for anglophone speakers only 155 | speaker_dirs = dataset_root.joinpath("wav").glob("*") 156 | speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if 157 | speaker_dir.name in keep_speaker_ids] 158 | print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 159 | (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) 160 | 161 | # Preprocess all speakers 162 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", 163 | skip_existing, logger) 164 | 165 | 166 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): 167 | # Initialize the preprocessing 168 | dataset_name = "VoxCeleb2" 169 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 170 | if not dataset_root: 171 | return 172 | 173 | # Get the speaker directories 174 | # Preprocess all speakers 175 | speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) 176 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", 177 | skip_existing, logger) 178 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/train.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.visualizations import Visualizations 4 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset 5 | from encoder.params_model import * 6 | from encoder.model import SpeakerEncoder 7 | from utils.profiler import Profiler 8 | from pathlib import Path 9 | import torch 10 | 11 | def sync(device: torch.device): 12 | # FIXME 13 | return 14 | # For correct profiling (cuda operations are async) 15 | if device.type == "cuda": 16 | torch.cuda.synchronize(device) 17 | 18 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, 19 | backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, 20 | no_visdom: bool): 21 | # Create a dataset and a dataloader 22 | dataset = SpeakerVerificationDataset(clean_data_root) 23 | loader = SpeakerVerificationDataLoader( 24 | dataset, 25 | speakers_per_batch, 26 | utterances_per_speaker, 27 | num_workers=8, 28 | ) 29 | 30 | # Setup the device on which to run the forward pass and the loss. These can be different, 31 | # because the forward pass is faster on the GPU whereas the loss is often (depending on your 32 | # hyperparameters) faster on the CPU. 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | # FIXME: currently, the gradient is None if loss_device is cuda 35 | loss_device = torch.device("cpu") 36 | 37 | # Create the model and the optimizer 38 | model = SpeakerEncoder(device, loss_device) 39 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) 40 | init_step = 1 41 | 42 | # Configure file path for the model 43 | state_fpath = models_dir.joinpath(run_id + ".pt") 44 | backup_dir = models_dir.joinpath(run_id + "_backups") 45 | 46 | # Load any existing model 47 | if not force_restart: 48 | if state_fpath.exists(): 49 | print("Found existing model \"%s\", loading it and resuming training." % run_id) 50 | checkpoint = torch.load(state_fpath) 51 | init_step = checkpoint["step"] 52 | model.load_state_dict(checkpoint["model_state"]) 53 | optimizer.load_state_dict(checkpoint["optimizer_state"]) 54 | optimizer.param_groups[0]["lr"] = learning_rate_init 55 | else: 56 | print("No model \"%s\" found, starting training from scratch." % run_id) 57 | else: 58 | print("Starting the training from scratch.") 59 | model.train() 60 | 61 | # Initialize the visualization environment 62 | vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) 63 | vis.log_dataset(dataset) 64 | vis.log_params() 65 | device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") 66 | vis.log_implementation({"Device": device_name}) 67 | 68 | # Training loop 69 | profiler = Profiler(summarize_every=10, disabled=False) 70 | for step, speaker_batch in enumerate(loader, init_step): 71 | profiler.tick("Blocking, waiting for batch (threaded)") 72 | 73 | # Forward pass 74 | inputs = torch.from_numpy(speaker_batch.data).to(device) 75 | sync(device) 76 | profiler.tick("Data to %s" % device) 77 | embeds = model(inputs) 78 | sync(device) 79 | profiler.tick("Forward pass") 80 | embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) 81 | loss, eer = model.loss(embeds_loss) 82 | sync(loss_device) 83 | profiler.tick("Loss") 84 | 85 | # Backward pass 86 | model.zero_grad() 87 | loss.backward() 88 | profiler.tick("Backward pass") 89 | model.do_gradient_ops() 90 | optimizer.step() 91 | profiler.tick("Parameter update") 92 | 93 | # Update visualizations 94 | # learning_rate = optimizer.param_groups[0]["lr"] 95 | vis.update(loss.item(), eer, step) 96 | 97 | # Draw projections and save them to the backup folder 98 | if umap_every != 0 and step % umap_every == 0: 99 | print("Drawing and saving projections (step %d)" % step) 100 | backup_dir.mkdir(exist_ok=True) 101 | projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) 102 | embeds = embeds.detach().cpu().numpy() 103 | vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) 104 | vis.save() 105 | 106 | # Overwrite the latest version of the model 107 | if save_every != 0 and step % save_every == 0: 108 | print("Saving the model (step %d)" % step) 109 | torch.save({ 110 | "step": step + 1, 111 | "model_state": model.state_dict(), 112 | "optimizer_state": optimizer.state_dict(), 113 | }, state_fpath) 114 | 115 | # Make a backup 116 | if backup_every != 0 and step % backup_every == 0: 117 | print("Making a backup (step %d)" % step) 118 | backup_dir.mkdir(exist_ok=True) 119 | backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) 120 | torch.save({ 121 | "step": step + 1, 122 | "model_state": model.state_dict(), 123 | "optimizer_state": optimizer.state_dict(), 124 | }, backup_fpath) 125 | 126 | profiler.tick("Extras (visualizations, saving)") 127 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/visualizations.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 4 | from datetime import datetime 5 | from time import perf_counter as timer 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | # import webbrowser 9 | import visdom 10 | import umap 11 | 12 | colormap = np.array([ 13 | [76, 255, 0], 14 | [0, 127, 70], 15 | [255, 0, 0], 16 | [255, 217, 38], 17 | [0, 135, 255], 18 | [165, 0, 165], 19 | [255, 167, 255], 20 | [0, 255, 255], 21 | [255, 96, 38], 22 | [142, 76, 0], 23 | [33, 0, 127], 24 | [0, 0, 0], 25 | [183, 183, 183], 26 | ], dtype=np.float) / 255 27 | 28 | 29 | class Visualizations: 30 | def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): 31 | # Tracking data 32 | self.last_update_timestamp = timer() 33 | self.update_every = update_every 34 | self.step_times = [] 35 | self.losses = [] 36 | self.eers = [] 37 | print("Updating the visualizations every %d steps." % update_every) 38 | 39 | # If visdom is disabled TODO: use a better paradigm for that 40 | self.disabled = disabled 41 | if self.disabled: 42 | return 43 | 44 | # Set the environment name 45 | now = str(datetime.now().strftime("%d-%m %Hh%M")) 46 | if env_name is None: 47 | self.env_name = now 48 | else: 49 | self.env_name = "%s (%s)" % (env_name, now) 50 | 51 | # Connect to visdom and open the corresponding window in the browser 52 | try: 53 | self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) 54 | except ConnectionError: 55 | raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " 56 | "start it.") 57 | # webbrowser.open("http://localhost:8097/env/" + self.env_name) 58 | 59 | # Create the windows 60 | self.loss_win = None 61 | self.eer_win = None 62 | # self.lr_win = None 63 | self.implementation_win = None 64 | self.projection_win = None 65 | self.implementation_string = "" 66 | 67 | def log_params(self): 68 | if self.disabled: 69 | return 70 | from encoder import params_data 71 | from encoder import params_model 72 | param_string = "Model parameters:
" 73 | for param_name in (p for p in dir(params_model) if not p.startswith("__")): 74 | value = getattr(params_model, param_name) 75 | param_string += "\t%s: %s
" % (param_name, value) 76 | param_string += "Data parameters:
" 77 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 78 | value = getattr(params_data, param_name) 79 | param_string += "\t%s: %s
" % (param_name, value) 80 | self.vis.text(param_string, opts={"title": "Parameters"}) 81 | 82 | def log_dataset(self, dataset: SpeakerVerificationDataset): 83 | if self.disabled: 84 | return 85 | dataset_string = "" 86 | dataset_string += "Speakers: %s\n" % len(dataset.speakers) 87 | dataset_string += "\n" + dataset.get_logs() 88 | dataset_string = dataset_string.replace("\n", "
") 89 | self.vis.text(dataset_string, opts={"title": "Dataset"}) 90 | 91 | def log_implementation(self, params): 92 | if self.disabled: 93 | return 94 | implementation_string = "" 95 | for param, value in params.items(): 96 | implementation_string += "%s: %s\n" % (param, value) 97 | implementation_string = implementation_string.replace("\n", "
") 98 | self.implementation_string = implementation_string 99 | self.implementation_win = self.vis.text( 100 | implementation_string, 101 | opts={"title": "Training implementation"} 102 | ) 103 | 104 | def update(self, loss, eer, step): 105 | # Update the tracking data 106 | now = timer() 107 | self.step_times.append(1000 * (now - self.last_update_timestamp)) 108 | self.last_update_timestamp = now 109 | self.losses.append(loss) 110 | self.eers.append(eer) 111 | print(".", end="") 112 | 113 | # Update the plots every steps 114 | if step % self.update_every != 0: 115 | return 116 | time_string = "Step time: mean: %5dms std: %5dms" % \ 117 | (int(np.mean(self.step_times)), int(np.std(self.step_times))) 118 | print("\nStep %6d Loss: %.4f EER: %.4f %s" % 119 | (step, np.mean(self.losses), np.mean(self.eers), time_string)) 120 | if not self.disabled: 121 | self.loss_win = self.vis.line( 122 | [np.mean(self.losses)], 123 | [step], 124 | win=self.loss_win, 125 | update="append" if self.loss_win else None, 126 | opts=dict( 127 | legend=["Avg. loss"], 128 | xlabel="Step", 129 | ylabel="Loss", 130 | title="Loss", 131 | ) 132 | ) 133 | self.eer_win = self.vis.line( 134 | [np.mean(self.eers)], 135 | [step], 136 | win=self.eer_win, 137 | update="append" if self.eer_win else None, 138 | opts=dict( 139 | legend=["Avg. EER"], 140 | xlabel="Step", 141 | ylabel="EER", 142 | title="Equal error rate" 143 | ) 144 | ) 145 | if self.implementation_win is not None: 146 | self.vis.text( 147 | self.implementation_string + ("%s" % time_string), 148 | win=self.implementation_win, 149 | opts={"title": "Training implementation"}, 150 | ) 151 | 152 | # Reset the tracking 153 | self.losses.clear() 154 | self.eers.clear() 155 | self.step_times.clear() 156 | 157 | def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, 158 | max_speakers=10): 159 | max_speakers = min(max_speakers, len(colormap)) 160 | embeds = embeds[:max_speakers * utterances_per_speaker] 161 | 162 | n_speakers = len(embeds) // utterances_per_speaker 163 | ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) 164 | colors = [colormap[i] for i in ground_truth] 165 | 166 | reducer = umap.UMAP() 167 | projected = reducer.fit_transform(embeds) 168 | plt.scatter(projected[:, 0], projected[:, 1], c=colors) 169 | plt.gca().set_aspect("equal", "datalim") 170 | plt.title("UMAP projection (step %d)" % step) 171 | if not self.disabled: 172 | self.projection_win = self.vis.matplot(plt, win=self.projection_win) 173 | if out_fpath is not None: 174 | plt.savefig(out_fpath) 175 | plt.clf() 176 | 177 | def save(self): 178 | if not self.disabled: 179 | self.vis.save([self.env_name]) 180 | -------------------------------------------------------------------------------- /speaker_encoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | -------------------------------------------------------------------------------- /speaker_encoder/utils/argutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from pathlib import Path 4 | import numpy as np 5 | import argparse 6 | 7 | _type_priorities = [ # In decreasing order 8 | Path, 9 | str, 10 | int, 11 | float, 12 | bool, 13 | ] 14 | 15 | def _priority(o): 16 | p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 17 | if p is not None: 18 | return p 19 | p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 20 | if p is not None: 21 | return p 22 | return len(_type_priorities) 23 | 24 | def print_args(args: argparse.Namespace, parser=None): 25 | args = vars(args) 26 | if parser is None: 27 | priorities = list(map(_priority, args.values())) 28 | else: 29 | all_params = [a.dest for g in parser._action_groups for a in g._group_actions ] 30 | priority = lambda p: all_params.index(p) if p in all_params else len(all_params) 31 | priorities = list(map(priority, args.keys())) 32 | 33 | pad = max(map(len, args.keys())) + 3 34 | indices = np.lexsort((list(args.keys()), priorities)) 35 | items = list(args.items()) 36 | 37 | print("Arguments:") 38 | for i in indices: 39 | param, value = items[i] 40 | print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) 41 | print("") 42 | -------------------------------------------------------------------------------- /speaker_encoder/utils/logmmse.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | import math 5 | from scipy.special import expn 6 | from collections import namedtuple 7 | 8 | NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2") 9 | 10 | 11 | def profile_noise(noise, sampling_rate, window_size=0): 12 | """ 13 | Creates a profile of the noise in a given waveform. 14 | 15 | :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 16 | :param sampling_rate: the sampling rate of the audio 17 | :param window_size: the size of the window the logmmse algorithm operates on. A default value 18 | will be picked if left as 0. 19 | :return: a NoiseProfile object 20 | """ 21 | noise, dtype = to_float(noise) 22 | noise += np.finfo(np.float64).eps 23 | 24 | if window_size == 0: 25 | window_size = int(math.floor(0.02 * sampling_rate)) 26 | 27 | if window_size % 2 == 1: 28 | window_size = window_size + 1 29 | 30 | perc = 50 31 | len1 = int(math.floor(window_size * perc / 100)) 32 | len2 = int(window_size - len1) 33 | 34 | win = np.hanning(window_size) 35 | win = win * len2 / np.sum(win) 36 | n_fft = 2 * window_size 37 | 38 | noise_mean = np.zeros(n_fft) 39 | n_frames = len(noise) // window_size 40 | for j in range(0, window_size * n_frames, window_size): 41 | noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0)) 42 | noise_mu2 = (noise_mean / n_frames) ** 2 43 | 44 | return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2) 45 | 46 | 47 | def denoise(wav, noise_profile: NoiseProfile, eta=0.15): 48 | """ 49 | Cleans the noise from a speech waveform given a noise profile. The waveform must have the 50 | same sampling rate as the one used to create the noise profile. 51 | 52 | :param wav: a speech waveform as a numpy array of floats or ints. 53 | :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 54 | the same) waveform. 55 | :param eta: voice threshold for noise update. While the voice activation detection value is 56 | below this threshold, the noise profile will be continuously updated throughout the audio. 57 | Set to 0 to disable updating the noise profile. 58 | :return: the clean wav as a numpy array of floats or ints of the same length. 59 | """ 60 | wav, dtype = to_float(wav) 61 | wav += np.finfo(np.float64).eps 62 | p = noise_profile 63 | 64 | nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2)) 65 | x_final = np.zeros(nframes * p.len2) 66 | 67 | aa = 0.98 68 | mu = 0.98 69 | ksi_min = 10 ** (-25 / 10) 70 | 71 | x_old = np.zeros(p.len1) 72 | xk_prev = np.zeros(p.len1) 73 | noise_mu2 = p.noise_mu2 74 | for k in range(0, nframes * p.len2, p.len2): 75 | insign = p.win * wav[k:k + p.window_size] 76 | 77 | spec = np.fft.fft(insign, p.n_fft, axis=0) 78 | sig = np.absolute(spec) 79 | sig2 = sig ** 2 80 | 81 | gammak = np.minimum(sig2 / noise_mu2, 40) 82 | 83 | if xk_prev.all() == 0: 84 | ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 85 | else: 86 | ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 87 | ksi = np.maximum(ksi_min, ksi) 88 | 89 | log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi) 90 | vad_decision = np.sum(log_sigma_k) / p.window_size 91 | if vad_decision < eta: 92 | noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 93 | 94 | a = ksi / (1 + ksi) 95 | vk = a * gammak 96 | ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 97 | hw = a * np.exp(ei_vk) 98 | sig = sig * hw 99 | xk_prev = sig ** 2 100 | xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0) 101 | xi_w = np.real(xi_w) 102 | 103 | x_final[k:k + p.len2] = x_old + xi_w[0:p.len1] 104 | x_old = xi_w[p.len1:p.window_size] 105 | 106 | output = from_float(x_final, dtype) 107 | output = np.pad(output, (0, len(wav) - len(output)), mode="constant") 108 | return output 109 | 110 | 111 | ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 112 | ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 113 | ## webrctvad 114 | # def vad(wav, sampling_rate, eta=0.15, window_size=0): 115 | # """ 116 | # TODO: fix doc 117 | # Creates a profile of the noise in a given waveform. 118 | # 119 | # :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 120 | # :param sampling_rate: the sampling rate of the audio 121 | # :param window_size: the size of the window the logmmse algorithm operates on. A default value 122 | # will be picked if left as 0. 123 | # :param eta: voice threshold for noise update. While the voice activation detection value is 124 | # below this threshold, the noise profile will be continuously updated throughout the audio. 125 | # Set to 0 to disable updating the noise profile. 126 | # """ 127 | # wav, dtype = to_float(wav) 128 | # wav += np.finfo(np.float64).eps 129 | # 130 | # if window_size == 0: 131 | # window_size = int(math.floor(0.02 * sampling_rate)) 132 | # 133 | # if window_size % 2 == 1: 134 | # window_size = window_size + 1 135 | # 136 | # perc = 50 137 | # len1 = int(math.floor(window_size * perc / 100)) 138 | # len2 = int(window_size - len1) 139 | # 140 | # win = np.hanning(window_size) 141 | # win = win * len2 / np.sum(win) 142 | # n_fft = 2 * window_size 143 | # 144 | # wav_mean = np.zeros(n_fft) 145 | # n_frames = len(wav) // window_size 146 | # for j in range(0, window_size * n_frames, window_size): 147 | # wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0)) 148 | # noise_mu2 = (wav_mean / n_frames) ** 2 149 | # 150 | # wav, dtype = to_float(wav) 151 | # wav += np.finfo(np.float64).eps 152 | # 153 | # nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2)) 154 | # vad = np.zeros(nframes * len2, dtype=np.bool) 155 | # 156 | # aa = 0.98 157 | # mu = 0.98 158 | # ksi_min = 10 ** (-25 / 10) 159 | # 160 | # xk_prev = np.zeros(len1) 161 | # noise_mu2 = noise_mu2 162 | # for k in range(0, nframes * len2, len2): 163 | # insign = win * wav[k:k + window_size] 164 | # 165 | # spec = np.fft.fft(insign, n_fft, axis=0) 166 | # sig = np.absolute(spec) 167 | # sig2 = sig ** 2 168 | # 169 | # gammak = np.minimum(sig2 / noise_mu2, 40) 170 | # 171 | # if xk_prev.all() == 0: 172 | # ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 173 | # else: 174 | # ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 175 | # ksi = np.maximum(ksi_min, ksi) 176 | # 177 | # log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi) 178 | # vad_decision = np.sum(log_sigma_k) / window_size 179 | # if vad_decision < eta: 180 | # noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 181 | # print(vad_decision) 182 | # 183 | # a = ksi / (1 + ksi) 184 | # vk = a * gammak 185 | # ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 186 | # hw = a * np.exp(ei_vk) 187 | # sig = sig * hw 188 | # xk_prev = sig ** 2 189 | # 190 | # vad[k:k + len2] = vad_decision >= eta 191 | # 192 | # vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant") 193 | # return vad 194 | 195 | 196 | def to_float(_input): 197 | if _input.dtype == np.float64: 198 | return _input, _input.dtype 199 | elif _input.dtype == np.float32: 200 | return _input.astype(np.float64), _input.dtype 201 | elif _input.dtype == np.uint8: 202 | return (_input - 128) / 128., _input.dtype 203 | elif _input.dtype == np.int16: 204 | return _input / 32768., _input.dtype 205 | elif _input.dtype == np.int32: 206 | return _input / 2147483648., _input.dtype 207 | raise ValueError('Unsupported wave file format') 208 | 209 | 210 | def from_float(_input, dtype): 211 | if dtype == np.float64: 212 | return _input, np.float64 213 | elif dtype == np.float32: 214 | return _input.astype(np.float32) 215 | elif dtype == np.uint8: 216 | return ((_input * 128) + 128).astype(np.uint8) 217 | elif dtype == np.int16: 218 | return (_input * 32768).astype(np.int16) 219 | elif dtype == np.int32: 220 | print(_input) 221 | return (_input * 2147483648).astype(np.int32) 222 | raise ValueError('Unsupported wave file format') 223 | -------------------------------------------------------------------------------- /speaker_encoder/utils/profiler.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from time import perf_counter as timer 4 | from collections import OrderedDict 5 | import numpy as np 6 | 7 | 8 | class Profiler: 9 | def __init__(self, summarize_every=5, disabled=False): 10 | self.last_tick = timer() 11 | self.logs = OrderedDict() 12 | self.summarize_every = summarize_every 13 | self.disabled = disabled 14 | 15 | def tick(self, name): 16 | if self.disabled: 17 | return 18 | 19 | # Log the time needed to execute that function 20 | if not name in self.logs: 21 | self.logs[name] = [] 22 | if len(self.logs[name]) >= self.summarize_every: 23 | self.summarize() 24 | self.purge_logs() 25 | self.logs[name].append(timer() - self.last_tick) 26 | 27 | self.reset_timer() 28 | 29 | def purge_logs(self): 30 | for name in self.logs: 31 | self.logs[name].clear() 32 | 33 | def reset_timer(self): 34 | self.last_tick = timer() 35 | 36 | def summarize(self): 37 | n = max(map(len, self.logs.values())) 38 | assert n == self.summarize_every 39 | print("\nAverage execution time over %d steps:" % n) 40 | 41 | name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()] 42 | pad = max(map(len, name_msgs)) 43 | for name_msg, deltas in zip(name_msgs, self.logs.values()): 44 | print(" %s mean: %4.0fms std: %4.0fms" % 45 | (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) 46 | print("", flush=True) 47 | -------------------------------------------------------------------------------- /train_dec.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | import torch 14 | from torch.utils.data import DataLoader 15 | 16 | import params 17 | from data import ATYDecDataset, ATYDecBatchCollate 18 | from model.vc import DiffVC 19 | 20 | n_mels = params.n_mels 21 | sampling_rate = params.sampling_rate 22 | n_fft = params.n_fft 23 | hop_size = params.hop_size 24 | 25 | channels = params.channels 26 | filters = params.filters 27 | layers = params.layers 28 | kernel = params.kernel 29 | dropout = params.dropout 30 | heads = params.heads 31 | window_size = params.window_size 32 | enc_dim = params.enc_dim 33 | 34 | dec_dim = params.dec_dim 35 | spk_dim = params.spk_dim 36 | use_ref_t = params.use_ref_t 37 | beta_min = params.beta_min 38 | beta_max = params.beta_max 39 | 40 | random_seed = params.seed 41 | test_size = params.test_size 42 | 43 | data_dir = '/data/lmorove1/hwang258/Speech-Backbones/DiffVC/aty_data' 44 | log_dir = 'logs_dec_aty' 45 | vc_path = 'logs_dec_LT/vc.pt' 46 | allspks = [ 47 | '0005', '0006', '0007', '0008', '0009', '0010', '0011', '0012', '0013', 48 | '0014', '0015', '0017', '0018', '0019', '0020', '0021', '0022', '0023', 49 | '0024', '0025', '0026' 50 | ] 51 | 52 | epochs = 40 53 | batch_size = 32 54 | learning_rate = 5e-5 55 | save_every = 1 56 | 57 | 58 | def main(dys): 59 | torch.manual_seed(random_seed) 60 | np.random.seed(random_seed) 61 | log_dir_dys = os.path.join(log_dir, dys) 62 | os.makedirs(log_dir_dys, exist_ok=True) 63 | 64 | print('Initializing data loaders...') 65 | train_set = ATYDecDataset(data_dir, dys) 66 | collate_fn = ATYDecBatchCollate() 67 | train_loader = DataLoader(train_set, batch_size=batch_size, 68 | collate_fn=collate_fn, num_workers=16, drop_last=True) 69 | print(len(train_set)) 70 | print('Initializing and loading models...') 71 | model = DiffVC(n_mels, channels, filters, heads, layers, kernel, 72 | dropout, window_size, enc_dim, spk_dim, use_ref_t, 73 | dec_dim, beta_min, beta_max) 74 | model.load_state_dict(torch.load(vc_path, map_location='cpu')) 75 | model = model.cuda() 76 | print('Encoder:') 77 | print(model.encoder) 78 | print('Number of parameters = %.2fm\n' % (model.encoder.nparams / 1e6)) 79 | print('Decoder:') 80 | print(model.decoder) 81 | print('Number of parameters = %.2fm\n' % (model.decoder.nparams / 1e6)) 82 | 83 | print('Initializing optimizers...') 84 | optimizer = torch.optim.Adam(params=model.decoder.parameters(), lr=learning_rate) 85 | 86 | print('Start training.') 87 | torch.backends.cudnn.benchmark = True 88 | iteration = 0 89 | for epoch in range(1, epochs + 1): 90 | print(f'Epoch: {epoch} [iteration: {iteration}]') 91 | model.train() 92 | losses = [] 93 | for batch in tqdm(train_loader, total=len(train_set) // batch_size): 94 | mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda() 95 | c, mel_lengths = batch['c'].cuda(), batch['mel_lengths'].cuda() 96 | model.zero_grad() 97 | loss = model.compute_loss(mel, mel_lengths, mel_ref, c) 98 | loss.backward() 99 | torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), max_norm=1) 100 | optimizer.step() 101 | losses.append(loss.item()) 102 | iteration += 1 103 | 104 | losses = np.asarray(losses) 105 | msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses)) 106 | print(msg) 107 | with open(f'{log_dir_dys}/train_dec.log', 'a') as f: 108 | f.write(msg) 109 | 110 | if epoch % save_every > 0: 111 | continue 112 | 113 | print('Saving model...\n') 114 | ckpt = model.state_dict() 115 | torch.save(ckpt, f=f"{log_dir_dys}/vc.pt") 116 | 117 | if __name__ == "__main__": 118 | for spk in allspks: 119 | main(spk) -------------------------------------------------------------------------------- /train_enc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | import torch 14 | from torch.utils.data import DataLoader 15 | 16 | import params 17 | from data import VCEncDataset, VCEncBatchCollate 18 | from model.vc import FwdDiffusion 19 | from model.utils import FastGL, sequence_mask 20 | from utils import save_plot, save_audio 21 | 22 | n_mels = params.n_mels 23 | sampling_rate = params.sampling_rate 24 | n_fft = params.n_fft 25 | hop_size = params.hop_size 26 | 27 | channels = params.channels 28 | filters = params.filters 29 | layers = params.layers 30 | kernel = params.kernel 31 | dropout = params.dropout 32 | heads = params.heads 33 | window_size = params.window_size 34 | dim = params.enc_dim 35 | 36 | random_seed = params.seed 37 | test_size = params.test_size 38 | 39 | data_dir = '../data/LibriTTS' 40 | exc_file = 'filelists/exceptions_libritts.txt' 41 | avg_type = 'mode' 42 | 43 | log_dir = 'logs_enc' 44 | epochs = 300 45 | batch_size = 128 46 | learning_rate = 5e-4 47 | save_every = 1 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | torch.manual_seed(random_seed) 53 | np.random.seed(random_seed) 54 | 55 | os.makedirs(log_dir, exist_ok=True) 56 | 57 | print('Initializing data loaders...') 58 | train_set = VCEncDataset(data_dir, exc_file, avg_type) 59 | collate_fn = VCEncBatchCollate() 60 | train_loader = DataLoader(train_set, batch_size=batch_size, 61 | collate_fn=collate_fn, num_workers=4, 62 | drop_last=True) 63 | 64 | print('Initializing models...') 65 | fgl = FastGL(n_mels, sampling_rate, n_fft, hop_size).cuda() 66 | model = FwdDiffusion(n_mels, channels, filters, heads, layers, kernel, 67 | dropout, window_size, dim).cuda() 68 | 69 | print('Encoder:') 70 | print(model) 71 | print('Number of parameters = %.2fm\n' % (model.nparams/1e6)) 72 | 73 | print('Initializing optimizers...') 74 | optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) 75 | 76 | print('Start training.') 77 | torch.backends.cudnn.benchmark = True 78 | iteration = 0 79 | for epoch in range(1, epochs + 1): 80 | print(f'Epoch: {epoch} [iteration: {iteration}]') 81 | model.train() 82 | losses = [] 83 | for batch in tqdm(train_loader, total=len(train_set)//batch_size): 84 | mel_x, mel_y = batch['x'].cuda(), batch['y'].cuda() 85 | mel_lengths = batch['lengths'].cuda() 86 | mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype) 87 | 88 | model.zero_grad() 89 | loss = model.compute_loss(mel_x, mel_y, mel_mask) 90 | loss.backward() 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) 92 | optimizer.step() 93 | 94 | losses.append(loss.item()) 95 | iteration += 1 96 | 97 | losses = np.asarray(losses) 98 | msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses)) 99 | print(msg) 100 | with open(f'{log_dir}/train_enc.log', 'a') as f: 101 | f.write(msg) 102 | losses = [] 103 | 104 | if epoch % save_every > 0: 105 | continue 106 | 107 | model.eval() 108 | print('Inference...\n') 109 | with torch.no_grad(): 110 | mels = train_set.get_test_dataset() 111 | for i, (mel_x, mel_y) in enumerate(mels): 112 | if i >= test_size: 113 | break 114 | mel_x = mel_x.unsqueeze(0).float().cuda() 115 | mel_y = mel_y.unsqueeze(0).float().cuda() 116 | mel_lengths = torch.LongTensor([mel_x.shape[-1]]).cuda() 117 | mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype) 118 | mel = model(mel_x, mel_mask) 119 | save_plot(mel.squeeze().cpu(), f'{log_dir}/generated_{i}.png') 120 | audio = fgl(mel) 121 | save_audio(f'{log_dir}/generated_{i}.wav', sampling_rate, audio) 122 | if epoch == save_every: 123 | save_plot(mel_x.squeeze().cpu(), f'{log_dir}/source_{i}.png') 124 | audio = fgl(mel_x) 125 | save_audio(f'{log_dir}/source_{i}.wav', sampling_rate, audio) 126 | save_plot(mel_y.squeeze().cpu(), f'{log_dir}/target_{i}.png') 127 | audio = fgl(mel_y) 128 | save_audio(f'{log_dir}/target_{i}.wav', sampling_rate, audio) 129 | 130 | print('Saving model...\n') 131 | ckpt = model.state_dict() 132 | torch.save(ckpt, f=f"{log_dir}/enc.pt") 133 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy.io import wavfile 12 | 13 | 14 | def save_plot(tensor, savepath): 15 | plt.style.use('default') 16 | fig, ax = plt.subplots(figsize=(12, 3)) 17 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') 18 | plt.colorbar(im, ax=ax) 19 | plt.tight_layout() 20 | fig.canvas.draw() 21 | plt.savefig(savepath) 22 | plt.close() 23 | 24 | 25 | def save_audio(file_path, sampling_rate, audio): 26 | audio = np.clip(audio.detach().cpu().squeeze().numpy(), -0.999, 0.999) 27 | wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16")) 28 | --------------------------------------------------------------------------------