├── models ├── __init__.py ├── utils.py ├── convolutional_transformer.py └── model.py ├── docs ├── imgs │ └── model.png ├── favicons │ ├── favicon.ico │ ├── apple-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon-96x96.png │ ├── ms-icon-70x70.png │ ├── ms-icon-144x144.png │ ├── ms-icon-150x150.png │ ├── ms-icon-310x310.png │ ├── android-icon-36x36.png │ ├── android-icon-48x48.png │ ├── android-icon-72x72.png │ ├── android-icon-96x96.png │ ├── apple-icon-114x114.png │ ├── apple-icon-120x120.png │ ├── apple-icon-144x144.png │ ├── apple-icon-152x152.png │ ├── apple-icon-180x180.png │ ├── apple-icon-57x57.png │ ├── apple-icon-60x60.png │ ├── apple-icon-72x72.png │ ├── apple-icon-76x76.png │ ├── android-icon-144x144.png │ ├── android-icon-192x192.png │ ├── apple-icon-precomposed.png │ ├── browserconfig.xml │ └── manifest.json ├── wavs │ ├── apc-apc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── apc-cpc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── apc-mel │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── apc-ppg │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── apc-w2v │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── cpc-apc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── cpc-cpc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── cpc-mel │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── cpc-ppg │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── cpc-w2v │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── mel-apc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── mel-cpc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── mel-mel │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── mel-ppg │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── mel-w2v │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── ppg-apc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── ppg-cpc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── ppg-mel │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── ppg-ppg │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── ppg-w2v │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── w2v-apc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── w2v-cpc │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── w2v-mel │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── w2v-ppg │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ ├── w2v-w2v │ │ ├── seen-f2f.wav │ │ ├── seen-f2m.wav │ │ ├── seen-m2f.wav │ │ ├── seen-m2m.wav │ │ ├── unseen-f2f.wav │ │ ├── unseen-f2m.wav │ │ ├── unseen-m2f.wav │ │ └── unseen-m2m.wav │ └── Ground-truth │ │ ├── seen-f2f-source.wav │ │ ├── seen-f2f-target.wav │ │ ├── seen-f2m-source.wav │ │ ├── seen-f2m-target.wav │ │ ├── seen-m2f-source.wav │ │ ├── seen-m2f-target.wav │ │ ├── seen-m2m-source.wav │ │ ├── seen-m2m-target.wav │ │ ├── unseen-f2f-source.wav │ │ ├── unseen-f2f-target.wav │ │ ├── unseen-f2m-source.wav │ │ ├── unseen-f2m-target.wav │ │ ├── unseen-m2f-source.wav │ │ ├── unseen-m2f-target.wav │ │ ├── unseen-m2m-source.wav │ │ └── unseen-m2m-target.wav ├── styles.css └── index.html ├── data ├── __init__.py ├── preprocess_dataset.py ├── feature_extract.py ├── utils.py └── intra_speaker_dataset.py ├── requirements.txt ├── merger.py ├── preprocess.py ├── info.yaml ├── convert_batch.py ├── README.md └── train.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import S2VC 2 | from .utils import * 3 | -------------------------------------------------------------------------------- /docs/imgs/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/imgs/model.png -------------------------------------------------------------------------------- /docs/favicons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/favicon.ico -------------------------------------------------------------------------------- /docs/favicons/apple-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon.png -------------------------------------------------------------------------------- /docs/favicons/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/favicon-16x16.png -------------------------------------------------------------------------------- /docs/favicons/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/favicon-32x32.png -------------------------------------------------------------------------------- /docs/favicons/favicon-96x96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/favicon-96x96.png -------------------------------------------------------------------------------- /docs/favicons/ms-icon-70x70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/ms-icon-70x70.png -------------------------------------------------------------------------------- /docs/wavs/apc-apc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/seen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/seen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/seen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/seen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/seen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/seen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/seen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/seen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/seen-m2m.wav -------------------------------------------------------------------------------- /docs/favicons/ms-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/ms-icon-144x144.png -------------------------------------------------------------------------------- /docs/favicons/ms-icon-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/ms-icon-150x150.png -------------------------------------------------------------------------------- /docs/favicons/ms-icon-310x310.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/ms-icon-310x310.png -------------------------------------------------------------------------------- /docs/wavs/apc-apc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-apc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-apc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-cpc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-cpc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-mel/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-mel/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-ppg/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-ppg/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/apc-w2v/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/apc-w2v/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-apc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-apc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-cpc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-cpc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-mel/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-mel/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-ppg/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-ppg/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/cpc-w2v/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/cpc-w2v/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-apc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-apc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-cpc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-cpc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-mel/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-mel/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-ppg/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-ppg/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/mel-w2v/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/mel-w2v/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-apc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-apc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-cpc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-cpc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-mel/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-mel/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-ppg/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-ppg/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/ppg-w2v/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/ppg-w2v/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-apc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-apc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-cpc/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-cpc/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-mel/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-mel/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-ppg/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-ppg/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/unseen-f2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/unseen-f2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/unseen-f2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/unseen-f2m.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/unseen-m2f.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/unseen-m2f.wav -------------------------------------------------------------------------------- /docs/wavs/w2v-w2v/unseen-m2m.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/w2v-w2v/unseen-m2m.wav -------------------------------------------------------------------------------- /docs/favicons/android-icon-36x36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-36x36.png -------------------------------------------------------------------------------- /docs/favicons/android-icon-48x48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-48x48.png -------------------------------------------------------------------------------- /docs/favicons/android-icon-72x72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-72x72.png -------------------------------------------------------------------------------- /docs/favicons/android-icon-96x96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-96x96.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-114x114.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-114x114.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-120x120.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-144x144.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-152x152.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-180x180.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-57x57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-57x57.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-60x60.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-72x72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-72x72.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-76x76.png -------------------------------------------------------------------------------- /docs/favicons/android-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-144x144.png -------------------------------------------------------------------------------- /docs/favicons/android-icon-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/android-icon-192x192.png -------------------------------------------------------------------------------- /docs/favicons/apple-icon-precomposed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/favicons/apple-icon-precomposed.png -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-f2f-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-f2f-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-f2f-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-f2f-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-f2m-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-f2m-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-f2m-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-f2m-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-m2f-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-m2f-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-m2f-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-m2f-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-m2m-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-m2m-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/seen-m2m-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/seen-m2m-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-f2f-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-f2f-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-f2f-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-f2f-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-f2m-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-f2m-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-f2m-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-f2m-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-m2f-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-m2f-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-m2f-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-m2f-target.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-m2m-source.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-m2m-source.wav -------------------------------------------------------------------------------- /docs/wavs/Ground-truth/unseen-m2m-target.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howard1337/S2VC/HEAD/docs/wavs/Ground-truth/unseen-m2m-target.wav -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocess_dataset import PreprocessDataset 2 | from .intra_speaker_dataset import IntraSpeakerDataset, collate_batch 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sox 2 | gdown 3 | tqdm 4 | librosa 5 | soundfile 6 | torch==1.7.1 7 | torchaudio==0.7.2 8 | argparse 9 | matplotlib 10 | tensorboard 11 | -e git://github.com/pytorch/fairseq.git@1a709b2a401ac8bd6d805c8a6a5f4d7f03b923ff#egg=fairseq 12 | -------------------------------------------------------------------------------- /docs/favicons/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | #ffffff -------------------------------------------------------------------------------- /merger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | 5 | """Merge the metadata.json of different features""" 6 | 7 | dataset_dir = sys.argv[1] 8 | sub_dirs = [i for i in os.listdir(sys.argv[1]) if 'json' not in i] 9 | 10 | 11 | 12 | metas = [] 13 | merged = {} 14 | 15 | for sub_dir in sub_dirs: 16 | metas.append(json.load(open(os.path.join(dataset_dir, sub_dir, 'metadata.json')))) 17 | 18 | for key in metas[0].keys(): 19 | if key == 'feature_name': 20 | continue 21 | merged[key] = [{} for i in range(len(metas[0][key]))] 22 | for subdir, meta in zip(sub_dirs, metas): 23 | for idx, value in enumerate(meta[key]): 24 | merged[key][idx]['audio_path'] = value['audio_path'] 25 | merged[key][idx][meta['feature_name']] = os.path.join(subdir, value['feature_path']) 26 | 27 | json.dump(merged, open(os.path.join(dataset_dir, 'metadata.json'), 'w'), indent=2) 28 | -------------------------------------------------------------------------------- /docs/favicons/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "App", 3 | "icons": [ 4 | { 5 | "src": "\/android-icon-36x36.png", 6 | "sizes": "36x36", 7 | "type": "image\/png", 8 | "density": "0.75" 9 | }, 10 | { 11 | "src": "\/android-icon-48x48.png", 12 | "sizes": "48x48", 13 | "type": "image\/png", 14 | "density": "1.0" 15 | }, 16 | { 17 | "src": "\/android-icon-72x72.png", 18 | "sizes": "72x72", 19 | "type": "image\/png", 20 | "density": "1.5" 21 | }, 22 | { 23 | "src": "\/android-icon-96x96.png", 24 | "sizes": "96x96", 25 | "type": "image\/png", 26 | "density": "2.0" 27 | }, 28 | { 29 | "src": "\/android-icon-144x144.png", 30 | "sizes": "144x144", 31 | "type": "image\/png", 32 | "density": "3.0" 33 | }, 34 | { 35 | "src": "\/android-icon-192x192.png", 36 | "sizes": "192x192", 37 | "type": "image\/png", 38 | "density": "4.0" 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /data/preprocess_dataset.py: -------------------------------------------------------------------------------- 1 | """Precompute Wav2Vec features and spectrograms.""" 2 | 3 | from copy import deepcopy 4 | from pathlib import Path 5 | 6 | import torch 7 | from librosa.util import find_files 8 | 9 | import sox 10 | 11 | from .utils import load_wav, log_mel_spectrogram 12 | class PreprocessDataset(torch.utils.data.Dataset): 13 | """Prefetch audio data for preprocessing.""" 14 | 15 | def __init__( 16 | self, 17 | data_dirs, 18 | trim_method, 19 | sample_rate, 20 | ): 21 | 22 | data = [] 23 | 24 | for data_dir in data_dirs: 25 | data_dir_path = Path(data_dir) 26 | speaker_dirs = [x for x in data_dir_path.iterdir() if x.is_dir()] 27 | 28 | for speaker_dir in speaker_dirs: 29 | audio_paths = find_files(speaker_dir) 30 | if len(audio_paths) == 0: 31 | continue 32 | 33 | speaker_name = speaker_dir.name 34 | for audio_path in audio_paths: 35 | data.append((speaker_name, audio_path)) 36 | 37 | self.trim_method = trim_method 38 | self.sample_rate = sample_rate 39 | self.data = data 40 | 41 | if trim_method == "vad": 42 | tfm = sox.Transformer() 43 | tfm.vad(location=1) 44 | tfm.vad(location=-1) 45 | self.sox_transform = tfm 46 | 47 | def __len__(self): 48 | return len(self.data) 49 | 50 | def __getitem__(self, index): 51 | speaker_name, audio_path = self.data[index] 52 | 53 | if self.trim_method == "librosa": 54 | wav = load_wav(audio_path, self.sample_rate, trim=True) 55 | elif self.trim_method == "vad": 56 | wav = load_wav(audio_path, self.sample_rate) 57 | trim_wav = self.sox_transform.build_array( 58 | input_array=wav, sample_rate_in=self.sample_rate 59 | ) 60 | wav = deepcopy(trim_wav if len(trim_wav) > 10 else wav) 61 | return speaker_name, audio_path, torch.FloatTensor(wav) 62 | -------------------------------------------------------------------------------- /models/utils.py: -------------------------------------------------------------------------------- 1 | """Useful utilities.""" 2 | 3 | import math 4 | 5 | import torch 6 | from torch.optim import Optimizer 7 | from torch.optim.lr_scheduler import LambdaLR 8 | 9 | from fairseq.models.wav2vec import Wav2Vec2Model 10 | 11 | 12 | def load_pretrained_wav2vec(ckpt_path): 13 | """Load pretrained Wav2Vec model.""" 14 | ckpt = torch.load(ckpt_path) 15 | model = Wav2Vec2Model.build_model(ckpt["args"], task=None) 16 | model.load_state_dict(ckpt["model"]) 17 | model.remove_pretraining_modules() 18 | model.eval() 19 | return model 20 | 21 | 22 | def get_cosine_schedule_with_warmup( 23 | optimizer: Optimizer, 24 | num_warmup_steps: int, 25 | num_training_steps: int, 26 | num_cycles: float = 0.5, 27 | last_epoch: int = -1, 28 | ): 29 | """ 30 | Create a schedule with a learning rate that decreases following the values of the cosine function between the 31 | initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the 32 | initial lr set in the optimizer. 33 | 34 | Args: 35 | optimizer (:class:`~torch.optim.Optimizer`): 36 | The optimizer for which to schedule the learning rate. 37 | num_warmup_steps (:obj:`int`): 38 | The number of steps for the warmup phase. 39 | num_training_steps (:obj:`int`): 40 | The total number of training steps. 41 | num_cycles (:obj:`float`, `optional`, defaults to 0.5): 42 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 43 | following a half-cosine). 44 | last_epoch (:obj:`int`, `optional`, defaults to -1): 45 | The index of the last epoch when resuming training. 46 | 47 | Return: 48 | :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 49 | """ 50 | 51 | def lr_lambda(current_step): 52 | if current_step < num_warmup_steps: 53 | return float(current_step) / float(max(1, num_warmup_steps)) 54 | progress = float(current_step - num_warmup_steps) / float( 55 | max(1, num_training_steps - num_warmup_steps) 56 | ) 57 | return max( 58 | 0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) 59 | ) 60 | 61 | return LambdaLR(optimizer, lr_lambda, last_epoch) 62 | -------------------------------------------------------------------------------- /data/feature_extract.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from functools import partial 3 | from multiprocessing import Pool, cpu_count 4 | from models import load_pretrained_wav2vec 5 | from data import log_mel_spectrogram 6 | 7 | 8 | class FeatureExtractor: 9 | def __init__(self, feature_name, wav2vec2_path=None, device=None): 10 | self.device = device 11 | if ( 12 | feature_name == "apc" 13 | or feature_name == "cpc" 14 | or feature_name == "timit_posteriorgram" 15 | or feature_name == "fbank" 16 | ): 17 | self.extractor = ( 18 | torch.hub.load("s3prl/s3prl:f2114342ff9e813e18a580fa41418aee9925414e", feature_name, refresh=True).eval().to(device) 19 | ) 20 | self.mode = 1 21 | elif feature_name == "wav2vec2": 22 | self.extractor = load_pretrained_wav2vec(wav2vec2_path).eval().to(device) 23 | self.mode = 2 24 | elif feature_name == "wav2vec2_mel": 25 | self.extractor = partial( 26 | log_mel_spectrogram, 27 | preemph=0.97, 28 | sample_rate=16000, 29 | n_mels=80, 30 | n_fft=400, 31 | hop_length=320, 32 | win_length=400, 33 | f_min=0, 34 | center=False, 35 | ) 36 | self.mode = 3 37 | elif feature_name == "cpc_mel": 38 | self.extractor = partial( 39 | log_mel_spectrogram, 40 | preemph=0.97, 41 | sample_rate=16000, 42 | n_mels=80, 43 | n_fft=465, 44 | hop_length=160, 45 | win_length=465, 46 | f_min=80, 47 | center=True, 48 | ) 49 | self.mode = 3 50 | else: 51 | print(feature_name) 52 | print( 53 | "Please use timit_posteriorgram, apc, wav2vec2, cpc, wav2vec2_mel, cpc_mel, or fbank" 54 | ) 55 | exit() 56 | 57 | def get_feature(self, wavs): 58 | if self.mode == 1: 59 | return self.extractor(wavs) 60 | elif self.mode == 2: 61 | feats = [] 62 | for wav in wavs: 63 | feat = self.extractor.extract_features(wav.unsqueeze(0), None)[0].squeeze(0) 64 | feats.append(feat) 65 | elif self.mode == 3: 66 | wavs = [wav.cpu().numpy() for wav in wavs] 67 | feats = [self.extractor(wav) for wav in wavs] 68 | feats = [torch.FloatTensor(feat).to(self.device) for feat in feats] 69 | return feats 70 | 71 | return feats 72 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Precompute Wav2Vec features.""" 3 | 4 | import os 5 | import json 6 | from pathlib import Path 7 | from tempfile import mkstemp 8 | from multiprocessing import cpu_count 9 | 10 | import tqdm 11 | import torch 12 | from torch.utils.data import DataLoader 13 | from argparse import ArgumentParser 14 | from copy import deepcopy 15 | 16 | from models import load_pretrained_wav2vec 17 | from data import PreprocessDataset 18 | from data.feature_extract import FeatureExtractor 19 | 20 | def parse_args(): 21 | """Parse command-line arguments.""" 22 | parser = ArgumentParser() 23 | parser.add_argument("data_dirs", type=str, nargs="+") 24 | parser.add_argument("feature_name", type=str) 25 | parser.add_argument("wav2vec_path", type=str) 26 | parser.add_argument("out_dir", type=str) 27 | parser.add_argument("--trim_method", choices=["librosa", "vad"], default="vad") 28 | parser.add_argument("--n_workers", type=int, default=cpu_count()) 29 | 30 | parser.add_argument("--sample_rate", type=int, default=16000) 31 | 32 | return vars(parser.parse_args()) 33 | 34 | 35 | def main( 36 | data_dirs, 37 | feature_name, 38 | wav2vec_path, 39 | out_dir, 40 | trim_method, 41 | n_workers, 42 | sample_rate, 43 | **kwargs, 44 | ): 45 | """Main function.""" 46 | 47 | out_dir_path = Path(out_dir) 48 | 49 | if out_dir_path.exists(): 50 | assert out_dir_path.is_dir() 51 | else: 52 | out_dir_path.mkdir(parents=True) 53 | 54 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 55 | 56 | dataset = PreprocessDataset( 57 | data_dirs, 58 | trim_method, 59 | sample_rate 60 | ) 61 | dataloader = DataLoader( 62 | dataset, batch_size=1, shuffle=False, drop_last=False, num_workers=n_workers 63 | ) 64 | 65 | 66 | speaker_infos = {} 67 | speaker_infos['feature_name'] = feature_name 68 | 69 | pbar = tqdm.tqdm(total=len(dataset), ncols=0) 70 | mapping = {'apc': 'fbank', 'timit_posteriorgram': 'fbank', 'cpc': 'cpc_mel', 'wav2vec2': 'wav2vec2_mel'} 71 | feat_extractor = FeatureExtractor(feature_name, wav2vec_path, device) 72 | mel_extractor = FeatureExtractor(mapping[feature_name], wav2vec_path, device) 73 | for speaker_name, audio_path, wav in dataloader: 74 | if wav.size(-1) < 10: 75 | continue 76 | 77 | wav = wav.to(device) 78 | speaker_name = speaker_name[0] 79 | audio_path = audio_path[0] 80 | 81 | 82 | with torch.no_grad(): 83 | feat = feat_extractor.get_feature(wav)[0] 84 | mel = mel_extractor.get_feature(wav)[0] 85 | fd, temp_file = mkstemp(suffix=".tar", prefix="utterance-", dir=out_dir_path) 86 | torch.save({"feat": feat.detach().cpu(), "mel": mel.detach().cpu()}, temp_file) 87 | os.close(fd) 88 | 89 | if speaker_name not in speaker_infos.keys(): 90 | speaker_infos[speaker_name] = [] 91 | 92 | speaker_infos[speaker_name].append( 93 | { 94 | "feature_path": Path(temp_file).name, 95 | "audio_path": audio_path, 96 | "mel_len": len(mel), 97 | } 98 | ) 99 | 100 | pbar.update(dataloader.batch_size) 101 | 102 | with open(out_dir_path / "metadata.json", "w") as f: 103 | json.dump(speaker_infos, f, indent=2) 104 | 105 | 106 | if __name__ == "__main__": 107 | main(**parse_args()) 108 | -------------------------------------------------------------------------------- /info.yaml: -------------------------------------------------------------------------------- 1 | libri121 -> libri260: 2 | source: /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000001.wav 3 | target: 4 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000003_000001.wav 5 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000004_000001.wav 6 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000005_000000.wav 7 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000006_000000.wav 8 | libri121 -> p227: 9 | source: /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000001.wav 10 | target: 11 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_005.wav 12 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_008.wav 13 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_011.wav 14 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_021.wav 15 | libri260 -> libri121: 16 | source: /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000005_000000.wav 17 | target: 18 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000004_000003.wav 19 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000005_000001.wav 20 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000000.wav 21 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000001.wav 22 | libri260 -> p225: 23 | source: /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000005_000000.wav 24 | target: 25 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_005.wav 26 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_008.wav 27 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_011.wav 28 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_021.wav 29 | p225 -> libri260: 30 | source: /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_006.wav 31 | target: 32 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000003_000001.wav 33 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000004_000001.wav 34 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000005_000000.wav 35 | - /home/storage/Dataset/LibriTTS/test-clean/260/123288/260_123288_000006_000000.wav 36 | p225 -> p227: 37 | source: /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_006.wav 38 | target: 39 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_005.wav 40 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_008.wav 41 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_011.wav 42 | - /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_021.wav 43 | p227 -> libri121: 44 | source: /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_020.wav 45 | target: 46 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000004_000003.wav 47 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000005_000001.wav 48 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000000.wav 49 | - /home/storage/Dataset/LibriTTS/test-clean/121/121726/121_121726_000025_000001.wav 50 | p227 -> p225: 51 | source: /home/storage/Dataset/Survey/VCTK/wav48/p227/p227_020.wav 52 | target: 53 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_005.wav 54 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_008.wav 55 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_011.wav 56 | - /home/storage/Dataset/Survey/VCTK/wav48/p225/p225_021.wav 57 | p228 -> p232: 58 | source: /home/storage/Dataset/Survey/VCTK/wav48/p228/p228_004.wav 59 | target: 60 | - /home/storage/Dataset/Survey/VCTK/wav48/p232/p232_005.wav 61 | - /home/storage/Dataset/Survey/VCTK/wav48/p232/p232_008.wav 62 | - /home/storage/Dataset/Survey/VCTK/wav48/p232/p232_011.wav 63 | - /home/storage/Dataset/Survey/VCTK/wav48/p232/p232_021.wav 64 | p232 -> p228: 65 | source: /home/storage/Dataset/Survey/VCTK/wav48/p232/p232_016.wav 66 | target: 67 | - /home/storage/Dataset/Survey/VCTK/wav48/p228/p228_005.wav 68 | - /home/storage/Dataset/Survey/VCTK/wav48/p228/p228_008.wav 69 | - /home/storage/Dataset/Survey/VCTK/wav48/p228/p228_011.wav 70 | - /home/storage/Dataset/Survey/VCTK/wav48/p228/p228_021.wav 71 | -------------------------------------------------------------------------------- /data/utils.py: -------------------------------------------------------------------------------- 1 | """Utilities for data manipulation.""" 2 | 3 | from typing import Union 4 | from pathlib import Path 5 | 6 | import librosa 7 | from librosa.effects import pitch_shift 8 | 9 | import numpy as np 10 | import matplotlib 11 | from matplotlib import pyplot as plt 12 | from scipy.signal import lfilter 13 | 14 | matplotlib.use("Agg") 15 | 16 | 17 | def trim_func(wav, sample_rate): 18 | _, (start_frame, end_frame) = librosa.effects.trim( 19 | wav, top_db=25, frame_length=512, hop_length=128 20 | ) 21 | start_frame = max(0, start_frame - 0.1 * sample_rate) 22 | end_frame = min(len(wav), end_frame + 0.1 * sample_rate) 23 | 24 | start = int(start_frame) 25 | end = int(end_frame) 26 | if end - start > 1000: # prevent empty slice 27 | wav = wav[start:end] 28 | return wav 29 | 30 | 31 | def load_wav( 32 | audio_path: Union[str, Path], 33 | sample_rate: int, 34 | trim: bool = False, 35 | shift: int = None, 36 | ) -> np.ndarray: 37 | """Load and preprocess waveform.""" 38 | wav = librosa.load(audio_path, sr=sample_rate)[0] 39 | 40 | shifted_wavs = None 41 | #if shift is not None: 42 | # shifted_wavs = [] 43 | # for i in [--9, -6, -3, 3, 6, 9]: 44 | # shifted_wav = pitch_shift(wav, sample_rate, i) 45 | # shifted_wav = shifted_wav / (np.abs(shifted_wav).max() + 1e-6) 46 | # shifted_wavs.append(shifted_wav) 47 | 48 | wav = wav / (np.abs(wav).max() + 1e-6) 49 | if trim: 50 | wav = trim_func(wav, sample_rate) 51 | # min_length = 1e100 52 | # for i in range(len(shifted_wavs)): 53 | # shifted_wavs[i] = trim_func(shifted_wavs[i], sample_rate) 54 | # min_length = min(len(shifted_wavs[i]), min_length) 55 | # for i in range(len(shifted_wavs)): 56 | # shifted_wavs[i] = shifted_wavs[i][:min_length] 57 | 58 | if shift is not None: 59 | return wav, shifted_wavs 60 | return wav 61 | 62 | 63 | def log_mel_spectrogram( 64 | x: np.ndarray, 65 | preemph: float, 66 | sample_rate: int, 67 | n_mels: int, 68 | n_fft: int, 69 | hop_length: int, 70 | win_length: int, 71 | f_min: int, 72 | center: bool 73 | ) -> np.ndarray: 74 | """Create a log Mel spectrogram from a raw audio signal.""" 75 | x = lfilter([1, -preemph], [1], x) 76 | magnitude = np.abs( 77 | librosa.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=center) 78 | ) 79 | mel_fb = librosa.filters.mel( 80 | sample_rate, n_fft, n_mels=n_mels, fmin=f_min 81 | ) 82 | mel_spec = np.dot(mel_fb, magnitude) 83 | log_mel_spec = np.log(mel_spec + 1e-9) 84 | return log_mel_spec.T 85 | 86 | 87 | def plot_mel(gt_mel, predicted_mel=None, filename="mel.png"): 88 | if predicted_mel is not None: 89 | fig, axes = plt.subplots(2, 1, squeeze=False, figsize=(10, 10)) 90 | else: 91 | fig, axes = plt.subplots(1, 1, squeeze=False, figsize=(10, 10)) 92 | 93 | axes[0][0].imshow(gt_mel.detach().cpu().numpy().T, origin="lower") 94 | axes[0][0].set_aspect(1, adjustable="box") 95 | axes[0][0].set_ylim(1.0, 80) 96 | axes[0][0].set_title("ground-truth mel-spectrogram", fontsize="medium") 97 | axes[0][0].tick_params(labelsize="x-small", left=False, labelleft=False) 98 | 99 | if predicted_mel is not None: 100 | axes[1][0].imshow(predicted_mel.detach().cpu().numpy(), origin="lower") 101 | axes[1][0].set_aspect(1.0, adjustable="box") 102 | axes[1][0].set_ylim(0, 80) 103 | axes[1][0].set_title("predicted mel-spectrogram", fontsize="medium") 104 | axes[1][0].tick_params(labelsize="x-small", left=False, labelleft=False) 105 | 106 | plt.tight_layout() 107 | plt.savefig(filename) 108 | plt.close() 109 | 110 | 111 | def plot_attn(attn, filename="attn.png", save=True): 112 | fig, axes = plt.subplots(len(attn), 1, squeeze=False, figsize=(10, 10)) 113 | 114 | for i, layer_attn in enumerate(attn): 115 | axes[i][0].imshow(attn[i][0].detach().cpu().numpy(), origin="lower") 116 | axes[i][0].set_title("layer {}".format(i), fontsize="medium") 117 | axes[i][0].tick_params(labelsize="x-small") 118 | axes[i][0].set_xlabel("target") 119 | axes[i][0].set_ylabel("source") 120 | 121 | plt.tight_layout() 122 | if save: 123 | plt.savefig(filename) 124 | if not save: 125 | return fig 126 | -------------------------------------------------------------------------------- /convert_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Convert multiple pairs.""" 3 | 4 | import warnings 5 | from pathlib import Path 6 | from functools import partial 7 | from multiprocessing import Pool, cpu_count 8 | 9 | import yaml 10 | import torch 11 | import numpy as np 12 | import soundfile as sf 13 | from argparse import ArgumentParser 14 | from tqdm import tqdm 15 | 16 | from data import load_wav, log_mel_spectrogram, plot_mel, plot_attn 17 | from data.feature_extract import FeatureExtractor 18 | from models import load_pretrained_wav2vec 19 | 20 | 21 | def parse_args(): 22 | """Parse command-line arguments.""" 23 | parser = ArgumentParser() 24 | parser.add_argument("info_path", type=str) 25 | parser.add_argument("output_dir", type=str, default=".") 26 | parser.add_argument("-c", "--ckpt_path", 27 | default="checkpoints/cpc-cpc.pt") 28 | parser.add_argument("-s", "--src_feat_name", default="cpc") 29 | parser.add_argument("-r", "--ref_feat_name", default="cpc") 30 | parser.add_argument("-w", "--wav2vec_path", 31 | default="checkpoints/wav2vec_small.pt") 32 | parser.add_argument("-v", "--vocoder_path", 33 | default="checkpoints/vocoder.pt") 34 | 35 | parser.add_argument("--sample_rate", type=int, default=16000) 36 | 37 | return vars(parser.parse_args()) 38 | 39 | 40 | def main( 41 | info_path, 42 | output_dir, 43 | ckpt_path, 44 | src_feat_name, 45 | ref_feat_name, 46 | wav2vec_path, 47 | vocoder_path, 48 | sample_rate, 49 | **kwargs, 50 | ): 51 | """Main function.""" 52 | 53 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 54 | src_feat_model = FeatureExtractor(src_feat_name, wav2vec_path, device) 55 | 56 | ref_feat_model = FeatureExtractor(ref_feat_name, wav2vec_path, device) 57 | 58 | print(f"[INFO] {src_feat_name} is loaded") 59 | 60 | model = torch.jit.load(ckpt_path).to(device).eval() 61 | print("[INFO] FragmentVC is loaded from", ckpt_path) 62 | 63 | vocoder = torch.jit.load(vocoder_path).to(device).eval() 64 | print("[INFO] Vocoder is loaded from", vocoder_path) 65 | 66 | path2wav = partial(load_wav, sample_rate=sample_rate, trim=True) 67 | 68 | with open(info_path) as f: 69 | infos = yaml.load(f, Loader=yaml.FullLoader) 70 | 71 | out_mels = [] 72 | attns = [] 73 | with Pool(cpu_count()) as pool: 74 | for pair_name, pair in tqdm(infos.items()): 75 | src_wav = load_wav(pair["source"], sample_rate, trim=True) 76 | src_wav = torch.FloatTensor(src_wav).to(device) 77 | 78 | tgt_wavs = pool.map(path2wav, pair["target"]) 79 | tgt_wavs = [torch.FloatTensor(tgt_wav).to(device) 80 | for tgt_wav in tgt_wavs] 81 | 82 | with torch.no_grad(): 83 | tgt_mels = ref_feat_model.get_feature(tgt_wavs) 84 | src_mel = (ref_feat_model.get_feature([src_wav])[0].transpose( 85 | 0, 1).unsqueeze(0).to(device)) 86 | tgt_mels = [tgt_mel.cpu() for tgt_mel in tgt_mels] 87 | tgt_mel = np.concatenate(tgt_mels, axis=0) 88 | tgt_mel = torch.FloatTensor(tgt_mel.T).unsqueeze(0).to(device) 89 | src_feat = src_feat_model.get_feature([src_wav])[ 90 | 0].unsqueeze(0) 91 | out_mel, attn = model(src_feat, tgt_mel) 92 | 93 | out_mel = out_mel.transpose(1, 2).squeeze(0) 94 | out_mels.append(out_mel) 95 | attns.append(attn) 96 | 97 | # print(f"[INFO] Pair {pair_name} converted") 98 | # out_mel: batch_size, time_stamp, mel_dim 99 | del model 100 | del src_feat_model 101 | del ref_feat_model 102 | print("[INFO] Generating waveforms...") 103 | batch_size = 10 104 | total = len(out_mels) 105 | out_wavs = [] 106 | pbar = tqdm(total=len(out_mels), ncols=0, unit="wavs") 107 | with torch.no_grad(): 108 | for i in range(0, total, batch_size): 109 | out_wavs.extend(vocoder.generate(out_mels[i:i+batch_size])) 110 | pbar.update(min(batch_size, total)) 111 | if total % batch_size != 0 and total > batch_size: 112 | out_wavs.extend(vocoder.generate( 113 | out_mels[total - total % batch_size:])) 114 | pbar.update(total % batch_size) 115 | pbar.close() 116 | 117 | print("[INFO] Waveforms generated") 118 | 119 | out_dir = Path(output_dir) 120 | out_dir.mkdir(parents=True, exist_ok=True) 121 | 122 | for pair_name, out_mel, out_wav, attn in tqdm(zip( 123 | infos.keys(), out_mels, out_wavs, attns 124 | )): 125 | out_wav = out_wav.cpu().numpy() 126 | out_path = Path(out_dir, pair_name) 127 | 128 | plot_mel(out_mel, filename=out_path.with_suffix(".mel.png")) 129 | plot_attn(attn, filename=out_path.with_suffix(".attn.png")) 130 | sf.write(out_path.with_suffix(".wav"), out_wav, sample_rate) 131 | 132 | 133 | if __name__ == "__main__": 134 | warnings.filterwarnings("ignore") 135 | main(**parse_args()) 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S2VC 2 | 3 | Here is the implementation of our paper [S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations](https://arxiv.org/abs/2104.02901). In this paper, we proposed S2VC which utilizes Self-Supervised pretrained representation to provide the latent phonetic structure of the utterance from the source speaker and the spectral features of the utterance from the target speaker. 4 | 5 | The following is the overall model architecture. 6 | 7 | ![Model architecture](docs/imgs/model.png) 8 | 9 | For the audio samples, please refer to our [demo page](https://howard1337.github.io/S2VC/). 10 | 11 | ## Usage 12 | 13 | You can download the pretrained model as well as the vocoder following the link under **Releases** section on the sidebar. 14 | 15 | The whole project was developed using Python 3.8, torch 1.7.1, and the pretrained model, as well as the vocoder, were turned to [TorchScript](https://pytorch.org/docs/stable/jit.html), so it's not guaranteed to be backward compatible. 16 | You can install the dependencies with 17 | 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | If you encounter any problems while installing *fairseq*, please refer to [pytorch/fairseq](https://github.com/pytorch/fairseq) for the installation instruction. 23 | 24 | ### Self-Supervised representations 25 | #### Wav2vec2 26 | In our implementation, we're using Wav2Vec 2.0 Base w/o finetuning which is trained on LibriSpeech. 27 | You can download the checkpoint [wav2vec_small.pt](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt) from [pytorch/fairseq](https://github.com/pytorch/fairseq). 28 | #### APC(Autoregressive Predictive Coding), CPC(Contrastive Predictive Coding) 29 | These two representations are extracted using this speech toolkit [S3PRL](https://github.com/s3prl/s3prl). 30 | You can check how to extract various representations from that repo. 31 | 32 | ### Vocoder 33 | 34 | The WaveRNN-based neural vocoder is from [yistLin/universal-vocoder](https://github.com/yistLin/universal-vocoder) which is based on the paper, [Towards achieving robust universal neural vocoding](https://arxiv.org/abs/1811.06292). 35 | 36 | ## Voice conversion with pretrained models 37 | 38 | You can convert an utterance from the source speaker with multiple utterances from the target speaker by preparing a conversion pairs information file in YAML format, like 39 | ```YAML 40 | # pairs_info.yaml 41 | pair1: 42 | source: VCTK-Corpus/wav48/p225/p225_001.wav 43 | target: 44 | - VCTK-Corpus/wav48/p227/p227_001.wav 45 | pair2: 46 | source: VCTK-Corpus/wav48/p225/p225_001.wav 47 | target: 48 | - VCTK-Corpus/wav48/p227/p227_002.wav 49 | - VCTK-Corpus/wav48/p227/p227_003.wav 50 | - VCTK-Corpus/wav48/p227/p227_004.wav 51 | ``` 52 | 53 | And convert multiple pairs at the same time, e.g. 54 | ```bash 55 | python convert_batch.py \ 56 | -w \ 57 | -v \ 58 | -c \ 59 | -s \ 60 | -r \ 61 | pairs_info.yaml \ 62 | outputs # the output directory of conversion results 63 | ``` 64 | 65 | After the conversion, the output directory, `outputs`, will be containing 66 | ```text 67 | pair1.wav 68 | pair1.mel.png 69 | pair1.attn.png 70 | pair2.wav 71 | pair2.mel.png 72 | pair2.attn.png 73 | ``` 74 | 75 | ## Train from scratch 76 | 77 | ### Preprocessing 78 | You can preprocess multiple corpora by passing multiple paths. 79 | But each path should be the directory that directly contains the speaker directories. 80 | And you have to specify the feature you want to extract. 81 | Currently, we support apc, cpc, wav2vec2, and timit_posteriorgram. 82 | i.e. 83 | ```bash 84 | python3 preprocess.py 85 | VCTK-Corpus/wav48 \ 86 | \ # more corpus if you want 87 | \ 88 | \ 89 | processed/ # the output directory of preprocessed features 90 | ``` 91 | After preprocessing, the output directory will be containing: 92 | ```text 93 | metadata.json 94 | utterance-000x7gsj.tar 95 | utterance-00wq7b0f.tar 96 | utterance-01lpqlnr.tar 97 | ... 98 | ``` 99 | 100 | You may need to preprocess multiple times for different features. 101 | i.e. 102 | ```bash 103 | python3 preprocess.py 104 | VCTK-Corpus/wav48 apc processed/apc 105 | python3 preprocess.py 106 | VCTK-Corpus/wav48 cpc processed/cpc 107 | ... 108 | ``` 109 | 110 | Then merge the metadata of different features. 111 | 112 | i.e. 113 | ```bash 114 | python3 merger.py processed 115 | ``` 116 | 117 | 118 | ### Training 119 | 120 | ```bash 121 | python train.py processed 122 | --save_dir ./ckpts \ 123 | -s \ 124 | -r 125 | ``` 126 | 127 | 128 | You can further specify `--preload` for preloading all training data into RAM to boost training speed. 129 | If `--comment ` is specified, e.g. `--comment CPC-CPC`, the training logs will be placed under a newly created directory like, `logs/2020-02-02_12:34:56_CPC-CPC`, otherwise there won't be any logging. 130 | For more details, you can refer to the usage by `python train.py -h`. 131 | -------------------------------------------------------------------------------- /data/intra_speaker_dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset for reconstruction scheme.""" 2 | 3 | import json 4 | import random 5 | from pathlib import Path 6 | from copy import deepcopy 7 | from concurrent.futures import ThreadPoolExecutor 8 | 9 | import torch 10 | from tqdm import tqdm 11 | from torch.utils.data import Dataset 12 | from torch.nn.utils.rnn import pad_sequence 13 | 14 | import sox 15 | 16 | from .utils import load_wav, log_mel_spectrogram 17 | 18 | 19 | 20 | class IntraSpeakerDataset(Dataset): 21 | """Dataset for reconstruction scheme. 22 | 23 | Returns: 24 | speaker_id: speaker id number. 25 | feat: Wav2Vec feature tensor. 26 | mel: log mel spectrogram tensor. 27 | """ 28 | 29 | def __init__(self, data_dir, metadata_path, src_feat, ref_feat, n_samples=5, pre_load=False, training=True): 30 | with open(metadata_path, "r") as f: 31 | metadata = json.load(f) 32 | 33 | executor = ThreadPoolExecutor(max_workers=4) 34 | futures = [] 35 | for speaker_name, utterances in metadata.items(): 36 | for utterance in utterances: 37 | futures.append( 38 | executor.submit( 39 | _process_data, 40 | speaker_name, 41 | data_dir, 42 | utterance, 43 | pre_load, 44 | src_feat, 45 | ref_feat, 46 | ) 47 | ) 48 | 49 | self.data = [] 50 | self.speaker_to_indices = {} 51 | for i, future in enumerate(tqdm(futures, ncols=0)): 52 | result = future.result() 53 | speaker_name = result[0] 54 | self.data.append(result) 55 | if speaker_name not in self.speaker_to_indices: 56 | self.speaker_to_indices[speaker_name] = [i] 57 | else: 58 | self.speaker_to_indices[speaker_name].append(i) 59 | 60 | 61 | self.data_dir = Path(data_dir) 62 | self.n_samples = n_samples 63 | self.pre_load = pre_load 64 | self.training = training 65 | self.src_feat = src_feat 66 | self.ref_feat = ref_feat 67 | self.src_dim = -1 68 | self.ref_dim = -1 69 | self.tgt_dim = -1 70 | 71 | def __len__(self): 72 | return len(self.data) 73 | 74 | def _get_data(self, index): 75 | if self.pre_load: 76 | speaker_name, content_emb, target_emb, target_mel = self.data[index] 77 | else: 78 | speaker_name, content_emb, target_emb, target_mel = _load_data(*self.data[index]) 79 | self.src_dim = content_emb.shape[1] 80 | self.ref_dim = target_emb.shape[1] 81 | self.tgt_dim = target_mel.shape[1] 82 | 83 | return speaker_name, content_emb, target_emb, target_mel 84 | 85 | def __getitem__(self, index): 86 | speaker_name, content_emb, target_emb, target_mel = self._get_data(index) 87 | return content_emb, target_emb, target_mel 88 | 89 | def get_feat_dim(self): 90 | self._get_data(0) 91 | return self.src_dim, self.ref_dim, self.tgt_dim 92 | 93 | 94 | def _process_data(speaker_name, data_dir, feature, load, src_feat, ref_feat): 95 | _, src_feature_path, ref_feature_path = feature["audio_path"], feature[src_feat], feature[ref_feat] 96 | if load: 97 | return _load_data(speaker_name, data_dir, src_feature_path, ref_feature_path) 98 | else: 99 | return speaker_name, data_dir, src_feature_path, ref_feature_path 100 | 101 | 102 | def _load_data(speaker_name, data_dir, src_feature_path, ref_feature_path): 103 | src_feature = torch.load(Path(data_dir, src_feature_path), 'cpu') 104 | ref_feature = torch.load(Path(data_dir, ref_feature_path), 'cpu') 105 | content_emb = src_feature["feat"].detach().cpu() 106 | target_emb = ref_feature["feat"].detach().cpu() 107 | target_mel = src_feature["mel"].detach().cpu() 108 | return speaker_name, content_emb, target_emb, target_mel 109 | 110 | 111 | def collate_batch(batch): 112 | """Collate a batch of data.""" 113 | srcs, tgts, tgt_mels = zip(*batch) 114 | 115 | src_lens = [len(src) for src in srcs] 116 | tgt_lens = [len(tgt) for tgt in tgts] 117 | tgt_mel_lens = [len(tgt_mel) for tgt_mel in tgt_mels] 118 | 119 | overlap_lens = [ 120 | min(src_len, tgt_mel_len) for src_len, tgt_mel_len in zip(src_lens, tgt_mel_lens) 121 | ] 122 | 123 | srcs = pad_sequence(srcs, batch_first=True) 124 | 125 | src_masks = [torch.arange(srcs.size(1)) >= src_len for src_len in src_lens] 126 | src_masks = torch.stack(src_masks) 127 | 128 | tgts = pad_sequence(tgts, batch_first=True, padding_value=-20) 129 | tgts = tgts.transpose(1, 2) # (batch, mel_dim, max_tgt_len) 130 | 131 | tgt_masks = [torch.arange(tgts.size(2)) >= tgt_len for tgt_len in tgt_lens] 132 | tgt_masks = torch.stack(tgt_masks) # (batch, max_tgt_len) 133 | 134 | tgt_mels = pad_sequence(tgt_mels, batch_first=True, padding_value=-20) 135 | tgt_mels = tgt_mels.transpose(1, 2) # (batch, mel_dim, max_tgt_len) 136 | 137 | return srcs, src_masks, tgts, tgt_masks, tgt_mels, overlap_lens 138 | -------------------------------------------------------------------------------- /models/convolutional_transformer.py: -------------------------------------------------------------------------------- 1 | """Convolutional transsformer""" 2 | 3 | from typing import Optional, Tuple 4 | 5 | import torch.nn.functional as F 6 | from torch import Tensor, bmm 7 | from torch.nn import ( 8 | Module, 9 | Dropout, 10 | LayerNorm, 11 | Conv1d, 12 | MultiheadAttention, 13 | Sequential, 14 | Linear, 15 | ReLU, 16 | Sigmoid, 17 | InstanceNorm1d, 18 | ) 19 | from torch.nn.modules.linear import _LinearWithBias 20 | 21 | 22 | class Smoother(Module): 23 | """Convolutional Transformer Encoder Layer""" 24 | 25 | def __init__(self, d_model: int, nhead: int, d_hid: int, dropout=0.1): 26 | super(Smoother, self).__init__() 27 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 28 | 29 | self.conv1 = Conv1d(d_model, d_hid, 9, padding=4) 30 | self.conv2 = Conv1d(d_hid, d_model, 1, padding=0) 31 | 32 | self.norm1 = LayerNorm(d_model) 33 | self.norm2 = LayerNorm(d_model) 34 | self.dropout1 = Dropout(dropout) 35 | self.dropout2 = Dropout(dropout) 36 | 37 | def forward( 38 | self, 39 | src: Tensor, 40 | src_mask: Optional[Tensor] = None, 41 | src_key_padding_mask: Optional[Tensor] = None, 42 | ) -> Tensor: 43 | # multi-head self attention 44 | src2 = self.self_attn( 45 | src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask 46 | )[0] 47 | 48 | # add & norm 49 | src = src + self.dropout1(src2) 50 | src = self.norm1(src) 51 | 52 | # conv1d 53 | src2 = src.transpose(0, 1).transpose(1, 2) 54 | src2 = self.conv2(F.relu(self.conv1(src2))) 55 | src2 = src2.transpose(1, 2).transpose(0, 1) 56 | 57 | # add & norm 58 | src = src + self.dropout2(src2) 59 | src = self.norm2(src) 60 | return src 61 | 62 | 63 | class Extractor(Module): 64 | """Convolutional Transformer Decoder Layer""" 65 | 66 | def __init__( 67 | self, 68 | d_model: int, 69 | nhead: int, 70 | d_hid: int, 71 | bottleneck_dim: int, 72 | dropout=0.1, 73 | no_residual=False, 74 | bottleneck=False, 75 | ): 76 | super(Extractor, self).__init__() 77 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 78 | self.cross_attn = MultiheadAttention(bottleneck_dim, nhead, dropout=dropout) 79 | self.out_proj = _LinearWithBias(d_model, d_model) 80 | 81 | self.conv1 = Conv1d(d_model, d_hid, 9, padding=4) 82 | self.conv2 = Conv1d(d_hid, d_model, 1, padding=0) 83 | 84 | self.bottleneck = bottleneck 85 | self.tgt_bottleneck = Sequential( 86 | Linear(d_model, d_model), 87 | ReLU(), 88 | # InstanceNorm1d(d_model), 89 | Linear(d_model, bottleneck_dim), 90 | ) 91 | 92 | self.memory_bottleneck = Sequential( 93 | Linear(d_model, d_model), 94 | ReLU(), 95 | # InstanceNorm1d(d_model), 96 | Linear(d_model, bottleneck_dim), 97 | ) 98 | 99 | self.norm1 = LayerNorm(d_model) 100 | self.norm2 = LayerNorm(d_model) 101 | self.norm3 = LayerNorm(d_model) 102 | self.dropout1 = Dropout(dropout) 103 | self.dropout2 = Dropout(dropout) 104 | self.dropout3 = Dropout(dropout) 105 | 106 | self.no_residual = no_residual 107 | 108 | def forward( 109 | self, 110 | tgt: Tensor, 111 | memory: Tensor, 112 | tgt_mask: Optional[Tensor] = None, 113 | memory_mask: Optional[Tensor] = None, 114 | tgt_key_padding_mask: Optional[Tensor] = None, 115 | memory_key_padding_mask: Optional[Tensor] = None, 116 | ) -> Tuple[Tensor, Optional[Tensor]]: 117 | # multi-head self attention 118 | tgt2 = self.self_attn( 119 | tgt, tgt, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask 120 | )[0] 121 | 122 | # add & norm 123 | tgt = tgt + self.dropout1(tgt2) 124 | tgt = self.norm1(tgt) 125 | 126 | # bottleneck feature of target and references 127 | if self.bottleneck: 128 | tgt_compat = self.tgt_bottleneck(tgt) 129 | memory_compact = self.memory_bottleneck(memory) 130 | else: 131 | tgt_compat = tgt 132 | memory_compact = memory 133 | 134 | # multi-head cross attention 135 | tgt2, attn = self.cross_attn( 136 | tgt_compat, 137 | memory_compact, 138 | memory_compact, 139 | attn_mask=memory_mask, 140 | key_padding_mask=memory_key_padding_mask, 141 | ) 142 | 143 | if self.bottleneck and attn is not None: 144 | memory = ( 145 | memory.contiguous() 146 | .view(memory.size(0), -1, memory.size(-1)) 147 | .transpose(0, 1) 148 | ) 149 | tgt2 = bmm(attn, memory) 150 | tgt2 = ( 151 | tgt2.transpose(0, 1) 152 | .contiguous() 153 | .view(-1, memory.size(0), memory.size(2)) 154 | ) 155 | tgt2 = F.linear(tgt2, self.out_proj.weight, self.out_proj.bias) 156 | # add & norm 157 | if self.no_residual: 158 | tgt = self.dropout2(tgt2) 159 | else: 160 | tgt = tgt + self.dropout2(tgt2) 161 | tgt = self.norm2(tgt) 162 | 163 | # conv1d 164 | tgt2 = tgt.transpose(0, 1).transpose(1, 2) 165 | tgt2 = self.conv2(F.relu(self.conv1(tgt2))) 166 | tgt2 = tgt2.transpose(1, 2).transpose(0, 1) 167 | 168 | # add & norm 169 | tgt = tgt + self.dropout3(tgt2) 170 | tgt = self.norm3(tgt) 171 | 172 | return tgt, attn 173 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | """FragmentVC model architecture.""" 2 | 3 | from typing import Tuple, List, Optional 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch import Tensor 9 | 10 | from .convolutional_transformer import Smoother, Extractor 11 | 12 | class S2VC(nn.Module): 13 | """ 14 | FragmentVC uses Wav2Vec feature of the source speaker to query and attend 15 | on mel spectrogram of the target speaker. 16 | """ 17 | 18 | def __init__(self, input_dim, ref_dim, d_model=512): 19 | super().__init__() 20 | self.unet = UnetBlock(d_model, input_dim, ref_dim) 21 | 22 | self.smoothers = nn.TransformerEncoder(Smoother(d_model, 2, 1024), num_layers=3) 23 | 24 | self.mel_linear = nn.Linear(d_model, 80) 25 | 26 | self.post_net = nn.Sequential( 27 | nn.Conv1d(80, 512, kernel_size=5, padding=2), 28 | nn.BatchNorm1d(512), 29 | nn.Tanh(), 30 | nn.Dropout(0.5), 31 | nn.Conv1d(512, 512, kernel_size=5, padding=2), 32 | nn.BatchNorm1d(512), 33 | nn.Tanh(), 34 | nn.Dropout(0.5), 35 | nn.Conv1d(512, 512, kernel_size=5, padding=2), 36 | nn.BatchNorm1d(512), 37 | nn.Tanh(), 38 | nn.Dropout(0.5), 39 | nn.Conv1d(512, 512, kernel_size=5, padding=2), 40 | nn.BatchNorm1d(512), 41 | nn.Tanh(), 42 | nn.Dropout(0.5), 43 | nn.Conv1d(512, 80, kernel_size=5, padding=2), 44 | nn.BatchNorm1d(80), 45 | nn.Dropout(0.5), 46 | ) 47 | 48 | def forward( 49 | self, 50 | srcs: Tensor, 51 | refs: Tensor, 52 | src_masks: Optional[Tensor] = None, 53 | ref_masks: Optional[Tensor] = None, 54 | ) -> Tuple[Tensor, List[Optional[Tensor]]]: 55 | """Forward function. 56 | 57 | Args: 58 | srcs: (batch, src_len, 768) 59 | src_masks: (batch, src_len) 60 | refs: (batch, 80, ref_len) 61 | ref_masks: (batch, ref_len) 62 | """ 63 | # out: (src_len, batch, d_model) 64 | out, attns = self.unet(srcs, refs, src_masks=src_masks, ref_masks=ref_masks) 65 | 66 | # out: (src_len, batch, d_model) 67 | out = self.smoothers(out, src_key_padding_mask=src_masks) 68 | 69 | # out: (src_len, batch, 80) 70 | out = self.mel_linear(out) 71 | 72 | # out: (batch, 80, src_len) 73 | out = out.transpose(1, 0).transpose(2, 1) 74 | refined = self.post_net(out) 75 | out = out + refined 76 | 77 | # out: (batch, 80, src_len) 78 | return out, attns 79 | 80 | 81 | 82 | class SelfAttentionPooling(nn.Module): 83 | """ 84 | Implementation of SelfAttentionPooling from https://gist.github.com/pohanchi/c77f6dbfbcbc21c5215acde4f62e4362 85 | Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition 86 | https://arxiv.org/pdf/2008.01077v1.pdf 87 | """ 88 | def __init__(self, input_dim: int): 89 | super(SelfAttentionPooling, self).__init__() 90 | self.W = nn.Linear(input_dim, 1) 91 | self.softmax = nn.functional.softmax 92 | 93 | def forward(self, batch_rep: Tensor, att_mask: Optional[Tensor] = None): 94 | """ 95 | N: batch size, T: sequence length, H: Hidden dimension 96 | input: 97 | batch_rep : size (N, T, H) 98 | attention_weight: 99 | att_w : size (N, T, 1) 100 | return: 101 | utter_rep: size (N, H) 102 | """ 103 | att_logits = self.W(batch_rep).squeeze(-1) 104 | if att_mask is not None: 105 | att_logits = att_logits.masked_fill(att_mask, 1e-20) 106 | att_w = self.softmax(att_logits, dim=-1).unsqueeze(-1) 107 | utter_rep = torch.sum(batch_rep * att_w, dim=1) 108 | 109 | return utter_rep 110 | 111 | class SourceEncoder(nn.Module): 112 | def __init__(self, d_model: int, input_dim: int): 113 | super(SourceEncoder, self).__init__() 114 | # encoder_layer = nn.TransformerEncoderLayer(d_model, 2, 1024, 0.1) 115 | # self.encoder = nn.TransformerEncoder(encoder_layer, 6) 116 | 117 | self.lin1 = nn.Linear(input_dim, input_dim) 118 | self.lin2 = nn.Linear(input_dim, d_model) 119 | self.lin3 = nn.Linear(d_model, d_model) 120 | self.lin4 = nn.Linear(d_model, d_model) 121 | 122 | self.bn1 = nn.BatchNorm1d(input_dim) 123 | self.bn2 = nn.BatchNorm1d(d_model) 124 | self.bn3 = nn.BatchNorm1d(d_model) 125 | self.bn4 = nn.BatchNorm1d(d_model) 126 | 127 | self.dropout1 = nn.Dropout(0.0) 128 | self.dropout2 = nn.Dropout(0.0) 129 | self.dropout3 = nn.Dropout(0.0) 130 | self.dropout4 = nn.Dropout(0.0) 131 | 132 | self.SAP = SelfAttentionPooling(d_model) 133 | self.proj = nn.Linear(d_model, d_model) 134 | torch.nn.init.xavier_uniform_( 135 | self.proj.weight, gain=torch.nn.init.calculate_gain('linear') 136 | ) 137 | 138 | def forward(self, srcs: Tensor, refs: Tensor, src_masks: Optional[Tensor] = None, ref_masks: Optional[Tensor] = None): 139 | tgt = F.relu(self.lin1(srcs)).transpose(1, 2) 140 | tgt = self.dropout1(self.bn1(tgt)).transpose(1, 2) 141 | 142 | tgt = F.relu(self.lin2(tgt)).transpose(1, 2) 143 | tgt = self.dropout2(self.bn2(tgt)).transpose(1, 2) 144 | 145 | tgt = F.relu(self.lin3(tgt)).transpose(1, 2) 146 | tgt = self.dropout3(self.bn3(tgt)).transpose(1, 2) 147 | 148 | tgt = F.relu(self.lin4(tgt)).transpose(1, 2) 149 | tgt = self.dropout4(self.bn4(tgt)).transpose(1, 2) 150 | 151 | spk_embed = F.relu(self.proj(self.SAP(refs.transpose(1, 2), ref_masks))).unsqueeze(1) 152 | tgt *= spk_embed 153 | 154 | # tgt = self.encoder(tgt, src_masks) 155 | return tgt 156 | 157 | 158 | class UnetBlock(nn.Module): 159 | """Hierarchically attend on references.""" 160 | 161 | def __init__(self, d_model: int, input_dim: int, ref_dim: int): 162 | super(UnetBlock, self).__init__() 163 | self.conv1 = nn.Conv1d(ref_dim, d_model, 3, padding=1, padding_mode="replicate") 164 | self.conv2 = nn.Conv1d(d_model, d_model, 3, padding=1, padding_mode="replicate") 165 | self.conv3 = nn.Conv1d(d_model, d_model, 3, padding=1, padding_mode="replicate") 166 | 167 | use_bottleneck = True 168 | bottleneck_dim = 4 169 | n_head = 2 170 | self.extractor1 = Extractor( 171 | d_model, n_head, 1024, bottleneck_dim, no_residual=True, bottleneck=use_bottleneck, 172 | ) 173 | 174 | self.src_encoder = SourceEncoder(d_model, input_dim) 175 | def forward( 176 | self, 177 | srcs: Tensor, 178 | refs: Tensor, 179 | src_masks: Optional[Tensor] = None, 180 | ref_masks: Optional[Tensor] = None, 181 | ) -> Tuple[Tensor, List[Optional[Tensor]]]: 182 | """Forward function. 183 | 184 | Args: 185 | srcs: (batch, 80, src_len) 186 | src_masks: (batch, src_len) 187 | refs: (batch, 80, ref_len) 188 | ref_masks: (batch, ref_len) 189 | """ 190 | 191 | # tgt: (batch, mel_len, bottleneck_dim) 192 | 193 | # tgt: (tgt_len, batch, bottleneck_dim) 194 | 195 | # ref*: (batch, d_model, mel_len) 196 | ref1 = self.conv1(refs) 197 | ref2 = self.conv2(F.relu(ref1)) 198 | ref3 = self.conv3(F.relu(ref2)) 199 | 200 | tgt = self.src_encoder(srcs, ref3, src_masks, ref_masks) 201 | tgt = tgt.transpose(0, 1) 202 | 203 | # out*: (tgt_len, batch, d_model) 204 | out, attn1 = self.extractor1( 205 | tgt, 206 | ref3.transpose(1, 2).transpose(0, 1), 207 | tgt_key_padding_mask=src_masks, 208 | memory_key_padding_mask=ref_masks, 209 | ) 210 | return out, [attn1] 211 | 212 | -------------------------------------------------------------------------------- /docs/styles.css: -------------------------------------------------------------------------------- 1 | html { 2 | background-color: lightgrey; 3 | font-family: sans-serif; 4 | -webkit-text-size-adjust: 100%; 5 | -ms-text-size-adjust: 100%; 6 | margin: 0; 7 | padding: 0; 8 | } 9 | 10 | body { 11 | background-color : lightgrey; 12 | margin: auto; 13 | width: 100%; 14 | min-width: 1200px; 15 | max-width: 2000px; 16 | height: 100%; 17 | padding: 0; 18 | } 19 | 20 | .container{ 21 | position: relative; 22 | /* background: rgb(22, 38, 67); For browsers that do not support gradients */ 23 | background: -webkit-linear-gradient(color1, color2); /* For Safari 5.1 to 6.0 /* 24 | /* background: -o-linear-gradient(color1, color2); /* For Opera 11.1 to 12.0 */ 25 | /* background: -moz-linear-gradient(color1, color2); /* For Firefox 3.6 to 15 */ 26 | /* background: linear-gradient(color1, color2); /* Standard syntax */ 27 | background-size: cover; 28 | height: auto; 29 | padding: 2%; 30 | } 31 | 32 | .footer-container{ 33 | position: relative; 34 | background-image: url("../img/pattern2.png") ; 35 | background-size: cover; 36 | height: auto; 37 | padding: 30px 30px; 38 | } 39 | 40 | a { 41 | color: white; 42 | } 43 | 44 | #img1{ 45 | z-index: 100; 46 | position: absolute; 47 | left: 10%; 48 | top: 15%; 49 | border-radius: 50%; 50 | height: 70%; 51 | width: auto; 52 | box-shadow: 10px 5px 5px gray; 53 | } 54 | 55 | #img3{ 56 | width: 90%; 57 | align-content: center; 58 | height: auto; 59 | padding: 2%; 60 | } 61 | 62 | #cat{ 63 | display: inline-block; 64 | float: left; 65 | width: 42%; 66 | height: auto; 67 | padding-left: 5%; 68 | } 69 | 70 | #cat_2{ 71 | display: inline-block; 72 | float: right; 73 | width: 42%; 74 | height: auto; 75 | padding-right: 5%; 76 | } 77 | 78 | #text1{ 79 | z-index:100; 80 | position: relative; 81 | color: white; 82 | font-size: 40px; 83 | font-weight: bold; 84 | text-align: center; 85 | margin: 2%; 86 | } 87 | 88 | #intro{ 89 | z-index:100; 90 | color: white; 91 | font-size: 22px; 92 | text-align: center; 93 | } 94 | 95 | #text2{ 96 | color: white; 97 | font-size: 30px; 98 | font-weight: bold; 99 | text-align: center; 100 | padding: 20px; 101 | } 102 | 103 | #footnote{ 104 | color: white; 105 | font-size: 20px; 106 | text-align: center; 107 | padding-bottom: 20px; 108 | } 109 | 110 | #area1{ 111 | width:100%; 112 | height:100px; 113 | } 114 | 115 | .clear{ 116 | clear:both; 117 | } 118 | 119 | .img-circle { 120 | border-radius: 50%; 121 | } 122 | 123 | .content-container{ 124 | background-color: white; 125 | padding: 40px 40px; 126 | text-align: left; 127 | font-size: 20px; 128 | margin-bottom: 30px; 129 | display: block; 130 | } 131 | 132 | .content-title{ 133 | font-size: 30px; 134 | color: rgb(22, 38, 67); 135 | text-align: center; 136 | padding-bottom: 20px; 137 | font-weight: bold; 138 | } 139 | 140 | nav{ 141 | margin-bottom: 76px; 142 | } 143 | 144 | .nav-button { 145 | background-color: #999999; 146 | width: 50%; 147 | padding: 10px 0; 148 | text-decoration: none; 149 | text-align: center; 150 | font-size: 20px; 151 | color: white; 152 | float: left; 153 | cursor: pointer; 154 | } 155 | 156 | .nav-button:hover { 157 | background-color: #666666; 158 | } 159 | 160 | .icon { 161 | background-color: white; 162 | height: 40px; 163 | width: 40px; 164 | padding: 0 0; 165 | margin-right: 5px; 166 | display: inline-block; 167 | position: center; 168 | cursor: pointer; 169 | border-radius: 50%; 170 | } 171 | 172 | .icon-container{ 173 | text-align:center; 174 | margin: 10px 10px; 175 | } 176 | 177 | /* The following is for projects.php*/ 178 | .project-container{ 179 | display: block; 180 | height: auto; 181 | width: 100%; 182 | margin-bottom: 30px; 183 | } 184 | .project-image-container{ 185 | display: inline-block; 186 | vertical-align: top; 187 | width: 30%; 188 | height: auto; 189 | margin-right: 3%; 190 | } 191 | .project-logo-container{ 192 | display: inline-block; 193 | vertical-align: top; 194 | width: 4%; 195 | height: auto; 196 | margin-right: 2%; 197 | } 198 | .project-text-container{ 199 | display: inline-block; 200 | vertical-align: top; 201 | width: 60%; 202 | height: auto; 203 | margin-left: 3%; 204 | } 205 | .project-full-text-container{ 206 | display: inline-block; 207 | vertical-align: top; 208 | width: 90%; 209 | height: auto; 210 | margin-left: 2%; 211 | } 212 | .project-video-container{ 213 | width: 70%; 214 | height: auto; 215 | margin-left: 15%; 216 | margin-right: 15%; 217 | margin-top: 30px; 218 | margin-bottom: 30px; 219 | display: block; 220 | } 221 | .project-year-container{ 222 | background-color: rgb(22, 38, 67); /*#FFC064*/ 223 | padding: 5px; 224 | color: white; 225 | font-size: 36px; 226 | font-weight: bold; 227 | text-align: center; 228 | } 229 | 230 | table { 231 | text-align: center; 232 | align-content: center; 233 | vertical-align: middle; 234 | border-collapse: collapse; 235 | width: 100%; 236 | } 237 | 238 | th, td{ 239 | width: 65px; 240 | padding: 1%; 241 | text-align: center; 242 | align-content: center; 243 | vertical-align: middle; 244 | border-collapse: collapse; 245 | } 246 | 247 | .play-button{ 248 | width: 50px; 249 | height: auto; 250 | display: block; 251 | } 252 | 253 | .play-button-demo{ 254 | width: 200px; 255 | height: auto; 256 | display: block; 257 | } 258 | 259 | /* End here */ 260 | 261 | h1 { 262 | text-align: center; 263 | color: rgb(22, 38, 67); 264 | } 265 | 266 | .icon-container > a { 267 | color: transparent; 268 | } 269 | 270 | img { 271 | display: block; 272 | width: 100%; 273 | height: 100%; 274 | margin-left: auto; 275 | margin-right: auto; 276 | } 277 | 278 | hr { 279 | color: rgb(22, 38, 67); 280 | size: 10px; 281 | } 282 | 283 | .option-div{ 284 | font-size: 24px; 285 | color: rgb(22, 38, 67); 286 | font-weight: bold; 287 | } 288 | 289 | .option-div option{ 290 | font-size: 20px; 291 | } 292 | 293 | .fa-play:before { 294 | content: "\f04b" 295 | } 296 | 297 | .fa-pause:before { 298 | content: "\f04c" 299 | } 300 | 301 | .fa-stop:before { 302 | content: "\f04d" 303 | } 304 | 305 | .fa { 306 | display: inline-block; 307 | font: normal normal normal 14px / 1 FontAwesome; 308 | font-size: inherit; 309 | text-rendering: auto; 310 | -webkit-font-smoothing: antialiased; 311 | -moz-osx-font-smoothing: grayscale 312 | } 313 | 314 | .btn { 315 | display: inline-block; 316 | font-weight: 400; 317 | text-align: center; 318 | white-space: nowrap; 319 | vertical-align: middle; 320 | -webkit-user-select: none; 321 | -moz-user-select: none; 322 | -ms-user-select: none; 323 | user-select: none; 324 | border: 1px solid transparent; 325 | padding: .375rem .75rem; 326 | font-size: 1rem; 327 | line-height: 1.5; 328 | border-radius: .25rem; 329 | transition: color .15s ease-in-out, background-color .15s ease-in-out, border-color .15s ease-in-out, box-shadow .15s ease-in-out 330 | } 331 | 332 | @media screen and (prefers-reduced-motion:reduce) { 333 | .btn { 334 | transition: none 335 | } 336 | } 337 | 338 | .btn:focus, .btn:hover { 339 | text-decoration: none 340 | } 341 | 342 | .btn.focus, .btn:focus { 343 | outline: 0; 344 | box-shadow: 0 0 0 .2rem rgba(0, 123, 255, .25) 345 | } 346 | 347 | .btn.disabled, .btn:disabled { 348 | opacity: .65 349 | } 350 | 351 | .btn:not(:disabled):not(.disabled) { 352 | cursor: pointer 353 | } 354 | 355 | a.btn.disabled, fieldset:disabled a.btn { 356 | pointer-events: none 357 | } 358 | 359 | .btn-primary { 360 | color: #fff; 361 | background-color: rgb(22, 38, 67); 362 | border-color: rgb(22, 38, 67) 363 | } 364 | 365 | .btn-primary:hover { 366 | color: #fff; 367 | background-color: rgb(22, 38, 67); 368 | border-color: rgb(22, 38, 67) 369 | } 370 | 371 | .btn-primary.focus, .btn-primary:focus { 372 | box-shadow: 0 0 0 .2rem rgba(22, 38, 67, .5) 373 | } 374 | 375 | .btn-primary.disabled, .btn-primary:disabled { 376 | color: #fff; 377 | background-color: rgb(22, 38, 67); 378 | border-color: rgb(22, 38, 67) 379 | } 380 | 381 | .btn-primary:not(:disabled):not(.disabled).active, .btn-primary:not(:disabled):not(.disabled):active, 382 | .show > .btn-primary.dropdown-toggle { 383 | color: #fff; 384 | background-color: rgb(22, 38, 67); 385 | border-color: rgb(22, 38, 67) 386 | } 387 | 388 | .btn-primary:not(:disabled):not(.disabled).active:focus, .btn-primary:not(:disabled):not(.disabled):active:focus, 389 | .show > .btn-primary.dropdown-toggle:focus { 390 | box-shadow: 0 0 0 .2rem rgba(22, 38, 67, .5) 391 | } 392 | 393 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Train S2VC model.""" 3 | 4 | import argparse 5 | import datetime 6 | import random 7 | from pathlib import Path 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch.optim import AdamW 12 | from torch.utils.data import DataLoader, random_split 13 | from torch.utils.tensorboard import SummaryWriter 14 | from tqdm import tqdm 15 | import numpy as np 16 | 17 | from data import IntraSpeakerDataset, collate_batch, plot_attn 18 | from models import S2VC, get_cosine_schedule_with_warmup 19 | 20 | random.seed(42) 21 | torch.manual_seed(42) 22 | torch.cuda.manual_seed(42) 23 | torch.cuda.manual_seed_all(42) 24 | np.random.seed(42) 25 | 26 | 27 | def parse_args(): 28 | """Parse command-line arguments.""" 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("data_dir", type=str) 31 | parser.add_argument("--save_dir", type=str, default=".") 32 | parser.add_argument("--total_steps", type=int, default=250000) 33 | parser.add_argument("--warmup_steps", type=int, default=100) 34 | parser.add_argument("--valid_steps", type=int, default=1000) 35 | parser.add_argument("--log_steps", type=int, default=100) 36 | parser.add_argument("--save_steps", type=int, default=10000) 37 | parser.add_argument("--n_samples", type=int, default=10) 38 | parser.add_argument("--accu_steps", type=int, default=2) 39 | parser.add_argument("--batch_size", type=int, default=6) 40 | parser.add_argument("--n_workers", type=int, default=8) 41 | parser.add_argument('-s', "--src_feat", type=str, default='cpc') 42 | parser.add_argument('-r', "--ref_feat", type=str, default='cpc') 43 | parser.add_argument("--preload", action="store_true") 44 | parser.add_argument("--lr_reduction", action="store_true") 45 | parser.add_argument("--comment", type=str) 46 | 47 | 48 | return vars(parser.parse_args()) 49 | 50 | 51 | def model_fn(batch, model, criterion, device): 52 | """Forward a batch through model.""" 53 | 54 | srcs, src_masks, tgts, tgt_masks, tgt_mels, overlap_lens = batch 55 | 56 | srcs = srcs.to(device) 57 | src_masks = src_masks.to(device) 58 | tgts = tgts.to(device) 59 | tgt_masks = tgt_masks.to(device) 60 | tgt_mels = tgt_mels.to(device) 61 | 62 | refs = tgts 63 | ref_masks = tgt_masks 64 | 65 | outs, attns = model(srcs, refs, src_masks=src_masks, ref_masks=ref_masks) 66 | 67 | losses = [] 68 | for out, tgt_mel, attn, overlap_len in zip(outs.unbind(), tgt_mels.unbind(), attns[-1], overlap_lens): 69 | loss = criterion(out[:, :overlap_len], tgt_mel[:, :overlap_len]) 70 | losses.append(loss) 71 | try: 72 | attns_plot = [] 73 | for i in range(len(attns)): 74 | attns_plot.append(attns[i][0][:overlap_lens[0], :overlap_lens[0]]) 75 | except: 76 | pass 77 | 78 | 79 | return sum(losses) / len(losses), attns_plot 80 | 81 | 82 | def valid(dataloader, model, criterion, device): 83 | """Validate on validation set.""" 84 | 85 | model.eval() 86 | running_loss = 0.0 87 | pbar = tqdm(total=len(dataloader.dataset), ncols=0, desc="Valid", unit=" uttr") 88 | 89 | for i, batch in enumerate(dataloader): 90 | with torch.no_grad(): 91 | loss, attns = model_fn(batch, model, criterion, device) 92 | running_loss += loss.item() 93 | 94 | pbar.update(dataloader.batch_size) 95 | pbar.set_postfix(loss=f"{running_loss / (i+1):.2f}") 96 | 97 | pbar.close() 98 | model.train() 99 | 100 | return running_loss / len(dataloader), attns 101 | 102 | 103 | def main( 104 | data_dir, 105 | save_dir, 106 | total_steps, 107 | warmup_steps, 108 | valid_steps, 109 | log_steps, 110 | save_steps, 111 | n_samples, 112 | accu_steps, 113 | batch_size, 114 | n_workers, 115 | src_feat, 116 | ref_feat, 117 | preload, 118 | lr_reduction, 119 | comment, 120 | ): 121 | """Main function.""" 122 | 123 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 124 | 125 | metadata_path = Path(data_dir) / "metadata.json" 126 | 127 | dataset = IntraSpeakerDataset( 128 | data_dir, metadata_path, src_feat, ref_feat, n_samples, preload 129 | ) 130 | input_dim, ref_dim, tgt_dim = dataset.get_feat_dim() 131 | lengths = [trainlen := int(0.9 * len(dataset)), len(dataset) - trainlen] 132 | trainset, validset = random_split(dataset, lengths) 133 | print(f'Input dim: {input_dim}, Reference dim: {ref_dim}, Target dim: {tgt_dim}') 134 | model = S2VC(input_dim, ref_dim).to(device) 135 | model = torch.jit.script(model) 136 | 137 | train_loader = DataLoader( 138 | trainset, 139 | batch_size=batch_size, 140 | shuffle=True, 141 | drop_last=True, 142 | num_workers=n_workers, 143 | pin_memory=True, 144 | collate_fn=collate_batch, 145 | ) 146 | valid_loader = DataLoader( 147 | validset, 148 | batch_size=batch_size * accu_steps, 149 | num_workers=n_workers, 150 | drop_last=True, 151 | pin_memory=True, 152 | # shuffle to make the plot on tensorboard differenct 153 | shuffle=True, 154 | collate_fn=collate_batch, 155 | ) 156 | train_iterator = iter(train_loader) 157 | 158 | if comment is not None: 159 | log_dir = "logs/" 160 | log_dir += datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") 161 | log_dir += "_" + comment 162 | writer = SummaryWriter(log_dir) 163 | 164 | save_dir_path = Path(save_dir) 165 | save_dir_path.mkdir(parents=True, exist_ok=True) 166 | 167 | 168 | learning_rate = 5e-5 169 | criterion = nn.L1Loss() 170 | optimizer = AdamW(model.parameters(), lr=learning_rate) 171 | scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps) 172 | 173 | best_loss = float("inf") 174 | best_state_dict = None 175 | 176 | pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step") 177 | 178 | for step in range(total_steps): 179 | if step == 40002: 180 | file = open('completed.txt', 'a') 181 | print(f'{comment} completed', file=file) 182 | break 183 | batch_loss = 0.0 184 | 185 | for _ in range(accu_steps): 186 | try: 187 | batch = next(train_iterator) 188 | except StopIteration: 189 | train_iterator = iter(train_loader) 190 | batch = next(train_iterator) 191 | 192 | loss, attns = model_fn(batch, model, criterion, device) 193 | loss = loss / accu_steps 194 | batch_loss += loss.item() 195 | loss.backward() 196 | 197 | optimizer.step() 198 | scheduler.step() 199 | optimizer.zero_grad() 200 | 201 | pbar.update() 202 | pbar.set_postfix(loss=f"{batch_loss:.2f}", step=step + 1) 203 | 204 | if step % log_steps == 0 and comment is not None: 205 | writer.add_scalar("Loss/train", batch_loss, step) 206 | try: 207 | attn = [attns[i].unsqueeze(0) for i in range(len(attns))] 208 | figure = plot_attn(attn, save=False) 209 | writer.add_figure(f"Image/Train-Attentions.png", figure, step + 1) 210 | except: 211 | pass 212 | 213 | if (step + 1) % valid_steps == 0: 214 | pbar.close() 215 | 216 | valid_loss, attns = valid(valid_loader, model, criterion, device) 217 | 218 | if comment is not None: 219 | writer.add_scalar("Loss/valid", valid_loss, step + 1) 220 | try: 221 | attn = [attns[i].unsqueeze(0) for i in range(len(attns))] 222 | figure = plot_attn(attn, save=False) 223 | writer.add_figure(f"Image/Valid-Attentions.png", figure, step + 1) 224 | except: 225 | pass 226 | 227 | if valid_loss < best_loss: 228 | best_loss = valid_loss 229 | best_state_dict = model.state_dict() 230 | 231 | pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step") 232 | 233 | if (step + 1) % save_steps == 0 and best_state_dict is not None: 234 | loss_str = f"{best_loss:.4f}".replace(".", "dot") 235 | best_ckpt_name = f"retriever-best-loss{loss_str}.pt" 236 | 237 | loss_str = f"{valid_loss:.4f}".replace(".", "dot") 238 | curr_ckpt_name = f"retriever-step{step+1}-loss{loss_str}.pt" 239 | 240 | current_state_dict = model.state_dict() 241 | model.cpu() 242 | 243 | model.load_state_dict(best_state_dict) 244 | model.save(str(save_dir_path / best_ckpt_name)) 245 | 246 | model.load_state_dict(current_state_dict) 247 | model.save(str(save_dir_path / curr_ckpt_name)) 248 | 249 | model.to(device) 250 | pbar.write(f"Step {step + 1}, best model saved. (loss={best_loss:.4f})") 251 | 252 | 253 | pbar.close() 254 | 255 | 256 | if __name__ == "__main__": 257 | main(**parse_args()) 258 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations 7 | 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 |

Audio Demo

35 |

S2VC: A Framework for Any-to-Any Voice Conversion with Self-SupervisedPretrained 36 | Representations

37 |
38 |

39 | Abstract: 40 | Any-to-any voice conversion (VC) aims to convert the timbre of utterances from and to any speakers seen 41 | or unseen during training. Various any-to-any VC approaches have been proposed like AUTOVC, AdaINVC, and 42 | FragmentVC. AUTOVC, and AdaINVC utilize source and target encoders to disentangle the content and 43 | speaker information of the features. FragmentVC utilizes two encoders to encode source and target 44 | information and adopts cross attention to align the source and target features with similar phonetic 45 | content. Moreover, pre-trained features are adopted. AUTOVC used dvector to extract speaker information, 46 | and self-supervised learning (SSL) features like wav2vec 2.0 is used in FragmentVC to extract the 47 | phonetic content information. Different from previous works, we proposed S2VC that utilizes 48 | Self-Supervised features as both source and target features for VC model. Supervised phoneme 49 | posteriororgram (PPG), which is believed to be speaker-independent and widely used in VC to extract 50 | content information, is chosen as a strong baseline for SSL features. The objective evaluation and 51 | subjective evaluation both show models taking SSL feature CPC as both source and target features 52 | outperforms that taking PPG as source feature, suggesting that SSL features have great potential in 53 | improving VC. 54 |

55 |

56 | 57 | arXiv (Preprint) 58 |

59 |
60 |
61 | 62 | 63 | 64 | 65 | 66 |
67 |
Samples
68 |
69 |
70 |
71 |
    72 |
73 |
74 |
75 | 76 | 80 |
81 | 82 |
83 | 84 | 90 |
91 | 92 |
93 | 94 | 98 |
99 |
100 | 101 | 108 |
109 |
110 |
111 | 112 |
Loading......
113 | 114 | 121 | 122 | 463 | 464 | 465 |
466 |

© 台大語音實驗室 NTU Speech Lab

467 |
468 | 469 | 470 | 471 | --------------------------------------------------------------------------------