├── losses ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── enh_loss.cpython-39.pyc │ ├── basic_loss.cpython-37.pyc │ ├── basic_loss.cpython-38.pyc │ ├── basic_loss.cpython-39.pyc │ ├── generator_loss.cpython-37.pyc │ ├── generator_loss.cpython-38.pyc │ ├── generator_loss.cpython-39.pyc │ ├── discriminator_loss.cpython-37.pyc │ ├── discriminator_loss.cpython-38.pyc │ └── discriminator_loss.cpython-39.pyc ├── discriminator_loss.py ├── generator_loss.py └── basic_loss.py ├── models ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── soundstream.cpython-37.pyc │ ├── soundstream.cpython-38.pyc │ ├── soundstream.cpython-39.pyc │ ├── soundstream_semantic.cpython-38.pyc │ └── soundstream_semantic.cpython-39.pyc ├── soundstream.py ├── soundstream2.py ├── soundstream_semantic.py └── msstftd.py ├── utils ├── __init__.py ├── __pycache__ │ ├── utils.cpython-37.pyc │ ├── utils.cpython-38.pyc │ ├── utils.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── ddp_utils.cpython-38.pyc │ ├── ddp_utils.cpython-39.pyc │ ├── hifigan_mel.cpython-37.pyc │ ├── hifigan_mel.cpython-38.pyc │ └── hifigan_mel.cpython-39.pyc └── hifigan_mel.py ├── dataloaders ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── base_dataloader.cpython-37.pyc │ ├── base_dataloader.cpython-38.pyc │ └── base_dataloader.cpython-39.pyc └── base_dataloader.py ├── distributed ├── __init__.py ├── __pycache__ │ ├── launch.cpython-37.pyc │ ├── launch.cpython-38.pyc │ ├── launch.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── distributed.cpython-37.pyc │ ├── distributed.cpython-38.pyc │ └── distributed.cpython-39.pyc ├── launch.py └── distributed.py ├── modules ├── commons │ ├── __init__.py │ ├── __pycache__ │ │ ├── ops.cpython-37.pyc │ │ ├── ops.cpython-38.pyc │ │ ├── ops.cpython-39.pyc │ │ ├── pqmf.cpython-37.pyc │ │ ├── pqmf.cpython-38.pyc │ │ ├── pqmf.cpython-39.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── torch_stft.cpython-37.pyc │ │ ├── torch_stft.cpython-38.pyc │ │ ├── torch_stft.cpython-39.pyc │ │ ├── base_layers.cpython-37.pyc │ │ ├── base_layers.cpython-38.pyc │ │ └── base_layers.cpython-39.pyc │ └── pqmf.py ├── discriminators │ ├── __init__.py │ ├── __pycache__ │ │ ├── mrd.cpython-37.pyc │ │ ├── mrd.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── combd_sbd.cpython-37.pyc │ │ ├── combd_sbd.cpython-38.pyc │ │ ├── period_discriminator.cpython-37.pyc │ │ ├── period_discriminator.cpython-38.pyc │ │ ├── period_discriminator.cpython-39.pyc │ │ ├── scale_discriminator.cpython-37.pyc │ │ ├── scale_discriminator.cpython-38.pyc │ │ ├── scale_discriminator.cpython-39.pyc │ │ ├── frequency_discriminator.cpython-37.pyc │ │ ├── frequency_discriminator.cpython-38.pyc │ │ └── frequency_discriminator.cpython-39.pyc │ └── frequency_discriminator.py ├── __pycache__ │ ├── conv.cpython-37.pyc │ ├── conv.cpython-38.pyc │ ├── conv.cpython-39.pyc │ ├── lstm.cpython-37.pyc │ ├── lstm.cpython-38.pyc │ ├── lstm.cpython-39.pyc │ ├── norm.cpython-37.pyc │ ├── norm.cpython-38.pyc │ ├── norm.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── seanet.cpython-37.pyc │ ├── seanet.cpython-38.pyc │ ├── seanet.cpython-39.pyc │ ├── transformer.cpython-37.pyc │ ├── transformer.cpython-38.pyc │ ├── transformer.cpython-39.pyc │ ├── semantic_module.cpython-38.pyc │ └── semantic_module.cpython-39.pyc ├── __init__.py ├── lstm.py ├── norm.py ├── transformer.py └── loss.py ├── descriptaudiocodec ├── .gitattributes ├── tests │ ├── __init__.py │ ├── test_cli.py │ └── test_train.py ├── dac │ ├── compare │ │ ├── __init__.py │ │ └── encodec.py │ ├── nn │ │ ├── __init__.py │ │ └── layers.py │ ├── model │ │ ├── __init__.py │ │ └── discriminator.py │ ├── __init__.py │ ├── __main__.py │ └── utils │ │ ├── decode.py │ │ ├── encode.py │ │ └── __init__.py ├── .dockerignore ├── conf │ ├── ablations │ │ ├── baseline.yml │ │ ├── no-adv.yml │ │ ├── no-mb.yml │ │ ├── no-mpd.yml │ │ ├── no-low-hop.yml │ │ ├── no-mpd-msd.yml │ │ ├── diff-mb.yml │ │ ├── equal-mb.yml │ │ ├── only-speech.yml │ │ └── no-data-balance.yml │ ├── quantizer │ │ ├── 2d.yml │ │ ├── 4d.yml │ │ ├── 24kbps.yml │ │ ├── 256d.yml │ │ ├── 32d.yml │ │ ├── 512d.yml │ │ ├── dropout-0.0.yml │ │ ├── dropout-0.5.yml │ │ └── dropout-0.25.yml │ ├── size │ │ ├── medium.yml │ │ └── small.yml │ ├── 1gpu.yml │ ├── downsampling │ │ ├── 128x.yml │ │ ├── 1024x.yml │ │ ├── 1536x.yml │ │ └── 768x.yml │ ├── base.yml │ └── final │ │ ├── 16khz.yml │ │ ├── 24khz.yml │ │ ├── 44khz.yml │ │ └── 44khz-16kbps.yml ├── requirements.txt ├── Dockerfile ├── Dockerfile.dev ├── .pre-commit-config.yaml ├── docker-compose.yml ├── LICENSE ├── scripts │ ├── compute_entropy.py │ ├── save_test_set.py │ ├── organize_daps.py │ ├── get_samples.py │ ├── evaluate.py │ └── mushra.py ├── setup.py ├── .gitignore └── README.md ├── exp.png ├── fig1.png ├── test_audio ├── music.wav ├── speech_cn.wav └── speech_en.flac ├── quantization ├── __pycache__ │ ├── vq.cpython-37.pyc │ ├── vq.cpython-38.pyc │ ├── vq.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── core_vq.cpython-37.pyc │ ├── core_vq.cpython-38.pyc │ ├── core_vq.cpython-39.pyc │ ├── distrib.cpython-37.pyc │ ├── distrib.cpython-38.pyc │ ├── distrib.cpython-39.pyc │ ├── core_vq_lsx_version.cpython-38.pyc │ └── core_vq_lsx_version.cpython-39.pyc ├── __init__.py ├── distrib.py └── vq.py ├── test_audio_reconstruction └── speech_en_nq_1.wav ├── requirements.txt ├── LICENSE ├── inference.py ├── config └── codec_16k_6kbps_v3_vqdp.yaml └── README.md /losses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /distributed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/commons/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /descriptaudiocodec/.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /descriptaudiocodec/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/compare/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /exp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/exp.png -------------------------------------------------------------------------------- /fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/fig1.png -------------------------------------------------------------------------------- /descriptaudiocodec/.dockerignore: -------------------------------------------------------------------------------- 1 | *.pth 2 | *.wav 3 | *.dac 4 | tests/ 5 | runs/ 6 | -------------------------------------------------------------------------------- /test_audio/music.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/test_audio/music.wav -------------------------------------------------------------------------------- /modules/discriminators/__init__.py: -------------------------------------------------------------------------------- 1 | from .frequency_discriminator import MultiFrequencyDiscriminator -------------------------------------------------------------------------------- /test_audio/speech_cn.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/test_audio/speech_cn.wav -------------------------------------------------------------------------------- /test_audio/speech_en.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/test_audio/speech_en.flac -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/baseline.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import loss 3 | from . import quantize 4 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/2d.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.codebook_dim: 2 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/4d.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.codebook_dim: 4 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/size/medium.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.decoder_dim: 1024 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/size/small.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.decoder_dim: 512 6 | -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/24kbps.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.n_codebooks: 28 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/256d.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.codebook_dim: 256 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/32d.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.codebook_dim: 32 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/512d.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.codebook_dim: 512 6 | -------------------------------------------------------------------------------- /modules/__pycache__/conv.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/conv.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/conv.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/conv.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/conv.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/conv.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/lstm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/lstm.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/lstm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/lstm.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/lstm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/lstm.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/norm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/norm.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/norm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/norm.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/norm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/norm.cpython-39.pyc -------------------------------------------------------------------------------- /descriptaudiocodec/conf/1gpu.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | 4 | batch_size: 12 5 | val_batch_size: 12 6 | num_workers: 4 7 | -------------------------------------------------------------------------------- /losses/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /losses/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /losses/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /losses/__pycache__/enh_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/enh_loss.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/seanet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/seanet.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/seanet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/seanet.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/seanet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/seanet.cpython-39.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/vq.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/vq.cpython-37.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/vq.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/vq.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/vq.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/vq.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/ddp_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/ddp_utils.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/ddp_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/ddp_utils.cpython-39.pyc -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/dropout-0.0.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.quantizer_dropout: 0.0 6 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/dropout-0.5.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.quantizer_dropout: 0.5 6 | -------------------------------------------------------------------------------- /distributed/__pycache__/launch.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/launch.cpython-37.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/launch.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/launch.cpython-38.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/launch.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/launch.cpython-39.pyc -------------------------------------------------------------------------------- /losses/__pycache__/basic_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/basic_loss.cpython-37.pyc -------------------------------------------------------------------------------- /losses/__pycache__/basic_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/basic_loss.cpython-38.pyc -------------------------------------------------------------------------------- /losses/__pycache__/basic_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/basic_loss.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/soundstream.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/soundstream.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/soundstream.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/soundstream.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/soundstream.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/soundstream.cpython-39.pyc -------------------------------------------------------------------------------- /test_audio_reconstruction/speech_en_nq_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/test_audio_reconstruction/speech_en_nq_1.wav -------------------------------------------------------------------------------- /utils/__pycache__/hifigan_mel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/hifigan_mel.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/hifigan_mel.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/hifigan_mel.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/hifigan_mel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/utils/__pycache__/hifigan_mel.cpython-39.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /descriptaudiocodec/conf/quantizer/dropout-0.25.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | DAC.quantizer_dropout: 0.25 6 | -------------------------------------------------------------------------------- /distributed/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /losses/__pycache__/generator_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/generator_loss.cpython-37.pyc -------------------------------------------------------------------------------- /losses/__pycache__/generator_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/generator_loss.cpython-38.pyc -------------------------------------------------------------------------------- /losses/__pycache__/generator_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/generator_loss.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/transformer.cpython-37.pyc -------------------------------------------------------------------------------- /modules/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/ops.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/ops.cpython-37.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/ops.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/ops.cpython-38.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/ops.cpython-39.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/pqmf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/pqmf.cpython-37.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/pqmf.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/pqmf.cpython-38.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/pqmf.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/pqmf.cpython-39.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/core_vq.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/core_vq.cpython-37.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/core_vq.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/core_vq.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/core_vq.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/core_vq.cpython-39.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/distrib.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/distrib.cpython-37.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/distrib.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/distrib.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/distrib.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/distrib.cpython-39.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/distributed.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/distributed.cpython-37.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/distributed.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/distributed.cpython-38.pyc -------------------------------------------------------------------------------- /distributed/__pycache__/distributed.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/distributed/__pycache__/distributed.cpython-39.pyc -------------------------------------------------------------------------------- /modules/__pycache__/semantic_module.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/semantic_module.cpython-38.pyc -------------------------------------------------------------------------------- /modules/__pycache__/semantic_module.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/__pycache__/semantic_module.cpython-39.pyc -------------------------------------------------------------------------------- /losses/__pycache__/discriminator_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/discriminator_loss.cpython-37.pyc -------------------------------------------------------------------------------- /losses/__pycache__/discriminator_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/discriminator_loss.cpython-38.pyc -------------------------------------------------------------------------------- /losses/__pycache__/discriminator_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/losses/__pycache__/discriminator_loss.cpython-39.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/torch_stft.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/torch_stft.cpython-37.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/torch_stft.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/torch_stft.cpython-38.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/torch_stft.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/torch_stft.cpython-39.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/mrd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/mrd.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/mrd.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/mrd.cpython-38.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/base_dataloader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/base_dataloader.cpython-37.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/base_dataloader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/base_dataloader.cpython-38.pyc -------------------------------------------------------------------------------- /dataloaders/__pycache__/base_dataloader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/dataloaders/__pycache__/base_dataloader.cpython-39.pyc -------------------------------------------------------------------------------- /models/__pycache__/soundstream_semantic.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/soundstream_semantic.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/soundstream_semantic.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/models/__pycache__/soundstream_semantic.cpython-39.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/base_layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/base_layers.cpython-37.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/base_layers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/base_layers.cpython-38.pyc -------------------------------------------------------------------------------- /modules/commons/__pycache__/base_layers.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/commons/__pycache__/base_layers.cpython-39.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /descriptaudiocodec/dac/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CodecMixin 2 | from .base import DACFile 3 | from .dac import DAC 4 | from .discriminator import Discriminator 5 | -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/combd_sbd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/combd_sbd.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/combd_sbd.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/combd_sbd.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/core_vq_lsx_version.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/core_vq_lsx_version.cpython-38.pyc -------------------------------------------------------------------------------- /quantization/__pycache__/core_vq_lsx_version.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/quantization/__pycache__/core_vq_lsx_version.cpython-39.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/period_discriminator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/period_discriminator.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/period_discriminator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/period_discriminator.cpython-38.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/period_discriminator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/period_discriminator.cpython-39.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/scale_discriminator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/scale_discriminator.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/scale_discriminator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/scale_discriminator.cpython-38.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/scale_discriminator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/scale_discriminator.cpython-39.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/frequency_discriminator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/frequency_discriminator.cpython-37.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/frequency_discriminator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/frequency_discriminator.cpython-38.pyc -------------------------------------------------------------------------------- /modules/discriminators/__pycache__/frequency_discriminator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhenye234/xcodec/HEAD/modules/discriminators/__pycache__/frequency_discriminator.cpython-39.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | omegaconf 3 | torchaudio 4 | einops 5 | numpy 6 | transformers 7 | tqdm 8 | tensorboard 9 | descript-audiotools>=0.7.2 10 | descript-audio-codec 11 | scipy==1.10.1 -------------------------------------------------------------------------------- /descriptaudiocodec/requirements.txt: -------------------------------------------------------------------------------- 1 | argbind>=0.3.7 2 | descript-audiotools>=0.7.2 3 | einops 4 | numpy 5 | torch 6 | torchaudio 7 | tqdm 8 | tensorboard 9 | numba>=0.5.7 10 | jupyterlab 11 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-adv.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | lambdas: 6 | mel/loss: 1.0 7 | waveform/loss: 1.0 8 | vq/commitment_loss: 0.25 9 | vq/codebook_loss: 1.0 10 | -------------------------------------------------------------------------------- /descriptaudiocodec/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 2 | 3 | COPY . /app 4 | WORKDIR /app 5 | 6 | RUN apt update && apt install -y git 7 | # install the package 8 | RUN pip install . 9 | 10 | # cache the model 11 | RUN python3 -m dac download 12 | -------------------------------------------------------------------------------- /quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # flake8: noqa 8 | from .vq import QuantizedResult, ResidualVectorQuantizer 9 | -------------------------------------------------------------------------------- /descriptaudiocodec/Dockerfile.dev: -------------------------------------------------------------------------------- 1 | ARG IMAGE=pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 2 | ARG GITHUB_TOKEN=none 3 | 4 | FROM $IMAGE 5 | 6 | RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc 7 | 8 | COPY requirements.txt /requirements.txt 9 | 10 | RUN apt update && apt install -y git 11 | 12 | # install the package 13 | RUN pip install --upgrade -r /requirements.txt 14 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | # preserved here for legacy reasons 4 | __model_version__ = "latest" 5 | 6 | import audiotools 7 | 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"] 9 | audiotools.ml.BaseModel.EXTERN += ["einops"] 10 | 11 | 12 | from . import nn 13 | from . import model 14 | from . import utils 15 | from .model import DAC 16 | from .model import DACFile 17 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/downsampling/128x.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | # Model setup 6 | DAC.sample_rate: 44100 7 | DAC.encoder_dim: 64 8 | DAC.encoder_rates: [2, 4, 4, 4] 9 | DAC.decoder_dim: 1536 10 | DAC.decoder_rates: [4, 4, 2, 2, 2, 1] 11 | 12 | # Quantization 13 | DAC.n_codebooks: 2 14 | DAC.codebook_size: 1024 15 | DAC.codebook_dim: 8 16 | DAC.quantizer_dropout: 1.0 17 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/downsampling/1024x.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | # Model setup 6 | DAC.sample_rate: 44100 7 | DAC.encoder_dim: 64 8 | DAC.encoder_rates: [2, 8, 8, 8] 9 | DAC.decoder_dim: 1536 10 | DAC.decoder_rates: [8, 4, 4, 2, 2, 2] 11 | 12 | # Quantization 13 | DAC.n_codebooks: 19 14 | DAC.codebook_size: 1024 15 | DAC.codebook_dim: 8 16 | DAC.quantizer_dropout: 1.0 17 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/downsampling/1536x.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | # Model setup 6 | DAC.sample_rate: 44100 7 | DAC.encoder_dim: 96 8 | DAC.encoder_rates: [2, 8, 8, 12] 9 | DAC.decoder_dim: 1536 10 | DAC.decoder_rates: [12, 4, 4, 2, 2, 2] 11 | 12 | # Quantization 13 | DAC.n_codebooks: 28 14 | DAC.codebook_size: 1024 15 | DAC.codebook_dim: 8 16 | DAC.quantizer_dropout: 1.0 17 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/downsampling/768x.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | # Model setup 6 | DAC.sample_rate: 44100 7 | DAC.encoder_dim: 64 8 | DAC.encoder_rates: [2, 6, 8, 8] 9 | DAC.decoder_dim: 1536 10 | DAC.decoder_rates: [6, 4, 4, 2, 2, 2] 11 | 12 | # Quantization 13 | DAC.n_codebooks: 14 14 | DAC.codebook_size: 1024 15 | DAC.codebook_dim: 8 16 | DAC.quantizer_dropout: 1.0 17 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-mb.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | Discriminator.sample_rate: 44100 6 | Discriminator.fft_sizes: [2048, 1024, 512] 7 | Discriminator.bands: 8 | - [0.0, 1.0] 9 | 10 | # re-weight lambdas to make up for 11 | # lost discriminators vs baseline 12 | lambdas: 13 | mel/loss: 15.0 14 | adv/feat_loss: 5.0 15 | adv/gen_loss: 1.0 16 | vq/commitment_loss: 0.25 17 | vq/codebook_loss: 1.0 18 | -------------------------------------------------------------------------------- /descriptaudiocodec/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/reorder_python_imports 3 | rev: v2.5.0 4 | hooks: 5 | - id: reorder-python-imports 6 | - repo: https://github.com/psf/black 7 | rev: 23.1.0 8 | hooks: 9 | - id: black 10 | language_version: python3 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v4.0.1 13 | hooks: 14 | - id: end-of-file-fixer 15 | - id: trailing-whitespace 16 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-mpd.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | Discriminator.sample_rate: 44100 6 | Discriminator.rates: [1] 7 | Discriminator.periods: [] 8 | Discriminator.fft_sizes: [2048, 1024, 512] 9 | Discriminator.bands: 10 | - [0.0, 0.1] 11 | - [0.1, 0.25] 12 | - [0.25, 0.5] 13 | - [0.5, 0.75] 14 | - [0.75, 1.0] 15 | 16 | lambdas: 17 | mel/loss: 15.0 18 | adv/feat_loss: 2.5 19 | adv/gen_loss: 1.0 20 | vq/commitment_loss: 0.25 21 | vq/codebook_loss: 1.0 22 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-low-hop.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | MelSpectrogramLoss.n_mels: [80] 6 | MelSpectrogramLoss.window_lengths: [512] 7 | MelSpectrogramLoss.mel_fmin: [0] 8 | MelSpectrogramLoss.mel_fmax: [null] 9 | MelSpectrogramLoss.pow: 1.0 10 | MelSpectrogramLoss.clamp_eps: 1.0e-5 11 | MelSpectrogramLoss.mag_weight: 0.0 12 | 13 | lambdas: 14 | mel/loss: 100.0 15 | adv/feat_loss: 2.0 16 | adv/gen_loss: 1.0 17 | vq/commitment_loss: 0.25 18 | vq/codebook_loss: 1.0 19 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-mpd-msd.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | Discriminator.sample_rate: 44100 6 | Discriminator.rates: [] 7 | Discriminator.periods: [] 8 | Discriminator.fft_sizes: [2048, 1024, 512] 9 | Discriminator.bands: 10 | - [0.0, 0.1] 11 | - [0.1, 0.25] 12 | - [0.25, 0.5] 13 | - [0.5, 0.75] 14 | - [0.75, 1.0] 15 | 16 | lambdas: 17 | mel/loss: 15.0 18 | adv/feat_loss: 2.66 19 | adv/gen_loss: 1.0 20 | vq/commitment_loss: 0.25 21 | vq/codebook_loss: 1.0 22 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/diff-mb.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | Discriminator.sample_rate: 44100 6 | Discriminator.fft_sizes: [2048, 1024, 512] 7 | Discriminator.bands: 8 | - [0.0, 0.05] 9 | - [0.05, 0.1] 10 | - [0.1, 0.25] 11 | - [0.25, 0.5] 12 | - [0.5, 1.0] 13 | 14 | 15 | # re-weight lambdas to make up for 16 | # lost discriminators vs baseline 17 | lambdas: 18 | mel/loss: 15.0 19 | adv/feat_loss: 5.0 20 | adv/gen_loss: 1.0 21 | vq/commitment_loss: 0.25 22 | vq/codebook_loss: 1.0 23 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/equal-mb.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | Discriminator.sample_rate: 44100 6 | Discriminator.fft_sizes: [2048, 1024, 512] 7 | Discriminator.bands: 8 | - [0.0, 0.2] 9 | - [0.2, 0.4] 10 | - [0.4, 0.6] 11 | - [0.6, 0.8] 12 | - [0.8, 1.0] 13 | 14 | 15 | # re-weight lambdas to make up for 16 | # lost discriminators vs baseline 17 | lambdas: 18 | mel/loss: 15.0 19 | adv/feat_loss: 5.0 20 | adv/gen_loss: 1.0 21 | vq/commitment_loss: 0.25 22 | vq/codebook_loss: 1.0 23 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/only-speech.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | train/build_dataset.folders: 6 | speech_fb: 7 | - /data/daps/train 8 | speech_hq: 9 | - /data/vctk 10 | - /data/vocalset 11 | - /data/read_speech 12 | - /data/french_speech 13 | speech_uq: 14 | - /data/emotional_speech/ 15 | - /data/common_voice/ 16 | - /data/german_speech/ 17 | - /data/russian_speech/ 18 | - /data/spanish_speech/ 19 | 20 | val/build_dataset.folders: 21 | speech_hq: 22 | - /data/daps/val 23 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Torch modules.""" 8 | 9 | # flake8: noqa 10 | from .conv import ( 11 | pad1d, 12 | unpad1d, 13 | NormConv1d, 14 | NormConvTranspose1d, 15 | NormConv2d, 16 | NormConvTranspose2d, 17 | SConv1d, 18 | SConvTranspose1d, 19 | ) 20 | from .lstm import SLSTM 21 | from .seanet import SEANetEncoder, SEANetDecoder 22 | from .transformer import StreamingTransformerEncoder 23 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/ablations/no-data-balance.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/base.yml 3 | - conf/1gpu.yml 4 | 5 | train/build_dataset.folders: 6 | speech: 7 | - /data/daps/train 8 | - /data/vctk 9 | - /data/vocalset 10 | - /data/read_speech 11 | - /data/french_speech 12 | - /data/emotional_speech/ 13 | - /data/common_voice/ 14 | - /data/german_speech/ 15 | - /data/russian_speech/ 16 | - /data/spanish_speech/ 17 | music: 18 | - /data/musdb/train 19 | - /data/jamendo 20 | general: 21 | - /data/audioset/data/unbalanced_train_segments/ 22 | - /data/audioset/data/balanced_train_segments/ 23 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import argbind 4 | 5 | from dac.utils import download 6 | from dac.utils.decode import decode 7 | from dac.utils.encode import encode 8 | 9 | STAGES = ["encode", "decode", "download"] 10 | 11 | 12 | def run(stage: str): 13 | """Run stages. 14 | 15 | Parameters 16 | ---------- 17 | stage : str 18 | Stage to run 19 | """ 20 | if stage not in STAGES: 21 | raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}") 22 | stage_fn = globals()[stage] 23 | 24 | if stage == "download": 25 | stage_fn() 26 | return 27 | 28 | stage_fn() 29 | 30 | 31 | if __name__ == "__main__": 32 | group = sys.argv.pop(1) 33 | args = argbind.parse_args(group=group) 34 | 35 | with argbind.scope(args): 36 | run(group) 37 | -------------------------------------------------------------------------------- /modules/lstm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """LSTM layers module.""" 8 | 9 | from torch import nn 10 | 11 | 12 | class SLSTM(nn.Module): 13 | """ 14 | LSTM without worrying about the hidden state, nor the layout of the data. 15 | Expects input as convolutional layout. 16 | """ 17 | def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True): 18 | super().__init__() 19 | self.skip = skip 20 | self.lstm = nn.LSTM(dimension, dimension, num_layers) 21 | 22 | def forward(self, x): 23 | x = x.permute(2, 0, 1) 24 | y, _ = self.lstm(x) 25 | if self.skip: 26 | y = y + x 27 | y = y.permute(1, 2, 0) 28 | return y 29 | -------------------------------------------------------------------------------- /modules/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """Normalization modules.""" 8 | 9 | import typing as tp 10 | 11 | import einops 12 | import torch 13 | from torch import nn 14 | 15 | 16 | class ConvLayerNorm(nn.LayerNorm): 17 | """ 18 | Convolution-friendly LayerNorm that moves channels to last dimensions 19 | before running the normalization and moves them back to original position right after. 20 | """ 21 | def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs): 22 | super().__init__(normalized_shape, **kwargs) 23 | 24 | def forward(self, x): 25 | x = einops.rearrange(x, 'b ... t -> b t ...') 26 | x = super().forward(x) 27 | x = einops.rearrange(x, 'b t ... -> b ... t') 28 | return 29 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/nn/layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from einops import rearrange 6 | from torch.nn.utils import weight_norm 7 | 8 | 9 | def WNConv1d(*args, **kwargs): 10 | return weight_norm(nn.Conv1d(*args, **kwargs)) 11 | 12 | 13 | def WNConvTranspose1d(*args, **kwargs): 14 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 15 | 16 | 17 | # Scripting this brings model speed up 1.4x 18 | @torch.jit.script 19 | def snake(x, alpha): 20 | shape = x.shape 21 | x = x.reshape(shape[0], shape[1], -1) 22 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 23 | x = x.reshape(shape) 24 | return x 25 | 26 | 27 | class Snake1d(nn.Module): 28 | def __init__(self, channels): 29 | super().__init__() 30 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 31 | 32 | def forward(self, x): 33 | return snake(x, self.alpha) 34 | -------------------------------------------------------------------------------- /descriptaudiocodec/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | services: 3 | base: 4 | build: 5 | context: . 6 | dockerfile: ./Dockerfile.dev 7 | args: 8 | GITHUB_TOKEN: ${GITHUB_TOKEN} 9 | IMAGE: ${IMAGE} 10 | volumes: 11 | - .:/u/home/src 12 | - ${PATH_TO_DATA}:/data 13 | - ${PATH_TO_RUNS}:/runs 14 | - ~/.config/gcloud:/u/home/.config/gcloud 15 | - ~/.zsh_history:/u/home/.zsh_history 16 | environment: 17 | - GITHUB_TOKEN 18 | - HOST_USER_ID 19 | - HOST_USER_GID 20 | - JUPYTER_TOKEN=password 21 | - PATH_TO_DATA=/data 22 | - PATH_TO_RUNS=/runs 23 | - MPLCONFIGDIR=/u/home/.mplconfig 24 | shm_size: 32G 25 | working_dir: /u/home/src 26 | deploy: 27 | resources: 28 | reservations: 29 | devices: 30 | - driver: nvidia 31 | capabilities: [gpu] 32 | dev: 33 | extends: base 34 | profiles: 35 | - interactive 36 | stdin_open: true 37 | tty: true 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 YE Zhen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /descriptaudiocodec/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-present, Descript 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/compute_entropy.py: -------------------------------------------------------------------------------- 1 | import argbind 2 | import audiotools as at 3 | import numpy as np 4 | import torch 5 | import tqdm 6 | 7 | import dac 8 | 9 | 10 | @argbind.bind(without_prefix=True, positional=True) 11 | def main( 12 | folder: str, 13 | model_path: str, 14 | n_samples: int = 1024, 15 | device: str = "cuda", 16 | ): 17 | files = at.util.find_audio(folder)[:n_samples] 18 | signals = [ 19 | at.AudioSignal.salient_excerpt(f, loudness_cutoff=-20, duration=1.0) 20 | for f in files 21 | ] 22 | 23 | with torch.no_grad(): 24 | model = dac.model.DAC.load(model_path).to(device) 25 | model.eval() 26 | 27 | codes = [] 28 | for x in tqdm.tqdm(signals): 29 | x = x.to(model.device) 30 | o = model.encode(x.audio_data, x.sample_rate) 31 | codes.append(o["codes"].cpu()) 32 | 33 | codes = torch.cat(codes, dim=-1) 34 | entropy = [] 35 | 36 | for i in range(codes.shape[1]): 37 | codes_ = codes[0, i, :] 38 | counts = torch.bincount(codes_) 39 | counts = (counts / counts.sum()).clamp(1e-10) 40 | entropy.append(-(counts * counts.log()).sum().item() * np.log2(np.e)) 41 | 42 | pct = sum(entropy) / (10 * len(entropy)) 43 | print(f"Entropy for each codebook: {entropy}") 44 | print(f"Effective percentage: {pct * 100}%") 45 | 46 | 47 | if __name__ == "__main__": 48 | args = argbind.parse_args() 49 | with argbind.scope(args): 50 | main() 51 | -------------------------------------------------------------------------------- /descriptaudiocodec/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open("README.md") as f: 5 | long_description = f.read() 6 | 7 | setup( 8 | name="descript-audio-codec", 9 | version="1.0.0", 10 | classifiers=[ 11 | "Intended Audience :: Developers", 12 | "Natural Language :: English", 13 | "Programming Language :: Python :: 3.7", 14 | "Topic :: Artistic Software", 15 | "Topic :: Multimedia", 16 | "Topic :: Multimedia :: Sound/Audio", 17 | "Topic :: Multimedia :: Sound/Audio :: Editors", 18 | "Topic :: Software Development :: Libraries", 19 | ], 20 | description="A high-quality general neural audio codec.", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | author="Prem Seetharaman, Rithesh Kumar", 24 | author_email="prem@descript.com", 25 | url="https://github.com/descriptinc/descript-audio-codec", 26 | license="MIT", 27 | packages=find_packages(), 28 | keywords=["audio", "compression", "machine learning"], 29 | install_requires=[ 30 | "argbind>=0.3.7", 31 | "descript-audiotools>=0.7.2", 32 | "einops", 33 | "numpy", 34 | "torch", 35 | "torchaudio", 36 | "tqdm", 37 | ], 38 | extras_require={ 39 | "dev": [ 40 | "pytest", 41 | "pytest-cov", 42 | "pynvml", 43 | "psutil", 44 | "pandas", 45 | "onnx", 46 | "onnx-simplifier", 47 | "seaborn", 48 | "jupyterlab", 49 | "pandas", 50 | "watchdog", 51 | "pesq", 52 | "tabulate", 53 | "encodec", 54 | ], 55 | }, 56 | ) 57 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/compare/encodec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from audiotools import AudioSignal 3 | from audiotools.ml import BaseModel 4 | from encodec import EncodecModel 5 | 6 | 7 | class Encodec(BaseModel): 8 | def __init__(self, sample_rate: int = 24000, bandwidth: float = 24.0): 9 | super().__init__() 10 | 11 | if sample_rate == 24000: 12 | self.model = EncodecModel.encodec_model_24khz() 13 | else: 14 | self.model = EncodecModel.encodec_model_48khz() 15 | self.model.set_target_bandwidth(bandwidth) 16 | self.sample_rate = 44100 17 | 18 | def forward( 19 | self, 20 | audio_data: torch.Tensor, 21 | sample_rate: int = 44100, 22 | n_quantizers: int = None, 23 | ): 24 | signal = AudioSignal(audio_data, sample_rate) 25 | signal.resample(self.model.sample_rate) 26 | recons = self.model(signal.audio_data) 27 | recons = AudioSignal(recons, self.model.sample_rate) 28 | recons.resample(sample_rate) 29 | return {"audio": recons.audio_data} 30 | 31 | 32 | if __name__ == "__main__": 33 | import numpy as np 34 | from functools import partial 35 | 36 | model = Encodec() 37 | 38 | for n, m in model.named_modules(): 39 | o = m.extra_repr() 40 | p = sum([np.prod(p.size()) for p in m.parameters()]) 41 | fn = lambda o, p: o + f" {p/1e6:<.3f}M params." 42 | setattr(m, "extra_repr", partial(fn, o=o, p=p)) 43 | print(model) 44 | print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()])) 45 | 46 | length = 88200 * 2 47 | x = torch.randn(1, 1, length).to(model.device) 48 | x.requires_grad_(True) 49 | x.retain_grad() 50 | 51 | # Make a forward pass 52 | out = model(x)["audio"] 53 | 54 | print(x.shape, out.shape) 55 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/save_test_set.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from pathlib import Path 3 | 4 | import argbind 5 | import torch 6 | from audiotools.core import util 7 | from audiotools.ml.decorators import Tracker 8 | from train import Accelerator 9 | 10 | import scripts.train as train 11 | 12 | 13 | @torch.no_grad() 14 | def process(batch, accel, test_data): 15 | batch = util.prepare_batch(batch, accel.device) 16 | signal = test_data.transform(batch["signal"].clone(), **batch["transform_args"]) 17 | return signal.cpu() 18 | 19 | 20 | @argbind.bind(without_prefix=True) 21 | @torch.no_grad() 22 | def save_test_set(args, accel, sample_rate: int = 44100, output: str = "samples/input"): 23 | tracker = Tracker() 24 | with argbind.scope(args, "test"): 25 | test_data = train.build_dataset(sample_rate) 26 | 27 | global process 28 | process = tracker.track("process", len(test_data))(process) 29 | 30 | output = Path(output) 31 | output.mkdir(parents=True, exist_ok=True) 32 | (output.parent / "input").mkdir(parents=True, exist_ok=True) 33 | with open(output / "metadata.csv", "w") as csvfile: 34 | keys = ["path", "original"] 35 | writer = csv.DictWriter(csvfile, fieldnames=keys) 36 | writer.writeheader() 37 | 38 | with tracker.live: 39 | for i in range(len(test_data)): 40 | signal = process(test_data[i], accel, test_data) 41 | input_path = output.parent / "input" / f"sample_{i}.wav" 42 | metadata = { 43 | "path": str(input_path), 44 | "original": str(signal.path_to_input_file), 45 | } 46 | writer.writerow(metadata) 47 | signal.write(input_path) 48 | tracker.done("test", f"N={len(test_data)}") 49 | 50 | 51 | if __name__ == "__main__": 52 | args = argbind.parse_args() 53 | with argbind.scope(args): 54 | with Accelerator() as accel: 55 | save_test_set(args, accel) 56 | -------------------------------------------------------------------------------- /dataloaders/base_dataloader.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | from torch.utils.data import Dataset 5 | import torchaudio 6 | from torchaudio.transforms import Resample 7 | 8 | 9 | class WaveDataset(Dataset): 10 | def __init__( 11 | self, 12 | flist_file, 13 | segment_size, 14 | sampling_rate, 15 | split=True, # whether or not to get a segment of an audio sample to form the batch 16 | shuffle=False, 17 | audio_norm_scale: float = 1.0, 18 | ): 19 | self.file_list = self.get_filelist(flist_file) 20 | if shuffle: 21 | random.shuffle(self.file_list) 22 | self.segment_size = segment_size 23 | self.sampling_rate = sampling_rate 24 | self.split = split 25 | self.audio_norm_scale = audio_norm_scale 26 | 27 | def get_filelist(self, fpath): 28 | with open(fpath, 'r') as f: 29 | flist = [l.strip() for l in f if l.strip()] 30 | return flist 31 | 32 | def __getitem__(self, index): 33 | fname = self.file_list[index] 34 | audio, sr = torchaudio.load(fname) 35 | if sr != self.sampling_rate: 36 | audio = Resample(sr, self.sampling_rate)(audio) 37 | if self.audio_norm_scale < 1.0: 38 | audio = audio * self.audio_norm_scale 39 | 40 | if self.split: 41 | if audio.size(1) >= self.segment_size: 42 | max_audio_start = audio.size(1) - self.segment_size 43 | audio_start = random.randint(0, max_audio_start) 44 | audio = audio[:, audio_start:audio_start+self.segment_size] 45 | else: 46 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 47 | # in case, audio clip is too short in validation set 48 | if audio.size(1) < self.segment_size: 49 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 50 | 51 | return audio 52 | 53 | def __len__(self): 54 | return len(self.file_list) 55 | -------------------------------------------------------------------------------- /utils/hifigan_mel.py: -------------------------------------------------------------------------------- 1 | """Adapted from https://github.com/jik876/hifi-gan""" 2 | 3 | import torch 4 | import numpy as np 5 | from scipy.io.wavfile import read 6 | from librosa.filters import mel as librosa_mel_fn 7 | 8 | 9 | def load_wav(full_path): 10 | sampling_rate, data = read(full_path) 11 | return data, sampling_rate 12 | 13 | 14 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 15 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 16 | 17 | 18 | def dynamic_range_decompression(x, C=1): 19 | return np.exp(x) / C 20 | 21 | 22 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 23 | return torch.log(torch.clamp(x, min=clip_val) * C) 24 | 25 | 26 | def dynamic_range_decompression_torch(x, C=1): 27 | return torch.exp(x) / C 28 | 29 | 30 | def spectral_normalize_torch(magnitudes): 31 | output = dynamic_range_compression_torch(magnitudes) 32 | return output 33 | 34 | 35 | def spectral_de_normalize_torch(magnitudes): 36 | output = dynamic_range_decompression_torch(magnitudes) 37 | return output 38 | 39 | 40 | mel_basis = {} 41 | hann_window = {} 42 | 43 | def mel_spectrogram( 44 | y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax=None, center=False): 45 | if torch.min(y) < -1.: 46 | print('min value is ', torch.min(y)) 47 | if torch.max(y) > 1.: 48 | print('max value is ', torch.max(y)) 49 | 50 | global mel_basis, hann_window 51 | if fmax not in mel_basis: 52 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 53 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 54 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 55 | 56 | y = torch.nn.functional.pad( 57 | y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 58 | y = y.squeeze(1) 59 | 60 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 61 | center=center, pad_mode='reflect', normalized=False, onesided=True,return_complex=False) 62 | 63 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 64 | 65 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 66 | spec = spectral_normalize_torch(spec) 67 | 68 | return spec 69 | -------------------------------------------------------------------------------- /losses/discriminator_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from losses.basic_loss import MSEDLoss 5 | 6 | 7 | class BasicDiscriminatorLoss(nn.Module): 8 | """Least-square GAN loss.""" 9 | 10 | def __init__(self, config=None): 11 | super(BasicDiscriminatorLoss, self).__init__() 12 | 13 | def forward(self, real_outputs, fake_outputs): 14 | loss = 0 15 | real_losses = [] 16 | fake_losses = [] 17 | for dr, dg in zip(real_outputs, fake_outputs): 18 | dr = dr.float() 19 | dg = dg.float() 20 | real_loss = torch.mean((1-dr)**2) 21 | fake_loss = torch.mean(dg**2) 22 | loss += (real_loss + fake_loss) 23 | real_losses.append(real_loss.item()) 24 | fake_losses.append(fake_loss.item()) 25 | 26 | return loss 27 | 28 | 29 | class MSEDiscriminatorLoss(BasicDiscriminatorLoss): 30 | def __init__(self, config=None): 31 | super().__init__(config) 32 | self.mse_loss = MSEDLoss() 33 | 34 | def apply_d_loss(self, scores_fake, scores_real, loss_func): 35 | total_loss = 0 36 | total_real_loss = 0 37 | total_fake_loss = 0 38 | if isinstance(scores_fake, list): 39 | # multi-scale loss 40 | for score_fake, score_real in zip(scores_fake, scores_real): 41 | loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) 42 | total_loss = total_loss + loss 43 | total_real_loss = total_real_loss + real_loss 44 | total_fake_loss = total_fake_loss + fake_loss 45 | # normalize loss values with number of scales 46 | total_loss /= len(scores_fake) 47 | total_real_loss /= len(scores_real) 48 | total_fake_loss /= len(scores_fake) 49 | else: 50 | # single scale loss 51 | total_loss, total_real_loss, total_fake_loss = loss_func(scores_fake, scores_real) 52 | return total_loss, total_real_loss, total_fake_loss 53 | 54 | def forward(self, real_scores, fake_scores): 55 | mse_D_loss, mse_D_real_loss, mse_D_fake_loss = self.apply_d_loss( 56 | scores_fake=fake_scores, 57 | scores_real=real_scores, 58 | loss_func=self.mse_loss) 59 | return mse_D_loss 60 | -------------------------------------------------------------------------------- /descriptaudiocodec/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for CLI. 3 | """ 4 | import subprocess 5 | from pathlib import Path 6 | 7 | import argbind 8 | import numpy as np 9 | import pytest 10 | import torch 11 | from audiotools import AudioSignal 12 | 13 | from dac.__main__ import run 14 | 15 | 16 | def setup_module(module): 17 | data_dir = Path(__file__).parent / "assets" 18 | data_dir.mkdir(exist_ok=True, parents=True) 19 | input_dir = data_dir / "input" 20 | input_dir.mkdir(exist_ok=True, parents=True) 21 | 22 | for i in range(5): 23 | signal = AudioSignal(np.random.randn(1000), 44_100) 24 | signal.write(input_dir / f"sample_{i}.wav") 25 | return input_dir 26 | 27 | 28 | def teardown_module(module): 29 | repo_root = Path(__file__).parent.parent 30 | subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/assets"]) 31 | 32 | 33 | @pytest.mark.parametrize("model_type", ["44khz", "24khz", "16khz"]) 34 | def test_reconstruction(model_type): 35 | # Test encoding 36 | input_dir = Path(__file__).parent / "assets" / "input" 37 | output_dir = input_dir.parent / model_type / "encoded_output" 38 | args = { 39 | "input": str(input_dir), 40 | "output": str(output_dir), 41 | "device": "cuda" if torch.cuda.is_available() else "cpu", 42 | "model_type": model_type, 43 | } 44 | with argbind.scope(args): 45 | run("encode") 46 | 47 | # Test decoding 48 | input_dir = output_dir 49 | output_dir = input_dir.parent / model_type / "decoded_output" 50 | args = { 51 | "input": str(input_dir), 52 | "output": str(output_dir), 53 | "model_type": model_type, 54 | } 55 | with argbind.scope(args): 56 | run("decode") 57 | 58 | 59 | def test_compression(): 60 | # Test encoding 61 | input_dir = Path(__file__).parent / "assets" / "input" 62 | output_dir = input_dir.parent / "encoded_output_quantizers" 63 | args = { 64 | "input": str(input_dir), 65 | "output": str(output_dir), 66 | "n_quantizers": 3, 67 | "device": "cuda" if torch.cuda.is_available() else "cpu", 68 | } 69 | with argbind.scope(args): 70 | run("encode") 71 | 72 | # Open .dac file 73 | dac_file = output_dir / "sample_0.dac" 74 | artifacts = np.load(dac_file, allow_pickle=True)[()] 75 | codes = artifacts["codes"] 76 | 77 | # Ensure that the number of quantizers is correct 78 | assert codes.shape[1] == 3 79 | 80 | # Ensure that dtype of compression is uint16 81 | assert codes.dtype == np.uint16 82 | 83 | 84 | # CUDA_VISIBLE_DEVICES=0 python -m pytest tests/test_cli.py -s 85 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/organize_daps.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import shutil 4 | from collections import defaultdict 5 | from typing import Tuple 6 | 7 | import argbind 8 | import numpy as np 9 | import tqdm 10 | from audiotools import util 11 | 12 | 13 | @argbind.bind() 14 | def split( 15 | audio_files, ratio: Tuple[float, float, float] = (0.8, 0.1, 0.1), seed: int = 0 16 | ): 17 | assert sum(ratio) == 1.0 18 | util.seed(seed) 19 | 20 | idx = np.arange(len(audio_files)) 21 | np.random.shuffle(idx) 22 | 23 | b = np.cumsum([0] + list(ratio)) * len(idx) 24 | b = [int(_b) for _b in b] 25 | train_idx = idx[b[0] : b[1]] 26 | val_idx = idx[b[1] : b[2]] 27 | test_idx = idx[b[2] :] 28 | 29 | audio_files = np.array(audio_files) 30 | train_files = audio_files[train_idx] 31 | val_files = audio_files[val_idx] 32 | test_files = audio_files[test_idx] 33 | 34 | return train_files, val_files, test_files 35 | 36 | 37 | def assign(val_split, test_split): 38 | def _assign(value): 39 | if value in val_split: 40 | return "val" 41 | if value in test_split: 42 | return "test" 43 | return "train" 44 | 45 | return _assign 46 | 47 | 48 | DAPS_VAL = ["f2", "m2"] 49 | DAPS_TEST = ["f10", "m10"] 50 | 51 | 52 | @argbind.bind(without_prefix=True) 53 | def process( 54 | dataset: str = "daps", 55 | daps_subset: str = "", 56 | ): 57 | get_split = None 58 | get_value = lambda path: path 59 | 60 | data_path = pathlib.Path("/data") 61 | dataset_path = data_path / dataset 62 | audio_files = util.find_audio(dataset_path) 63 | 64 | if dataset == "daps": 65 | get_split = assign(DAPS_VAL, DAPS_TEST) 66 | get_value = lambda path: (str(path).split("/")[-1].split("_", maxsplit=4)[0]) 67 | audio_files = [ 68 | x 69 | for x in util.find_audio(dataset_path) 70 | if daps_subset in str(x) and "breaths" not in str(x) 71 | ] 72 | 73 | if get_split is None: 74 | _, val, test = split(audio_files) 75 | get_split = assign(val, test) 76 | 77 | splits = defaultdict(list) 78 | for x in audio_files: 79 | _split = get_split(get_value(x)) 80 | splits[_split].append(x) 81 | 82 | with util.chdir(dataset_path): 83 | for k, v in splits.items(): 84 | v = sorted(v) 85 | print(f"Processing {k} in {dataset_path} of length {len(v)}") 86 | for _v in tqdm.tqdm(v): 87 | tgt_path = pathlib.Path( 88 | str(_v).replace(str(dataset_path), str(dataset_path / k)) 89 | ) 90 | tgt_path.parent.mkdir(parents=True, exist_ok=True) 91 | shutil.copyfile(_v, tgt_path) 92 | 93 | 94 | if __name__ == "__main__": 95 | args = argbind.parse_args() 96 | with argbind.scope(args): 97 | process() 98 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/get_samples.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import argbind 4 | import torch 5 | from audiotools import AudioSignal 6 | from audiotools.core import util 7 | from audiotools.ml.decorators import Tracker 8 | from train import Accelerator 9 | from train import DAC 10 | 11 | from dac.compare.encodec import Encodec 12 | 13 | Encodec = argbind.bind(Encodec) 14 | 15 | 16 | def load_state( 17 | accel: Accelerator, 18 | tracker: Tracker, 19 | save_path: str, 20 | tag: str = "latest", 21 | load_weights: bool = False, 22 | model_type: str = "dac", 23 | bandwidth: float = 24.0, 24 | ): 25 | kwargs = { 26 | "folder": f"{save_path}/{tag}", 27 | "map_location": "cpu", 28 | "package": not load_weights, 29 | } 30 | tracker.print(f"Resuming from {str(Path('.').absolute())}/{kwargs['folder']}") 31 | 32 | if model_type == "dac": 33 | generator, _ = DAC.load_from_folder(**kwargs) 34 | elif model_type == "encodec": 35 | generator = Encodec(bandwidth=bandwidth) 36 | 37 | generator = accel.prepare_model(generator) 38 | return generator 39 | 40 | 41 | @torch.no_grad() 42 | def process(signal, accel, generator, **kwargs): 43 | signal = signal.to(accel.device) 44 | recons = generator(signal.audio_data, signal.sample_rate, **kwargs)["audio"] 45 | recons = AudioSignal(recons, signal.sample_rate) 46 | recons = recons.normalize(signal.loudness()) 47 | return recons.cpu() 48 | 49 | 50 | @argbind.bind(without_prefix=True) 51 | @torch.no_grad() 52 | def get_samples( 53 | accel, 54 | path: str = "ckpt", 55 | input: str = "samples/input", 56 | output: str = "samples/output", 57 | model_type: str = "dac", 58 | model_tag: str = "latest", 59 | bandwidth: float = 24.0, 60 | n_quantizers: int = None, 61 | ): 62 | tracker = Tracker(log_file=f"{path}/eval.txt", rank=accel.local_rank) 63 | generator = load_state( 64 | accel, 65 | tracker, 66 | save_path=path, 67 | model_type=model_type, 68 | bandwidth=bandwidth, 69 | tag=model_tag, 70 | ) 71 | generator.eval() 72 | kwargs = {"n_quantizers": n_quantizers} if model_type == "dac" else {} 73 | 74 | audio_files = util.find_audio(input) 75 | 76 | global process 77 | process = tracker.track("process", len(audio_files))(process) 78 | 79 | output = Path(output) 80 | output.mkdir(parents=True, exist_ok=True) 81 | 82 | with tracker.live: 83 | for i in range(len(audio_files)): 84 | signal = AudioSignal(audio_files[i]) 85 | recons = process(signal, accel, generator, **kwargs) 86 | recons.write(output / audio_files[i].name) 87 | 88 | tracker.done("test", f"N={len(audio_files)}") 89 | 90 | 91 | if __name__ == "__main__": 92 | args = argbind.parse_args() 93 | with argbind.scope(args): 94 | with Accelerator() as accel: 95 | get_samples(accel) 96 | -------------------------------------------------------------------------------- /descriptaudiocodec/tests/test_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for CLI. 3 | """ 4 | import os 5 | import shlex 6 | import subprocess 7 | from pathlib import Path 8 | 9 | import argbind 10 | import numpy as np 11 | from audiotools import AudioSignal 12 | 13 | from dac.__main__ import run 14 | 15 | 16 | def make_fake_data(data_dir=Path(__file__).parent / "assets"): 17 | data_dir.mkdir(exist_ok=True, parents=True) 18 | input_dir = data_dir / "input" 19 | input_dir.mkdir(exist_ok=True, parents=True) 20 | 21 | for i in range(100): 22 | signal = AudioSignal(np.random.randn(44_100 * 5), 44_100) 23 | signal.write(input_dir / f"sample_{i}.wav") 24 | return input_dir 25 | 26 | 27 | def make_fake_data_tree(): 28 | data_dir = Path(__file__).parent / "assets" 29 | 30 | for relative_dir in [ 31 | "train/speech", 32 | "train/music", 33 | "train/env", 34 | "val/speech", 35 | "val/music", 36 | "val/env", 37 | "test/speech", 38 | "test/music", 39 | "test/env", 40 | ]: 41 | leaf_dir = data_dir / relative_dir 42 | leaf_dir.mkdir(exist_ok=True, parents=True) 43 | make_fake_data(leaf_dir) 44 | return { 45 | split: { 46 | key: [str(data_dir / f"{split}/{key}")] 47 | for key in ["speech", "music", "env"] 48 | } 49 | for split in ["train", "val", "test"] 50 | } 51 | 52 | 53 | def setup_module(module): 54 | # Make fake dataset dir 55 | input_datasets = make_fake_data_tree() 56 | repo_root = Path(__file__).parent.parent 57 | 58 | # Load baseline conf and modify it for testing 59 | conf = argbind.load_args(repo_root / "conf" / "ablations" / "baseline.yml") 60 | 61 | for key in ["train", "val", "test"]: 62 | conf[f"{key}/build_dataset.folders"] = input_datasets[key] 63 | conf["num_iters"] = 1 64 | conf["val/AudioDataset.n_examples"] = 1 65 | conf["val_idx"] = [0] 66 | conf["val_batch_size"] = 1 67 | 68 | argbind.dump_args(conf, Path(__file__).parent / "assets" / "conf.yml") 69 | 70 | 71 | def teardown_module(module): 72 | repo_root = Path(__file__).parent.parent 73 | # Remove fake dataset dir 74 | subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/assets"]) 75 | subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/runs"]) 76 | 77 | 78 | def test_single_gpu_train(): 79 | env = os.environ.copy() 80 | env["CUDA_VISIBLE_DEVICES"] = "0" 81 | repo_root = Path(__file__).parent.parent 82 | args = shlex.split( 83 | f"python {repo_root}/scripts/train.py --args.load {repo_root}/tests/assets/conf.yml --save_path {repo_root}/tests/runs/baseline" 84 | ) 85 | subprocess.check_output(args, env=env) 86 | 87 | 88 | def test_multi_gpu_train(): 89 | env = os.environ.copy() 90 | env["CUDA_VISIBLE_DEVICES"] = "0,1" 91 | repo_root = Path(__file__).parent.parent 92 | args = shlex.split( 93 | f"torchrun --nproc_per_node gpu {repo_root}/scripts/train.py --args.load {repo_root}/tests/assets/conf.yml --save_path {repo_root}/tests/runs/baseline_multigpu" 94 | ) 95 | subprocess.check_output(args, env=env) 96 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | import sys 5 | import torchaudio 6 | 7 | import torch 8 | import typing as tp 9 | from omegaconf import OmegaConf 10 | 11 | from models.soundstream_semantic import SoundStream 12 | import torch.nn.functional as F 13 | 14 | 15 | def build_codec_model(config): 16 | model = eval(config.generator.name)(**config.generator.config) 17 | return model 18 | 19 | def save_audio(wav: torch.Tensor, path: tp.Union[Path, str], sample_rate: int, rescale: bool = False): 20 | limit = 0.99 21 | mx = wav.abs().max() 22 | if rescale: 23 | wav = wav * min(limit / mx, 1) 24 | else: 25 | wav = wav.clamp(-limit, limit) 26 | 27 | path = str(Path(path).with_suffix('.wav')) 28 | torchaudio.save(path, wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16) 29 | 30 | 31 | 32 | def process_audio(input_file, output_file, rescale, args, config, soundstream): 33 | # Loading audio 34 | wav, sr = torchaudio.load(input_file) 35 | if wav.size(0) > 1: 36 | wav = wav.mean(0, keepdim=True) # Convert to mono 37 | if sr != soundstream.sample_rate: 38 | wav = torchaudio.transforms.Resample(sr, soundstream.sample_rate)(wav) 39 | if config.audio_norm_scale < 1.0: 40 | wav = wav * config.audio_norm_scale 41 | 42 | 43 | wav = wav.unsqueeze(1).cuda() 44 | compressed = soundstream.encode(wav, target_bw=args.bw) 45 | print(f"Compressed shape: {compressed.shape}") 46 | # Decode and save 47 | out = soundstream.decode(compressed) 48 | out = out.detach().cpu().squeeze(0) 49 | 50 | save_audio(out, output_file, 16000, rescale=rescale) 51 | print(f"Processed and saved: {output_file}") 52 | 53 | def main(): 54 | parser = argparse.ArgumentParser( 55 | description='High fidelity neural audio codec for a single file.') 56 | parser.add_argument('--input', type=Path, default='test_audio/speech_en.flac', help='Input audio file.') 57 | parser.add_argument('--output', type=Path, default='test_audio_reconstruction/speech_en_nq_1.wav', help='Output audio file.') 58 | parser.add_argument('--resume_path', type=str, default='speech_ckpt/hubert_1k_data/xcodec_speech_hubert.pth', help='Path to model checkpoint.') 59 | parser.add_argument('-r', '--rescale', action='store_true', help='Rescale output to avoid clipping.') 60 | #bw 0.5-> nq 1; 1->nq 2; 2->nq 4; 4->nq 8 61 | parser.add_argument('-b', '--bw', type=str, default=0.5, help='Target bandwidth.') 62 | args = parser.parse_args() 63 | 64 | args.bw = float(args.bw) 65 | 66 | if not args.input.exists(): 67 | sys.exit(f"Input file {args.input} does not exist.") 68 | 69 | config_path = os.path.join(os.path.dirname(args.resume_path), 'config.yaml') 70 | if not os.path.isfile(config_path): 71 | sys.exit(f"{config_path} file does not exist.") 72 | 73 | config = OmegaConf.load(config_path) 74 | soundstream = build_codec_model(config) 75 | parameter_dict = torch.load(args.resume_path) 76 | soundstream.load_state_dict(parameter_dict ) # Load model 77 | soundstream = soundstream.cuda() 78 | soundstream.eval() 79 | 80 | process_audio(args.input, args.output, args.rescale, args, config, soundstream) 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /descriptaudiocodec/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/env.sh 108 | venv/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # Files created by experiments 131 | output/ 132 | snapshot/ 133 | *.m4a 134 | *.wav 135 | notebooks/scratch.ipynb 136 | notebooks/inspect.ipynb 137 | notebooks/effects.ipynb 138 | notebooks/*.ipynb 139 | notebooks/*.gif 140 | notebooks/*.wav 141 | notebooks/*.mp4 142 | *runs/ 143 | boards/ 144 | samples/ 145 | *.ipynb 146 | 147 | results.json 148 | metrics.csv 149 | mprofile_* 150 | mem.png 151 | 152 | results/ 153 | mprofile* 154 | *.png 155 | # do not ignore the test wav file 156 | !tests/audio/short_test_audio.wav 157 | !tests/audio/output.wav 158 | */.DS_Store 159 | .DS_Store 160 | env.sh 161 | _codebraid/ 162 | **/*.html 163 | **/*.exec.md 164 | flagged/ 165 | log.txt 166 | ckpt/ 167 | .syncthing* 168 | tests/assets/ 169 | archived/ 170 | 171 | *_remote_module_* 172 | *.zip 173 | *.pth 174 | encoded_out/ 175 | recon/ 176 | recons/ 177 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/base.yml: -------------------------------------------------------------------------------- 1 | # Model setup 2 | DAC.sample_rate: 44100 3 | DAC.encoder_dim: 64 4 | DAC.encoder_rates: [2, 4, 8, 8] 5 | DAC.decoder_dim: 1536 6 | DAC.decoder_rates: [8, 8, 4, 2] 7 | 8 | # Quantization 9 | DAC.n_codebooks: 9 10 | DAC.codebook_size: 1024 11 | DAC.codebook_dim: 8 12 | DAC.quantizer_dropout: 1.0 13 | 14 | # Discriminator 15 | Discriminator.sample_rate: 44100 16 | Discriminator.rates: [] 17 | Discriminator.periods: [2, 3, 5, 7, 11] 18 | Discriminator.fft_sizes: [2048, 1024, 512] 19 | Discriminator.bands: 20 | - [0.0, 0.1] 21 | - [0.1, 0.25] 22 | - [0.25, 0.5] 23 | - [0.5, 0.75] 24 | - [0.75, 1.0] 25 | 26 | # Optimization 27 | AdamW.betas: [0.8, 0.99] 28 | AdamW.lr: 0.0001 29 | ExponentialLR.gamma: 0.999996 30 | 31 | amp: false 32 | val_batch_size: 100 33 | device: cuda 34 | num_iters: 250000 35 | save_iters: [10000, 50000, 100000, 200000] 36 | valid_freq: 1000 37 | sample_freq: 10000 38 | num_workers: 32 39 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7] 40 | seed: 0 41 | lambdas: 42 | mel/loss: 15.0 43 | adv/feat_loss: 2.0 44 | adv/gen_loss: 1.0 45 | vq/commitment_loss: 0.25 46 | vq/codebook_loss: 1.0 47 | 48 | VolumeNorm.db: [const, -16] 49 | 50 | # Transforms 51 | build_transform.preprocess: 52 | - Identity 53 | build_transform.augment_prob: 0.0 54 | build_transform.augment: 55 | - Identity 56 | build_transform.postprocess: 57 | - VolumeNorm 58 | - RescaleAudio 59 | - ShiftPhase 60 | 61 | # Loss setup 62 | MultiScaleSTFTLoss.window_lengths: [2048, 512] 63 | MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] 64 | MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] 65 | MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] 66 | MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] 67 | MelSpectrogramLoss.pow: 1.0 68 | MelSpectrogramLoss.clamp_eps: 1.0e-5 69 | MelSpectrogramLoss.mag_weight: 0.0 70 | 71 | # Data 72 | batch_size: 72 73 | train/AudioDataset.duration: 0.38 74 | train/AudioDataset.n_examples: 10000000 75 | 76 | val/AudioDataset.duration: 5.0 77 | val/build_transform.augment_prob: 1.0 78 | val/AudioDataset.n_examples: 250 79 | 80 | test/AudioDataset.duration: 10.0 81 | test/build_transform.augment_prob: 1.0 82 | test/AudioDataset.n_examples: 1000 83 | 84 | AudioLoader.shuffle: true 85 | AudioDataset.without_replacement: true 86 | 87 | train/build_dataset.folders: 88 | speech_fb: 89 | - /data/daps/train 90 | speech_hq: 91 | - /data/vctk 92 | - /data/vocalset 93 | - /data/read_speech 94 | - /data/french_speech 95 | speech_uq: 96 | - /data/emotional_speech/ 97 | - /data/common_voice/ 98 | - /data/german_speech/ 99 | - /data/russian_speech/ 100 | - /data/spanish_speech/ 101 | music_hq: 102 | - /data/musdb/train 103 | music_uq: 104 | - /data/jamendo 105 | general: 106 | - /data/audioset/data/unbalanced_train_segments/ 107 | - /data/audioset/data/balanced_train_segments/ 108 | 109 | val/build_dataset.folders: 110 | speech_hq: 111 | - /data/daps/val 112 | music_hq: 113 | - /data/musdb/test 114 | general: 115 | - /data/audioset/data/eval_segments/ 116 | 117 | test/build_dataset.folders: 118 | speech_hq: 119 | - /data/daps/test 120 | music_hq: 121 | - /data/musdb/test 122 | general: 123 | - /data/audioset/data/eval_segments/ 124 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/final/16khz.yml: -------------------------------------------------------------------------------- 1 | # Model setup 2 | DAC.sample_rate: 16000 3 | DAC.encoder_dim: 64 4 | DAC.encoder_rates: [2, 4, 5, 8] 5 | DAC.decoder_dim: 1536 6 | DAC.decoder_rates: [8, 5, 4, 2] 7 | 8 | # Quantization 9 | DAC.n_codebooks: 12 10 | DAC.codebook_size: 1024 11 | DAC.codebook_dim: 8 12 | DAC.quantizer_dropout: 0.5 13 | 14 | # Discriminator 15 | Discriminator.sample_rate: 16000 16 | Discriminator.rates: [] 17 | Discriminator.periods: [2, 3, 5, 7, 11] 18 | Discriminator.fft_sizes: [2048, 1024, 512] 19 | Discriminator.bands: 20 | - [0.0, 0.1] 21 | - [0.1, 0.25] 22 | - [0.25, 0.5] 23 | - [0.5, 0.75] 24 | - [0.75, 1.0] 25 | 26 | # Optimization 27 | AdamW.betas: [0.8, 0.99] 28 | AdamW.lr: 0.0001 29 | ExponentialLR.gamma: 0.999996 30 | 31 | amp: false 32 | val_batch_size: 100 33 | device: cuda 34 | num_iters: 400000 35 | save_iters: [10000, 50000, 100000, 200000] 36 | valid_freq: 1000 37 | sample_freq: 10000 38 | num_workers: 32 39 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7] 40 | seed: 0 41 | lambdas: 42 | mel/loss: 15.0 43 | adv/feat_loss: 2.0 44 | adv/gen_loss: 1.0 45 | vq/commitment_loss: 0.25 46 | vq/codebook_loss: 1.0 47 | 48 | VolumeNorm.db: [const, -16] 49 | 50 | # Transforms 51 | build_transform.preprocess: 52 | - Identity 53 | build_transform.augment_prob: 0.0 54 | build_transform.augment: 55 | - Identity 56 | build_transform.postprocess: 57 | - VolumeNorm 58 | - RescaleAudio 59 | - ShiftPhase 60 | 61 | # Loss setup 62 | MultiScaleSTFTLoss.window_lengths: [2048, 512] 63 | MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] 64 | MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] 65 | MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] 66 | MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] 67 | MelSpectrogramLoss.pow: 1.0 68 | MelSpectrogramLoss.clamp_eps: 1.0e-5 69 | MelSpectrogramLoss.mag_weight: 0.0 70 | 71 | # Data 72 | batch_size: 72 73 | train/AudioDataset.duration: 0.38 74 | train/AudioDataset.n_examples: 10000000 75 | 76 | val/AudioDataset.duration: 5.0 77 | val/build_transform.augment_prob: 1.0 78 | val/AudioDataset.n_examples: 250 79 | 80 | test/AudioDataset.duration: 10.0 81 | test/build_transform.augment_prob: 1.0 82 | test/AudioDataset.n_examples: 1000 83 | 84 | AudioLoader.shuffle: true 85 | AudioDataset.without_replacement: true 86 | 87 | train/build_dataset.folders: 88 | speech_fb: 89 | - /data/daps/train 90 | speech_hq: 91 | - /data/vctk 92 | - /data/vocalset 93 | - /data/read_speech 94 | - /data/french_speech 95 | speech_uq: 96 | - /data/emotional_speech/ 97 | - /data/common_voice/ 98 | - /data/german_speech/ 99 | - /data/russian_speech/ 100 | - /data/spanish_speech/ 101 | music_hq: 102 | - /data/musdb/train 103 | music_uq: 104 | - /data/jamendo 105 | general: 106 | - /data/audioset/data/unbalanced_train_segments/ 107 | - /data/audioset/data/balanced_train_segments/ 108 | 109 | val/build_dataset.folders: 110 | speech_hq: 111 | - /data/daps/val 112 | music_hq: 113 | - /data/musdb/test 114 | general: 115 | - /data/audioset/data/eval_segments/ 116 | 117 | test/build_dataset.folders: 118 | speech_hq: 119 | - /data/daps/test 120 | music_hq: 121 | - /data/musdb/test 122 | general: 123 | - /data/audioset/data/eval_segments/ 124 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/final/24khz.yml: -------------------------------------------------------------------------------- 1 | # Model setup 2 | DAC.sample_rate: 24000 3 | DAC.encoder_dim: 64 4 | DAC.encoder_rates: [2, 4, 5, 8] 5 | DAC.decoder_dim: 1536 6 | DAC.decoder_rates: [8, 5, 4, 2] 7 | 8 | # Quantization 9 | DAC.n_codebooks: 32 10 | DAC.codebook_size: 1024 11 | DAC.codebook_dim: 8 12 | DAC.quantizer_dropout: 0.5 13 | 14 | # Discriminator 15 | Discriminator.sample_rate: 24000 16 | Discriminator.rates: [] 17 | Discriminator.periods: [2, 3, 5, 7, 11] 18 | Discriminator.fft_sizes: [2048, 1024, 512] 19 | Discriminator.bands: 20 | - [0.0, 0.1] 21 | - [0.1, 0.25] 22 | - [0.25, 0.5] 23 | - [0.5, 0.75] 24 | - [0.75, 1.0] 25 | 26 | # Optimization 27 | AdamW.betas: [0.8, 0.99] 28 | AdamW.lr: 0.0001 29 | ExponentialLR.gamma: 0.999996 30 | 31 | amp: false 32 | val_batch_size: 100 33 | device: cuda 34 | num_iters: 400000 35 | save_iters: [10000, 50000, 100000, 200000] 36 | valid_freq: 1000 37 | sample_freq: 10000 38 | num_workers: 32 39 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7] 40 | seed: 0 41 | lambdas: 42 | mel/loss: 15.0 43 | adv/feat_loss: 2.0 44 | adv/gen_loss: 1.0 45 | vq/commitment_loss: 0.25 46 | vq/codebook_loss: 1.0 47 | 48 | VolumeNorm.db: [const, -16] 49 | 50 | # Transforms 51 | build_transform.preprocess: 52 | - Identity 53 | build_transform.augment_prob: 0.0 54 | build_transform.augment: 55 | - Identity 56 | build_transform.postprocess: 57 | - VolumeNorm 58 | - RescaleAudio 59 | - ShiftPhase 60 | 61 | # Loss setup 62 | MultiScaleSTFTLoss.window_lengths: [2048, 512] 63 | MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] 64 | MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] 65 | MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] 66 | MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] 67 | MelSpectrogramLoss.pow: 1.0 68 | MelSpectrogramLoss.clamp_eps: 1.0e-5 69 | MelSpectrogramLoss.mag_weight: 0.0 70 | 71 | # Data 72 | batch_size: 72 73 | train/AudioDataset.duration: 0.38 74 | train/AudioDataset.n_examples: 10000000 75 | 76 | val/AudioDataset.duration: 5.0 77 | val/build_transform.augment_prob: 1.0 78 | val/AudioDataset.n_examples: 250 79 | 80 | test/AudioDataset.duration: 10.0 81 | test/build_transform.augment_prob: 1.0 82 | test/AudioDataset.n_examples: 1000 83 | 84 | AudioLoader.shuffle: true 85 | AudioDataset.without_replacement: true 86 | 87 | train/build_dataset.folders: 88 | speech_fb: 89 | - /data/daps/train 90 | speech_hq: 91 | - /data/vctk 92 | - /data/vocalset 93 | - /data/read_speech 94 | - /data/french_speech 95 | speech_uq: 96 | - /data/emotional_speech/ 97 | - /data/common_voice/ 98 | - /data/german_speech/ 99 | - /data/russian_speech/ 100 | - /data/spanish_speech/ 101 | music_hq: 102 | - /data/musdb/train 103 | music_uq: 104 | - /data/jamendo 105 | general: 106 | - /data/audioset/data/unbalanced_train_segments/ 107 | - /data/audioset/data/balanced_train_segments/ 108 | 109 | val/build_dataset.folders: 110 | speech_hq: 111 | - /data/daps/val 112 | music_hq: 113 | - /data/musdb/test 114 | general: 115 | - /data/audioset/data/eval_segments/ 116 | 117 | test/build_dataset.folders: 118 | speech_hq: 119 | - /data/daps/test 120 | music_hq: 121 | - /data/musdb/test 122 | general: 123 | - /data/audioset/data/eval_segments/ 124 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/final/44khz.yml: -------------------------------------------------------------------------------- 1 | # Model setup 2 | DAC.sample_rate: 44100 3 | DAC.encoder_dim: 64 4 | DAC.encoder_rates: [2, 4, 8, 8] 5 | DAC.decoder_dim: 1536 6 | DAC.decoder_rates: [8, 8, 4, 2] 7 | 8 | # Quantization 9 | DAC.n_codebooks: 9 10 | DAC.codebook_size: 1024 11 | DAC.codebook_dim: 8 12 | DAC.quantizer_dropout: 0.5 13 | 14 | # Discriminator 15 | Discriminator.sample_rate: 44100 16 | Discriminator.rates: [] 17 | Discriminator.periods: [2, 3, 5, 7, 11] 18 | Discriminator.fft_sizes: [2048, 1024, 512] 19 | Discriminator.bands: 20 | - [0.0, 0.1] 21 | - [0.1, 0.25] 22 | - [0.25, 0.5] 23 | - [0.5, 0.75] 24 | - [0.75, 1.0] 25 | 26 | # Optimization 27 | AdamW.betas: [0.8, 0.99] 28 | AdamW.lr: 0.0001 29 | ExponentialLR.gamma: 0.999996 30 | 31 | amp: false 32 | val_batch_size: 100 33 | device: cuda 34 | num_iters: 400000 35 | save_iters: [10000, 50000, 100000, 200000] 36 | valid_freq: 1000 37 | sample_freq: 10000 38 | num_workers: 32 39 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7] 40 | seed: 0 41 | lambdas: 42 | mel/loss: 15.0 43 | adv/feat_loss: 2.0 44 | adv/gen_loss: 1.0 45 | vq/commitment_loss: 0.25 46 | vq/codebook_loss: 1.0 47 | 48 | VolumeNorm.db: [const, -16] 49 | 50 | # Transforms 51 | build_transform.preprocess: 52 | - Identity 53 | build_transform.augment_prob: 0.0 54 | build_transform.augment: 55 | - Identity 56 | build_transform.postprocess: 57 | - VolumeNorm 58 | - RescaleAudio 59 | - ShiftPhase 60 | 61 | # Loss setup 62 | MultiScaleSTFTLoss.window_lengths: [2048, 512] 63 | MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] 64 | MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] 65 | MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] 66 | MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] 67 | MelSpectrogramLoss.pow: 1.0 68 | MelSpectrogramLoss.clamp_eps: 1.0e-5 69 | MelSpectrogramLoss.mag_weight: 0.0 70 | 71 | # Data 72 | batch_size: 72 73 | train/AudioDataset.duration: 0.38 74 | train/AudioDataset.n_examples: 10000000 75 | 76 | val/AudioDataset.duration: 5.0 77 | val/build_transform.augment_prob: 1.0 78 | val/AudioDataset.n_examples: 250 79 | 80 | test/AudioDataset.duration: 10.0 81 | test/build_transform.augment_prob: 1.0 82 | test/AudioDataset.n_examples: 1000 83 | 84 | AudioLoader.shuffle: true 85 | AudioDataset.without_replacement: true 86 | 87 | train/build_dataset.folders: 88 | speech_fb: 89 | - /data/daps/train 90 | speech_hq: 91 | - /data/vctk 92 | - /data/vocalset 93 | - /data/read_speech 94 | - /data/french_speech 95 | speech_uq: 96 | - /data/emotional_speech/ 97 | - /data/common_voice/ 98 | - /data/german_speech/ 99 | - /data/russian_speech/ 100 | - /data/spanish_speech/ 101 | music_hq: 102 | - /data/musdb/train 103 | music_uq: 104 | - /data/jamendo 105 | general: 106 | - /data/audioset/data/unbalanced_train_segments/ 107 | - /data/audioset/data/balanced_train_segments/ 108 | 109 | val/build_dataset.folders: 110 | speech_hq: 111 | - /data/daps/val 112 | music_hq: 113 | - /data/musdb/test 114 | general: 115 | - /data/audioset/data/eval_segments/ 116 | 117 | test/build_dataset.folders: 118 | speech_hq: 119 | - /data/daps/test 120 | music_hq: 121 | - /data/musdb/test 122 | general: 123 | - /data/audioset/data/eval_segments/ 124 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/utils/decode.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | 4 | import argbind 5 | import numpy as np 6 | import torch 7 | from audiotools import AudioSignal 8 | from tqdm import tqdm 9 | 10 | from dac import DACFile 11 | from dac.utils import load_model 12 | 13 | warnings.filterwarnings("ignore", category=UserWarning) 14 | 15 | 16 | @argbind.bind(group="decode", positional=True, without_prefix=True) 17 | @torch.inference_mode() 18 | @torch.no_grad() 19 | def decode( 20 | input: str, 21 | output: str = "", 22 | weights_path: str = "", 23 | model_tag: str = "latest", 24 | model_bitrate: str = "8kbps", 25 | device: str = "cuda", 26 | model_type: str = "44khz", 27 | verbose: bool = False, 28 | ): 29 | """Decode audio from codes. 30 | 31 | Parameters 32 | ---------- 33 | input : str 34 | Path to input directory or file 35 | output : str, optional 36 | Path to output directory, by default "". 37 | If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`. 38 | weights_path : str, optional 39 | Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the 40 | model_tag and model_type. 41 | model_tag : str, optional 42 | Tag of the model to use, by default "latest". Ignored if `weights_path` is specified. 43 | model_bitrate: str 44 | Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps". 45 | device : str, optional 46 | Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU. 47 | model_type : str, optional 48 | The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified. 49 | """ 50 | generator = load_model( 51 | model_type=model_type, 52 | model_bitrate=model_bitrate, 53 | tag=model_tag, 54 | load_path=weights_path, 55 | ) 56 | generator.to(device) 57 | generator.eval() 58 | 59 | # Find all .dac files in input directory 60 | _input = Path(input) 61 | input_files = list(_input.glob("**/*.dac")) 62 | 63 | # If input is a .dac file, add it to the list 64 | if _input.suffix == ".dac": 65 | input_files.append(_input) 66 | 67 | # Create output directory 68 | output = Path(output) 69 | output.mkdir(parents=True, exist_ok=True) 70 | 71 | for i in tqdm(range(len(input_files)), desc=f"Decoding files"): 72 | # Load file 73 | artifact = DACFile.load(input_files[i]) 74 | 75 | # Reconstruct audio from codes 76 | recons = generator.decompress(artifact, verbose=verbose) 77 | 78 | # Compute output path 79 | relative_path = input_files[i].relative_to(input) 80 | output_dir = output / relative_path.parent 81 | if not relative_path.name: 82 | output_dir = output 83 | relative_path = input_files[i] 84 | output_name = relative_path.with_suffix(".wav").name 85 | output_path = output_dir / output_name 86 | output_path.parent.mkdir(parents=True, exist_ok=True) 87 | 88 | # Write to file 89 | recons.write(output_path) 90 | 91 | 92 | if __name__ == "__main__": 93 | args = argbind.parse_args() 94 | with argbind.scope(args): 95 | decode() 96 | -------------------------------------------------------------------------------- /descriptaudiocodec/conf/final/44khz-16kbps.yml: -------------------------------------------------------------------------------- 1 | # Model setup 2 | DAC.sample_rate: 44100 3 | DAC.encoder_dim: 64 4 | DAC.encoder_rates: [2, 4, 8, 8] 5 | DAC.latent_dim: 128 6 | DAC.decoder_dim: 1536 7 | DAC.decoder_rates: [8, 8, 4, 2] 8 | 9 | # Quantization 10 | DAC.n_codebooks: 18 # Max bitrate of 16kbps 11 | DAC.codebook_size: 1024 12 | DAC.codebook_dim: 8 13 | DAC.quantizer_dropout: 0.5 14 | 15 | # Discriminator 16 | Discriminator.sample_rate: 44100 17 | Discriminator.rates: [] 18 | Discriminator.periods: [2, 3, 5, 7, 11] 19 | Discriminator.fft_sizes: [2048, 1024, 512] 20 | Discriminator.bands: 21 | - [0.0, 0.1] 22 | - [0.1, 0.25] 23 | - [0.25, 0.5] 24 | - [0.5, 0.75] 25 | - [0.75, 1.0] 26 | 27 | # Optimization 28 | AdamW.betas: [0.8, 0.99] 29 | AdamW.lr: 0.0001 30 | ExponentialLR.gamma: 0.999996 31 | 32 | amp: false 33 | val_batch_size: 100 34 | device: cuda 35 | num_iters: 400000 36 | save_iters: [10000, 50000, 100000, 200000] 37 | valid_freq: 1000 38 | sample_freq: 10000 39 | num_workers: 32 40 | val_idx: [0, 1, 2, 3, 4, 5, 6, 7] 41 | seed: 0 42 | lambdas: 43 | mel/loss: 15.0 44 | adv/feat_loss: 2.0 45 | adv/gen_loss: 1.0 46 | vq/commitment_loss: 0.25 47 | vq/codebook_loss: 1.0 48 | 49 | VolumeNorm.db: [const, -16] 50 | 51 | # Transforms 52 | build_transform.preprocess: 53 | - Identity 54 | build_transform.augment_prob: 0.0 55 | build_transform.augment: 56 | - Identity 57 | build_transform.postprocess: 58 | - VolumeNorm 59 | - RescaleAudio 60 | - ShiftPhase 61 | 62 | # Loss setup 63 | MultiScaleSTFTLoss.window_lengths: [2048, 512] 64 | MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320] 65 | MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048] 66 | MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0] 67 | MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null] 68 | MelSpectrogramLoss.pow: 1.0 69 | MelSpectrogramLoss.clamp_eps: 1.0e-5 70 | MelSpectrogramLoss.mag_weight: 0.0 71 | 72 | # Data 73 | batch_size: 72 74 | train/AudioDataset.duration: 0.38 75 | train/AudioDataset.n_examples: 10000000 76 | 77 | val/AudioDataset.duration: 5.0 78 | val/build_transform.augment_prob: 1.0 79 | val/AudioDataset.n_examples: 250 80 | 81 | test/AudioDataset.duration: 10.0 82 | test/build_transform.augment_prob: 1.0 83 | test/AudioDataset.n_examples: 1000 84 | 85 | AudioLoader.shuffle: true 86 | AudioDataset.without_replacement: true 87 | 88 | train/build_dataset.folders: 89 | speech_fb: 90 | - /data/daps/train 91 | speech_hq: 92 | - /data/vctk 93 | - /data/vocalset 94 | - /data/read_speech 95 | - /data/french_speech 96 | speech_uq: 97 | - /data/emotional_speech/ 98 | - /data/common_voice/ 99 | - /data/german_speech/ 100 | - /data/russian_speech/ 101 | - /data/spanish_speech/ 102 | music_hq: 103 | - /data/musdb/train 104 | music_uq: 105 | - /data/jamendo 106 | general: 107 | - /data/audioset/data/unbalanced_train_segments/ 108 | - /data/audioset/data/balanced_train_segments/ 109 | 110 | val/build_dataset.folders: 111 | speech_hq: 112 | - /data/daps/val 113 | music_hq: 114 | - /data/musdb/test 115 | general: 116 | - /data/audioset/data/eval_segments/ 117 | 118 | test/build_dataset.folders: 119 | speech_hq: 120 | - /data/daps/test 121 | music_hq: 122 | - /data/musdb/test 123 | general: 124 | - /data/audioset/data/eval_segments/ 125 | -------------------------------------------------------------------------------- /descriptaudiocodec/dac/utils/encode.py: -------------------------------------------------------------------------------- 1 | import math 2 | import warnings 3 | from pathlib import Path 4 | 5 | import argbind 6 | import numpy as np 7 | import torch 8 | from audiotools import AudioSignal 9 | from audiotools.core import util 10 | from tqdm import tqdm 11 | 12 | from dac.utils import load_model 13 | 14 | warnings.filterwarnings("ignore", category=UserWarning) 15 | 16 | 17 | @argbind.bind(group="encode", positional=True, without_prefix=True) 18 | @torch.inference_mode() 19 | @torch.no_grad() 20 | def encode( 21 | input: str, 22 | output: str = "", 23 | weights_path: str = "", 24 | model_tag: str = "latest", 25 | model_bitrate: str = "8kbps", 26 | n_quantizers: int = None, 27 | device: str = "cuda", 28 | model_type: str = "44khz", 29 | win_duration: float = 5.0, 30 | verbose: bool = False, 31 | ): 32 | """Encode audio files in input path to .dac format. 33 | 34 | Parameters 35 | ---------- 36 | input : str 37 | Path to input audio file or directory 38 | output : str, optional 39 | Path to output directory, by default "". If `input` is a directory, the directory sub-tree relative to `input` is re-created in `output`. 40 | weights_path : str, optional 41 | Path to weights file, by default "". If not specified, the weights file will be downloaded from the internet using the 42 | model_tag and model_type. 43 | model_tag : str, optional 44 | Tag of the model to use, by default "latest". Ignored if `weights_path` is specified. 45 | model_bitrate: str 46 | Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps". 47 | n_quantizers : int, optional 48 | Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate. 49 | device : str, optional 50 | Device to use, by default "cuda" 51 | model_type : str, optional 52 | The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified. 53 | """ 54 | generator = load_model( 55 | model_type=model_type, 56 | model_bitrate=model_bitrate, 57 | tag=model_tag, 58 | load_path=weights_path, 59 | ) 60 | generator.to(device) 61 | generator.eval() 62 | kwargs = {"n_quantizers": n_quantizers} 63 | 64 | # Find all audio files in input path 65 | input = Path(input) 66 | audio_files = util.find_audio(input) 67 | 68 | output = Path(output) 69 | output.mkdir(parents=True, exist_ok=True) 70 | 71 | for i in tqdm(range(len(audio_files)), desc="Encoding files"): 72 | # Load file 73 | signal = AudioSignal(audio_files[i]) 74 | 75 | # Encode audio to .dac format 76 | artifact = generator.compress(signal, win_duration, verbose=verbose, **kwargs) 77 | 78 | # Compute output path 79 | relative_path = audio_files[i].relative_to(input) 80 | output_dir = output / relative_path.parent 81 | if not relative_path.name: 82 | output_dir = output 83 | relative_path = audio_files[i] 84 | output_name = relative_path.with_suffix(".dac").name 85 | output_path = output_dir / output_name 86 | output_path.parent.mkdir(parents=True, exist_ok=True) 87 | 88 | artifact.save(output_path) 89 | 90 | 91 | if __name__ == "__main__": 92 | args = argbind.parse_args() 93 | with argbind.scope(args): 94 | encode() 95 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/evaluate.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import multiprocessing as mp 3 | from concurrent.futures import ProcessPoolExecutor 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | 7 | import argbind 8 | import torch 9 | from audiotools import AudioSignal 10 | from audiotools import metrics 11 | from audiotools.core import util 12 | from audiotools.ml.decorators import Tracker 13 | from train import losses 14 | 15 | 16 | @dataclass 17 | class State: 18 | stft_loss: losses.MultiScaleSTFTLoss 19 | mel_loss: losses.MelSpectrogramLoss 20 | waveform_loss: losses.L1Loss 21 | sisdr_loss: losses.SISDRLoss 22 | 23 | 24 | def get_metrics(signal_path, recons_path, state): 25 | output = {} 26 | signal = AudioSignal(signal_path) 27 | recons = AudioSignal(recons_path) 28 | for sr in [22050, 44100]: 29 | x = signal.clone().resample(sr) 30 | y = recons.clone().resample(sr) 31 | k = "22k" if sr == 22050 else "44k" 32 | output.update( 33 | { 34 | f"mel-{k}": state.mel_loss(x, y), 35 | f"stft-{k}": state.stft_loss(x, y), 36 | f"waveform-{k}": state.waveform_loss(x, y), 37 | f"sisdr-{k}": state.sisdr_loss(x, y), 38 | f"visqol-audio-{k}": metrics.quality.visqol(x, y), 39 | f"visqol-speech-{k}": metrics.quality.visqol(x, y, "speech"), 40 | } 41 | ) 42 | output["path"] = signal.path_to_file 43 | output.update(signal.metadata) 44 | return output 45 | 46 | 47 | @argbind.bind(without_prefix=True) 48 | @torch.no_grad() 49 | def evaluate( 50 | input: str = "samples/input", 51 | output: str = "samples/output", 52 | n_proc: int = 50, 53 | ): 54 | tracker = Tracker() 55 | 56 | waveform_loss = losses.L1Loss() 57 | stft_loss = losses.MultiScaleSTFTLoss() 58 | mel_loss = losses.MelSpectrogramLoss() 59 | sisdr_loss = losses.SISDRLoss() 60 | 61 | state = State( 62 | waveform_loss=waveform_loss, 63 | stft_loss=stft_loss, 64 | mel_loss=mel_loss, 65 | sisdr_loss=sisdr_loss, 66 | ) 67 | 68 | audio_files = util.find_audio(input) 69 | output = Path(output) 70 | output.mkdir(parents=True, exist_ok=True) 71 | 72 | @tracker.track("metrics", len(audio_files)) 73 | def record(future, writer): 74 | o = future.result() 75 | for k, v in o.items(): 76 | if torch.is_tensor(v): 77 | o[k] = v.item() 78 | writer.writerow(o) 79 | o.pop("path") 80 | return o 81 | 82 | futures = [] 83 | with tracker.live: 84 | with open(output / "metrics.csv", "w") as csvfile: 85 | with ProcessPoolExecutor(n_proc, mp.get_context("fork")) as pool: 86 | for i in range(len(audio_files)): 87 | future = pool.submit( 88 | get_metrics, audio_files[i], output / audio_files[i].name, state 89 | ) 90 | futures.append(future) 91 | 92 | keys = list(futures[0].result().keys()) 93 | writer = csv.DictWriter(csvfile, fieldnames=keys) 94 | writer.writeheader() 95 | 96 | for future in futures: 97 | record(future, writer) 98 | 99 | tracker.done("test", f"N={len(audio_files)}") 100 | 101 | 102 | if __name__ == "__main__": 103 | args = argbind.parse_args() 104 | with argbind.scope(args): 105 | evaluate() 106 | -------------------------------------------------------------------------------- /distributed/launch.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ 2 | # Diffsound 3 | # code based https://github.com/cientgu/VQ-Diffusion 4 | # ------------------------------------------ 5 | import os 6 | 7 | import torch 8 | from torch import distributed as dist 9 | from torch import multiprocessing as mp 10 | 11 | # import distributed as dist_fn 12 | import distributed.distributed as dist_fn 13 | 14 | 15 | def find_free_port(): 16 | import socket 17 | 18 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 19 | 20 | sock.bind(("", 0)) 21 | port = sock.getsockname()[1] 22 | sock.close() 23 | 24 | return port 25 | 26 | 27 | def launch(fn, n_gpu_per_machine, n_machine=1, machine_rank=0, dist_url=None, args=()): 28 | world_size = n_machine * n_gpu_per_machine 29 | 30 | if world_size > 1: 31 | # if "OMP_NUM_THREADS" not in os.environ: 32 | # os.environ["OMP_NUM_THREADS"] = "1" 33 | if dist_url == "auto": 34 | if n_machine != 1: 35 | raise ValueError('dist_url="auto" not supported in multi-machine jobs') 36 | port = find_free_port() 37 | dist_url = f"tcp://127.0.0.1:{port}" 38 | print('dist_url ', dist_url) 39 | print('n_machine ', n_machine) 40 | print('args ', args) 41 | print('world_size ', world_size) 42 | print('machine_rank ', machine_rank) 43 | if n_machine > 1 and dist_url.startswith("file://"): 44 | raise ValueError( 45 | "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://" 46 | ) 47 | 48 | mp.spawn( 49 | distributed_worker, 50 | nprocs=n_gpu_per_machine, 51 | args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args), 52 | daemon=False, 53 | ) 54 | # n_machine ? world_size 55 | else: 56 | local_rank = 0 57 | fn(local_rank, *args) 58 | 59 | 60 | def distributed_worker( 61 | local_rank, fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args 62 | ): 63 | if not torch.cuda.is_available(): 64 | raise OSError("CUDA is not available. Please check your environments") 65 | 66 | global_rank = machine_rank * n_gpu_per_machine + local_rank 67 | print('local_rank ',local_rank) 68 | print('global_rank ',global_rank) 69 | try: 70 | dist.init_process_group( 71 | backend="NCCL", 72 | init_method=dist_url, 73 | world_size=world_size, 74 | rank=global_rank, 75 | ) 76 | 77 | except Exception: 78 | raise OSError("failed to initialize NCCL groups") 79 | 80 | # changed 81 | dist_fn.synchronize() 82 | 83 | if n_gpu_per_machine > torch.cuda.device_count(): 84 | raise ValueError( 85 | f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})" 86 | ) 87 | 88 | torch.cuda.set_device(local_rank) 89 | 90 | if dist_fn.LOCAL_PROCESS_GROUP is not None: 91 | raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None") 92 | 93 | # change paert 94 | 95 | n_machine = world_size // n_gpu_per_machine 96 | for i in range(n_machine): 97 | ranks_on_i = list(range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine)) 98 | pg = dist.new_group(ranks_on_i) 99 | 100 | if i == machine_rank: 101 | dist_fn.LOCAL_PROCESS_GROUP = pg 102 | 103 | fn(local_rank, *args) 104 | -------------------------------------------------------------------------------- /config/codec_16k_6kbps_v3_vqdp.yaml: -------------------------------------------------------------------------------- 1 | 2 | ########### model config ########### 3 | generator: 4 | name: SoundStream 5 | config: 6 | n_filters: 32 7 | D: 256 8 | target_bandwidths: [0.5, 1, 1.5, 2, 4] 9 | ratios: [8, 5, 4, 2] # downsampling by 320 10 | sample_rate: 16000 11 | bins: 1024 12 | semantic_techer: hubert_base 13 | # Discriminator list 14 | #d_list: ['mpd', 'msd', 'mfd'] 15 | d_list: ['mfd'] 16 | 17 | mfd: 18 | name: MultiFrequencyDiscriminator 19 | config: 20 | hop_lengths: [32, 64, 128, 256, 512, 1024] 21 | hidden_channels: [64, 128, 256, 512, 512, 512] 22 | domain: double 23 | mel_scale: true 24 | sample_rate: 16000 25 | 26 | mpd: 27 | name: MultiPeriodDiscriminator 28 | config: 29 | period_sizes: [2, 3, 5, 7, 11] 30 | period_kernel_size: 5 31 | 32 | msd: 33 | name: MultiScaleDiscriminator 34 | config: 35 | num_scales: 3 36 | pool_kernel_size: 4 37 | pool_stride: 2 38 | 39 | ########### optimizer config ########### 40 | optimizer: 41 | g: 42 | name: AdamW 43 | config: 44 | lr: 2e-4 45 | betas: [0.8, 0.99] 46 | eps: 1.0e-6 47 | 48 | d: 49 | name: AdamW 50 | config: 51 | lr: 2e-4 52 | betas: [0.8, 0.99] 53 | eps: 1.0e-6 54 | 55 | lr_scheduler: 56 | g: 57 | name: ExponentialLR 58 | config: 59 | gamma: 0.999 60 | d: 61 | name: ExponentialLR 62 | config: 63 | gamma: 0.999 64 | 65 | ########### criterion config ########### 66 | criterion: 67 | g_criterion: 68 | name: losses.generator_loss.GeneratorSTFTLoss 69 | config: 70 | use_mel_loss: false 71 | #adv_criterion: LeastDLoss 72 | adv_criterion: MSEGLoss 73 | mel_loss_weight: 45 74 | use_feature_match: true 75 | feat_match_loss_weight: 20 76 | use_full_stft_loss: true # Magnitude 77 | use_sub_stft_loss: true # PQMF loss 78 | full_stft_loss_weight: 1 79 | sub_stft_loss_weight: 1 80 | mel_scale_loss: 81 | sampling_rate: 16000 82 | n_fft: 1024 83 | num_mels: 80 84 | hop_size: 160 85 | win_size: 800 86 | fmin: 0 87 | full_multi_scale_stft_loss: # Full-band multi-scale STFT loss. 88 | fft_sizes: [512, 1024, 2048] 89 | win_sizes: [480, 960, 1200] 90 | hop_sizes: [120, 240, 300] 91 | sub_multi_scale_stft_loss: # Sub-band multi-scale STFT loss. 92 | num_bands: 6 93 | fft_sizes: [128, 256, 256] 94 | win_sizes: [80, 120, 200] 95 | hop_sizes: [20, 40, 50] 96 | 97 | d_criterion: 98 | name: losses.discriminator_loss.MSEDiscriminatorLoss 99 | config: null 100 | 101 | commit_loss_weight: 1. #1000 102 | semantic_loss_weight: 100 103 | ########### training and data config ########### 104 | 105 | training_file: "/aifs4su/data/zheny/fairseq/vae_v2/codec_final/list_librispeech/train.txt" 106 | validation_file: "/aifs4su/data/zheny/fairseq/vae_v2/codec_final/list_librispeech/valid.txt" 107 | 108 | seed: 2333 109 | cudnn_deterministic: false 110 | tensorboard: true # whether to use tensorboard 111 | #checkpoint_interval: 5 112 | #summary_interval: 10 113 | #validation_interval: 10 114 | 115 | checkpoint_interval: 5000 116 | summary_interval: 100 117 | validation_interval: 5000 118 | 119 | num_epoches: 5000 120 | print_freq: 10 121 | discriminator_iter_start: 0 # start step after which we update discriminators 122 | num_ckpt_keep: 10 123 | 124 | segment_size: 48000 125 | audio_norm_scale: 0.95 126 | batch_size: 8 127 | num_workers: 8 128 | num_plots: 8 129 | -------------------------------------------------------------------------------- /descriptaudiocodec/scripts/mushra.py: -------------------------------------------------------------------------------- 1 | import string 2 | from dataclasses import dataclass 3 | from pathlib import Path 4 | from typing import List 5 | 6 | import argbind 7 | import gradio as gr 8 | from audiotools import preference as pr 9 | 10 | 11 | @argbind.bind(without_prefix=True) 12 | @dataclass 13 | class Config: 14 | folder: str = None 15 | save_path: str = "results.csv" 16 | conditions: List[str] = None 17 | reference: str = None 18 | seed: int = 0 19 | share: bool = False 20 | n_samples: int = 10 21 | 22 | 23 | def get_text(wav_file: str): 24 | txt_file = Path(wav_file).with_suffix(".txt") 25 | if Path(txt_file).exists(): 26 | with open(txt_file, "r") as f: 27 | txt = f.read() 28 | else: 29 | txt = "" 30 | return f"""
18 | 
189 | 
18 |
19 | # Experiments on VALL-E
20 |
21 |
22 |
23 |
24 |
27 |
28 | # Highlight
29 |
30 | You can easily apply our approach to enhance any existing acoustic codec:
31 |
32 | For example
33 |
34 | ```python
35 | class Codec():
36 | def __init__(self):
37 | # Acoustic codec components
38 | self.encoder = Encoder(...) # Acoustic encoder
39 | self.decoder = Decoder(...) # Acoustic decoder
40 | self.quantizer = RVQ(...) # Residual Vector Quantizer (RVQ)
41 |
42 | # Adding the semantic module
43 | self.semantic_model = AutoModel.from_pretrained(...) # e.g., Hubert, WavLM
44 |
45 | # Adding Projector
46 | self.fc_prior = nn.Linear(...)
47 | self.fc_post1 = nn.Linear(...)
48 | self.fc_post2 = nn.Linear(...)
49 |
50 | def forward(self, x, bw):
51 | # Encode the input acoustically and semantically
52 | e_acoustic = self.encoder(x)
53 | e_semantic = self.semantic_model(x)
54 |
55 | # Combine acoustic and semantic features
56 | combined_features = torch.cat([e_acoustic, e_semantic])
57 |
58 | # Apply prior transformation
59 | transformed_features = self.fc_prior(combined_features)
60 |
61 | # Quantize the unified semantic and acoustic features
62 | quantized, codes, bandwidth, commit_loss = self.quantizer(transformed_features, bw)
63 |
64 | # Post-process the quantized features
65 | quantized_semantic = self.fc_post1(quantized)
66 | quantized_acoustic = self.fc_post2(quantized)
67 |
68 | # Decode the quantized acoustic features
69 | output = self.decoder(quantized_acoustic)
70 |
71 |
72 |
73 | def semantic_loss(self,semantic,quantized_semantic):
74 | return F.mse_loss(semantic,quantized_semantic)
75 | ```
76 | For more details, please refer to our code.
77 |
78 | # Available models
79 |
80 | X-Codec is part of Hugging Face's Transformers library (see [model documentation](https://huggingface.co/docs/transformers/en/model_doc/xcodec)).
81 |
82 | Below are the Transformers-compatible checkpoints on the Hugging Face Hub 🤗
83 |
84 | | Model checkpoint | Semantic Model | Domain | Training Data |
85 | |--------------------------------------------|-----------------------------------------------------------------------|---------------|-------------------------------|
86 | | [xcodec-hubert-librispeech](https://huggingface.co/hf-audio/xcodec-hubert-librispeech) | [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) | Speech | Librispeech |
87 | | [xcodec-wavlm-mls](https://huggingface.co/hf-audio/xcodec-wavlm-mls) (not mentioned in paper) | [microsoft/wavlm-base-plus](https://huggingface.co/microsoft/wavlm-base-plus)| Speech | MLS English |
88 | | [xcodec-wavlm-more-data](https://huggingface.co/hf-audio/xcodec-wavlm-more-data) (not mentioned in paper) | [microsoft/wavlm-base-plus](https://huggingface.co/microsoft/wavlm-base-plus)| Speech | MLS English + Internal data |
89 | | [xcodec-hubert-general](https://huggingface.co/hf-audio/xcodec-hubert-general) | [ZhenYe234/hubert_base_general_audio](https://huggingface.co/ZhenYe234/hubert_base_general_audio) | General audio | 200k hours internal data |
90 | | [xcodec-hubert-general-balanced](https://huggingface.co/hf-audio/xcodec-hubert-general-balanced) (not mentioned in paper) | [ZhenYe234/hubert_base_general_audio](https://huggingface.co/ZhenYe234/hubert_base_general_audio) | General audio | More balanced data |
91 |
92 |
93 | Below are the original checkpoints.
94 |
95 | | Model name | Hugging Face | Config | Semantic Model | Domain | Training Data |
96 | |---------------------------------------------|--------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------|---------------|-------------------------------|
97 | | xcodec_hubert_librispeech | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/xcodec_speech_hubert_librispeech.pth) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert.yaml) | [🤗 Hubert-base](https://huggingface.co/facebook/hubert-base-ls960) | Speech | Librispeech |
98 | | xcodec_wavlm_mls (not mentioned in paper) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/xcodec_speech_wavlm_mls.pth) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_wavlm.yaml) | [🤗 Wavlm-base-plus](https://huggingface.co/microsoft/wavlm-base-plus) | Speech | MLS English |
99 | | xcodec_wavlm_more_data (not mentioned in paper) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/xcodec_speech_wavlm_more_data.pth) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_wavlm.yaml) | [🤗 Wavlm-base-plus](https://huggingface.co/microsoft/wavlm-base-plus) | Speech | MLS English + Internal data |
100 | | xcodec_hubert_general_audio | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/xcodec_hubert_general_audio.pth) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert_general.yaml) | [🤗Hubert-base-general-audio](https://huggingface.co/ZhenYe234/hubert_base_general_audio) | General audio | 200k hours internal data |
101 | | xcodec_hubert_general_audio_more_data (not mentioned in paper) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/xcodec_hubert_general_audio_v2.pth) | [🤗](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert_general.yaml) | [🤗Hubert-base-general-audio](https://huggingface.co/ZhenYe234/hubert_base_general_audio) | General audio | More balanced data |
102 |
103 |
104 |
105 |
106 |
107 | # Inference
108 |
109 | To run inference, first download the model and config from hugging face.
110 |
111 | ```bash
112 | python inference.py
113 | ```
114 |
115 | # Training
116 | Prepare the training_file and validation_file in config. The file should list the paths to your audio files:
117 | ```bash
118 | /path/to/your/xxx.wav
119 | /path/to/your/yyy.wav
120 | ...
121 | ```
122 | Then:
123 |
124 | ```bash
125 | torchrun --nnodes=1 --nproc-per-node=8 main_launch_vqdp.py
126 | ```
127 |
128 | ## Acknowledgement
129 | I would like to extend a special thanks to authors of Uniaudio and DAC, since our code base is mainly borrowed from [Uniaudio](https://github.com/yangdongchao/UniAudio/tree/main/codec) and [DAC](https://github.com/descriptinc/descript-audio-codec).
130 |
131 | ## Citation
132 | If you find this repo helpful, please consider citing in the following format:
133 |
134 | ```bibtex
135 | @article{ye2024codecdoesmatterexploring,
136 | title={Codec Does Matter: Exploring the Semantic Shortcoming of Codec for Audio Language Model},
137 | author={Zhen Ye and Peiwen Sun and Jiahe Lei and Hongzhan Lin and Xu Tan and Zheqi Dai and Qiuqiang Kong and Jianyi Chen and Jiahao Pan and Qifeng Liu and Yike Guo and Wei Xue},
138 | journal={arXiv preprint arXiv:2408.17175},
139 | year={2024},
140 | }
141 | ```
142 |
--------------------------------------------------------------------------------