├── .gitmodules
├── boson_multimodal
    ├── __init__.py
    ├── dataset
    │   └── __init__.py
    ├── model
    │   ├── __init__.py
    │   └── higgs_audio
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   ├── cuda_graph_runner.py
    │   │   ├── audio_head.py
    │   │   ├── custom_modules.py
    │   │   └── configuration_higgs_audio.py
    ├── data_collator
    │   └── __init__.py
    ├── audio_processing
    │   ├── descriptaudiocodec
    │   │   ├── __init__.py
    │   │   └── dac
    │   │   │   ├── nn
    │   │   │       ├── layers.py
    │   │   │       └── quantize.py
    │   │   │   └── model
    │   │   │       ├── base.py
    │   │   │       └── dac.py
    │   ├── quantization
    │   │   ├── __init__.py
    │   │   ├── distrib.py
    │   │   ├── vq.py
    │   │   ├── ddp_utils.py
    │   │   └── ac.py
    │   ├── LICENSE
    │   ├── semantic_module.py
    │   └── higgs_audio_tokenizer.py
    ├── constants.py
    ├── data_types.py
    └── serve
    │   └── utils.py
├── setup.py
├── examples
    ├── voice_prompts
    │   ├── zh_man_sichuan.txt
    │   ├── vex.txt
    │   ├── vex.wav
    │   ├── en_man.wav
    │   ├── fiftyshades_anna.txt
    │   ├── mabel.wav
    │   ├── belinda.wav
    │   ├── chadwick.wav
    │   ├── en_woman.wav
    │   ├── mabaoguo.wav
    │   ├── bigbang_amy.wav
    │   ├── shrek_fiona.wav
    │   ├── shrek_shrek.wav
    │   ├── broom_salesman.wav
    │   ├── shrek_donkey.wav
    │   ├── zh_man_sichuan.wav
    │   ├── bigbang_sheldon.wav
    │   ├── fiftyshades_anna.wav
    │   ├── shrek_donkey_es.wav
    │   ├── chadwick.txt
    │   ├── en_man.txt
    │   ├── belinda.txt
    │   ├── en_woman.txt
    │   ├── mabaoguo.txt
    │   ├── shrek_donkey_es.txt
    │   ├── shrek_shrek.txt
    │   ├── mabel.txt
    │   ├── shrek_fiona.txt
    │   ├── bigbang_amy.txt
    │   ├── bigbang_sheldon.txt
    │   ├── broom_salesman.txt
    │   ├── shrek_donkey.txt
    │   └── profile.yaml
    ├── scene_prompts
    │   ├── quiet_indoor.txt
    │   └── reading_blog.txt
    ├── serve_engine
    │   ├── voice_examples
    │   │   └── old_man.wav
    │   ├── README.md
    │   ├── run_hf_example.py
    │   └── input_samples.py
    ├── transcript
    │   ├── single_speaker
    │   │   ├── en_basic.txt
    │   │   ├── experimental
    │   │   │   ├── en_humming.txt
    │   │   │   └── en_bgm.txt
    │   │   ├── zh_ai.txt
    │   │   ├── en_dl.txt
    │   │   └── en_higgs_audio_blog.md
    │   └── multi_speaker
    │   │   ├── en_argument.txt
    │   │   └── en_higgs.txt
    ├── vllm
    │   ├── README.md
    │   └── run_chat_completion.py
    └── README.md
├── figures
    ├── emergent-tts-emotions-win-rate.png
    ├── higgs_audio_tokenizer_architecture.png
    ├── dual_ffn_comparison_seed_tts_en_sim.png
    ├── dual_ffn_comparison_seed_tts_en_wer.png
    ├── dual_ffn_comparison_seed_tts_zh_sim.png
    ├── dual_ffn_comparison_seed_tts_zh_wer.png
    ├── higgs_audio_v2_architecture_combined.png
    └── higgs_audio_v2_open_source_delay_pattern.png
├── requirements.txt
├── setup.cfg
├── .github
    └── workflows
    │   └── test.yml
├── SUPPORT_GUIDELINES.md
├── tech_blogs
    ├── ARCHITECTURE_BLOG.md
    └── TOKENIZER_BLOG.md
├── pyproject.toml
├── .gitignore
└── LICENSE


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/boson_multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/boson_multimodal/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/boson_multimodal/data_collator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | 
4 | setup()
5 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/descriptaudiocodec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/zh_man_sichuan.txt:
--------------------------------------------------------------------------------
1 | 对，这就是我，万人敬仰的太乙真人，虽然有点婴儿肥，但也掩不住我逼人的帅气。


--------------------------------------------------------------------------------
/examples/scene_prompts/quiet_indoor.txt:
--------------------------------------------------------------------------------
1 | Audio is recorded from a quiet room.
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/vex.txt:
--------------------------------------------------------------------------------
1 | Uhh, this is going to take forever. Why is everything so far?
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/vex.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/vex.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/en_man.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/en_man.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/fiftyshades_anna.txt:
--------------------------------------------------------------------------------
1 | I'm working at the hardware store till 7. I think I'd like that too. What?
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/mabel.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/mabel.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/belinda.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/belinda.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/chadwick.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/chadwick.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/en_woman.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/en_woman.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/mabaoguo.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/mabaoguo.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/bigbang_amy.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/bigbang_amy.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_fiona.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/shrek_fiona.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_shrek.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/shrek_shrek.wav


--------------------------------------------------------------------------------
/boson_multimodal/constants.py:
--------------------------------------------------------------------------------
1 | AUDIO_IN_TOKEN = "<|AUDIO|>"
2 | AUDIO_OUT_TOKEN = "<|AUDIO_OUT|>"
3 | EOS_TOKEN = "<|end_of_text|>"
4 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/broom_salesman.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/broom_salesman.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_donkey.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/shrek_donkey.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/zh_man_sichuan.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/zh_man_sichuan.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/bigbang_sheldon.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/bigbang_sheldon.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/fiftyshades_anna.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/fiftyshades_anna.wav


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_donkey_es.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/voice_prompts/shrek_donkey_es.wav


--------------------------------------------------------------------------------
/figures/emergent-tts-emotions-win-rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/emergent-tts-emotions-win-rate.png


--------------------------------------------------------------------------------
/figures/higgs_audio_tokenizer_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/higgs_audio_tokenizer_architecture.png


--------------------------------------------------------------------------------
/examples/serve_engine/voice_examples/old_man.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/examples/serve_engine/voice_examples/old_man.wav


--------------------------------------------------------------------------------
/figures/dual_ffn_comparison_seed_tts_en_sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/dual_ffn_comparison_seed_tts_en_sim.png


--------------------------------------------------------------------------------
/figures/dual_ffn_comparison_seed_tts_en_wer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/dual_ffn_comparison_seed_tts_en_wer.png


--------------------------------------------------------------------------------
/figures/dual_ffn_comparison_seed_tts_zh_sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/dual_ffn_comparison_seed_tts_zh_sim.png


--------------------------------------------------------------------------------
/figures/dual_ffn_comparison_seed_tts_zh_wer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/dual_ffn_comparison_seed_tts_zh_wer.png


--------------------------------------------------------------------------------
/figures/higgs_audio_v2_architecture_combined.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/higgs_audio_v2_architecture_combined.png


--------------------------------------------------------------------------------
/examples/voice_prompts/chadwick.txt:
--------------------------------------------------------------------------------
1 | Oh dear, who left all this junk lying around? Whoops, there it goes! Mind your pointed little pink head, starfish man.
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/en_man.txt:
--------------------------------------------------------------------------------
1 | Maintaining your ability to learn translates into increased marketability, improved career options and higher salaries.
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/belinda.txt:
--------------------------------------------------------------------------------
1 | Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year.
2 | 


--------------------------------------------------------------------------------
/figures/higgs_audio_v2_open_source_delay_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boson-ai/higgs-audio/HEAD/figures/higgs_audio_v2_open_source_delay_pattern.png


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/en_basic.txt:
--------------------------------------------------------------------------------
1 | The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/en_woman.txt:
--------------------------------------------------------------------------------
1 | The device would work during the day as well, if you took steps to either block direct sunlight or point it away from the sun.
2 | 


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/experimental/en_humming.txt:
--------------------------------------------------------------------------------
1 | Are you asking if I can hum a tune? Of course I can! [humming start] la la la la la [humming end] See?
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/mabaoguo.txt:
--------------------------------------------------------------------------------
1 | 我是浑元形意太极门掌门人马保国,刚才有个朋友问我:马老师发生什么事啦.我说怎么回事,给我发了几张截图,我一看,哦,原来是昨天,有两个年轻人,三十多岁,一个体重九十多公斤,一个体重八十多公斤.他们说,哎,有一个说是:我在健身房练功,颈椎练坏了,马老师你能不能教教我浑元功法
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_donkey_es.txt:
--------------------------------------------------------------------------------
1 | ¡Uy, guau! Eso sí que asusta. Y si el rugido no funciona, tu mal aliento seguro los desmaya. Necesitas unas pastillitas de menta porque el hocico te apesta.


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_shrek.txt:
--------------------------------------------------------------------------------
1 | Well, it's no wonder you don't have any friends. Listen, little donkey, take a look at me. What am I?
2 | No! I'm an ogre! You know, with a torch and pitchfork. Doesn't that bother you?


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/zh_ai.txt:
--------------------------------------------------------------------------------
1 | 大家好，欢迎收听本期的跟李沐学AI。今天沐哥在忙着洗数据，所以由我，希格斯主播代替他讲这期视频。
2 | 今天我们要聊的是一个你绝对不能忽视的话题"多模态学习"。
3 | 无论你是开发者，数据科学爱好者，还是只是对人工智能感兴趣的人都一定听说过这个词。它已经成为AI时代的一个研究热点。
4 | 那么，问题来了，你真的了解多模态吗 你知道如何自己动手构建多模态大模型吗。


--------------------------------------------------------------------------------
/examples/voice_prompts/mabel.txt:
--------------------------------------------------------------------------------
1 | You do talk an awful lot about weather, did you know that? Sometimes I wonder if you're actually content to be a wizard or if you're secretly harbouring a desire to become a seer of the clouds.
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_fiona.txt:
--------------------------------------------------------------------------------
1 | Well, when one lives alone, one has to learn these things in case there's a... There's an arrow in your butt!
2 | Calm down. If you want to help Shrek, run into the woods and find me a blue flower with red thorns.


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/experimental/en_bgm.txt:
--------------------------------------------------------------------------------
1 | [music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it’s due, so that defeat is not disgrace. And I hope I don’t have to do it often. [music end]
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/bigbang_amy.txt:
--------------------------------------------------------------------------------
1 | If that was slang, I'm unfamiliar with it. <SE>[Laughter]</SE> If it was literal, I share your aversion to soiled hosiery. <SE>[Laughter]</SE> In any case, I'm here because my mother and I have agreed that I will date at least once a year.''' 
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/bigbang_sheldon.txt:
--------------------------------------------------------------------------------
1 | Hello, Amy Farrah Fowler. I'm sorry to inform you that you have been taken in by unsupportable mathematics designed to prey on the gullible and the lonely. Additionally, I'm being blackmailed with a hidden dirty sock. <SE>[Laughter]</SE>
2 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/broom_salesman.txt:
--------------------------------------------------------------------------------
1 | I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic. And the bond between you and your wand should only grow stronger. Do not be surprised at your new wand's ability to perceive your intentions - particularly in a moment of need.
2 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | # flake8: noqa
8 | from .vq import QuantizedResult, ResidualVectorQuantizer
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | descript-audio-codec
 2 | torch
 3 | transformers>=4.45.1,<4.47.0
 4 | librosa
 5 | dacite
 6 | boto3==1.35.36
 7 | s3fs
 8 | torchvision
 9 | torchaudio
10 | json_repair
11 | pandas
12 | pydantic
13 | vector_quantize_pytorch
14 | loguru
15 | pydub
16 | ruff==0.12.2
17 | omegaconf
18 | click
19 | langid
20 | jieba
21 | accelerate>=0.26.0
22 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/shrek_donkey.txt:
--------------------------------------------------------------------------------
1 | And I've got a great idea, I'll stick with you. You're a mean green fighting machine, together we'll scare the spit out of anybody that crosses us. 
2 | Oh, Wow, that was really scary. And if you don't mind me saying, if that don't work, your breath certainly will get the job done, 'cause you definitely need some Tic Tacs or something, 'cause your breath stinks!


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = boson_multimodal
 3 | author = Boson AI
 4 | version = 0.1.0
 5 | url = https://github.com/boson-ai/higgs-audio
 6 | description = Higgs Audio
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | 
10 | [options]
11 | packages = find:
12 | 
13 | [options.packages.find]
14 | exclude =
15 |     tests*
16 |     training*
17 | 


--------------------------------------------------------------------------------
/examples/transcript/multi_speaker/en_argument.txt:
--------------------------------------------------------------------------------
1 | [SPEAKER0] I can't believe you did that without even asking me first!
2 | [SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.
3 | [SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!
4 | [SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.
5 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/__init__.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig, AutoModel
 2 | 
 3 | from .configuration_higgs_audio import HiggsAudioConfig, HiggsAudioEncoderConfig
 4 | from .modeling_higgs_audio import HiggsAudioModel
 5 | 
 6 | 
 7 | AutoConfig.register("higgs_audio_encoder", HiggsAudioEncoderConfig)
 8 | AutoConfig.register("higgs_audio", HiggsAudioConfig)
 9 | AutoModel.register(HiggsAudioConfig, HiggsAudioModel)
10 | 


--------------------------------------------------------------------------------
/examples/scene_prompts/reading_blog.txt:
--------------------------------------------------------------------------------
1 | In this audio, the person is reading a blog post aloud. The content is informative and engaging, with the speaker using a clear, conversational tone to make the material feel more approachable. The pacing is moderate, allowing listeners to absorb the information, and the tone shifts slightly to emphasize key points. The speaker occasionally pauses for effect, ensuring each section flows smoothly, as they guide the listener through the post's main ideas.
2 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Test
 2 | on:
 3 |   push:
 4 |     branches: [ main ]
 5 |   pull_request:
 6 |     branches: [ main ]
 7 | 
 8 | jobs:
 9 |   lint:
10 |     name: Lint
11 |     runs-on: ubuntu-22.04
12 |     steps:
13 |       - name: Checkout code 
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Check Code Formatting with Ruff
17 |         run: |
18 |             echo "python version: $(python --version)"
19 |             pip install ruff==0.12.2  # Ensure ruff is installed
20 |             ruff format --check .
21 | 


--------------------------------------------------------------------------------
/examples/voice_prompts/profile.yaml:
--------------------------------------------------------------------------------
1 | profiles:
2 |   male_en: Male, American accent, modern speaking rate, moderate-pitch, friendly tone, and very clear audio.
3 |   female_en_story: She speaks with a calm, gentle, and informative tone at a measured pace, with excellent articulation and very clear audio. She naturally brings storytelling to life with an articulate, genuine, and personable vocal style.
4 |   male_en_british: He speaks with a clear British accent and a conversational, inquisitive tone. His delivery is articulate and at a moderate pace, and very clear audio.
5 |   female_en_british: A female voice with a clear British accent speaking at a modern rate with a moderate-pitch in an expressive and friendly tone and very clear audio.
6 | 


--------------------------------------------------------------------------------
/examples/transcript/multi_speaker/en_higgs.txt:
--------------------------------------------------------------------------------
1 | [SPEAKER0] You're training HiggsAudio again? Aren't you tired of staring at it all day?
2 | [SPEAKER1] Ha! This time, I'm trying to get it to generate multi-speaker dialogues.
3 | [SPEAKER0] Oh, so you want it to sound like a real conversation with multiple people? That sounds… tricky.
4 | [SPEAKER1] It is. The biggest challenge is making sure it understands who's speaking and when. We need a solid dataset with real conversations, including interruptions and natural flow.
5 | [SPEAKER0] Right, because real conversations aren't just people taking turns like robots. There are overlaps, hesitations, and sudden topic changes.
6 | [SPEAKER1] Exactly! That's why we need speaker diarization — so the model knows when one speaker stops and another starts, even if they overlap.
7 | 


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/en_dl.txt:
--------------------------------------------------------------------------------
 1 | Hey, everyone! Welcome back to Tech Talk Tuesdays.
 2 | It’s your host, Alex, and today, we’re diving into a topic that’s become absolutely crucial in the tech world — deep learning.
 3 | And let’s be honest, if you’ve been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.
 4 | 
 5 | So here’s the big question: Do you want to understand how deep learning works?
 6 | How to use it to build powerful models that can predict, automate, and transform industries?
 7 | Well, today, I’ve got some exciting news for you.
 8 | 
 9 | We’re going to talk about a course that I highly recommend: Dive into Deep Learning.
10 | It’s not just another course; it’s an entire experience that will take you from a beginner to someone who is well-versed in deep learning techniques.
11 | 


--------------------------------------------------------------------------------
/examples/serve_engine/README.md:
--------------------------------------------------------------------------------
 1 | # Examples to use HiggsAudioServeEngine
 2 | 
 3 | The `run_hf_example.py` script provides three different examples for using the `HiggsAudioServeEngine`. 
 4 | Each example will generate an audio file (`output_{example}.wav`) in the current directory.
 5 | 
 6 | ### Zero-Shot Voice Generation
 7 | Generate audio with specific voice characteristics (e.g., accents).
 8 | 
 9 | ```bash
10 | python run_hf_example.py zero_shot
11 | ```
12 | 
13 | ### Voice Cloning
14 | Clone a voice from a reference audio sample.
15 | 
16 | ```bash
17 | python run_hf_example.py voice_clone
18 | ```
19 | 
20 | ### (Experimental) Interleaved Dialogue Generation
21 | Higgs Audio v2 is also able to generate text. Here's an example that shows it is able to generate multi-speaker conversations with interleaved transcript and audio from scene descriptions.
22 | 
23 | ```bash
24 | python run_hf_example.py interleaved_dialogue
25 | ```
26 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/descriptaudiocodec/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from einops import rearrange
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | 
 9 | def WNConv1d(*args, **kwargs):
10 |     return weight_norm(nn.Conv1d(*args, **kwargs))
11 | 
12 | 
13 | def WNConvTranspose1d(*args, **kwargs):
14 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
15 | 
16 | 
17 | # Scripting this brings model speed up 1.4x
18 | @torch.jit.script
19 | def snake(x, alpha):
20 |     shape = x.shape
21 |     x = x.reshape(shape[0], shape[1], -1)
22 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
23 |     x = x.reshape(shape)
24 |     return x
25 | 
26 | 
27 | class Snake1d(nn.Module):
28 |     def __init__(self, channels):
29 |         super().__init__()
30 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
31 | 
32 |     def forward(self, x):
33 |         return snake(x, self.alpha)
34 | 


--------------------------------------------------------------------------------
/boson_multimodal/data_types.py:
--------------------------------------------------------------------------------
 1 | """Basic data types for multimodal ChatML format."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Dict, List, Optional, Union
 5 | 
 6 | 
 7 | @dataclass
 8 | class AudioContent:
 9 |     audio_url: str
10 |     # Base64 encoded audio bytes
11 |     raw_audio: Optional[str] = None
12 |     offset: Optional[float] = None
13 |     duration: Optional[float] = None
14 |     row_id: Optional[int] = None
15 |     type: str = "audio"
16 | 
17 | 
18 | @dataclass
19 | class TextContent:
20 |     text: str
21 |     type: str = "text"
22 | 
23 | 
24 | @dataclass
25 | class Message:
26 |     role: str
27 |     content: Union[str, AudioContent, TextContent, List[Union[str, AudioContent, TextContent]]]
28 |     recipient: Optional[str] = None
29 | 
30 | 
31 | @dataclass
32 | class ChatMLSample:
33 |     """Dataclass to hold multimodal ChatML data."""
34 | 
35 |     messages: List[Message]
36 |     start_index: Optional[int] = None  # We will mask the messages[:start_index] when finetuning the LLM.
37 |     misc: Optional[Dict] = None
38 |     speaker: Optional[str] = None
39 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/common.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from transformers.modeling_utils import PreTrainedModel
 4 | 
 5 | from .configuration_higgs_audio import HiggsAudioConfig
 6 | 
 7 | 
 8 | class HiggsAudioPreTrainedModel(PreTrainedModel):
 9 |     config_class = HiggsAudioConfig
10 |     base_model_prefix = "model"
11 |     supports_gradient_checkpointing = True
12 |     _no_split_modules = []
13 |     _skip_keys_device_placement = "past_key_values"
14 |     _supports_flash_attn_2 = True
15 |     _supports_sdpa = True
16 | 
17 |     def _init_weights(self, module):
18 |         std = self.config.init_std if hasattr(self.config, "init_std") else self.config.audio_encoder_config.init_std
19 | 
20 |         if isinstance(module, (nn.Linear, nn.Conv1d)):
21 |             module.weight.data.normal_(mean=0.0, std=std)
22 |             if module.bias is not None:
23 |                 module.bias.data.zero_()
24 |         elif isinstance(module, nn.Embedding):
25 |             module.weight.data.normal_(mean=0.0, std=std)
26 |             if module.padding_idx is not None:
27 |                 module.weight.data[module.padding_idx].zero_()
28 | 


--------------------------------------------------------------------------------
/SUPPORT_GUIDELINES.md:
--------------------------------------------------------------------------------
 1 | # Contribution & Support Guidelines
 2 | 
 3 | Thank you for your interest in this project! Before opening an issue, please take a moment to read the following guidelines:
 4 | 
 5 | ## Self-Check First
 6 | - Write your question in **English** or include an English translation so the community can understand and assist you better.
 7 | - Verify that you have **installed the correct version** of the package.
 8 | - Check the GitHub [README](README.md), [Hugging Face Space](https://huggingface.co/spaces/smola/higgs_audio_v2), [Model Card](https://huggingface.co/bosonai/higgs-audio-v2-generation-3B-base) and existing issues — many questions already have answers.
 9 | - Ensure your problem can be **reproduced** and is directly related to this project.
10 | 
11 | ## Asking Properly
12 | - Provide **clear reproduction steps / minimal code examples / error logs**.
13 | - Keep the issue title **concise and descriptive**, and include enough context in the body.
14 | - Avoid vague questions like *“It doesn’t work, what should I do?”* or *“Can you debug this for me?”*.
15 | 
16 | ## About Support
17 | - This is a **community-driven open source project**. Maintainers will respond when time allows.
18 | - There is **no obligation** to answer every request — please be patient and understanding.
19 | - For more reliable or timely support, consider:
20 |   - Submitting a **Pull Request** to improve code or documentation.
21 |   - Providing detailed context so that the community can help.
22 | 
23 | ## Code of Conduct
24 | - Be **respectful and polite**.
25 | - Do not spam or repeatedly demand responses.
26 | - Off-topic, vague, or inappropriate questions may be closed.
27 | 


--------------------------------------------------------------------------------
/examples/serve_engine/run_hf_example.py:
--------------------------------------------------------------------------------
 1 | """Example for using HiggsAudio for generating both the transcript and audio in an interleaved manner."""
 2 | 
 3 | from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
 4 | import torch
 5 | import torchaudio
 6 | import time
 7 | from loguru import logger
 8 | import click
 9 | 
10 | from input_samples import INPUT_SAMPLES
11 | 
12 | MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
13 | AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
14 | 
15 | 
16 | @click.command()
17 | @click.argument("example", type=click.Choice(list(INPUT_SAMPLES.keys())))
18 | def main(example: str):
19 |     input_sample = INPUT_SAMPLES[example]()
20 |     device = "cuda" if torch.cuda.is_available() else "cpu"
21 |     logger.info(f"Using device: {device}")
22 | 
23 |     serve_engine = HiggsAudioServeEngine(
24 |         MODEL_PATH,
25 |         AUDIO_TOKENIZER_PATH,
26 |         device=device,
27 |     )
28 | 
29 |     logger.info("Starting generation...")
30 |     start_time = time.time()
31 |     output: HiggsAudioResponse = serve_engine.generate(
32 |         chat_ml_sample=input_sample,
33 |         max_new_tokens=1024,
34 |         temperature=1.0,
35 |         top_p=0.95,
36 |         top_k=50,
37 |         stop_strings=["<|end_of_text|>", "<|eot_id|>"],
38 |     )
39 |     elapsed_time = time.time() - start_time
40 |     logger.info(f"Generation time: {elapsed_time:.2f} seconds")
41 | 
42 |     torchaudio.save(f"output_{example}.wav", torch.from_numpy(output.audio)[None, :], output.sampling_rate)
43 |     logger.info(f"Generated text:\n{output.generated_text}")
44 |     logger.info(f"Saved audio to output_{example}.wav")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/tech_blogs/ARCHITECTURE_BLOG.md:
--------------------------------------------------------------------------------
 1 | # HiggsAudio-V2 Model Architecture
 2 | <img src="../figures/higgs_audio_v2_architecture_combined.png" width=800/>
 3 | 
 4 | 
 5 | Our model is built on top of [Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B). To enhance the model’s ability to process audio tokens, we incorporate the "DualFFN" architecture as an audio adapter. DualFFN acts as an audio-specific expert, boosting the LLM's performance with minimal computational overhead. Our implementation preserves 91% of the original LLM’s training speed with the inclusion of DualFFN.
 6 | 
 7 | Since our audio tokenizer is based on Residual Vector-Quantization (RVQ) and contains multiple codebooks, we adopt the [delay pattern](https://proceedings.neurips.cc/paper_files/paper/2023/file/94b472a1842cd7c56dcb125fb2765fbd-Paper-Conference.pdf) to enable simultaneous code generation across codebooks while supporting streaming.
 8 | 
 9 | <img src="../figures/higgs_audio_v2_open_source_delay_pattern.png" width=800/>
10 | 
11 | 
12 | ## DualFFN Performance Ablation Study
13 | 
14 | To assess the effectiveness of DualFFN, we trained two smaller models based on LLaMA-3.1-1B: one incorporating DualFFN and one without. Both models were trained for 250K steps with a learning rate of 5e-4 on a subset of the AudioVerse dataset. We evaluated their performance on SeedTTS-Eval, with the results presented in the figures below. The model equipped with DualFFN consistently outperforms its counterpart in terms of word error rate (WER) and speaker similarity.
15 | 
16 | - SeedTTS-EN
17 | 
18 | <img src="../figures/dual_ffn_comparison_seed_tts_en_wer.png" width=400/> <img src="../figures/dual_ffn_comparison_seed_tts_en_sim.png" width=400/>
19 | 
20 | - SeedTTS-ZH
21 | 
22 | <img src="../figures/dual_ffn_comparison_seed_tts_zh_wer.png" width=400/> <img src="../figures/dual_ffn_comparison_seed_tts_zh_sim.png" width=400>
23 | 
24 | We may notice that the model with DualFFN consistently outperforms the model without DualFFN in terms of word-error-rate (WER) and speaker similarity.
25 | 


--------------------------------------------------------------------------------
/examples/transcript/single_speaker/en_higgs_audio_blog.md:
--------------------------------------------------------------------------------
 1 | At Boson AI, we work on making communication with AI as easy, natural and fun as talking to a human. Today, we are excited to introduce Higgs Audio Understanding and Higgs Audio Generation — two powerful tools designed to build customized AI agents tailored for diverse audio understanding and generation needs.
 2 | 
 3 | # Higgs Audio Generation
 4 | To communicate with humans in a delightful and natural manner, we need to be able to generate realistic, emotionally competent and well-accentuated speech. We need a system that is capable of pronouncing words correctly, even if they derive from a foreign language, particularly for people’s names and places. We need a system that can generate conversations between multiple speakers, particularly when multiple characters in games are involved, or when reading books or screenplays.
 5 | 
 6 | Pure TTS (text to speech) systems struggle at these tasks, since they typically do not understand the meaning of what they’re generating, or any sense of urgency, hesitation, or other intonations that would be plainly obvious to a human speaker. They also struggle to adopt the natural character of a speaker, for example, whether they’re naturally enthusiastic or more deliberate and thoughtful.
 7 | 
 8 | The way to address this problem is to build a TTS system using a Large Language Model (LLM) as a backbone. This endows the TTS system with the understanding needed to generate competent speech. Higgs Audio Generation enhances the underlying LLM to process audio by treating raw audio as tokens. This approach enables the model to be trained end-to-end on extensive text-audio datasets.
 9 | 
10 | The base model we are introducing today demonstrates impressive performance on benchmark tests. Additionally, it showcases emerging capabilities, including generating speech with emotional tone based on text semantics and producing multi-speaker dialogues from written transcripts, all due to the improved understanding. Before diving into technical details, let’s listen to two examples of audio generated by our model.
11 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/LICENSE:
--------------------------------------------------------------------------------
 1 | Third-Party License Attribution for Audio Processing Module
 2 | ===========================================================
 3 | 
 4 | This directory contains code derived from multiple open-source projects. 
 5 | The following sections detail the licenses and attributions for third-party code.
 6 | 
 7 | ## XCodec Repository
 8 | The code in this directory is derived from:
 9 | https://github.com/zhenye234/xcodec
10 | 
11 | ## Individual File Attributions
12 | 
13 | ### Quantization Module (quantization/)
14 | - Several files contain code derived from Meta Platforms, Inc. and the vector-quantize-pytorch repository
15 | - Individual files contain their own license headers where applicable
16 | - The vector-quantize-pytorch portions are licensed under the MIT License
17 | 
18 | ## License Terms
19 | 
20 | ### MIT License (for applicable portions)
21 | Permission is hereby granted, free of charge, to any person obtaining a copy
22 | of this software and associated documentation files (the "Software"), to deal
23 | in the Software without restriction, including without limitation the rights
24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25 | copies of the Software, and to permit persons to whom the Software is
26 | furnished to do so, subject to the following conditions:
27 | 
28 | The above copyright notice and this permission notice shall be included in all
29 | copies or substantial portions of the Software.
30 | 
31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37 | SOFTWARE.
38 | 
39 | ## Attribution Requirements
40 | When using this code, please ensure proper attribution to:
41 | 1. The original xcodec repository: https://github.com/zhenye234/xcodec
42 | 2. Any other repositories mentioned in individual file headers
43 | 3. This derivative work and its modifications
44 | 
45 | ## Disclaimer
46 | This directory contains modified versions of the original code. Please refer to
47 | the original repositories for the canonical implementations and their specific
48 | license terms.
49 | 
50 | For any questions about licensing or attribution, please check the individual
51 | file headers and the original source repositories. 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [tool.ruff]
  6 | line-length = 119
  7 | target-version = "py310"
  8 | indent-width = 4
  9 | exclude = [
 10 |     ".bzr",
 11 |     ".direnv",
 12 |     ".eggs",
 13 |     ".git",
 14 |     ".git-rewrite",
 15 |     ".hg",
 16 |     ".ipynb_checkpoints",
 17 |     ".mypy_cache",
 18 |     ".nox",
 19 |     ".pants.d",
 20 |     ".pyenv",
 21 |     ".pytest_cache",
 22 |     ".pytype",
 23 |     ".ruff_cache",
 24 |     ".svn",
 25 |     ".tox",
 26 |     ".venv",
 27 |     ".vscode",
 28 |     "__pypackages__",
 29 |     "_build",
 30 |     "buck-out",
 31 |     "build",
 32 |     "dist",
 33 |     "node_modules",
 34 |     "site-packages",
 35 |     "venv",
 36 |     "external",
 37 |     "third_party",
 38 | ]
 39 | 
 40 | [tool.ruff.lint]
 41 | preview = true
 42 | ignore-init-module-imports = true
 43 | extend-select = [
 44 |     "B009", # static getattr
 45 |     "B010", # static setattr
 46 |     "CPY", # Copyright
 47 |     "E", # PEP8 errors
 48 |     "F", # PEP8 formatting
 49 |     "I", # Import sorting
 50 |     "TID251", # Banned API
 51 |     "UP", # Pyupgrade
 52 |     "W", # PEP8 warnings
 53 | ]
 54 | ignore = [
 55 |     "E501", # Line length (handled by ruff-format)
 56 |     "E741", # Ambiguous variable name
 57 |     "W605", # Invalid escape sequence
 58 |     "UP007", # X | Y type annotations
 59 | ]
 60 | 
 61 | [tool.ruff.lint.per-file-ignores]
 62 | "__init__.py" = [
 63 |     "F401", # Ignore seemingly unused imports (they're meant for re-export)
 64 | ]
 65 | 
 66 | [tool.ruff.lint.isort]
 67 | lines-after-imports = 2
 68 | known-first-party = ["character_tuning"]
 69 | 
 70 | [tool.ruff.format]
 71 | # Like Black, use double quotes for strings.
 72 | quote-style = "double"
 73 | 
 74 | # Like Black, indent with spaces, rather than tabs.
 75 | indent-style = "space"
 76 | 
 77 | # Like Black, respect magic trailing commas.
 78 | skip-magic-trailing-comma = false
 79 | 
 80 | # Like Black, automatically detect the appropriate line ending.
 81 | line-ending = "auto"
 82 | 
 83 | # Enable auto-formatting of code examples in docstrings. Markdown,
 84 | # reStructuredText code/literal blocks and doctests are all supported.
 85 | #
 86 | # This is currently disabled by default, but it is planned for this
 87 | # to be opt-out in the future.
 88 | docstring-code-format = false
 89 | 
 90 | # Set the line length limit used when formatting code snippets in
 91 | # docstrings.
 92 | #
 93 | # This only has an effect when the `docstring-code-format` setting is
 94 | # enabled.
 95 | docstring-code-line-length = "dynamic"
 96 | 
 97 | [tool.ruff.lint.flake8-tidy-imports.banned-api]
 98 | "os.getenv".msg = "Use os.environ instead"
 99 | "os.putenv".msg = "Use os.environ instead"
100 | "os.unsetenv".msg = "Use os.environ instead"
101 | 
102 | 


--------------------------------------------------------------------------------
/examples/vllm/README.md:
--------------------------------------------------------------------------------
 1 | # Serve Higgs Audio with vLLM
 2 | 
 3 | We provided both OpenAI compatible chat completion and audio speech server backed by vLLM engine. To start the server, you can use the following command
 4 | 
 5 | ```bash
 6 | docker run --gpus all --ipc=host --shm-size=20gb --network=host \
 7 | bosonai/higgs-audio-vllm:latest \
 8 | --served-model-name "higgs-audio-v2-generation-3B-base" \
 9 | --model "bosonai/higgs-audio-v2-generation-3B-base"  \
10 | --audio-tokenizer-type "bosonai/higgs-audio-v2-tokenizer" \
11 | --limit-mm-per-prompt audio=50 \
12 | --max-model-len 8192 \
13 | --port 8000 \
14 | --gpu-memory-utilization 0.8 \
15 | --disable-mm-preprocessor-cache
16 | ```
17 | 
18 | In audio speech API, we provided the same voices as the [voice_prompts](../voice_prompts) folder. In addition, if you want to use your custom voices, you can add the voice presets in the docker run command 
19 | 
20 | ```bash
21 | --voice-presets-dir YOUR_VOICE_PRESETS_PATH
22 | ```
23 | 
24 | And in the voice presets directory, you need to add `config.json` file for each voice in the following format:
25 | ```json
26 | {
27 |     "belinda": {
28 |         "transcript": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year.",
29 |         "audio_file": "belinda.wav"
30 |     },
31 |     "broom_salesman": {
32 |         "transcript": "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic. And the bond between you and your wand should only grow stronger. Do not be surprised at your new wand's ability to perceive your intentions - particularly in a moment of need.",
33 |         "audio_file": "broom_salesman.wav"
34 |     }
35 | }
36 | ```
37 | 
38 | We tested on A100 GPU with 40GB memory, which can achieve about 1500 tokens/s throughput for audio generation, which translate to 60 seconds audio generation per second with higgs-audio-tokenizer.
39 | We also tested on RTX 4090 GPU with 24GB memory, which can achieve about 600 tokens/s throughput for audio generation, which translate to 24 seconds audio generation per second.
40 | 
41 | ### cURL Example
42 | To quickly test the server with curl, you can use the following command to generate audio with the audio speech API.
43 | 
44 | ```bash
45 | curl -X POST "http://localhost:8000/v1/audio/speech" \
46 |   -H "Content-Type: application/json" \
47 |   -d '{
48 |     "model": "higgs-audio-v2-generation-3B-base",
49 |     "voice": "en_woman",
50 |     "input": "Today is a wonderful day to build something people love!",
51 |     "response_format": "pcm"
52 |   }' \
53 |   --output - | ffmpeg -f s16le -ar 24000 -ac 1 -i - speech.wav
54 | ```
55 | 
56 | 
57 | ### Python example
58 | You can also use the python client code to achieve more complex use cases with the chat completion API.
59 | 
60 | Voice clone
61 | ```bash
62 | python run_chat_completion.py --api-base http://localhost:8000/v1 --task voice_clone
63 | ```
64 | 
65 | Smart voice
66 | ```bash
67 | python run_chat_completion.py --api-base http://localhost:8000/v1 --task smart_voice
68 | ```
69 | 
70 | Multispeaker
71 | ```bash
72 | python run_chat_completion.py --api-base http://localhost:8000/v1 --task multispeaker
73 | ```
74 | 


--------------------------------------------------------------------------------
/examples/serve_engine/input_samples.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import os
 3 | from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
 4 | 
 5 | 
 6 | def encode_base64_content_from_file(file_path: str) -> str:
 7 |     """Encode a content from a local file to base64 format."""
 8 |     # Read the audio file as binary and encode it directly to Base64
 9 |     with open(file_path, "rb") as audio_file:
10 |         audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
11 |     return audio_base64
12 | 
13 | 
14 | def get_interleaved_dialogue_input_sample():
15 |     system_prompt = (
16 |         "Generate audio following instruction.\n\n"
17 |         "<|scene_desc_start|>\n"
18 |         "SPEAKER0: vocal fry;moderate pitch;monotone;masculine;young adult;slightly fast\n"
19 |         "SPEAKER1: masculine;moderate;moderate pitch;monotone;mature\n\n"
20 |         "In this scene, a group of adventurers is debating whether to investigate a potentially dangerous situation.\n"
21 |         "<|scene_desc_end|>"
22 |     )
23 | 
24 |     messages = [
25 |         Message(
26 |             role="system",
27 |             content=system_prompt,
28 |         ),
29 |         Message(
30 |             role="user",
31 |             content="<|generation_instruction_start|>\nGenerate interleaved transcript and audio that lasts for around 20 seconds.\n<|generation_instruction_end|>",
32 |         ),
33 |     ]
34 |     chat_ml_sample = ChatMLSample(messages=messages)
35 |     return chat_ml_sample
36 | 
37 | 
38 | def get_zero_shot_input_sample():
39 |     system_prompt = (
40 |         "Generate audio following instruction.\n\n<|scene_desc_start|>\nSPEAKER0: british accent\n<|scene_desc_end|>"
41 |     )
42 | 
43 |     messages = [
44 |         Message(
45 |             role="system",
46 |             content=system_prompt,
47 |         ),
48 |         Message(
49 |             role="user",
50 |             content="Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
51 |             "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
52 |             "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.",
53 |         ),
54 |     ]
55 |     chat_ml_sample = ChatMLSample(messages=messages)
56 |     return chat_ml_sample
57 | 
58 | 
59 | def get_voice_clone_input_sample():
60 |     reference_text = "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic."
61 |     reference_audio = encode_base64_content_from_file(
62 |         os.path.join(os.path.dirname(__file__), "voice_examples/old_man.wav")
63 |     )
64 |     messages = [
65 |         Message(
66 |             role="user",
67 |             content=reference_text,
68 |         ),
69 |         Message(
70 |             role="assistant",
71 |             content=AudioContent(raw_audio=reference_audio, audio_url="placeholder"),
72 |         ),
73 |         Message(
74 |             role="user",
75 |             content="Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
76 |             "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
77 |             "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.",
78 |         ),
79 |     ]
80 |     return ChatMLSample(messages=messages)
81 | 
82 | 
83 | INPUT_SAMPLES = {
84 |     "interleaved_dialogue": get_interleaved_dialogue_input_sample,
85 |     "zero_shot": get_zero_shot_input_sample,
86 |     "voice_clone": get_voice_clone_input_sample,
87 | }
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Temporary files generated in training
  2 | dpo_samples*
  3 | scoring_results
  4 | results/
  5 | hf_slurm_logs/
  6 | slurm_results/
  7 | enroot_images/
  8 | slurm*.out
  9 | cache_*
 10 | mlruns/
 11 | local_download_dir/
 12 | audioverse/data
 13 | # the folder pattern is sft_{year}.
 14 | sft_20*
 15 | data/
 16 | audioverse/cache
 17 | # vim ipython plugin generated files
 18 | .jukit
 19 | 
 20 | # node
 21 | node_modules
 22 | package.json
 23 | package-lock.json
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | share/python-wheels/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | MANIFEST
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | !tests/*
 65 | htmlcov/
 66 | .tox/
 67 | .nox/
 68 | .coverage
 69 | .coverage.*
 70 | .cache
 71 | nosetests.xml
 72 | coverage.xml
 73 | *.cover
 74 | *.py,cover
 75 | .hypothesis/
 76 | .pytest_cache/
 77 | cover/
 78 | 
 79 | # Translations
 80 | *.mo
 81 | *.pot
 82 | 
 83 | # Django stuff:
 84 | *.log
 85 | local_settings.py
 86 | db.sqlite3
 87 | db.sqlite3-journal
 88 | 
 89 | # Flask stuff:
 90 | instance/
 91 | .webassets-cache
 92 | 
 93 | # Scrapy stuff:
 94 | .scrapy
 95 | 
 96 | # Sphinx documentation
 97 | docs/_build/
 98 | 
 99 | # PyBuilder
100 | .pybuilder/
101 | target/
102 | 
103 | # Jupyter Notebook
104 | .ipynb_checkpoints
105 | 
106 | # IPython
107 | profile_default/
108 | ipython_config.py
109 | 
110 | # pyenv
111 | #   For a library or package, you might want to ignore these files since the code is
112 | #   intended to run in multiple environments; otherwise, check them in:
113 | # .python-version
114 | 
115 | # pipenv
116 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
117 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
118 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
119 | #   install all needed dependencies.
120 | #Pipfile.lock
121 | 
122 | # poetry
123 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
124 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
125 | #   commonly ignored for libraries.
126 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
127 | #poetry.lock
128 | 
129 | # pdm
130 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
131 | #pdm.lock
132 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
133 | #   in version control.
134 | #   https://pdm.fming.dev/#use-with-ide
135 | .pdm.toml
136 | 
137 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
138 | __pypackages__/
139 | 
140 | # Celery stuff
141 | celerybeat-schedule
142 | celerybeat.pid
143 | 
144 | # SageMath parsed files
145 | *.sage.py
146 | 
147 | # Environments
148 | /.conda_env*
149 | /.env*
150 | /.higgs_audio_env*
151 | /.venv*
152 | /conda_env*
153 | /env*
154 | /ENV*
155 | /higgs_audio_env*
156 | /venv*
157 | 
158 | # Spyder project settings
159 | .spyderproject
160 | .spyproject
161 | 
162 | # Rope project settings
163 | .ropeproject
164 | 
165 | # mkdocs documentation
166 | /site
167 | 
168 | # mypy
169 | .mypy_cache/
170 | .dmypy.json
171 | dmypy.json
172 | 
173 | # Pyre type checker
174 | .pyre/
175 | 
176 | # pytype static type analyzer
177 | .pytype/
178 | 
179 | # Cython debug symbols
180 | cython_debug/
181 | 
182 | # PyCharm
183 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
184 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
185 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
186 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
187 | #.idea/
188 | 
189 | *.jsonl
190 | download
191 | .DS_Store
192 | *entry.py
193 | 
194 | # Pytorch
195 | torch_compile_debug/
196 | 
197 | # Out Dir
198 | result/
199 | 
200 | # Ruff
201 | .ruff_cache/


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/quantization/distrib.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | """Torch distributed utilities."""
  8 | 
  9 | import typing as tp
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | def rank():
 15 |     if torch.distributed.is_initialized():
 16 |         return torch.distributed.get_rank()
 17 |     else:
 18 |         return 0
 19 | 
 20 | 
 21 | def world_size():
 22 |     if torch.distributed.is_initialized():
 23 |         return torch.distributed.get_world_size()
 24 |     else:
 25 |         return 1
 26 | 
 27 | 
 28 | def is_distributed():
 29 |     return world_size() > 1
 30 | 
 31 | 
 32 | def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
 33 |     if is_distributed():
 34 |         return torch.distributed.all_reduce(tensor, op)
 35 | 
 36 | 
 37 | def _is_complex_or_float(tensor):
 38 |     return torch.is_floating_point(tensor) or torch.is_complex(tensor)
 39 | 
 40 | 
 41 | def _check_number_of_params(params: tp.List[torch.Tensor]):
 42 |     # utility function to check that the number of params in all workers is the same,
 43 |     # and thus avoid a deadlock with distributed all reduce.
 44 |     if not is_distributed() or not params:
 45 |         return
 46 |     # print('params[0].device ', params[0].device)
 47 |     tensor = torch.tensor([len(params)], device=params[0].device, dtype=torch.long)
 48 |     all_reduce(tensor)
 49 |     if tensor.item() != len(params) * world_size():
 50 |         # If not all the workers have the same number, for at least one of them,
 51 |         # this inequality will be verified.
 52 |         raise RuntimeError(
 53 |             f"Mismatch in number of params: ours is {len(params)}, at least one worker has a different one."
 54 |         )
 55 | 
 56 | 
 57 | def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int = 0):
 58 |     """Broadcast the tensors from the given parameters to all workers.
 59 |     This can be used to ensure that all workers have the same model to start with.
 60 |     """
 61 |     if not is_distributed():
 62 |         return
 63 |     tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)]
 64 |     _check_number_of_params(tensors)
 65 |     handles = []
 66 |     for tensor in tensors:
 67 |         handle = torch.distributed.broadcast(tensor.data, src=src, async_op=True)
 68 |         handles.append(handle)
 69 |     for handle in handles:
 70 |         handle.wait()
 71 | 
 72 | 
 73 | def sync_buffer(buffers, average=True):
 74 |     """
 75 |     Sync grad for buffers. If average is False, broadcast instead of averaging.
 76 |     """
 77 |     if not is_distributed():
 78 |         return
 79 |     handles = []
 80 |     for buffer in buffers:
 81 |         if torch.is_floating_point(buffer.data):
 82 |             if average:
 83 |                 handle = torch.distributed.all_reduce(buffer.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
 84 |             else:
 85 |                 handle = torch.distributed.broadcast(buffer.data, src=0, async_op=True)
 86 |             handles.append((buffer, handle))
 87 |     for buffer, handle in handles:
 88 |         handle.wait()
 89 |         if average:
 90 |             buffer.data /= world_size
 91 | 
 92 | 
 93 | def sync_grad(params):
 94 |     """
 95 |     Simpler alternative to DistributedDataParallel, that doesn't rely
 96 |     on any black magic. For simple models it can also be as fast.
 97 |     Just call this on your model parameters after the call to backward!
 98 |     """
 99 |     if not is_distributed():
100 |         return
101 |     handles = []
102 |     for p in params:
103 |         if p.grad is not None:
104 |             handle = torch.distributed.all_reduce(p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True)
105 |             handles.append((p, handle))
106 |     for p, handle in handles:
107 |         handle.wait()
108 |         p.grad.data /= world_size()
109 | 
110 | 
111 | def average_metrics(metrics: tp.Dict[str, float], count=1.0):
112 |     """Average a dictionary of metrics across all workers, using the optional
113 |     `count` as unormalized weight.
114 |     """
115 |     if not is_distributed():
116 |         return metrics
117 |     keys, values = zip(*metrics.items())
118 |     device = "cuda" if torch.cuda.is_available() else "cpu"
119 |     tensor = torch.tensor(list(values) + [1], device=device, dtype=torch.float32)
120 |     tensor *= count
121 |     all_reduce(tensor)
122 |     averaged = (tensor[:-1] / tensor[-1]).cpu().tolist()
123 |     return dict(zip(keys, averaged))
124 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/quantization/vq.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | """Residual vector quantizer implementation."""
  8 | 
  9 | from dataclasses import dataclass, field
 10 | import math
 11 | import typing as tp
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | 
 16 | # from .core_vq import ResidualVectorQuantization
 17 | from .core_vq_lsx_version import ResidualVectorQuantization
 18 | 
 19 | 
 20 | @dataclass
 21 | class QuantizedResult:
 22 |     quantized: torch.Tensor
 23 |     codes: torch.Tensor
 24 |     bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
 25 |     penalty: tp.Optional[torch.Tensor] = None
 26 |     metrics: dict = field(default_factory=dict)
 27 | 
 28 | 
 29 | class ResidualVectorQuantizer(nn.Module):
 30 |     """Residual Vector Quantizer.
 31 |     Args:
 32 |         dimension (int): Dimension of the codebooks.
 33 |         n_q (int): Number of residual vector quantizers used.
 34 |         bins (int): Codebook size.
 35 |         decay (float): Decay for exponential moving average over the codebooks.
 36 |         kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
 37 |         kmeans_iters (int): Number of iterations used for kmeans initialization.
 38 |         threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
 39 |             that have an exponential moving average cluster size less than the specified threshold with
 40 |             randomly selected vector from the current batch.
 41 |     """
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         dimension: int = 256,
 46 |         codebook_dim: int = None,
 47 |         n_q: int = 8,
 48 |         bins: int = 1024,
 49 |         decay: float = 0.99,
 50 |         kmeans_init: bool = True,
 51 |         kmeans_iters: int = 50,
 52 |         threshold_ema_dead_code: int = 2,
 53 |     ):
 54 |         super().__init__()
 55 |         self.n_q = n_q
 56 |         self.dimension = dimension
 57 |         self.codebook_dim = codebook_dim
 58 |         self.bins = bins
 59 |         self.decay = decay
 60 |         self.kmeans_init = kmeans_init
 61 |         self.kmeans_iters = kmeans_iters
 62 |         self.threshold_ema_dead_code = threshold_ema_dead_code
 63 |         self.vq = ResidualVectorQuantization(
 64 |             dim=self.dimension,
 65 |             codebook_dim=self.codebook_dim,
 66 |             codebook_size=self.bins,
 67 |             num_quantizers=self.n_q,
 68 |             decay=self.decay,
 69 |             kmeans_init=self.kmeans_init,
 70 |             kmeans_iters=self.kmeans_iters,
 71 |             threshold_ema_dead_code=self.threshold_ema_dead_code,
 72 |         )
 73 | 
 74 |     def forward(self, x: torch.Tensor, sample_rate: int, bandwidth: tp.Optional[float] = None):  # -> QuantizedResult:
 75 |         """Residual vector quantization on the given input tensor.
 76 |         Args:
 77 |             x (torch.Tensor): Input tensor.
 78 |             sample_rate (int): Sample rate of the input tensor.
 79 |             bandwidth (float): Target bandwidth.
 80 |         Returns:
 81 |             QuantizedResult:
 82 |                 The quantized (or approximately quantized) representation with
 83 |                 the associated bandwidth and any penalty term for the loss.
 84 |         """
 85 |         bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
 86 |         n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
 87 |         quantized, codes, commit_loss = self.vq(x, n_q=n_q)
 88 |         bw = torch.tensor(n_q * bw_per_q).to(x)
 89 |         return quantized, codes, bw, torch.mean(commit_loss)
 90 |         # return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
 91 | 
 92 |     def get_num_quantizers_for_bandwidth(self, sample_rate: int, bandwidth: tp.Optional[float] = None) -> int:
 93 |         """Return n_q based on specified target bandwidth."""
 94 |         bw_per_q = self.get_bandwidth_per_quantizer(sample_rate)
 95 |         n_q = self.n_q
 96 |         if bandwidth and bandwidth > 0.0:
 97 |             n_q = int(max(1, math.floor(bandwidth / bw_per_q)))
 98 |         return n_q
 99 | 
100 |     def get_bandwidth_per_quantizer(self, sample_rate: int):
101 |         """Return bandwidth per quantizer for a given input sample rate."""
102 |         return math.log2(self.bins) * sample_rate / 1000
103 | 
104 |     def encode(self, x: torch.Tensor, sample_rate: int, bandwidth: tp.Optional[float] = None) -> torch.Tensor:
105 |         """Encode a given input tensor with the specified sample rate at the given bandwidth.
106 |         The RVQ encode method sets the appropriate number of quantizer to use
107 |         and returns indices for each quantizer.
108 |         """
109 |         n_q = self.get_num_quantizers_for_bandwidth(sample_rate, bandwidth)
110 |         codes = self.vq.encode(x, n_q=n_q)
111 |         return codes
112 | 
113 |     def decode(self, codes: torch.Tensor) -> torch.Tensor:
114 |         """Decode the given codes to the quantized representation."""
115 |         quantized = self.vq.decode(codes)
116 |         return quantized
117 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/cuda_graph_runner.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from typing import Optional, List, Dict, Tuple, Union
  4 | import gc
  5 | 
  6 | from transformers.cache_utils import Cache
  7 | 
  8 | 
  9 | _NUM_WARMUP_ITERS = 2
 10 | 
 11 | 
 12 | class CUDAGraphRunner(nn.Module):
 13 |     def __init__(self, model):
 14 |         super().__init__()
 15 |         self.model = model
 16 | 
 17 |         self.input_buffers: Dict[str, torch.Tensor] = {}
 18 |         self.output_buffers: Dict[str, torch.Tensor] = {}
 19 | 
 20 |         self._graph: Optional[torch.cuda.CUDAGraph] = None
 21 | 
 22 |     @property
 23 |     def graph(self):
 24 |         assert self._graph is not None
 25 |         return self._graph
 26 | 
 27 |     def capture(
 28 |         self,
 29 |         hidden_states: torch.Tensor,
 30 |         causal_mask: torch.Tensor,
 31 |         position_ids: torch.Tensor,
 32 |         audio_discrete_codes_mask: torch.Tensor,
 33 |         cache_position: torch.Tensor,
 34 |         past_key_values: Union[Cache, List[torch.FloatTensor]],
 35 |         use_cache: bool,
 36 |         audio_attention_mask: torch.Tensor,
 37 |         fast_forward_attention_mask: torch.Tensor,
 38 |         output_attentions: bool,
 39 |         output_hidden_states: bool,
 40 |         is_decoding_audio_token: Optional[bool] = None,
 41 |         is_using_cuda_graph: Optional[bool] = False,
 42 |         stream: torch.cuda.Stream = None,
 43 |         memory_pool: Optional[Tuple[int, int]] = None,
 44 |     ):
 45 |         assert self._graph is None
 46 |         # Run warmup iterations
 47 |         for _ in range(_NUM_WARMUP_ITERS):
 48 |             self.model(
 49 |                 hidden_states=hidden_states,
 50 |                 causal_mask=causal_mask,
 51 |                 position_ids=position_ids,
 52 |                 audio_discrete_codes_mask=audio_discrete_codes_mask,
 53 |                 cache_position=cache_position,
 54 |                 past_key_values=past_key_values,
 55 |                 use_cache=use_cache,
 56 |                 audio_attention_mask=audio_attention_mask,
 57 |                 fast_forward_attention_mask=fast_forward_attention_mask,
 58 |                 output_attentions=output_attentions,
 59 |                 output_hidden_states=output_hidden_states,
 60 |                 is_decoding_audio_token=is_decoding_audio_token,
 61 |                 is_using_cuda_graph=is_using_cuda_graph,
 62 |             )
 63 | 
 64 |         torch.cuda.synchronize()
 65 | 
 66 |         # Capture the graph
 67 |         self._graph = torch.cuda.CUDAGraph()
 68 |         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
 69 |             out_hidden_states, all_hidden_states, all_self_attns = self.model(
 70 |                 hidden_states=hidden_states,
 71 |                 causal_mask=causal_mask,
 72 |                 position_ids=position_ids,
 73 |                 audio_discrete_codes_mask=audio_discrete_codes_mask,
 74 |                 cache_position=cache_position,
 75 |                 past_key_values=past_key_values,
 76 |                 use_cache=use_cache,
 77 |                 audio_attention_mask=audio_attention_mask,
 78 |                 fast_forward_attention_mask=fast_forward_attention_mask,
 79 |                 output_attentions=output_attentions,
 80 |                 output_hidden_states=output_hidden_states,
 81 |                 is_decoding_audio_token=is_decoding_audio_token,
 82 |                 is_using_cuda_graph=is_using_cuda_graph,
 83 |             )
 84 |             # hidden_states_out = torch.ops._C.weak_ref_tensor(outputs[0])
 85 |             # del outputs
 86 |             gc.collect()
 87 |         torch.cuda.synchronize()
 88 | 
 89 |         # Save input and output buffers
 90 |         self.input_buffers = {
 91 |             "hidden_states": hidden_states,
 92 |             "causal_mask": causal_mask,
 93 |             "position_ids": position_ids,
 94 |             "audio_discrete_codes_mask": audio_discrete_codes_mask,
 95 |             "cache_position": cache_position,
 96 |             "past_key_values": past_key_values,
 97 |             "audio_attention_mask": audio_attention_mask,
 98 |             "fast_forward_attention_mask": fast_forward_attention_mask,
 99 |         }
100 |         self.output_buffers = {
101 |             "hidden_states": out_hidden_states,
102 |             "all_hidden_states": all_hidden_states,
103 |             "all_self_attns": all_self_attns,
104 |         }
105 | 
106 |     def forward(
107 |         self,
108 |         hidden_states: torch.Tensor,
109 |         causal_mask: torch.Tensor,
110 |         position_ids: torch.Tensor,
111 |         audio_discrete_codes_mask: torch.Tensor,
112 |         cache_position: torch.Tensor,
113 |         audio_attention_mask: torch.Tensor,
114 |         fast_forward_attention_mask: torch.Tensor,
115 |         **kwargs,
116 |     ) -> torch.Tensor:
117 |         # Copy input tensors to buffers
118 |         self.input_buffers["hidden_states"].copy_(hidden_states, non_blocking=True)
119 |         self.input_buffers["causal_mask"].copy_(causal_mask, non_blocking=True)
120 |         self.input_buffers["position_ids"].copy_(position_ids, non_blocking=True)
121 |         self.input_buffers["audio_discrete_codes_mask"].copy_(audio_discrete_codes_mask, non_blocking=True)
122 |         self.input_buffers["cache_position"].copy_(cache_position, non_blocking=True)
123 |         self.input_buffers["audio_attention_mask"].copy_(audio_attention_mask, non_blocking=True)
124 |         self.input_buffers["fast_forward_attention_mask"].copy_(fast_forward_attention_mask, non_blocking=True)
125 | 
126 |         # Run the captured graph
127 |         self.graph.replay()
128 | 
129 |         return self.output_buffers["hidden_states"], None, None
130 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/audio_head.py:
--------------------------------------------------------------------------------
  1 | """Projector that maps hidden states from the LLM component to multimodal logits."""
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | 
  6 | from dataclasses import dataclass
  7 | from typing import Optional, Tuple
  8 | 
  9 | from .common import HiggsAudioPreTrainedModel
 10 | from .configuration_higgs_audio import HiggsAudioConfig
 11 | 
 12 | 
 13 | @dataclass
 14 | class HiggsAudioDecoderLayerOutput:
 15 |     logits: torch.FloatTensor
 16 |     audio_logits: torch.FloatTensor
 17 |     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 18 |     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 19 | 
 20 | 
 21 | class HiggsAudioDecoderProjector(HiggsAudioPreTrainedModel):
 22 |     """Projection layers that map hidden states from the LLM component to audio / text logits.
 23 | 
 24 |     We support two type of audio head:
 25 |     - Basic Audio Head:
 26 |         Directly map the hidden states to audio logits for all the codebooks.
 27 |     """
 28 | 
 29 |     def __init__(self, config: HiggsAudioConfig, layer_idx: Optional[int] = None):
 30 |         super().__init__(config)
 31 |         self.text_lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
 32 |         self.audio_lm_head = nn.Linear(
 33 |             config.text_config.hidden_size, config.audio_num_codebooks * (config.audio_codebook_size + 2), bias=False
 34 |         )
 35 | 
 36 |         # Initialize weights and apply final processing
 37 |         self.post_init()
 38 | 
 39 |     def forward(
 40 |         self,
 41 |         hidden_states,
 42 |         audio_out_mask,
 43 |         label_audio_ids=None,
 44 |         attention_mask=None,
 45 |         position_ids=None,
 46 |         past_key_values=None,
 47 |         use_cache=None,
 48 |         output_attentions=None,
 49 |         output_hidden_states=None,
 50 |         output_audio_hidden_states=False,
 51 |         cache_position=None,
 52 |     ):
 53 |         """
 54 |         Args:
 55 |             hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`):
 56 |                 Hidden states from the LLM component
 57 |             audio_out_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
 58 |                 Mask for identifying the audio out tokens.
 59 |             label_audio_ids (`torch.Tensor` of shape `(num_codebooks, num_audio_out_tokens)`):
 60 |                 Label tokens for the audio-out part. This is used for calculating the logits if RQ-Transformer is used.
 61 |             attention_mask (`torch.Tensor` of shape `(batch_size, seq_len)`):
 62 |                 Mask to avoid performing attention on padding token indices
 63 |             position_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
 64 |                 Position ids for the input tokens
 65 | 
 66 |         Returns:
 67 |             logits (`torch.Tensor` of shape `(batch_size, seq_len, vocab_size)`):
 68 |                 Logits for text tokens
 69 |             audio_logits (`torch.Tensor` of shape `(num_audio_out_tokens, audio_num_codebooks * audio_codebook_size)`):
 70 |                 Logits for audio tokens. We ensure `num_text_tokens + num_audio_tokens == batch_size * seq_len`
 71 |         """
 72 |         logits = self.text_lm_head(hidden_states)
 73 | 
 74 |         all_hidden_states = () if output_hidden_states else None
 75 |         all_self_attns = () if output_attentions else None
 76 |         next_decoder_cache = None
 77 | 
 78 |         if self.config.audio_decoder_proj_num_layers > 0:
 79 |             # create position embeddings to be shared across the decoder layers
 80 |             position_embeddings = self.rotary_emb(hidden_states, position_ids)
 81 |             for decoder_layer in self.transformer_layers:
 82 |                 if output_hidden_states:
 83 |                     all_hidden_states += (hidden_states,)
 84 | 
 85 |                 if self.gradient_checkpointing and self.training:
 86 |                     layer_outputs = self._gradient_checkpointing_func(
 87 |                         decoder_layer.__call__,
 88 |                         hidden_states,
 89 |                         attention_mask,
 90 |                         position_ids,
 91 |                         past_key_values,
 92 |                         output_attentions,
 93 |                         use_cache,
 94 |                         cache_position,
 95 |                         position_embeddings,
 96 |                     )
 97 |                 else:
 98 |                     layer_outputs = decoder_layer(
 99 |                         hidden_states,
100 |                         attention_mask=attention_mask,
101 |                         position_ids=position_ids,
102 |                         past_key_value=past_key_values,
103 |                         output_attentions=output_attentions,
104 |                         use_cache=use_cache,
105 |                         cache_position=cache_position,
106 |                         position_embeddings=position_embeddings,
107 |                     )
108 |                 hidden_states = layer_outputs[0]
109 |             hidden_states = self.norm(hidden_states)
110 | 
111 |             if output_hidden_states:
112 |                 all_hidden_states += (hidden_states,)
113 | 
114 |             if output_attentions:
115 |                 all_self_attns += (layer_outputs[1],)
116 | 
117 |             if use_cache:
118 |                 next_decoder_cache = layer_outputs[2 if output_attentions else 1]
119 | 
120 |         next_cache = next_decoder_cache if use_cache else None
121 | 
122 |         audio_logits = self.audio_lm_head(hidden_states[audio_out_mask])
123 | 
124 |         if output_audio_hidden_states:
125 |             audio_hidden_states = hidden_states[audio_out_mask]
126 |         else:
127 |             audio_hidden_states = None
128 | 
129 |         return logits, audio_logits, all_self_attns, all_hidden_states, audio_hidden_states, next_cache
130 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
  1 | # Examples
  2 | 
  3 | > [!NOTE]  
  4 | > If you do not like the audio you get, you can generate multiple times with different seeds. In addition, you may need to apply text normalization to get the best performance, e.g. converting 70 °F to "seventy degrees Fahrenheit", and converting "1 2 3 4" to "one two three four". The model also performs better in longer sentences. Right now, the model has not been post-trained, we will release the post-trained model in the future.
  5 | 
  6 | ## Single-speaker Audio Generation
  7 | 
  8 | ### Voice clone
  9 | 
 10 | ```bash
 11 | python3 generation.py \
 12 | --transcript transcript/single_speaker/en_dl.txt \
 13 | --ref_audio broom_salesman \
 14 | --seed 12345 \
 15 | --out_path generation.wav
 16 | ```
 17 | 
 18 | The model will read the transcript with the same voice as in the [reference audio](./voice_prompts/broom_salesman.wav). The technique is also called shallow voice clone.
 19 | 
 20 | We have some example audio prompts stored in [voice_prompts](./voice_prompts/). Feel free to pick one in the folder and try out the model. Here's another example that uses the voice of `belinda`. You can also add new own favorite voice in the folder and clone the voice.
 21 | 
 22 | ```bash
 23 | python3 generation.py \
 24 | --transcript transcript/single_speaker/en_dl.txt \
 25 | --ref_audio belinda \
 26 | --seed 12345 \
 27 | --out_path generation.wav
 28 | ```
 29 | 
 30 | #### (Experimental) Cross-lingual voice clone
 31 | 
 32 | This example demonstrates voice cloning with a Chinese prompt, where the synthesized speech is in English.
 33 | 
 34 | ```bash
 35 | python3 generation.py \
 36 | --transcript transcript/single_speaker/en_dl.txt \
 37 | --scene_prompt empty \
 38 | --ref_audio zh_man_sichuan \
 39 | --temperature 0.3 \
 40 | --seed 12345 \
 41 | --out_path generation.wav
 42 | ```
 43 | 
 44 | ### Smart voice
 45 | 
 46 | The model supports reading the transcript with a random voice.
 47 | 
 48 | ```bash
 49 | python3 generation.py \
 50 | --transcript transcript/single_speaker/en_dl.txt \
 51 | --seed 12345 \
 52 | --out_path generation.wav
 53 | ```
 54 | 
 55 | It also works for other languages like Chinese.
 56 | 
 57 | ```bash
 58 | python3 generation.py \
 59 | --transcript transcript/single_speaker/zh_ai.txt \
 60 | --seed 12345 \
 61 | --out_path generation.wav
 62 | ```
 63 | 
 64 | ### Describe speaker characteristics with text
 65 | 
 66 | The model allows you to describe the speaker via text. See [voice_prompts/profile.yaml](voice_prompts/profile.yaml) for examples. You can run the following two examples that try to specify male / female British accent for the speakers. Also, try to remove the `--seed 12345` flag to see how the model is generating different voices.
 67 | 
 68 | ```bash
 69 | # Male British Accent
 70 | python3 generation.py \
 71 | --transcript transcript/single_speaker/en_dl.txt \
 72 | --ref_audio profile:male_en_british \
 73 | --seed 12345 \
 74 | --out_path generation.wav
 75 | 
 76 | # Female British Accent
 77 | python3 generation.py \
 78 | --transcript transcript/single_speaker/en_dl.txt \
 79 | --ref_audio profile:female_en_british \
 80 | --seed 12345 \
 81 | --out_path generation.wav
 82 | ```
 83 | 
 84 | ### Chunking for long-form audio generation
 85 | 
 86 | To generate long-form audios, you can chunk the text and render each chunk one by one while putting the previous generated audio and the reference audio in the prompt. Here's an example that generates the first five paragraphs of Higgs Audio v1 release blog. See [text](./transcript/single_speaker/en_higgs_audio_blog.md).
 87 | 
 88 | ```bash
 89 | python3 generation.py \
 90 | --scene_prompt scene_prompts/reading_blog.txt \
 91 | --transcript transcript/single_speaker/en_higgs_audio_blog.md \
 92 | --ref_audio en_man \
 93 | --chunk_method word \
 94 | --temperature 0.3 \
 95 | --generation_chunk_buffer_size 2 \
 96 | --seed 12345 \
 97 | --out_path generation.wav
 98 | ```
 99 | 
100 | ### Experimental and Emergent Capabilities
101 | 
102 | As shown in our demo, the pretrained model is demonstrating emergent features. We prepared some samples to help you explore these experimental prompts. We will enhance the stability of these experimental prompts in the future version of HiggsAudio.
103 | 
104 | #### (Experimental) Hum a tune with the cloned voice
105 | The model is able to hum a tune with the cloned voice.
106 | 
107 | ```bash
108 | python3 generation.py \
109 | --transcript transcript/single_speaker/experimental/en_humming.txt \
110 | --ref_audio en_woman \
111 | --ras_win_len 0 \
112 | --seed 12345 \
113 | --out_path generation.wav
114 | ```
115 | 
116 | #### (Experimental) Read the sentence while adding background music (BGM)
117 | 
118 | ```bash
119 | python3 generation.py \
120 | --transcript transcript/single_speaker/experimental/en_bgm.txt \
121 | --ref_audio en_woman \
122 | --ras_win_len 0 \
123 | --ref_audio_in_system_message \
124 | --seed 123456 \
125 | --out_path generation.wav
126 | ```
127 | 
128 | ## Multi-speaker Audio Generation
129 | 
130 | 
131 | ### Smart voice
132 | 
133 | To get started to explore HiggsAudio's capability in generating multi-speaker audios. Let's try to generate a multi-speaker dialog from transcript in the zero-shot fashion. See the transcript in [transcript/multi_speaker/en_argument.txt](transcript/multi_speaker/en_argument.txt). The speakers are annotated with `[SPEAKER0]` and `[SPEAKER1]`.
134 | 
135 | ```bash
136 | python3 generation.py \
137 | --transcript transcript/multi_speaker/en_argument.txt \
138 | --seed 12345 \
139 | --out_path generation.wav
140 | ```
141 | 
142 | ### Multi-voice clone
143 | You can also try to clone the voices from multiple people simultaneously and generate audio about the transcript. Here's an example that puts reference audios in the system message and prompt the model iteratively. You can hear "Belinda" arguing with "Broom Salesman".
144 | 
145 | ```bash
146 | python3 generation.py \
147 | --transcript transcript/multi_speaker/en_argument.txt \
148 | --ref_audio belinda,broom_salesman \
149 | --ref_audio_in_system_message \
150 | --chunk_method speaker \
151 | --seed 12345 \
152 | --out_path generation.wav
153 | ```
154 | 
155 | You can also let "Broom Salesman" talking to "Belinda", who recently trained HiggsAudio.
156 | 
157 | ```bash
158 | python3 generation.py \
159 | --transcript transcript/multi_speaker/en_higgs.txt \
160 | --ref_audio broom_salesman,belinda \
161 | --ref_audio_in_system_message \
162 | --chunk_method speaker \
163 | --chunk_max_num_turns 2 \
164 | --seed 12345 \
165 | --out_path generation.wav
166 | ```
167 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/custom_modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | class PartiallyFrozenEmbedding(nn.Module):
  6 |     """Split an existing `nn.Embedding` module that splits the embedding into:
  7 | 
  8 |     - A frozen embedding for indices [0..freeze_until_idx].
  9 |     - A trainable embedding for indices [freeze_until_idx+1..vocab_size-1].
 10 | 
 11 |     This should work with both Zero-2 and Zero-3 seamlessly
 12 |     """
 13 | 
 14 |     def __init__(self, original_embedding: nn.Embedding, freeze_until_idx: int):
 15 |         """
 16 |         :param original_embedding: An instance of nn.Embedding (the original embedding layer).
 17 |         :param freeze_until_idx: The index up to which the embedding is frozen (excluding). The freeze_until_idx is not frozen.
 18 |         """
 19 |         super().__init__()
 20 |         self.freeze_until_idx = freeze_until_idx
 21 |         self.original_vocab_size = original_embedding.num_embeddings
 22 |         self.embedding_dim = original_embedding.embedding_dim
 23 | 
 24 |         # Split the original embedding into frozen and trainable parts
 25 |         self.embedding_frozen = nn.Embedding(
 26 |             freeze_until_idx,
 27 |             self.embedding_dim,
 28 |             dtype=original_embedding.weight.dtype,
 29 |             device=original_embedding.weight.device,
 30 |         )
 31 |         self.embedding_trainable = nn.Embedding(
 32 |             self.original_vocab_size - freeze_until_idx,
 33 |             self.embedding_dim,
 34 |             dtype=original_embedding.weight.dtype,
 35 |             device=original_embedding.weight.device,
 36 |         )
 37 | 
 38 |         # Copy weights from the original embedding into the frozen and trainable parts
 39 |         with torch.no_grad():
 40 |             self.embedding_frozen.weight.copy_(original_embedding.weight[:freeze_until_idx])
 41 |             self.embedding_trainable.weight.copy_(original_embedding.weight[freeze_until_idx:])
 42 | 
 43 |         # Freeze the frozen embedding
 44 |         self.embedding_frozen.weight.requires_grad = False
 45 | 
 46 |     def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
 47 |         """
 48 |         Forward pass for the split embedding wrapper.
 49 |         :param input_ids: Tensor of shape [batch_size, seq_len] with indices in [0..original_vocab_size-1].
 50 |         """
 51 |         # Masks to separate frozen and trainable indices
 52 |         # (bsz, seq_len)
 53 |         mask_frozen = input_ids < self.freeze_until_idx
 54 |         mask_trainable = ~mask_frozen
 55 | 
 56 |         # Output tensor for embedding results
 57 |         batch_size, seq_len = input_ids.shape
 58 |         embeddings = torch.zeros(
 59 |             batch_size,
 60 |             seq_len,
 61 |             self.embedding_dim,
 62 |             device=input_ids.device,
 63 |             dtype=self.embedding_frozen.weight.dtype,
 64 |         )
 65 | 
 66 |         # Handle frozen embedding
 67 |         if mask_frozen.any():
 68 |             frozen_ids = input_ids[mask_frozen]
 69 |             frozen_emb = self.embedding_frozen(frozen_ids)
 70 |             embeddings[mask_frozen] = frozen_emb
 71 | 
 72 |         # Handle trainable embedding
 73 |         if mask_trainable.any():
 74 |             # Adjust trainable IDs to the local index space of the trainable embedding
 75 |             trainable_ids = input_ids[mask_trainable] - (self.freeze_until_idx)
 76 |             trainable_emb = self.embedding_trainable(trainable_ids)
 77 |             embeddings[mask_trainable] = trainable_emb
 78 | 
 79 |         return embeddings
 80 | 
 81 |     def to_unsplit(self) -> nn.Embedding:
 82 |         unsplit_embedding = nn.Embedding(
 83 |             self.original_vocab_size,
 84 |             self.embedding_dim,
 85 |             dtype=self.embedding_frozen.weight.dtype,
 86 |             device=self.embedding_frozen.weight.device,
 87 |         )
 88 | 
 89 |         with torch.no_grad():
 90 |             unsplit_embedding.weight[: self.freeze_until_idx].copy_(self.embedding_frozen.weight)
 91 |             unsplit_embedding.weight[self.freeze_until_idx :].copy_(self.embedding_trainable.weight)
 92 | 
 93 |         return unsplit_embedding
 94 | 
 95 | 
 96 | class PartiallyFrozenLinear(nn.Module):
 97 |     """A wrapper around nn.Linear to partially freeze part of the weight matrix."""
 98 | 
 99 |     def __init__(self, original_linear: nn.Linear, freeze_until_idx: int):
100 |         """
101 |         :param original_linear: The original nn.Linear layer.
102 |         :param freeze_until_idx: The index up to which the rows of the weight matrix are frozen.
103 |         """
104 |         super().__init__()
105 |         assert original_linear.bias is None, "Currently only support linear module without bias"
106 | 
107 |         self.freeze_until_idx = freeze_until_idx
108 |         self.input_dim = original_linear.in_features
109 |         self.output_dim = original_linear.out_features
110 | 
111 |         # Create frozen and trainable linear layers
112 |         self.linear_frozen = nn.Linear(
113 |             self.input_dim,
114 |             freeze_until_idx,
115 |             bias=False,
116 |             dtype=original_linear.weight.dtype,
117 |             device=original_linear.weight.device,
118 |         )
119 |         self.linear_trainable = nn.Linear(
120 |             self.input_dim,
121 |             self.output_dim - freeze_until_idx,
122 |             bias=False,
123 |             dtype=original_linear.weight.dtype,
124 |             device=original_linear.weight.device,
125 |         )
126 | 
127 |         # Copy weights from the original linear layer
128 |         with torch.no_grad():
129 |             self.linear_frozen.weight.copy_(original_linear.weight[:freeze_until_idx])
130 |             self.linear_trainable.weight.copy_(original_linear.weight[freeze_until_idx:])
131 | 
132 |         # Freeze the frozen linear layer
133 |         self.linear_frozen.weight.requires_grad = False
134 | 
135 |     def forward(self, input_tensor):
136 |         # input_tensor: (bsz, seq_len, hidden_state_dim)
137 |         frozen_output = self.linear_frozen(input_tensor)
138 |         trainable_output = self.linear_trainable(input_tensor)
139 |         return torch.cat((frozen_output, trainable_output), dim=-1)
140 | 
141 |     def to_unsplit(self) -> nn.Linear:
142 |         unsplit_linear = nn.Linear(
143 |             self.input_dim,
144 |             self.output_dim,
145 |             bias=False,
146 |             dtype=self.linear_frozen.weight.dtype,
147 |             device=self.linear_frozen.weight.device,
148 |         )
149 | 
150 |         # Copy weights from the frozen and trainable layers into the unsplit linear layer
151 |         with torch.no_grad():
152 |             unsplit_linear.weight[: self.freeze_until_idx].copy_(self.linear_frozen.weight)
153 |             unsplit_linear.weight[self.freeze_until_idx :].copy_(self.linear_trainable.weight)
154 | 
155 |         return unsplit_linear
156 | 


--------------------------------------------------------------------------------
/boson_multimodal/serve/utils.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import base64
  3 | import re
  4 | import regex
  5 | from typing import AsyncGenerator, Union
  6 | import io
  7 | from pydub import AudioSegment
  8 | import torch
  9 | import numpy as np
 10 | from functools import lru_cache
 11 | 
 12 | from ..audio_processing.higgs_audio_tokenizer import HiggsAudioTokenizer
 13 | 
 14 | 
 15 | def random_uuid() -> str:
 16 |     return str(uuid.uuid4().hex)
 17 | 
 18 | 
 19 | async def async_generator_wrap(first_element, gen: AsyncGenerator):
 20 |     """Wrap an async generator with the first element."""
 21 |     yield first_element
 22 |     async for item in gen:
 23 |         yield item
 24 | 
 25 | 
 26 | @lru_cache(maxsize=50)
 27 | def encode_base64_content_from_file(file_path: str) -> str:
 28 |     """Encode a content from a local file to base64 format."""
 29 |     # Read the MP3 file as binary and encode it directly to Base64
 30 |     with open(file_path, "rb") as audio_file:
 31 |         audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
 32 |     return audio_base64
 33 | 
 34 | 
 35 | def pcm16_to_target_format(
 36 |     np_audio: np.ndarray,
 37 |     sample_rate: int,
 38 |     bit_depth: int,
 39 |     channels: int,
 40 |     format: str,
 41 |     target_rate: int,
 42 | ):
 43 |     wav_audio = AudioSegment(
 44 |         np_audio.tobytes(),
 45 |         frame_rate=sample_rate,
 46 |         sample_width=bit_depth // 8,
 47 |         channels=channels,
 48 |     )
 49 |     if target_rate is not None and target_rate != sample_rate:
 50 |         wav_audio = wav_audio.set_frame_rate(target_rate)
 51 | 
 52 |     # Convert WAV to MP3
 53 |     target_io = io.BytesIO()
 54 |     wav_audio.export(target_io, format=format)
 55 |     target_io.seek(0)
 56 | 
 57 |     return target_io
 58 | 
 59 | 
 60 | chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
 61 | 
 62 | 
 63 | def contains_chinese(text: str):
 64 |     return bool(chinese_char_pattern.search(text))
 65 | 
 66 | 
 67 | # remove blank between chinese character
 68 | def replace_blank(text: str):
 69 |     out_str = []
 70 |     for i, c in enumerate(text):
 71 |         if c == " ":
 72 |             if (text[i + 1].isascii() and text[i + 1] != " ") and (text[i - 1].isascii() and text[i - 1] != " "):
 73 |                 out_str.append(c)
 74 |         else:
 75 |             out_str.append(c)
 76 |     return "".join(out_str)
 77 | 
 78 | 
 79 | def replace_corner_mark(text: str):
 80 |     text = text.replace("²", "平方")
 81 |     text = text.replace("³", "立方")
 82 |     return text
 83 | 
 84 | 
 85 | # remove meaningless symbol
 86 | def remove_bracket(text: str):
 87 |     text = text.replace("（", "").replace("）", "")
 88 |     text = text.replace("【", "").replace("】", "")
 89 |     text = text.replace("`", "").replace("`", "")
 90 |     text = text.replace("——", " ")
 91 |     return text
 92 | 
 93 | 
 94 | # split paragrah logic：
 95 | # 1. per sentence max len token_max_n, min len token_min_n, merge if last sentence len less than merge_len
 96 | # 2. cal sentence len according to lang
 97 | # 3. split sentence according to puncatation
 98 | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, token_min_n=60, merge_len=20, comma_split=False):
 99 |     def calc_utt_length(_text: str):
100 |         if lang == "zh":
101 |             return len(_text)
102 |         else:
103 |             return len(tokenize(_text))
104 | 
105 |     def should_merge(_text: str):
106 |         if lang == "zh":
107 |             return len(_text) < merge_len
108 |         else:
109 |             return len(tokenize(_text)) < merge_len
110 | 
111 |     if lang == "zh":
112 |         pounc = ["。", "？", "！", "；", "：", "、", ".", "?", "!", ";"]
113 |     else:
114 |         pounc = [".", "?", "!", ";", ":"]
115 |     if comma_split:
116 |         pounc.extend(["，", ","])
117 | 
118 |     if text[-1] not in pounc:
119 |         if lang == "zh":
120 |             text += "。"
121 |         else:
122 |             text += "."
123 | 
124 |     st = 0
125 |     utts = []
126 |     for i, c in enumerate(text):
127 |         if c in pounc:
128 |             if len(text[st:i]) > 0:
129 |                 utts.append(text[st:i] + c)
130 |             if i + 1 < len(text) and text[i + 1] in ['"', "”"]:
131 |                 tmp = utts.pop(-1)
132 |                 utts.append(tmp + text[i + 1])
133 |                 st = i + 2
134 |             else:
135 |                 st = i + 1
136 | 
137 |     final_utts = []
138 |     cur_utt = ""
139 |     for utt in utts:
140 |         if calc_utt_length(cur_utt + utt) > token_max_n and calc_utt_length(cur_utt) > token_min_n:
141 |             final_utts.append(cur_utt)
142 |             cur_utt = ""
143 |         cur_utt = cur_utt + utt
144 |     if len(cur_utt) > 0:
145 |         if should_merge(cur_utt) and len(final_utts) != 0:
146 |             final_utts[-1] = final_utts[-1] + cur_utt
147 |         else:
148 |             final_utts.append(cur_utt)
149 | 
150 |     return final_utts
151 | 
152 | 
153 | def is_only_punctuation(text: str):
154 |     # Regular expression: Match strings that consist only of punctuation marks or are empty.
155 |     punctuation_pattern = r"^[\p{P}\p{S}]*$"
156 |     return bool(regex.fullmatch(punctuation_pattern, text))
157 | 
158 | 
159 | # spell Arabic numerals
160 | def spell_out_number(text: str, inflect_parser):
161 |     new_text = []
162 |     st = None
163 |     for i, c in enumerate(text):
164 |         if not c.isdigit():
165 |             if st is not None:
166 |                 num_str = inflect_parser.number_to_words(text[st:i])
167 |                 new_text.append(num_str)
168 |                 st = None
169 |             new_text.append(c)
170 |         else:
171 |             if st is None:
172 |                 st = i
173 |     if st is not None and st < len(text):
174 |         num_str = inflect_parser.number_to_words(text[st:])
175 |         new_text.append(num_str)
176 |     return "".join(new_text)
177 | 
178 | 
179 | def remove_emoji(text: str):
180 |     # Pattern to match emojis and their modifiers
181 |     # - Standard emoji range
182 |     # - Zero-width joiners (U+200D)
183 |     # - Variation selectors (U+FE0F, U+FE0E)
184 |     # - Skin tone modifiers (U+1F3FB to U+1F3FF)
185 |     emoji_pattern = re.compile(
186 |         r"["
187 |         r"\U00010000-\U0010FFFF"  # Standard emoji range
188 |         r"\u200D"  # Zero-width joiner
189 |         r"\uFE0F\uFE0E"  # Variation selectors
190 |         r"\U0001F3FB-\U0001F3FF"  # Skin tone modifiers
191 |         r"]+",
192 |         flags=re.UNICODE,
193 |     )
194 |     return emoji_pattern.sub(r"", text)
195 | 
196 | 
197 | def remove_repeated_punctuations(text, punctuations):
198 |     if len(punctuations) == 0:
199 |         return text
200 |     pattern = f"[{re.escape(''.join(punctuations))}]"  # Create regex pattern for given punctuations
201 |     return re.sub(rf"({pattern})\1+", r"\1", text)
202 | 
203 | 
204 | def full_to_half_width(text: str) -> str:
205 |     """Convert full-width punctuation to half-width in a given string."""
206 |     full_width = "！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾＿｀｛｜｝～"
207 |     half_width = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
208 |     trans_table = str.maketrans(full_width, half_width)
209 |     return text.translate(trans_table)
210 | 
211 | 
212 | def split_interleaved_delayed_audios(
213 |     audio_data: Union[list[list[int]], torch.Tensor],
214 |     audio_tokenizer: HiggsAudioTokenizer,
215 |     audio_stream_eos_id: int,
216 | ) -> list[tuple[list[list[int]], torch.Tensor]]:
217 |     separator = [audio_stream_eos_id] * audio_tokenizer.num_codebooks
218 | 
219 |     # Convert separator to numpy array if audio_data is numpy array
220 |     if isinstance(audio_data, torch.Tensor):
221 |         audio_data = audio_data.transpose(1, 0)
222 |         separator = torch.tensor(separator)
223 |         # Find the indices where the rows equal the separator
224 |         split_indices = torch.where(torch.all(audio_data == separator, dim=1))[0]
225 |         start = 0
226 |         groups = []
227 |         for idx in split_indices:
228 |             groups.append(audio_data[start:idx].transpose(1, 0))
229 |             start = idx + 1
230 |         if start < len(audio_data):
231 |             groups.append(audio_data[start:].transpose(1, 0))
232 |     else:
233 |         groups = []
234 |         current = []
235 |         for row in audio_data:
236 |             current.append(row)
237 | 
238 |             if row == separator:
239 |                 groups.append(current)
240 |                 current = []
241 | 
242 |         # Don't forget the last group if there's no trailing separator
243 |         if current:
244 |             groups.append(current)
245 | 
246 |     return groups
247 | 


--------------------------------------------------------------------------------
/examples/vllm/run_chat_completion.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | """An example showing how to use vLLM to serve multimodal models
  3 | and run online inference with OpenAI client.
  4 | """
  5 | 
  6 | import argparse
  7 | import base64
  8 | import os
  9 | import time
 10 | from io import BytesIO
 11 | 
 12 | import numpy as np
 13 | import requests
 14 | import soundfile as sf
 15 | from openai import OpenAI
 16 | 
 17 | OPENAI_AUDIO_SAMPLE_RATE = 24000
 18 | DEFAULT_SYSTEM_PROMPT = (
 19 |     "Generate audio following instruction.\n\n"
 20 |     "<|scene_desc_start|>\n"
 21 |     "Audio is recorded from a quiet room.\n"
 22 |     "<|scene_desc_end|>"
 23 | )
 24 | 
 25 | 
 26 | def encode_base64_content_from_file(file_path: str) -> str:
 27 |     """Encode a content from a local file to base64 format."""
 28 |     # Read the MP3 file as binary and encode it directly to Base64
 29 |     with open(file_path, "rb") as audio_file:
 30 |         audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
 31 |     return audio_base64
 32 | 
 33 | 
 34 | def run_smart_voice() -> None:
 35 |     chat_completion = client.chat.completions.create(
 36 |         messages=[
 37 |             {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
 38 |             {
 39 |                 "role": "user",
 40 |                 "content": (
 41 |                     "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
 42 |                 ),
 43 |             },
 44 |         ],
 45 |         model=model,
 46 |         modalities=["text", "audio"],
 47 |         temperature=1.0,
 48 |         top_p=0.95,
 49 |         extra_body={"top_k": 50},
 50 |         stop=["<|eot_id|>", "<|end_of_text|>", "<|audio_eos|>"],
 51 |     )
 52 | 
 53 |     text = chat_completion.choices[0].message.content
 54 |     audio = chat_completion.choices[0].message.audio.data
 55 |     # Decode base64 audio string to bytes
 56 |     audio_bytes = base64.b64decode(audio)
 57 |     print("Chat completion text output:", text)
 58 |     print("Saving the audio to file")
 59 |     with open("output_smart_voice.wav", "wb") as f:
 60 |         f.write(audio_bytes)
 61 | 
 62 | 
 63 | def run_voice_clone(stream: bool = False) -> None:
 64 |     data_dir = os.path.join(os.path.dirname(__file__), "..", "voice_prompts")
 65 |     audio_path = os.path.join(data_dir, "belinda.wav")
 66 |     audio_text_path = os.path.join(data_dir, "belinda.txt")
 67 |     with open(audio_text_path, "r") as f:
 68 |         audio_text = f.read()
 69 |     audio_base64 = encode_base64_content_from_file(audio_path)
 70 |     messages = [
 71 |         {"role": "user", "content": audio_text},
 72 |         {
 73 |             "role": "assistant",
 74 |             "content": [
 75 |                 {
 76 |                     "type": "input_audio",
 77 |                     "input_audio": {
 78 |                         "data": audio_base64,
 79 |                         "format": "wav",
 80 |                     },
 81 |                 }
 82 |             ],
 83 |         },
 84 |         {
 85 |             "role": "user",
 86 |             "content": (
 87 |                 "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life!"
 88 |             ),
 89 |         },
 90 |     ]
 91 |     start_time = time.time()
 92 |     chat_completion = client.chat.completions.create(
 93 |         messages=messages,
 94 |         model=model,
 95 |         max_completion_tokens=500,
 96 |         stream=stream,
 97 |         modalities=["text", "audio"],
 98 |         temperature=1.0,
 99 |         top_p=0.95,
100 |         extra_body={"top_k": 50},
101 |         stop=["<|eot_id|>", "<|end_of_text|>", "<|audio_eos|>"],
102 |     )
103 |     if stream:
104 |         audio_bytes_io = BytesIO()
105 |         i = 0
106 |         first_audio_latency = None
107 |         for chunk in chat_completion:
108 |             if chunk.choices and hasattr(chunk.choices[0].delta, "audio") and chunk.choices[0].delta.audio:
109 |                 if first_audio_latency is None:
110 |                     first_audio_latency = time.time() - start_time
111 |                 audio_bytes = base64.b64decode(chunk.choices[0].delta.audio["data"])
112 |                 audio_bytes_io.write(audio_bytes)
113 |                 audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
114 |                 i += 1
115 |         audio_bytes_io.seek(0)
116 |         audio_data = np.frombuffer(audio_bytes_io.getvalue(), dtype=np.int16)
117 |         print("Saving the audio to file")
118 |         print(f"First audio latency: {first_audio_latency * 1000} ms")
119 |         print(f"Total audio latency: {(time.time() - start_time) * 1000} ms")
120 |         sf.write("output_voice_clone.wav", audio_data, OPENAI_AUDIO_SAMPLE_RATE)
121 |     else:
122 |         text = chat_completion.choices[0].message.content
123 |         audio = chat_completion.choices[0].message.audio.data
124 |         audio_bytes = base64.b64decode(audio)
125 |         print("Chat completion text output:", text)
126 |         print("Saving the audio to file")
127 |         with open("output_voice_clone.wav", "wb") as f:
128 |             f.write(audio_bytes)
129 | 
130 | 
131 | def run_generate_multispeaker(stream: bool = False) -> None:
132 |     MULTI_SPEAKER_SYSTEM_PROMPT = (
133 |         "You are an AI assistant designed to convert text into speech.\n"
134 |         "If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.\n"
135 |         "If no speaker tag is present, select a suitable voice on your own.\n\n"
136 |         "<|scene_desc_start|>\n"
137 |         "SPEAKER0: feminine\n"
138 |         "SPEAKER1: masculine\n"
139 |         "<|scene_desc_end|>"
140 |     )
141 |     transcript_path = os.path.join(os.path.dirname(__file__), "..", "transcript", "multi_speaker", "en_argument.txt")
142 |     with open(transcript_path, "r") as f:
143 |         transcript = f.read()
144 | 
145 |     messages = [{"role": "system", "content": MULTI_SPEAKER_SYSTEM_PROMPT}, {"role": "user", "content": transcript}]
146 |     chat_completion = client.chat.completions.create(
147 |         messages=messages,
148 |         model=model,
149 |         stream=stream,
150 |         stream_options={"include_usage": True},
151 |         stop=["<|end_of_text|>", "<|eot_id|>", "<|audio_eos|>"],
152 |         modalities=["text", "audio"],
153 |         temperature=1.0,
154 |         top_p=0.95,
155 |         extra_body={"top_k": 50},
156 |     )
157 | 
158 |     if stream:
159 |         audio_bytes_io = BytesIO()
160 |         i = 0
161 |         for chunk in chat_completion:
162 |             if chunk.choices and hasattr(chunk.choices[0].delta, "audio") and chunk.choices[0].delta.audio:
163 |                 audio_bytes = base64.b64decode(chunk.choices[0].delta.audio["data"])
164 |                 audio_bytes_io.write(audio_bytes)
165 |                 audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
166 |                 # sf.write(f"output_tts_{i}.wav", audio_data, target_rate)
167 |                 i += 1
168 |             else:
169 |                 print(chunk)
170 |         audio_bytes_io.seek(0)
171 |         audio_data = np.frombuffer(audio_bytes_io.getvalue(), dtype=np.int16)
172 |         print("Saving the audio to file")
173 |         sf.write("output_multispeaker.wav", audio_data, OPENAI_AUDIO_SAMPLE_RATE)
174 |     else:
175 |         text = chat_completion.choices[0].message.content
176 |         audio = chat_completion.choices[0].message.audio.data
177 |         audio_bytes = base64.b64decode(audio)
178 |         print("Chat completion text output:", text)
179 |         print("Saving the audio to file")
180 |         with open("output_multispeaker.wav", "wb") as f:
181 |             f.write(audio_bytes)
182 | 
183 | 
184 | def main(args) -> None:
185 |     if args.task == "voice_clone":
186 |         run_voice_clone(args.stream)
187 |     elif args.task == "smart_voice":
188 |         run_smart_voice()
189 |     elif args.task == "multispeaker":
190 |         run_generate_multispeaker(args.stream)
191 |     else:
192 |         raise ValueError(f"Task {args.task} not supported")
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     parser = argparse.ArgumentParser()
197 |     parser.add_argument(
198 |         "--api-base",
199 |         type=str,
200 |         default="http://localhost:8000/v1",
201 |         help="API base URL for OpenAI client.",
202 |     )
203 |     parser.add_argument("--api-key", type=str, default="EMPTY", help="API key for OpenAI client.")
204 |     parser.add_argument("--stream", action="store_true", help="Stream the audio.")
205 |     parser.add_argument(
206 |         "--task",
207 |         type=str,
208 |         default="voice_clone",
209 |         help="Task to run.",
210 |         choices=["voice_clone", "smart_voice", "multispeaker"],
211 |     )
212 |     parser.add_argument("--model", type=str, default=None, help="Model to use.")
213 |     args = parser.parse_args()
214 | 
215 |     client = OpenAI(
216 |         api_key=args.api_key,
217 |         base_url=args.api_base,
218 |     )
219 | 
220 |     if args.model is None:
221 |         models = client.models.list()
222 |         model = models.data[0].id
223 |     else:
224 |         model = args.model
225 | 
226 |     main(args)
227 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/semantic_module.py:
--------------------------------------------------------------------------------
  1 | # Based on code from: https://github.com/zhenye234/xcodec
  2 | # Licensed under MIT License
  3 | # Modifications by BosonAI
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | 
  9 | class Conv1d1x1(nn.Conv1d):
 10 |     """1x1 Conv1d."""
 11 | 
 12 |     def __init__(self, in_channels, out_channels, bias=True):
 13 |         super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias)
 14 | 
 15 | 
 16 | class Conv1d(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         in_channels: int,
 20 |         out_channels: int,
 21 |         kernel_size: int,
 22 |         stride: int = 1,
 23 |         padding: int = -1,
 24 |         dilation: int = 1,
 25 |         groups: int = 1,
 26 |         bias: bool = True,
 27 |     ):
 28 |         super().__init__()
 29 |         self.in_channels = in_channels
 30 |         self.out_channels = out_channels
 31 |         self.kernel_size = kernel_size
 32 |         if padding < 0:
 33 |             padding = (kernel_size - 1) // 2 * dilation
 34 |         self.dilation = dilation
 35 |         self.conv = nn.Conv1d(
 36 |             in_channels=in_channels,
 37 |             out_channels=out_channels,
 38 |             kernel_size=kernel_size,
 39 |             stride=stride,
 40 |             padding=padding,
 41 |             dilation=dilation,
 42 |             groups=groups,
 43 |             bias=bias,
 44 |         )
 45 | 
 46 |     def forward(self, x):
 47 |         """
 48 |         Args:
 49 |             x (Tensor): Float tensor variable with the shape  (B, C, T).
 50 |         Returns:
 51 |             Tensor: Float tensor variable with the shape (B, C, T).
 52 |         """
 53 |         x = self.conv(x)
 54 |         return x
 55 | 
 56 | 
 57 | class ResidualUnit(nn.Module):
 58 |     def __init__(
 59 |         self,
 60 |         in_channels: int,
 61 |         out_channels: int,
 62 |         kernel_size=3,
 63 |         dilation=1,
 64 |         bias=False,
 65 |         nonlinear_activation="ELU",
 66 |         nonlinear_activation_params={},
 67 |     ):
 68 |         super().__init__()
 69 |         self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
 70 |         self.conv1 = Conv1d(
 71 |             in_channels=in_channels,
 72 |             out_channels=out_channels,
 73 |             kernel_size=kernel_size,
 74 |             stride=1,
 75 |             dilation=dilation,
 76 |             bias=bias,
 77 |         )
 78 |         self.conv2 = Conv1d1x1(out_channels, out_channels, bias)
 79 | 
 80 |     def forward(self, x):
 81 |         y = self.conv1(self.activation(x))
 82 |         y = self.conv2(self.activation(y))
 83 |         return x + y
 84 | 
 85 | 
 86 | class ConvTranspose1d(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         in_channels: int,
 90 |         out_channels: int,
 91 |         kernel_size: int,
 92 |         stride: int,
 93 |         padding=-1,
 94 |         output_padding=-1,
 95 |         groups=1,
 96 |         bias=True,
 97 |     ):
 98 |         super().__init__()
 99 |         if padding < 0:
100 |             padding = (stride + 1) // 2
101 |         if output_padding < 0:
102 |             output_padding = 1 if stride % 2 else 0
103 |         self.deconv = nn.ConvTranspose1d(
104 |             in_channels=in_channels,
105 |             out_channels=out_channels,
106 |             kernel_size=kernel_size,
107 |             stride=stride,
108 |             padding=padding,
109 |             output_padding=output_padding,
110 |             groups=groups,
111 |             bias=bias,
112 |         )
113 | 
114 |     def forward(self, x):
115 |         """
116 |         Args:
117 |             x (Tensor): Float tensor variable with the shape  (B, C, T).
118 |         Returns:
119 |             Tensor: Float tensor variable with the shape (B, C', T').
120 |         """
121 |         x = self.deconv(x)
122 |         return x
123 | 
124 | 
125 | class EncoderBlock(nn.Module):
126 |     def __init__(
127 |         self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True
128 |     ):
129 |         super().__init__()
130 |         self.res_units = torch.nn.ModuleList()
131 |         for dilation in dilations:
132 |             self.res_units += [ResidualUnit(in_channels, in_channels, kernel_size=unit_kernel_size, dilation=dilation)]
133 |         self.num_res = len(self.res_units)
134 | 
135 |         self.conv = Conv1d(
136 |             in_channels=in_channels,
137 |             out_channels=out_channels,
138 |             kernel_size=3 if stride == 1 else (2 * stride),  # special case: stride=1, do not use kernel=2
139 |             stride=stride,
140 |             bias=bias,
141 |         )
142 | 
143 |     def forward(self, x):
144 |         for idx in range(self.num_res):
145 |             x = self.res_units[idx](x)
146 |         x = self.conv(x)
147 |         return x
148 | 
149 | 
150 | class Encoder(nn.Module):
151 |     def __init__(
152 |         self,
153 |         input_channels: int,
154 |         encode_channels: int,
155 |         channel_ratios=(1, 1),
156 |         strides=(1, 1),
157 |         kernel_size=3,
158 |         bias=True,
159 |         block_dilations=(1, 1),
160 |         unit_kernel_size=3,
161 |     ):
162 |         super().__init__()
163 |         assert len(channel_ratios) == len(strides)
164 | 
165 |         self.conv = Conv1d(
166 |             in_channels=input_channels, out_channels=encode_channels, kernel_size=kernel_size, stride=1, bias=False
167 |         )
168 |         self.conv_blocks = torch.nn.ModuleList()
169 |         in_channels = encode_channels
170 |         for idx, stride in enumerate(strides):
171 |             out_channels = int(encode_channels * channel_ratios[idx])  # could be float
172 |             self.conv_blocks += [
173 |                 EncoderBlock(
174 |                     in_channels,
175 |                     out_channels,
176 |                     stride,
177 |                     dilations=block_dilations,
178 |                     unit_kernel_size=unit_kernel_size,
179 |                     bias=bias,
180 |                 )
181 |             ]
182 |             in_channels = out_channels
183 |         self.num_blocks = len(self.conv_blocks)
184 |         self.out_channels = out_channels
185 | 
186 |     def forward(self, x):
187 |         x = self.conv(x)
188 |         for i in range(self.num_blocks):
189 |             x = self.conv_blocks[i](x)
190 |         return x
191 | 
192 | 
193 | class DecoderBlock(nn.Module):
194 |     """Decoder block (no up-sampling)"""
195 | 
196 |     def __init__(
197 |         self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True
198 |     ):
199 |         super().__init__()
200 | 
201 |         if stride == 1:
202 |             self.conv = Conv1d(
203 |                 in_channels=in_channels,
204 |                 out_channels=out_channels,
205 |                 kernel_size=3,  # fix kernel=3 when stride=1 for unchanged shape
206 |                 stride=stride,
207 |                 bias=bias,
208 |             )
209 |         else:
210 |             self.conv = ConvTranspose1d(
211 |                 in_channels=in_channels,
212 |                 out_channels=out_channels,
213 |                 kernel_size=(2 * stride),
214 |                 stride=stride,
215 |                 bias=bias,
216 |             )
217 | 
218 |         self.res_units = torch.nn.ModuleList()
219 |         for idx, dilation in enumerate(dilations):
220 |             self.res_units += [
221 |                 ResidualUnit(out_channels, out_channels, kernel_size=unit_kernel_size, dilation=dilation)
222 |             ]
223 |         self.num_res = len(self.res_units)
224 | 
225 |     def forward(self, x):
226 |         x = self.conv(x)
227 |         for idx in range(self.num_res):
228 |             x = self.res_units[idx](x)
229 |         return x
230 | 
231 | 
232 | class Decoder(nn.Module):
233 |     def __init__(
234 |         self,
235 |         code_dim: int,
236 |         output_channels: int,
237 |         decode_channels: int,
238 |         channel_ratios=(1, 1),
239 |         strides=(1, 1),
240 |         kernel_size=3,
241 |         bias=True,
242 |         block_dilations=(1, 1),
243 |         unit_kernel_size=3,
244 |     ):
245 |         super().__init__()
246 |         assert len(channel_ratios) == len(strides)
247 | 
248 |         self.conv1 = Conv1d(
249 |             in_channels=code_dim,
250 |             out_channels=int(decode_channels * channel_ratios[0]),
251 |             kernel_size=kernel_size,
252 |             stride=1,
253 |             bias=False,
254 |         )
255 | 
256 |         self.conv_blocks = torch.nn.ModuleList()
257 |         for idx, stride in enumerate(strides):
258 |             in_channels = int(decode_channels * channel_ratios[idx])
259 |             if idx < (len(channel_ratios) - 1):
260 |                 out_channels = int(decode_channels * channel_ratios[idx + 1])
261 |             else:
262 |                 out_channels = decode_channels
263 |             self.conv_blocks += [
264 |                 DecoderBlock(
265 |                     in_channels,
266 |                     out_channels,
267 |                     stride,
268 |                     dilations=block_dilations,
269 |                     unit_kernel_size=unit_kernel_size,
270 |                     bias=bias,
271 |                 )
272 |             ]
273 |         self.num_blocks = len(self.conv_blocks)
274 | 
275 |         self.conv2 = Conv1d(out_channels, output_channels, kernel_size, 1, bias=False)
276 | 
277 |     def forward(self, z):
278 |         x = self.conv1(z)
279 |         for i in range(self.num_blocks):
280 |             x = self.conv_blocks[i](x)
281 |         x = self.conv2(x)
282 |         return x
283 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/quantization/ddp_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import subprocess
  4 | from datetime import datetime
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.distributed as dist
  9 | from torch.nn.parallel import DistributedDataParallel
 10 | from torch.nn.parallel.distributed import _find_tensors
 11 | import torch.optim
 12 | import torch.utils.data
 13 | from packaging import version
 14 | from omegaconf import OmegaConf
 15 | 
 16 | 
 17 | def set_random_seed(seed):
 18 |     random.seed(seed)
 19 |     np.random.seed(seed)
 20 |     torch.manual_seed(seed)
 21 |     torch.cuda.manual_seed_all(seed)
 22 | 
 23 | 
 24 | def is_logging_process():
 25 |     return not dist.is_initialized() or dist.get_rank() == 0
 26 | 
 27 | 
 28 | def get_logger(cfg, name=None):
 29 |     # log_file_path is used when unit testing
 30 |     if is_logging_process():
 31 |         logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_config, resolve=True))
 32 |         return logging.getLogger(name)
 33 | 
 34 | 
 35 | # from https://github.com/Lightning-AI/lightning-bolts/blob/5d61197cd2f491f69e238137a5edabe80ae14ad9/pl_bolts/models/self_supervised/simclr/simclr_module.py#L20
 36 | class SyncFunction(torch.autograd.Function):
 37 |     @staticmethod
 38 |     # @torch.no_grad()
 39 |     def forward(ctx, tensor):
 40 |         ctx.batch_size = tensor.shape[0]
 41 | 
 42 |         gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
 43 | 
 44 |         torch.distributed.all_gather(gathered_tensor, tensor)
 45 |         gathered_tensor = torch.cat(gathered_tensor, 0)
 46 | 
 47 |         return gathered_tensor
 48 | 
 49 |     @staticmethod
 50 |     def backward(ctx, grad_output):
 51 |         grad_input = grad_output.clone()
 52 |         torch.distributed.all_reduce(grad_input, op=torch.distributed.ReduceOp.SUM, async_op=False)
 53 | 
 54 |         idx_from = torch.distributed.get_rank() * ctx.batch_size
 55 |         idx_to = (torch.distributed.get_rank() + 1) * ctx.batch_size
 56 |         return grad_input[idx_from:idx_to]
 57 | 
 58 | 
 59 | def get_timestamp():
 60 |     return datetime.now().strftime("%y%m%d-%H%M%S")
 61 | 
 62 | 
 63 | def get_commit_hash():
 64 |     message = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
 65 |     return message.strip().decode("utf-8")
 66 | 
 67 | 
 68 | class DDP(DistributedDataParallel):
 69 |     """
 70 |     Override the forward call in lightning so it goes to training and validation step respectively
 71 |     """
 72 | 
 73 |     def forward(self, *inputs, **kwargs):  # pragma: no cover
 74 |         if version.parse(torch.__version__[:6]) < version.parse("1.11"):
 75 |             self._sync_params()
 76 |             inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
 77 |             assert len(self.device_ids) == 1
 78 |             if self.module.training:
 79 |                 output = self.module.training_step(*inputs[0], **kwargs[0])
 80 |             elif self.module.testing:
 81 |                 output = self.module.test_step(*inputs[0], **kwargs[0])
 82 |             else:
 83 |                 output = self.module.validation_step(*inputs[0], **kwargs[0])
 84 |             if torch.is_grad_enabled():
 85 |                 # We'll return the output object verbatim since it is a freeform
 86 |                 # object. We need to find any tensors in this object, though,
 87 |                 # because we need to figure out which parameters were used during
 88 |                 # this forward pass, to ensure we short circuit reduction for any
 89 |                 # unused parameters. Only if `find_unused_parameters` is set.
 90 |                 if self.find_unused_parameters:
 91 |                     self.reducer.prepare_for_backward(list(_find_tensors(output)))
 92 |                 else:
 93 |                     self.reducer.prepare_for_backward([])
 94 |         else:
 95 |             from torch.nn.parallel.distributed import (
 96 |                 logging,
 97 |                 Join,
 98 |                 _DDPSink,
 99 |                 _tree_flatten_with_rref,
100 |                 _tree_unflatten_with_rref,
101 |             )
102 | 
103 |             with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
104 |                 if torch.is_grad_enabled() and self.require_backward_grad_sync:
105 |                     self.logger.set_runtime_stats_and_log()
106 |                     self.num_iterations += 1
107 |                     self.reducer.prepare_for_forward()
108 | 
109 |                 # Notify the join context that this process has not joined, if
110 |                 # needed
111 |                 work = Join.notify_join_context(self)
112 |                 if work:
113 |                     self.reducer._set_forward_pass_work_handle(work, self._divide_by_initial_world_size)
114 | 
115 |                 # Calling _rebuild_buckets before forward compuation,
116 |                 # It may allocate new buckets before deallocating old buckets
117 |                 # inside _rebuild_buckets. To save peak memory usage,
118 |                 # call _rebuild_buckets before the peak memory usage increases
119 |                 # during forward computation.
120 |                 # This should be called only once during whole training period.
121 |                 if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
122 |                     logging.info("Reducer buckets have been rebuilt in this iteration.")
123 |                     self._has_rebuilt_buckets = True
124 | 
125 |                 # sync params according to location (before/after forward) user
126 |                 # specified as part of hook, if hook was specified.
127 |                 buffer_hook_registered = hasattr(self, "buffer_hook")
128 |                 if self._check_sync_bufs_pre_fwd():
129 |                     self._sync_buffers()
130 | 
131 |                 if self._join_config.enable:
132 |                     # Notify joined ranks whether they should sync in backwards pass or not.
133 |                     self._check_global_requires_backward_grad_sync(is_joined_rank=False)
134 | 
135 |                 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
136 |                 if self.module.training:
137 |                     output = self.module.training_step(*inputs[0], **kwargs[0])
138 |                 elif self.module.testing:
139 |                     output = self.module.test_step(*inputs[0], **kwargs[0])
140 |                 else:
141 |                     output = self.module.validation_step(*inputs[0], **kwargs[0])
142 | 
143 |                 # sync params according to location (before/after forward) user
144 |                 # specified as part of hook, if hook was specified.
145 |                 if self._check_sync_bufs_post_fwd():
146 |                     self._sync_buffers()
147 | 
148 |                 if torch.is_grad_enabled() and self.require_backward_grad_sync:
149 |                     self.require_forward_param_sync = True
150 |                     # We'll return the output object verbatim since it is a freeform
151 |                     # object. We need to find any tensors in this object, though,
152 |                     # because we need to figure out which parameters were used during
153 |                     # this forward pass, to ensure we short circuit reduction for any
154 |                     # unused parameters. Only if `find_unused_parameters` is set.
155 |                     if self.find_unused_parameters and not self.static_graph:
156 |                         # Do not need to populate this for static graph.
157 |                         self.reducer.prepare_for_backward(list(_find_tensors(output)))
158 |                     else:
159 |                         self.reducer.prepare_for_backward([])
160 |                 else:
161 |                     self.require_forward_param_sync = False
162 | 
163 |             # TODO: DDPSink is currently enabled for unused parameter detection and
164 |             # static graph training for first iteration.
165 |             if (self.find_unused_parameters and not self.static_graph) or (
166 |                 self.static_graph and self.num_iterations == 1
167 |             ):
168 |                 state_dict = {
169 |                     "static_graph": self.static_graph,
170 |                     "num_iterations": self.num_iterations,
171 |                 }
172 | 
173 |                 output_tensor_list, treespec, output_is_rref = _tree_flatten_with_rref(output)
174 |                 output_placeholders = [None for _ in range(len(output_tensor_list))]
175 |                 # Do not touch tensors that have no grad_fn, which can cause issues
176 |                 # such as https://github.com/pytorch/pytorch/issues/60733
177 |                 for i, output in enumerate(output_tensor_list):
178 |                     if torch.is_tensor(output) and output.grad_fn is None:
179 |                         output_placeholders[i] = output
180 | 
181 |                 # When find_unused_parameters=True, makes tensors which require grad
182 |                 # run through the DDPSink backward pass. When not all outputs are
183 |                 # used in loss, this makes those corresponding tensors receive
184 |                 # undefined gradient which the reducer then handles to ensure
185 |                 # param.grad field is not touched and we don't error out.
186 |                 passthrough_tensor_list = _DDPSink.apply(
187 |                     self.reducer,
188 |                     state_dict,
189 |                     *output_tensor_list,
190 |                 )
191 |                 for i in range(len(output_placeholders)):
192 |                     if output_placeholders[i] is None:
193 |                         output_placeholders[i] = passthrough_tensor_list[i]
194 | 
195 |                 # Reconstruct output data structure.
196 |                 output = _tree_unflatten_with_rref(output_placeholders, treespec, output_is_rref)
197 |         return output
198 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/descriptaudiocodec/dac/nn/quantize.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from einops import rearrange
  8 | from torch.nn.utils import weight_norm
  9 | 
 10 | from dac.nn.layers import WNConv1d
 11 | 
 12 | 
 13 | class VectorQuantize(nn.Module):
 14 |     """
 15 |     Implementation of VQ similar to Karpathy's repo:
 16 |     https://github.com/karpathy/deep-vector-quantization
 17 |     Additionally uses following tricks from Improved VQGAN
 18 |     (https://arxiv.org/pdf/2110.04627.pdf):
 19 |         1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
 20 |             for improved codebook usage
 21 |         2. l2-normalized codes: Converts euclidean distance to cosine similarity which
 22 |             improves training stability
 23 |     """
 24 | 
 25 |     def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int):
 26 |         super().__init__()
 27 |         self.codebook_size = codebook_size
 28 |         self.codebook_dim = codebook_dim
 29 | 
 30 |         self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
 31 |         self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
 32 |         self.codebook = nn.Embedding(codebook_size, codebook_dim)
 33 | 
 34 |     def forward(self, z):
 35 |         """Quantized the input tensor using a fixed codebook and returns
 36 |         the corresponding codebook vectors
 37 | 
 38 |         Parameters
 39 |         ----------
 40 |         z : Tensor[B x D x T]
 41 | 
 42 |         Returns
 43 |         -------
 44 |         Tensor[B x D x T]
 45 |             Quantized continuous representation of input
 46 |         Tensor[1]
 47 |             Commitment loss to train encoder to predict vectors closer to codebook
 48 |             entries
 49 |         Tensor[1]
 50 |             Codebook loss to update the codebook
 51 |         Tensor[B x T]
 52 |             Codebook indices (quantized discrete representation of input)
 53 |         Tensor[B x D x T]
 54 |             Projected latents (continuous representation of input before quantization)
 55 |         """
 56 | 
 57 |         # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
 58 |         z_e = self.in_proj(z)  # z_e : (B x D x T)
 59 |         z_q, indices = self.decode_latents(z_e)
 60 | 
 61 |         commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
 62 |         codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
 63 | 
 64 |         z_q = z_e + (z_q - z_e).detach()  # noop in forward pass, straight-through gradient estimator in backward pass
 65 | 
 66 |         z_q = self.out_proj(z_q)
 67 | 
 68 |         return z_q, commitment_loss, codebook_loss, indices, z_e
 69 | 
 70 |     def embed_code(self, embed_id):
 71 |         return F.embedding(embed_id, self.codebook.weight)
 72 | 
 73 |     def decode_code(self, embed_id):
 74 |         return self.embed_code(embed_id).transpose(1, 2)
 75 | 
 76 |     def decode_latents(self, latents):
 77 |         encodings = rearrange(latents, "b d t -> (b t) d")
 78 |         codebook = self.codebook.weight  # codebook: (N x D)
 79 | 
 80 |         # L2 normalize encodings and codebook (ViT-VQGAN)
 81 |         encodings = F.normalize(encodings)
 82 |         codebook = F.normalize(codebook)
 83 | 
 84 |         # Compute euclidean distance with codebook
 85 |         dist = (
 86 |             encodings.pow(2).sum(1, keepdim=True)
 87 |             - 2 * encodings @ codebook.t()
 88 |             + codebook.pow(2).sum(1, keepdim=True).t()
 89 |         )
 90 |         indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
 91 |         z_q = self.decode_code(indices)
 92 |         return z_q, indices
 93 | 
 94 | 
 95 | class ResidualVectorQuantize(nn.Module):
 96 |     """
 97 |     Introduced in SoundStream: An end2end neural audio codec
 98 |     https://arxiv.org/abs/2107.03312
 99 |     """
100 | 
101 |     def __init__(
102 |         self,
103 |         input_dim: int = 512,
104 |         n_codebooks: int = 9,
105 |         codebook_size: int = 1024,
106 |         codebook_dim: Union[int, list] = 8,
107 |         quantizer_dropout: float = 0.0,
108 |     ):
109 |         super().__init__()
110 |         if isinstance(codebook_dim, int):
111 |             codebook_dim = [codebook_dim for _ in range(n_codebooks)]
112 | 
113 |         self.n_codebooks = n_codebooks
114 |         self.codebook_dim = codebook_dim
115 |         self.codebook_size = codebook_size
116 | 
117 |         self.quantizers = nn.ModuleList(
118 |             [VectorQuantize(input_dim, codebook_size, codebook_dim[i]) for i in range(n_codebooks)]
119 |         )
120 |         self.quantizer_dropout = quantizer_dropout
121 | 
122 |     def forward(self, z, n_quantizers: int = None):
123 |         """Quantized the input tensor using a fixed set of `n` codebooks and returns
124 |         the corresponding codebook vectors
125 |         Parameters
126 |         ----------
127 |         z : Tensor[B x D x T]
128 |         n_quantizers : int, optional
129 |             No. of quantizers to use
130 |             (n_quantizers < self.n_codebooks ex: for quantizer dropout)
131 |             Note: if `self.quantizer_dropout` is True, this argument is ignored
132 |                 when in training mode, and a random number of quantizers is used.
133 |         Returns
134 |         -------
135 |         dict
136 |             A dictionary with the following keys:
137 | 
138 |             "z" : Tensor[B x D x T]
139 |                 Quantized continuous representation of input
140 |             "codes" : Tensor[B x N x T]
141 |                 Codebook indices for each codebook
142 |                 (quantized discrete representation of input)
143 |             "latents" : Tensor[B x N*D x T]
144 |                 Projected latents (continuous representation of input before quantization)
145 |             "vq/commitment_loss" : Tensor[1]
146 |                 Commitment loss to train encoder to predict vectors closer to codebook
147 |                 entries
148 |             "vq/codebook_loss" : Tensor[1]
149 |                 Codebook loss to update the codebook
150 |         """
151 |         z_q = 0
152 |         residual = z
153 |         commitment_loss = 0
154 |         codebook_loss = 0
155 | 
156 |         codebook_indices = []
157 |         latents = []
158 | 
159 |         if n_quantizers is None:
160 |             n_quantizers = self.n_codebooks
161 |         if self.training:
162 |             n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
163 |             dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
164 |             n_dropout = int(z.shape[0] * self.quantizer_dropout)
165 |             n_quantizers[:n_dropout] = dropout[:n_dropout]
166 |             n_quantizers = n_quantizers.to(z.device)
167 | 
168 |         for i, quantizer in enumerate(self.quantizers):
169 |             if self.training is False and i >= n_quantizers:
170 |                 break
171 | 
172 |             z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(residual)
173 | 
174 |             # Create mask to apply quantizer dropout
175 |             mask = torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
176 |             z_q = z_q + z_q_i * mask[:, None, None]
177 |             residual = residual - z_q_i
178 | 
179 |             # Sum losses
180 |             commitment_loss += (commitment_loss_i * mask).mean()
181 |             codebook_loss += (codebook_loss_i * mask).mean()
182 | 
183 |             codebook_indices.append(indices_i)
184 |             latents.append(z_e_i)
185 | 
186 |         codes = torch.stack(codebook_indices, dim=1)
187 |         latents = torch.cat(latents, dim=1)
188 | 
189 |         return z_q, codes, latents, commitment_loss, codebook_loss
190 | 
191 |     def from_codes(self, codes: torch.Tensor):
192 |         """Given the quantized codes, reconstruct the continuous representation
193 |         Parameters
194 |         ----------
195 |         codes : Tensor[B x N x T]
196 |             Quantized discrete representation of input
197 |         Returns
198 |         -------
199 |         Tensor[B x D x T]
200 |             Quantized continuous representation of input
201 |         """
202 |         z_q = 0.0
203 |         z_p = []
204 |         n_codebooks = codes.shape[1]
205 |         for i in range(n_codebooks):
206 |             z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
207 |             z_p.append(z_p_i)
208 | 
209 |             z_q_i = self.quantizers[i].out_proj(z_p_i)
210 |             z_q = z_q + z_q_i
211 |         return z_q, torch.cat(z_p, dim=1), codes
212 | 
213 |     def from_latents(self, latents: torch.Tensor):
214 |         """Given the unquantized latents, reconstruct the
215 |         continuous representation after quantization.
216 | 
217 |         Parameters
218 |         ----------
219 |         latents : Tensor[B x N x T]
220 |             Continuous representation of input after projection
221 | 
222 |         Returns
223 |         -------
224 |         Tensor[B x D x T]
225 |             Quantized representation of full-projected space
226 |         Tensor[B x D x T]
227 |             Quantized representation of latent space
228 |         """
229 |         z_q = 0
230 |         z_p = []
231 |         codes = []
232 |         dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
233 | 
234 |         n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
235 |         for i in range(n_codebooks):
236 |             j, k = dims[i], dims[i + 1]
237 |             z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
238 |             z_p.append(z_p_i)
239 |             codes.append(codes_i)
240 | 
241 |             z_q_i = self.quantizers[i].out_proj(z_p_i)
242 |             z_q = z_q + z_q_i
243 | 
244 |         return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
245 | 
246 | 
247 | if __name__ == "__main__":
248 |     rvq = ResidualVectorQuantize(quantizer_dropout=True)
249 |     x = torch.randn(16, 512, 80)
250 |     y = rvq(x)
251 |     print(y["latents"].shape)
252 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/descriptaudiocodec/dac/model/base.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from dataclasses import dataclass
  3 | from pathlib import Path
  4 | from typing import Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import tqdm
  9 | from audiotools import AudioSignal
 10 | from torch import nn
 11 | 
 12 | SUPPORTED_VERSIONS = ["1.0.0"]
 13 | 
 14 | 
 15 | @dataclass
 16 | class DACFile:
 17 |     codes: torch.Tensor
 18 | 
 19 |     # Metadata
 20 |     chunk_length: int
 21 |     original_length: int
 22 |     input_db: float
 23 |     channels: int
 24 |     sample_rate: int
 25 |     padding: bool
 26 |     dac_version: str
 27 | 
 28 |     def save(self, path):
 29 |         artifacts = {
 30 |             "codes": self.codes.numpy().astype(np.uint16),
 31 |             "metadata": {
 32 |                 "input_db": self.input_db.numpy().astype(np.float32),
 33 |                 "original_length": self.original_length,
 34 |                 "sample_rate": self.sample_rate,
 35 |                 "chunk_length": self.chunk_length,
 36 |                 "channels": self.channels,
 37 |                 "padding": self.padding,
 38 |                 "dac_version": SUPPORTED_VERSIONS[-1],
 39 |             },
 40 |         }
 41 |         path = Path(path).with_suffix(".dac")
 42 |         with open(path, "wb") as f:
 43 |             np.save(f, artifacts)
 44 |         return path
 45 | 
 46 |     @classmethod
 47 |     def load(cls, path):
 48 |         artifacts = np.load(path, allow_pickle=True)[()]
 49 |         codes = torch.from_numpy(artifacts["codes"].astype(int))
 50 |         if artifacts["metadata"].get("dac_version", None) not in SUPPORTED_VERSIONS:
 51 |             raise RuntimeError(f"Given file {path} can't be loaded with this version of descript-audio-codec.")
 52 |         return cls(codes=codes, **artifacts["metadata"])
 53 | 
 54 | 
 55 | class CodecMixin:
 56 |     @property
 57 |     def padding(self):
 58 |         if not hasattr(self, "_padding"):
 59 |             self._padding = True
 60 |         return self._padding
 61 | 
 62 |     @padding.setter
 63 |     def padding(self, value):
 64 |         assert isinstance(value, bool)
 65 | 
 66 |         layers = [l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))]
 67 | 
 68 |         for layer in layers:
 69 |             if value:
 70 |                 if hasattr(layer, "original_padding"):
 71 |                     layer.padding = layer.original_padding
 72 |             else:
 73 |                 layer.original_padding = layer.padding
 74 |                 layer.padding = tuple(0 for _ in range(len(layer.padding)))
 75 | 
 76 |         self._padding = value
 77 | 
 78 |     def get_delay(self):
 79 |         # Any number works here, delay is invariant to input length
 80 |         l_out = self.get_output_length(0)
 81 |         L = l_out
 82 | 
 83 |         layers = []
 84 |         for layer in self.modules():
 85 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
 86 |                 layers.append(layer)
 87 | 
 88 |         for layer in reversed(layers):
 89 |             d = layer.dilation[0]
 90 |             k = layer.kernel_size[0]
 91 |             s = layer.stride[0]
 92 | 
 93 |             if isinstance(layer, nn.ConvTranspose1d):
 94 |                 L = ((L - d * (k - 1) - 1) / s) + 1
 95 |             elif isinstance(layer, nn.Conv1d):
 96 |                 L = (L - 1) * s + d * (k - 1) + 1
 97 | 
 98 |             L = math.ceil(L)
 99 | 
100 |         l_in = L
101 | 
102 |         return (l_in - l_out) // 2
103 | 
104 |     def get_output_length(self, input_length):
105 |         L = input_length
106 |         # Calculate output length
107 |         for layer in self.modules():
108 |             if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
109 |                 d = layer.dilation[0]
110 |                 k = layer.kernel_size[0]
111 |                 s = layer.stride[0]
112 | 
113 |                 if isinstance(layer, nn.Conv1d):
114 |                     L = ((L - d * (k - 1) - 1) / s) + 1
115 |                 elif isinstance(layer, nn.ConvTranspose1d):
116 |                     L = (L - 1) * s + d * (k - 1) + 1
117 | 
118 |                 L = math.floor(L)
119 |         return L
120 | 
121 |     @torch.no_grad()
122 |     def compress(
123 |         self,
124 |         audio_path_or_signal: Union[str, Path, AudioSignal],
125 |         win_duration: float = 1.0,
126 |         verbose: bool = False,
127 |         normalize_db: float = -16,
128 |         n_quantizers: int = None,
129 |     ) -> DACFile:
130 |         """Processes an audio signal from a file or AudioSignal object into
131 |         discrete codes. This function processes the signal in short windows,
132 |         using constant GPU memory.
133 | 
134 |         Parameters
135 |         ----------
136 |         audio_path_or_signal : Union[str, Path, AudioSignal]
137 |             audio signal to reconstruct
138 |         win_duration : float, optional
139 |             window duration in seconds, by default 5.0
140 |         verbose : bool, optional
141 |             by default False
142 |         normalize_db : float, optional
143 |             normalize db, by default -16
144 | 
145 |         Returns
146 |         -------
147 |         DACFile
148 |             Object containing compressed codes and metadata
149 |             required for decompression
150 |         """
151 |         audio_signal = audio_path_or_signal
152 |         if isinstance(audio_signal, (str, Path)):
153 |             audio_signal = AudioSignal.load_from_file_with_ffmpeg(str(audio_signal))
154 | 
155 |         self.eval()
156 |         original_padding = self.padding
157 |         original_device = audio_signal.device
158 | 
159 |         audio_signal = audio_signal.clone()
160 |         original_sr = audio_signal.sample_rate
161 | 
162 |         resample_fn = audio_signal.resample
163 |         loudness_fn = audio_signal.loudness
164 | 
165 |         # If audio is > 10 minutes long, use the ffmpeg versions
166 |         if audio_signal.signal_duration >= 10 * 60 * 60:
167 |             resample_fn = audio_signal.ffmpeg_resample
168 |             loudness_fn = audio_signal.ffmpeg_loudness
169 | 
170 |         original_length = audio_signal.signal_length
171 |         resample_fn(self.sample_rate)
172 |         input_db = loudness_fn()
173 | 
174 |         if normalize_db is not None:
175 |             audio_signal.normalize(normalize_db)
176 |         audio_signal.ensure_max_of_audio()
177 | 
178 |         nb, nac, nt = audio_signal.audio_data.shape
179 |         audio_signal.audio_data = audio_signal.audio_data.reshape(nb * nac, 1, nt)
180 |         win_duration = audio_signal.signal_duration if win_duration is None else win_duration
181 | 
182 |         if audio_signal.signal_duration <= win_duration:
183 |             # Unchunked compression (used if signal length < win duration)
184 |             self.padding = True
185 |             n_samples = nt
186 |             hop = nt
187 |         else:
188 |             # Chunked inference
189 |             self.padding = False
190 |             # Zero-pad signal on either side by the delay
191 |             audio_signal.zero_pad(self.delay, self.delay)
192 |             n_samples = int(win_duration * self.sample_rate)
193 |             # Round n_samples to nearest hop length multiple
194 |             n_samples = int(math.ceil(n_samples / self.hop_length) * self.hop_length)
195 |             hop = self.get_output_length(n_samples)
196 | 
197 |         codes = []
198 |         range_fn = range if not verbose else tqdm.trange
199 | 
200 |         for i in range_fn(0, nt, hop):
201 |             x = audio_signal[..., i : i + n_samples]
202 |             x = x.zero_pad(0, max(0, n_samples - x.shape[-1]))
203 | 
204 |             audio_data = x.audio_data.to(self.device)
205 |             audio_data = self.preprocess(audio_data, self.sample_rate)
206 |             _, c, _, _, _ = self.encode(audio_data, n_quantizers)
207 |             codes.append(c.to(original_device))
208 |             chunk_length = c.shape[-1]
209 | 
210 |         codes = torch.cat(codes, dim=-1)
211 | 
212 |         dac_file = DACFile(
213 |             codes=codes,
214 |             chunk_length=chunk_length,
215 |             original_length=original_length,
216 |             input_db=input_db,
217 |             channels=nac,
218 |             sample_rate=original_sr,
219 |             padding=self.padding,
220 |             dac_version=SUPPORTED_VERSIONS[-1],
221 |         )
222 | 
223 |         if n_quantizers is not None:
224 |             codes = codes[:, :n_quantizers, :]
225 | 
226 |         self.padding = original_padding
227 |         return dac_file
228 | 
229 |     @torch.no_grad()
230 |     def decompress(
231 |         self,
232 |         obj: Union[str, Path, DACFile],
233 |         verbose: bool = False,
234 |     ) -> AudioSignal:
235 |         """Reconstruct audio from a given .dac file
236 | 
237 |         Parameters
238 |         ----------
239 |         obj : Union[str, Path, DACFile]
240 |             .dac file location or corresponding DACFile object.
241 |         verbose : bool, optional
242 |             Prints progress if True, by default False
243 | 
244 |         Returns
245 |         -------
246 |         AudioSignal
247 |             Object with the reconstructed audio
248 |         """
249 |         self.eval()
250 |         if isinstance(obj, (str, Path)):
251 |             obj = DACFile.load(obj)
252 | 
253 |         original_padding = self.padding
254 |         self.padding = obj.padding
255 | 
256 |         range_fn = range if not verbose else tqdm.trange
257 |         codes = obj.codes
258 |         original_device = codes.device
259 |         chunk_length = obj.chunk_length
260 |         recons = []
261 | 
262 |         for i in range_fn(0, codes.shape[-1], chunk_length):
263 |             c = codes[..., i : i + chunk_length].to(self.device)
264 |             z = self.quantizer.from_codes(c)[0]
265 |             r = self.decode(z)
266 |             recons.append(r.to(original_device))
267 | 
268 |         recons = torch.cat(recons, dim=-1)
269 |         recons = AudioSignal(recons, self.sample_rate)
270 | 
271 |         resample_fn = recons.resample
272 |         loudness_fn = recons.loudness
273 | 
274 |         # If audio is > 10 minutes long, use the ffmpeg versions
275 |         if recons.signal_duration >= 10 * 60 * 60:
276 |             resample_fn = recons.ffmpeg_resample
277 |             loudness_fn = recons.ffmpeg_loudness
278 | 
279 |         recons.normalize(obj.input_db)
280 |         resample_fn(obj.sample_rate)
281 |         recons = recons[..., : obj.original_length]
282 |         loudness_fn()
283 |         recons.audio_data = recons.audio_data.reshape(-1, obj.channels, obj.original_length)
284 | 
285 |         self.padding = original_padding
286 |         return recons
287 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/descriptaudiocodec/dac/model/dac.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List
  3 | from typing import Union
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from audiotools import AudioSignal
  8 | from audiotools.ml import BaseModel
  9 | from torch import nn
 10 | 
 11 | from .base import CodecMixin
 12 | from dac.nn.layers import Snake1d
 13 | from dac.nn.layers import WNConv1d
 14 | from dac.nn.layers import WNConvTranspose1d
 15 | from dac.nn.quantize import ResidualVectorQuantize
 16 | 
 17 | 
 18 | def init_weights(m):
 19 |     if isinstance(m, nn.Conv1d):
 20 |         nn.init.trunc_normal_(m.weight, std=0.02)
 21 |         nn.init.constant_(m.bias, 0)
 22 | 
 23 | 
 24 | class ResidualUnit(nn.Module):
 25 |     def __init__(self, dim: int = 16, dilation: int = 1):
 26 |         super().__init__()
 27 |         pad = ((7 - 1) * dilation) // 2
 28 |         self.block = nn.Sequential(
 29 |             Snake1d(dim),
 30 |             WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
 31 |             Snake1d(dim),
 32 |             WNConv1d(dim, dim, kernel_size=1),
 33 |         )
 34 | 
 35 |     def forward(self, x):
 36 |         y = self.block(x)
 37 |         pad = (x.shape[-1] - y.shape[-1]) // 2
 38 |         if pad > 0:
 39 |             x = x[..., pad:-pad]
 40 |         return x + y
 41 | 
 42 | 
 43 | class EncoderBlock(nn.Module):
 44 |     def __init__(self, dim: int = 16, stride: int = 1):
 45 |         super().__init__()
 46 |         self.block = nn.Sequential(
 47 |             ResidualUnit(dim // 2, dilation=1),
 48 |             ResidualUnit(dim // 2, dilation=3),
 49 |             ResidualUnit(dim // 2, dilation=9),
 50 |             Snake1d(dim // 2),
 51 |             WNConv1d(
 52 |                 dim // 2,
 53 |                 dim,
 54 |                 kernel_size=2 * stride,
 55 |                 stride=stride,
 56 |                 padding=math.ceil(stride / 2),
 57 |             ),
 58 |         )
 59 | 
 60 |     def forward(self, x):
 61 |         return self.block(x)
 62 | 
 63 | 
 64 | class Encoder(nn.Module):
 65 |     def __init__(
 66 |         self,
 67 |         d_model: int = 64,
 68 |         strides: list = [2, 4, 8, 8],
 69 |         d_latent: int = 256,
 70 |     ):
 71 |         super().__init__()
 72 |         # Create first convolution
 73 |         self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
 74 | 
 75 |         # Create EncoderBlocks that double channels as they downsample by `stride`
 76 |         for stride in strides:
 77 |             d_model *= 2
 78 |             self.block += [EncoderBlock(d_model, stride=stride)]
 79 | 
 80 |         # Create last convolution
 81 |         self.block += [
 82 |             Snake1d(d_model),
 83 |             WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
 84 |         ]
 85 | 
 86 |         # Wrap black into nn.Sequential
 87 |         self.block = nn.Sequential(*self.block)
 88 |         self.enc_dim = d_model
 89 | 
 90 |     def forward(self, x):
 91 |         return self.block(x)
 92 | 
 93 | 
 94 | class DecoderBlock(nn.Module):
 95 |     def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1, out_pad=0):
 96 |         super().__init__()
 97 |         self.block = nn.Sequential(
 98 |             Snake1d(input_dim),
 99 |             WNConvTranspose1d(
100 |                 input_dim,
101 |                 output_dim,
102 |                 kernel_size=2 * stride,
103 |                 stride=stride,
104 |                 padding=math.ceil(stride / 2),
105 |                 output_padding=stride % 2,  # out_pad,
106 |             ),
107 |             ResidualUnit(output_dim, dilation=1),
108 |             ResidualUnit(output_dim, dilation=3),
109 |             ResidualUnit(output_dim, dilation=9),
110 |         )
111 | 
112 |     def forward(self, x):
113 |         return self.block(x)
114 | 
115 | 
116 | class Decoder(nn.Module):
117 |     def __init__(
118 |         self,
119 |         input_channel,
120 |         channels,
121 |         rates,
122 |         d_out: int = 1,
123 |     ):
124 |         super().__init__()
125 | 
126 |         # Add first conv layer
127 |         layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
128 | 
129 |         # Add upsampling + MRF blocks
130 |         for i, stride in enumerate(rates):
131 |             input_dim = channels // 2**i
132 |             output_dim = channels // 2 ** (i + 1)
133 |             if i == 1:
134 |                 out_pad = 1
135 |             else:
136 |                 out_pad = 0
137 |             layers += [DecoderBlock(input_dim, output_dim, stride, out_pad)]
138 | 
139 |         # Add final conv layer
140 |         layers += [
141 |             Snake1d(output_dim),
142 |             WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
143 |             # nn.Tanh(),
144 |         ]
145 | 
146 |         self.model = nn.Sequential(*layers)
147 | 
148 |     def forward(self, x):
149 |         return self.model(x)
150 | 
151 | 
152 | class DAC(BaseModel, CodecMixin):
153 |     def __init__(
154 |         self,
155 |         encoder_dim: int = 64,
156 |         encoder_rates: List[int] = [2, 4, 8, 8],
157 |         latent_dim: int = None,
158 |         decoder_dim: int = 1536,
159 |         decoder_rates: List[int] = [8, 8, 4, 2],
160 |         n_codebooks: int = 9,
161 |         codebook_size: int = 1024,
162 |         codebook_dim: Union[int, list] = 8,
163 |         quantizer_dropout: bool = False,
164 |         sample_rate: int = 44100,
165 |     ):
166 |         super().__init__()
167 | 
168 |         self.encoder_dim = encoder_dim
169 |         self.encoder_rates = encoder_rates
170 |         self.decoder_dim = decoder_dim
171 |         self.decoder_rates = decoder_rates
172 |         self.sample_rate = sample_rate
173 | 
174 |         if latent_dim is None:
175 |             latent_dim = encoder_dim * (2 ** len(encoder_rates))
176 | 
177 |         self.latent_dim = latent_dim
178 | 
179 |         self.hop_length = np.prod(encoder_rates)
180 |         self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
181 | 
182 |         self.n_codebooks = n_codebooks
183 |         self.codebook_size = codebook_size
184 |         self.codebook_dim = codebook_dim
185 |         self.quantizer = ResidualVectorQuantize(
186 |             input_dim=latent_dim,
187 |             n_codebooks=n_codebooks,
188 |             codebook_size=codebook_size,
189 |             codebook_dim=codebook_dim,
190 |             quantizer_dropout=quantizer_dropout,
191 |         )
192 | 
193 |         self.decoder = Decoder(
194 |             latent_dim,
195 |             decoder_dim,
196 |             decoder_rates,
197 |         )
198 |         self.sample_rate = sample_rate
199 |         self.apply(init_weights)
200 | 
201 |         self.delay = self.get_delay()
202 | 
203 |     def preprocess(self, audio_data, sample_rate):
204 |         if sample_rate is None:
205 |             sample_rate = self.sample_rate
206 |         assert sample_rate == self.sample_rate
207 | 
208 |         length = audio_data.shape[-1]
209 |         right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
210 |         audio_data = nn.functional.pad(audio_data, (0, right_pad))
211 | 
212 |         return audio_data
213 | 
214 |     def encode(
215 |         self,
216 |         audio_data: torch.Tensor,
217 |         n_quantizers: int = None,
218 |     ):
219 |         """Encode given audio data and return quantized latent codes
220 | 
221 |         Parameters
222 |         ----------
223 |         audio_data : Tensor[B x 1 x T]
224 |             Audio data to encode
225 |         n_quantizers : int, optional
226 |             Number of quantizers to use, by default None
227 |             If None, all quantizers are used.
228 | 
229 |         Returns
230 |         -------
231 |         dict
232 |             A dictionary with the following keys:
233 |             "z" : Tensor[B x D x T]
234 |                 Quantized continuous representation of input
235 |             "codes" : Tensor[B x N x T]
236 |                 Codebook indices for each codebook
237 |                 (quantized discrete representation of input)
238 |             "latents" : Tensor[B x N*D x T]
239 |                 Projected latents (continuous representation of input before quantization)
240 |             "vq/commitment_loss" : Tensor[1]
241 |                 Commitment loss to train encoder to predict vectors closer to codebook
242 |                 entries
243 |             "vq/codebook_loss" : Tensor[1]
244 |                 Codebook loss to update the codebook
245 |             "length" : int
246 |                 Number of samples in input audio
247 |         """
248 |         z = self.encoder(audio_data)
249 |         z, codes, latents, commitment_loss, codebook_loss = self.quantizer(z, n_quantizers)
250 |         return z, codes, latents, commitment_loss, codebook_loss
251 | 
252 |     def decode(self, z: torch.Tensor):
253 |         """Decode given latent codes and return audio data
254 | 
255 |         Parameters
256 |         ----------
257 |         z : Tensor[B x D x T]
258 |             Quantized continuous representation of input
259 |         length : int, optional
260 |             Number of samples in output audio, by default None
261 | 
262 |         Returns
263 |         -------
264 |         dict
265 |             A dictionary with the following keys:
266 |             "audio" : Tensor[B x 1 x length]
267 |                 Decoded audio data.
268 |         """
269 |         return self.decoder(z)
270 | 
271 |     def forward(
272 |         self,
273 |         audio_data: torch.Tensor,
274 |         sample_rate: int = None,
275 |         n_quantizers: int = None,
276 |     ):
277 |         """Model forward pass
278 | 
279 |         Parameters
280 |         ----------
281 |         audio_data : Tensor[B x 1 x T]
282 |             Audio data to encode
283 |         sample_rate : int, optional
284 |             Sample rate of audio data in Hz, by default None
285 |             If None, defaults to `self.sample_rate`
286 |         n_quantizers : int, optional
287 |             Number of quantizers to use, by default None.
288 |             If None, all quantizers are used.
289 | 
290 |         Returns
291 |         -------
292 |         dict
293 |             A dictionary with the following keys:
294 |             "z" : Tensor[B x D x T]
295 |                 Quantized continuous representation of input
296 |             "codes" : Tensor[B x N x T]
297 |                 Codebook indices for each codebook
298 |                 (quantized discrete representation of input)
299 |             "latents" : Tensor[B x N*D x T]
300 |                 Projected latents (continuous representation of input before quantization)
301 |             "vq/commitment_loss" : Tensor[1]
302 |                 Commitment loss to train encoder to predict vectors closer to codebook
303 |                 entries
304 |             "vq/codebook_loss" : Tensor[1]
305 |                 Codebook loss to update the codebook
306 |             "length" : int
307 |                 Number of samples in input audio
308 |             "audio" : Tensor[B x 1 x length]
309 |                 Decoded audio data.
310 |         """
311 |         length = audio_data.shape[-1]
312 |         audio_data = self.preprocess(audio_data, sample_rate)
313 |         z, codes, latents, commitment_loss, codebook_loss = self.encode(audio_data, n_quantizers)
314 | 
315 |         x = self.decode(z)
316 |         return {
317 |             "audio": x[..., :length],
318 |             "z": z,
319 |             "codes": codes,
320 |             "latents": latents,
321 |             "vq/commitment_loss": commitment_loss,
322 |             "vq/codebook_loss": codebook_loss,
323 |         }
324 | 
325 | 
326 | if __name__ == "__main__":
327 |     import numpy as np
328 |     from functools import partial
329 | 
330 |     model = DAC().to("cpu")
331 | 
332 |     for n, m in model.named_modules():
333 |         o = m.extra_repr()
334 |         p = sum([np.prod(p.size()) for p in m.parameters()])
335 |         fn = lambda o, p: o + f" {p / 1e6:<.3f}M params."
336 |         setattr(m, "extra_repr", partial(fn, o=o, p=p))
337 |     print(model)
338 |     print("Total # of params: ", sum([np.prod(p.size()) for p in model.parameters()]))
339 | 
340 |     length = 88200 * 2
341 |     x = torch.randn(1, 1, length).to(model.device)
342 |     x.requires_grad_(True)
343 |     x.retain_grad()
344 | 
345 |     # Make a forward pass
346 |     out = model(x)["audio"]
347 |     print("Input shape:", x.shape)
348 |     print("Output shape:", out.shape)
349 | 
350 |     # Create gradient variable
351 |     grad = torch.zeros_like(out)
352 |     grad[:, :, grad.shape[-1] // 2] = 1
353 | 
354 |     # Make a backward pass
355 |     out.backward(grad)
356 | 
357 |     # Check non-zero values
358 |     gradmap = x.grad.squeeze(0)
359 |     gradmap = (gradmap != 0).sum(0)  # sum across features
360 |     rf = (gradmap != 0).sum()
361 | 
362 |     print(f"Receptive field: {rf.item()}")
363 | 
364 |     x = AudioSignal(torch.randn(1, 1, 44100 * 60), 44100)
365 |     model.decompress(model.compress(x, verbose=True), verbose=True)
366 | 


--------------------------------------------------------------------------------
/boson_multimodal/model/higgs_audio/configuration_higgs_audio.py:
--------------------------------------------------------------------------------
  1 | from transformers.configuration_utils import PretrainedConfig
  2 | from transformers.models.auto import CONFIG_MAPPING
  3 | 
  4 | 
  5 | class HiggsAudioEncoderConfig(PretrainedConfig):
  6 |     """Configuration of the Audio encoder in Higgs-Audio."""
  7 | 
  8 |     model_type = "higgs_audio_encoder"
  9 | 
 10 |     def __init__(
 11 |         self,
 12 |         num_mel_bins=128,
 13 |         encoder_layers=32,
 14 |         encoder_attention_heads=20,
 15 |         encoder_ffn_dim=5120,
 16 |         encoder_layerdrop=0.0,
 17 |         d_model=1280,
 18 |         dropout=0.0,
 19 |         attention_dropout=0.0,
 20 |         activation_function="gelu",
 21 |         activation_dropout=0.0,
 22 |         scale_embedding=False,
 23 |         init_std=0.02,
 24 |         max_source_positions=1500,
 25 |         pad_token_id=128001,
 26 |         **kwargs,
 27 |     ):
 28 |         super().__init__(**kwargs)
 29 | 
 30 |         self.num_mel_bins = num_mel_bins
 31 |         self.d_model = d_model
 32 |         self.encoder_layers = encoder_layers
 33 |         self.encoder_attention_heads = encoder_attention_heads
 34 |         self.encoder_ffn_dim = encoder_ffn_dim
 35 |         self.dropout = dropout
 36 |         self.attention_dropout = attention_dropout
 37 |         self.activation_function = activation_function
 38 |         self.activation_dropout = activation_dropout
 39 |         self.encoder_layerdrop = encoder_layerdrop
 40 |         self.num_hidden_layers = encoder_layers
 41 |         self.init_std = init_std
 42 |         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 43 |         self.max_source_positions = max_source_positions
 44 |         self.pad_token_id = pad_token_id
 45 | 
 46 | 
 47 | class HiggsAudioConfig(PretrainedConfig):
 48 |     r"""
 49 |     This is the configuration class for the HiggsAudioModel.
 50 | 
 51 |     Args:
 52 |         text_config (`Union[AutoConfig, dict]`):
 53 |             The config object or dictionary of the text backbone.
 54 |         audio_encoder_config (`Union[AutoConfig, dict]`):
 55 |             The config object or dictionary of the whisper encoder.
 56 |             The audio encoder will be bidirectional and will be only available for audio understanding.
 57 |         audio_tokenizer_config
 58 |             The config object or dictionary of the audio tokenizer.
 59 |         audio_adapter_type
 60 |             The type of audio adapter to use. We support two types of adapter:
 61 |             - stack:
 62 |                 We stack additional Transformer layers after the main LLM backbone for audio generation.
 63 |             - dual_ffn:
 64 |                 For selected part of the LLM backbone, we replace the text FFN with a dual FFN architecture
 65 |                 that contains an additional audio FFN. The audio FFN will be triggered when the location is marked for audio tokens.
 66 |             - dual_ffn_fast_forward:
 67 |                 We pick a few layers in the LLM backbone to plug-in the audio FFN. For the remaining layers,
 68 |                 the audio hidden states will be directly fast-forward to the next layer.
 69 |                 This reduces the computational cost for audio generation.
 70 |         audio_embed_avg (`bool`, *optional*, defaults to False):
 71 |             Whether to average the audio embeddings before sending them to the text attention layer.
 72 |         audio_ffn_hidden_size
 73 |             The hidden size of the audio feedforward network in dual-path FFN
 74 |         audio_ffn_intermediate_size
 75 |             The intermediate size of the audio feedforward network in dual-path FFN
 76 |         audio_dual_ffn_layers
 77 |             The layers in the LLM backbone to plug-in the dual FFN layer (mixture of audio FFN and text FFN).
 78 |         audio_decoder_proj_num_attention (`int`, *optional*, defaults to 0):
 79 |             The number of attention heads in the audio decoder projection layer.
 80 |         use_delay_pattern (`bool`, *optional*, defaults to False):
 81 |             Whether to use delay pattern in the audio decoder.
 82 |         skip_audio_tower (`bool`, *optional*, defaults to False):
 83 |             Whether to skip the audio tower in the audio encoder.
 84 |         use_audio_out_embed_projector (`bool`, *optional*, defaults to False):
 85 |             Whether to use an embedding projector to map audio out embeddings.
 86 |         use_audio_out_self_attention (`bool`, *optional*, defaults to False):
 87 |             Whether to use self-attention to aggregate information from audio-tokens before sending to the text attention layer.
 88 |         audio_num_codebooks (`int`, *optional*, defaults to 12):
 89 |             The number of codebooks in RVQGAN.
 90 |         audio_codebook_size (`int`, *optional*, defaults to 1024):
 91 |             The size of each codebook in RVQGAN.
 92 |         audio_stream_bos_id
 93 |             The id of the bos in the audio stream
 94 |         audio_stream_eos_id
 95 |             The id of the eos in the audio stream
 96 |         audio_bos_token (`str`, *optional*, defaults to "<|audio_bos|>"):
 97 |             The special `<|audio_bos|>` token. In Higgs-Audio, it is mapped to 128011,
 98 |             which is the index of `<|reserved_special_token_3|>` in Llama-3.1-8B-Instruct's tokenizer.
 99 |         audio_eos_token (`str`, *optional*, defaults to "<|audio_eos|>"):
100 |             The special `<|audio_eos|>` token. We use 128012 as the default value,
101 |             which is the index of `<|reserved_special_token_4|>` in Llama-3.1-8B-Instruct's tokenizer.
102 |         audio_out_bos_token (`str`, *optional*, defaults to "<|audio_out_bos|>"):
103 |             The special `<|audio_out_bos|>` token. We use 128013 as the default value,
104 |             which is the index of `<|reserved_special_token_5|>` in Llama-3.1-8B-Instruct's tokenizer.
105 |         audio_token (`str`, *optional*, defaults to "<|AUDIO|>"):
106 |             The special `<|AUDIO|>` token. We use 128015 as the default value,
107 |             which is the index of `<|reserved_special_token_7|>` in Llama-3.1-8B-Instruct's tokenizer.
108 |             This token indicates that the location should be filled in with whisper features.
109 |         audio_out_token (`str`, *optional*, defaults to "<|AUDIO_OUT|>"):
110 |             The special `<|AUDIO_OUT|>` token. We use 128016 as the default value,
111 |             which is the index of `<|reserved_special_token_8|>` in Llama-3.1-8B-Instruct's tokenizer.
112 |             This token indicates that the location should be filled in with audio tokens extracted via audio tokenizer.
113 |     """
114 | 
115 |     model_type = "higgs_audio"
116 |     is_composition = True
117 | 
118 |     def __init__(
119 |         self,
120 |         text_config=None,
121 |         audio_encoder_config=None,
122 |         audio_tokenizer_config=None,
123 |         audio_adapter_type="stack",
124 |         audio_embed_avg=False,
125 |         audio_ffn_hidden_size=4096,
126 |         audio_ffn_intermediate_size=14336,
127 |         audio_dual_ffn_layers=None,
128 |         audio_decoder_proj_num_layers=0,
129 |         encode_whisper_embed=True,
130 |         encode_audio_in_tokens=False,
131 |         use_delay_pattern=False,
132 |         skip_audio_tower=False,
133 |         use_audio_out_embed_projector=False,
134 |         use_audio_out_self_attention=False,
135 |         use_rq_transformer=False,
136 |         rq_transformer_hidden_size=None,
137 |         rq_transformer_intermediate_size=None,
138 |         rq_transformer_num_attention_heads=None,
139 |         rq_transformer_num_key_value_heads=None,
140 |         rq_transformer_num_hidden_layers=3,
141 |         audio_num_codebooks=12,
142 |         audio_codebook_size=1024,
143 |         audio_stream_bos_id=1024,
144 |         audio_stream_eos_id=1025,
145 |         audio_bos_token="<|audio_bos|>",
146 |         audio_eos_token="<|audio_eos|>",
147 |         audio_out_bos_token="<|audio_out_bos|>",
148 |         audio_in_token="<|AUDIO|>",
149 |         audio_out_token="<|AUDIO_OUT|>",
150 |         audio_in_token_idx=128015,
151 |         audio_out_token_idx=128016,
152 |         pad_token_id=128001,
153 |         audio_out_bos_token_id=128013,
154 |         audio_eos_token_id=128012,
155 |         **kwargs,
156 |     ):
157 |         if isinstance(audio_encoder_config, dict):
158 |             audio_encoder_config["model_type"] = (
159 |                 audio_encoder_config["model_type"] if "model_type" in audio_encoder_config else "higgs_audio_encoder"
160 |             )
161 |             audio_encoder_config = CONFIG_MAPPING[audio_encoder_config["model_type"]](**audio_encoder_config)
162 |         elif audio_encoder_config is None:
163 |             audio_encoder_config = HiggsAudioEncoderConfig()
164 | 
165 |         if isinstance(text_config, dict):
166 |             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
167 |             text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
168 |         elif text_config is None:
169 |             text_config = CONFIG_MAPPING["llama"]()
170 | 
171 |         assert audio_adapter_type in [
172 |             "stack",
173 |             "dual_ffn",
174 |             "dual_ffn_fast_forward",
175 |         ], f"Invalid audio adapter type: {audio_adapter_type}"
176 |         if audio_adapter_type.startswith("dual_ffn"):
177 |             assert audio_dual_ffn_layers is not None, (
178 |                 "audio_dual_ffn_layers must be specified when using dual_ffn adapter."
179 |             )
180 |         self.text_config = text_config
181 |         self.audio_encoder_config = audio_encoder_config
182 |         self.audio_tokenizer_config = audio_tokenizer_config
183 |         self.audio_adapter_type = audio_adapter_type
184 |         self.audio_embed_avg = audio_embed_avg
185 |         self.audio_ffn_hidden_size = audio_ffn_hidden_size
186 |         self.audio_ffn_intermediate_size = audio_ffn_intermediate_size
187 |         self.audio_dual_ffn_layers = audio_dual_ffn_layers
188 |         self.audio_decoder_proj_num_layers = audio_decoder_proj_num_layers
189 |         self.encode_whisper_embed = encode_whisper_embed
190 |         self.encode_audio_in_tokens = encode_audio_in_tokens
191 |         self.use_delay_pattern = use_delay_pattern
192 |         self.skip_audio_tower = skip_audio_tower
193 |         self.use_audio_out_embed_projector = use_audio_out_embed_projector
194 |         self.use_audio_out_self_attention = use_audio_out_self_attention
195 | 
196 |         self.use_rq_transformer = use_rq_transformer
197 | 
198 |         if self.use_rq_transformer:
199 |             assert not self.use_delay_pattern, "Delay pattern is not supported if you turned on RQ-Transformer!"
200 |         self.rq_transformer_hidden_size = rq_transformer_hidden_size
201 |         self.rq_transformer_intermediate_size = rq_transformer_intermediate_size
202 |         self.rq_transformer_num_attention_heads = rq_transformer_num_attention_heads
203 |         self.rq_transformer_num_key_value_heads = rq_transformer_num_key_value_heads
204 |         self.rq_transformer_num_hidden_layers = rq_transformer_num_hidden_layers
205 | 
206 |         if use_rq_transformer:
207 |             # For RQ-Transformer, we set the hidden_size to the same as the text model's hidden size if it is not specified.
208 |             if self.rq_transformer_hidden_size is None:
209 |                 self.rq_transformer_hidden_size = text_config.hidden_size
210 |             assert self.rq_transformer_hidden_size % 128 == 0
211 |             if self.rq_transformer_intermediate_size is None:
212 |                 self.rq_transformer_intermediate_size = text_config.intermediate_size
213 |             if self.rq_transformer_num_attention_heads is None:
214 |                 self.rq_transformer_num_attention_heads = self.rq_transformer_hidden_size // 128
215 |             if self.rq_transformer_num_key_value_heads is None:
216 |                 self.rq_transformer_num_key_value_heads = self.rq_transformer_hidden_size // 128 // 4
217 |             assert self.rq_transformer_hidden_size % self.rq_transformer_num_attention_heads == 0
218 |             assert self.rq_transformer_hidden_size % self.rq_transformer_num_key_value_heads == 0
219 | 
220 |         self.audio_num_codebooks = audio_num_codebooks
221 |         self.audio_codebook_size = audio_codebook_size
222 |         self.audio_bos_token = audio_bos_token
223 |         self.audio_eos_token = audio_eos_token
224 |         self.audio_out_bos_token = audio_out_bos_token
225 |         self.audio_in_token = audio_in_token
226 |         self.audio_out_token = audio_out_token
227 |         self.audio_in_token_idx = audio_in_token_idx
228 |         self.audio_out_token_idx = audio_out_token_idx
229 |         self.audio_stream_bos_id = audio_stream_bos_id
230 |         self.audio_stream_eos_id = audio_stream_eos_id
231 |         self.audio_out_bos_token_id = audio_out_bos_token_id
232 |         self.audio_eos_token_id = audio_eos_token_id
233 | 
234 |         super().__init__(**kwargs)
235 |         self.pad_token_id = pad_token_id
236 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/higgs_audio_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Based on code from: https://github.com/zhenye234/xcodec
  2 | # Licensed under MIT License
  3 | # Modifications by BosonAI
  4 | 
  5 | import math
  6 | import os
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from typing import Optional, Union, Sequence
 11 | import numpy as np
 12 | from transformers import AutoModel
 13 | import torchaudio
 14 | import json
 15 | import librosa
 16 | from huggingface_hub import snapshot_download
 17 | 
 18 | from vector_quantize_pytorch import ResidualFSQ
 19 | from .descriptaudiocodec.dac.model import dac as dac2
 20 | from .quantization.vq import ResidualVectorQuantizer
 21 | from .semantic_module import Encoder, Decoder
 22 | 
 23 | 
 24 | class EncodedResult:
 25 |     def __init__(self, audio_codes):
 26 |         self.audio_codes = audio_codes
 27 | 
 28 | 
 29 | class HiggsAudioFeatureExtractor(nn.Module):
 30 |     def __init__(self, sampling_rate=16000):
 31 |         super().__init__()
 32 |         self.sampling_rate = sampling_rate
 33 | 
 34 |     def forward(self, raw_audio, sampling_rate=16000, return_tensors="pt"):
 35 |         # Convert from librosa to torch
 36 |         audio_signal = torch.tensor(raw_audio)
 37 |         audio_signal = audio_signal.unsqueeze(0)
 38 |         if len(audio_signal.shape) < 3:
 39 |             audio_signal = audio_signal.unsqueeze(0)
 40 |         return {"input_values": audio_signal}
 41 | 
 42 | 
 43 | class HiggsAudioTokenizer(nn.Module):
 44 |     def __init__(
 45 |         self,
 46 |         n_filters: int = 32,
 47 |         D: int = 128,
 48 |         target_bandwidths: Sequence[Union[int, float]] = [1, 1.5, 2, 4, 6],
 49 |         ratios: Sequence[int] = [8, 5, 4, 2],  #  downsampling by 320
 50 |         sample_rate: int = 16000,
 51 |         bins: int = 1024,
 52 |         n_q: int = 8,
 53 |         codebook_dim: int = None,
 54 |         normalize: bool = False,
 55 |         causal: bool = False,
 56 |         semantic_techer: str = "hubert_base_general",
 57 |         last_layer_semantic: bool = True,
 58 |         merge_mode: str = "concat",
 59 |         downsample_mode: str = "step_down",
 60 |         semantic_mode: str = "classic",
 61 |         vq_scale: int = 1,
 62 |         semantic_sample_rate: int = None,
 63 |         device: str = "cuda",
 64 |     ):
 65 |         super().__init__()
 66 |         self.hop_length = np.prod(ratios)
 67 |         self.semantic_techer = semantic_techer
 68 | 
 69 |         self.frame_rate = math.ceil(sample_rate / np.prod(ratios))  # 50 Hz
 70 | 
 71 |         self.target_bandwidths = target_bandwidths
 72 |         self.n_q = n_q
 73 |         self.sample_rate = sample_rate
 74 |         self.encoder = dac2.Encoder(64, ratios, D)
 75 | 
 76 |         self.decoder_2 = dac2.Decoder(D, 1024, ratios)
 77 |         self.last_layer_semantic = last_layer_semantic
 78 |         self.device = device
 79 |         if semantic_techer == "hubert_base":
 80 |             self.semantic_model = AutoModel.from_pretrained("facebook/hubert-base-ls960")
 81 |             self.semantic_sample_rate = 16000
 82 |             self.semantic_dim = 768
 83 |             self.encoder_semantic_dim = 768
 84 | 
 85 |         elif semantic_techer == "wavlm_base_plus":
 86 |             self.semantic_model = AutoModel.from_pretrained("microsoft/wavlm-base-plus")
 87 |             self.semantic_sample_rate = 16000
 88 |             self.semantic_dim = 768
 89 |             self.encoder_semantic_dim = 768
 90 | 
 91 |         elif semantic_techer == "hubert_base_general":
 92 |             self.semantic_model = AutoModel.from_pretrained("bosonai/hubert_base", trust_remote_code=True)
 93 |             self.semantic_sample_rate = 16000
 94 |             self.semantic_dim = 768
 95 |             self.encoder_semantic_dim = 768
 96 | 
 97 |         # Overwrite semantic model sr to ensure semantic_downsample_factor is an integer
 98 |         if semantic_sample_rate is not None:
 99 |             self.semantic_sample_rate = semantic_sample_rate
100 | 
101 |         self.semantic_model.eval()
102 | 
103 |         # make the semantic model parameters do not need gradient
104 |         for param in self.semantic_model.parameters():
105 |             param.requires_grad = False
106 | 
107 |         self.semantic_downsample_factor = int(self.hop_length / (self.sample_rate / self.semantic_sample_rate) / 320)
108 | 
109 |         self.quantizer_dim = int((D + self.encoder_semantic_dim) // vq_scale)
110 |         self.encoder_semantic = Encoder(input_channels=self.semantic_dim, encode_channels=self.encoder_semantic_dim)
111 |         self.decoder_semantic = Decoder(
112 |             code_dim=self.encoder_semantic_dim, output_channels=self.semantic_dim, decode_channels=self.semantic_dim
113 |         )
114 | 
115 |         # out_D=D+768
116 |         if isinstance(bins, int):  # RVQ
117 |             self.quantizer = ResidualVectorQuantizer(
118 |                 dimension=self.quantizer_dim, codebook_dim=codebook_dim, n_q=n_q, bins=bins
119 |             )
120 |             self.quantizer_type = "RVQ"
121 |         else:  # RFSQ
122 |             self.quantizer = ResidualFSQ(dim=self.quantizer_dim, levels=bins, num_quantizers=n_q)
123 |             self.quantizer_type = "RFSQ"
124 | 
125 |         self.fc_prior = nn.Linear(D + self.encoder_semantic_dim, self.quantizer_dim)
126 |         self.fc_post1 = nn.Linear(self.quantizer_dim, self.encoder_semantic_dim)
127 |         self.fc_post2 = nn.Linear(self.quantizer_dim, D)
128 | 
129 |         self.downsample_mode = downsample_mode
130 |         if downsample_mode == "avg":
131 |             self.semantic_pooling = nn.AvgPool1d(
132 |                 kernel_size=self.semantic_downsample_factor, stride=self.semantic_downsample_factor
133 |             )
134 | 
135 |         self.audio_tokenizer_feature_extractor = HiggsAudioFeatureExtractor(sampling_rate=self.sample_rate)
136 | 
137 |     @property
138 |     def tps(self):
139 |         return self.frame_rate
140 | 
141 |     @property
142 |     def sampling_rate(self):
143 |         return self.sample_rate
144 | 
145 |     @property
146 |     def num_codebooks(self):
147 |         return self.n_q
148 | 
149 |     @property
150 |     def codebook_size(self):
151 |         return self.quantizer_dim
152 | 
153 |     def get_last_layer(self):
154 |         return self.decoder.layers[-1].weight
155 | 
156 |     def calculate_rec_loss(self, rec, target):
157 |         target = target / target.norm(dim=-1, keepdim=True)
158 |         rec = rec / rec.norm(dim=-1, keepdim=True)
159 |         rec_loss = (1 - (target * rec).sum(-1)).mean()
160 | 
161 |         return rec_loss
162 | 
163 |     @torch.no_grad()
164 |     def get_regress_target(self, x):
165 |         x = torchaudio.functional.resample(x, self.sample_rate, self.semantic_sample_rate)
166 | 
167 |         if (
168 |             self.semantic_techer == "hubert_base"
169 |             or self.semantic_techer == "hubert_base_general"
170 |             or self.semantic_techer == "wavlm_base_plus"
171 |         ):
172 |             x = x[:, 0, :]
173 |             x = F.pad(x, (160, 160))
174 |             target = self.semantic_model(x, output_hidden_states=True).hidden_states
175 |             target = torch.stack(target, dim=1)  # .transpose(-1, -2)#.flatten(start_dim=1, end_dim=2)
176 | 
177 |             # average for all layers
178 |             target = target.mean(1)
179 |             # target = target[9]
180 |             # if self.hop_length > 320:
181 |             #     target = self.semantic_pooling(target.transpose(1, 2)).transpose(1, 2)
182 | 
183 |         elif self.semantic_techer == "w2v_bert2":
184 |             target = self.semantic_model(x)
185 | 
186 |         elif self.semantic_techer.startswith("whisper"):
187 |             if self.last_layer_semantic:
188 |                 target = self.semantic_model(x, avg_layers=False)
189 |             else:
190 |                 target = self.semantic_model(x, avg_layers=True)
191 | 
192 |         elif self.semantic_techer.startswith("mert_music"):
193 |             if self.last_layer_semantic:
194 |                 target = self.semantic_model(x, avg_layers=False)
195 |             else:
196 |                 target = self.semantic_model(x, avg_layers=True)
197 | 
198 |         elif self.semantic_techer.startswith("qwen_audio_omni"):
199 |             target = self.semantic_model(x)
200 | 
201 |         if self.downsample_mode == "step_down":
202 |             if self.semantic_downsample_factor > 1:
203 |                 target = target[:, :: self.semantic_downsample_factor, :]
204 | 
205 |         elif self.downsample_mode == "avg":
206 |             target = self.semantic_pooling(target.transpose(1, 2)).transpose(1, 2)
207 |         return target
208 | 
209 |     def forward(self, x: torch.Tensor, bw: int):
210 |         e_semantic_input = self.get_regress_target(x).detach()
211 | 
212 |         e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
213 |         e_acoustic = self.encoder(x)
214 | 
215 |         e = torch.cat([e_acoustic, e_semantic], dim=1)
216 | 
217 |         e = self.fc_prior(e.transpose(1, 2))
218 | 
219 |         if self.quantizer_type == "RVQ":
220 |             e = e.transpose(1, 2)
221 |             quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
222 |             quantized = quantized.transpose(1, 2)
223 |         else:
224 |             quantized, codes = self.quantizer(e)
225 |             commit_loss = torch.tensor(0.0)
226 | 
227 |         quantized_semantic = self.fc_post1(quantized).transpose(1, 2)
228 |         quantized_acoustic = self.fc_post2(quantized).transpose(1, 2)
229 | 
230 |         o = self.decoder_2(quantized_acoustic)
231 | 
232 |         o_semantic = self.decoder_semantic(quantized_semantic)
233 |         semantic_recon_loss = F.mse_loss(e_semantic_input.transpose(1, 2).detach(), o_semantic)
234 | 
235 |         return o, commit_loss, semantic_recon_loss, None
236 | 
237 |     def encode(self, audio_path_or_wv, sr=None, loudness_normalize=False, loudness_threshold=-23.0):
238 |         if isinstance(audio_path_or_wv, str):
239 |             wv, sr = librosa.load(audio_path_or_wv, mono=True, sr=None)
240 |         else:
241 |             wv = audio_path_or_wv
242 |             assert sr is not None
243 |         if loudness_normalize:
244 |             import pyloudnorm as pyln
245 | 
246 |             meter = pyln.Meter(sr)
247 |             l = meter.integrated_loudness(wv)
248 |             wv = pyln.normalize.loudness(wv, l, loudness_threshold)
249 |         if sr != self.sampling_rate:
250 |             wv = librosa.resample(wv, orig_sr=sr, target_sr=self.sampling_rate)
251 |         if self.audio_tokenizer_feature_extractor is not None:
252 |             inputs = self.audio_tokenizer_feature_extractor(
253 |                 raw_audio=wv, sampling_rate=self.audio_tokenizer_feature_extractor.sampling_rate, return_tensors="pt"
254 |             )
255 |             input_values = inputs["input_values"].to(self.device)
256 |         else:
257 |             input_values = torch.from_numpy(wv).float().unsqueeze(0)
258 |         with torch.no_grad():
259 |             encoder_outputs = self._xcodec_encode(input_values)
260 |             vq_code = encoder_outputs.audio_codes[0]
261 |         return vq_code
262 | 
263 |     def _xcodec_encode(self, x: torch.Tensor, target_bw: Optional[int] = None) -> torch.Tensor:
264 |         bw = target_bw
265 | 
266 |         e_semantic_input = self.get_regress_target(x).detach()
267 | 
268 |         e_semantic = self.encoder_semantic(e_semantic_input.transpose(1, 2))
269 |         e_acoustic = self.encoder(x)
270 | 
271 |         if e_acoustic.shape[2] != e_semantic.shape[2]:
272 |             pad_size = 160 * self.semantic_downsample_factor
273 |             e_acoustic = self.encoder(F.pad(x[:, 0, :], (pad_size, pad_size)).unsqueeze(0))
274 | 
275 |         if e_acoustic.shape[2] != e_semantic.shape[2]:
276 |             if e_acoustic.shape[2] > e_semantic.shape[2]:
277 |                 e_acoustic = e_acoustic[:, :, : e_semantic.shape[2]]
278 |             else:
279 |                 e_semantic = e_semantic[:, :, : e_acoustic.shape[2]]
280 | 
281 |         e = torch.cat([e_acoustic, e_semantic], dim=1)
282 | 
283 |         e = self.fc_prior(e.transpose(1, 2))
284 | 
285 |         if self.quantizer_type == "RVQ":
286 |             e = e.transpose(1, 2)
287 |             quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
288 |             codes = codes.permute(1, 0, 2)
289 |         else:
290 |             quantized, codes = self.quantizer(e)
291 |             codes = codes.permute(0, 2, 1)
292 | 
293 |         # return codes
294 |         return EncodedResult(codes)
295 | 
296 |     def decode(self, vq_code: torch.Tensor) -> torch.Tensor:
297 |         vq_code = vq_code.to(self.device)
298 | 
299 |         if self.quantizer_type == "RVQ":
300 |             vq_code = vq_code.permute(1, 0, 2)
301 |             quantized = self.quantizer.decode(vq_code)
302 |             quantized = quantized.transpose(1, 2)
303 |         else:
304 |             vq_code = vq_code.permute(0, 2, 1)
305 |             quantized = self.quantizer.get_output_from_indices(vq_code)
306 |         quantized_acoustic = self.fc_post2(quantized).transpose(1, 2)
307 | 
308 |         o = self.decoder_2(quantized_acoustic)
309 |         return o.detach().cpu().numpy()
310 | 
311 | 
312 | def load_higgs_audio_tokenizer(tokenizer_name_or_path, device="cuda"):
313 |     is_local = os.path.exists(tokenizer_name_or_path)
314 |     if not is_local:
315 |         tokenizer_path = snapshot_download(tokenizer_name_or_path)
316 |     else:
317 |         tokenizer_path = tokenizer_name_or_path
318 |     config_path = os.path.join(tokenizer_path, "config.json")
319 |     model_path = os.path.join(tokenizer_path, "model.pth")
320 |     config = json.load(open(config_path))
321 |     model = HiggsAudioTokenizer(
322 |         **config,
323 |         device=device,
324 |     )
325 |     parameter_dict = torch.load(model_path, map_location=device)
326 |     model.load_state_dict(parameter_dict, strict=False)
327 |     model.to(device)
328 |     model.eval()
329 |     return model
330 | 


--------------------------------------------------------------------------------
/boson_multimodal/audio_processing/quantization/ac.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | """Arithmetic coder."""
  8 | 
  9 | import io
 10 | import math
 11 | import random
 12 | import typing as tp
 13 | import torch
 14 | 
 15 | from ..binary import BitPacker, BitUnpacker
 16 | 
 17 | 
 18 | def build_stable_quantized_cdf(
 19 |     pdf: torch.Tensor, total_range_bits: int, roundoff: float = 1e-8, min_range: int = 2, check: bool = True
 20 | ) -> torch.Tensor:
 21 |     """Turn the given PDF into a quantized CDF that splits
 22 |     [0, 2 ** self.total_range_bits - 1] into chunks of size roughly proportional
 23 |     to the PDF.
 24 | 
 25 |     Args:
 26 |         pdf (torch.Tensor): probability distribution, shape should be `[N]`.
 27 |         total_range_bits (int): see `ArithmeticCoder`, the typical range we expect
 28 |             during the coding process is `[0, 2 ** total_range_bits - 1]`.
 29 |         roundoff (float): will round the pdf up to that level to remove difference coming
 30 |         from e.g. evaluating the Language Model on different architectures.
 31 |         min_range (int): minimum range width. Should always be at least 2 for numerical
 32 |             stability. Use this to avoid pathological behavior is a value
 33 |             that is expected to be rare actually happens in real life.
 34 |         check (bool): if True, checks that nothing bad happened, can be deactivated for speed.
 35 |     """
 36 |     pdf = pdf.detach()
 37 |     if roundoff:
 38 |         pdf = (pdf / roundoff).floor() * roundoff
 39 |     # interpolate with uniform distribution to achieve desired minimum probability.
 40 |     total_range = 2**total_range_bits
 41 |     cardinality = len(pdf)
 42 |     alpha = min_range * cardinality / total_range
 43 |     assert alpha <= 1, "you must reduce min_range"
 44 |     ranges = (((1 - alpha) * total_range) * pdf).floor().long()
 45 |     ranges += min_range
 46 |     quantized_cdf = torch.cumsum(ranges, dim=-1)
 47 |     if min_range < 2:
 48 |         raise ValueError("min_range must be at least 2.")
 49 |     if check:
 50 |         assert quantized_cdf[-1] <= 2**total_range_bits, quantized_cdf[-1]
 51 |         if ((quantized_cdf[1:] - quantized_cdf[:-1]) < min_range).any() or quantized_cdf[0] < min_range:
 52 |             raise ValueError("You must increase your total_range_bits.")
 53 |     return quantized_cdf
 54 | 
 55 | 
 56 | class ArithmeticCoder:
 57 |     """ArithmeticCoder,
 58 |     Let us take a distribution `p` over `N` symbols, and assume we have a stream
 59 |     of random variables `s_t` sampled from `p`. Let us assume that we have a budget
 60 |     of `B` bits that we can afford to write on device. There are `2**B` possible numbers,
 61 |     corresponding to the range `[0, 2 ** B - 1]`. We can map each of those number to a single
 62 |     sequence `(s_t)` by doing the following:
 63 | 
 64 |     1) Initialize the current range to` [0 ** 2 B - 1]`.
 65 |     2) For each time step t, split the current range into contiguous chunks,
 66 |         one for each possible outcome, with size roughly proportional to `p`.
 67 |         For instance, if `p = [0.75, 0.25]`, and the range is `[0, 3]`, the chunks
 68 |         would be `{[0, 2], [3, 3]}`.
 69 |     3) Select the chunk corresponding to `s_t`, and replace the current range with this.
 70 |     4) When done encoding all the values, just select any value remaining in the range.
 71 | 
 72 |     You will notice that this procedure can fail: for instance if at any point in time
 73 |     the range is smaller than `N`, then we can no longer assign a non-empty chunk to each
 74 |     possible outcome. Intuitively, the more likely a value is, the less the range width
 75 |     will reduce, and the longer we can go on encoding values. This makes sense: for any efficient
 76 |     coding scheme, likely outcomes would take less bits, and more of them can be coded
 77 |     with a fixed budget.
 78 | 
 79 |     In practice, we do not know `B` ahead of time, but we have a way to inject new bits
 80 |     when the current range decreases below a given limit (given by `total_range_bits`), without
 81 |     having to redo all the computations. If we encode mostly likely values, we will seldom
 82 |     need to inject new bits, but a single rare value can deplete our stock of entropy!
 83 | 
 84 |     In this explanation, we assumed that the distribution `p` was constant. In fact, the present
 85 |     code works for any sequence `(p_t)` possibly different for each timestep.
 86 |     We also assume that `s_t ~ p_t`, but that doesn't need to be true, although the smaller
 87 |     the KL between the true distribution and `p_t`, the most efficient the coding will be.
 88 | 
 89 |     Args:
 90 |         fo (IO[bytes]): file-like object to which the bytes will be written to.
 91 |         total_range_bits (int): the range `M` described above is `2 ** total_range_bits.
 92 |             Any time the current range width fall under this limit, new bits will
 93 |             be injected to rescale the initial range.
 94 |     """
 95 | 
 96 |     def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
 97 |         assert total_range_bits <= 30
 98 |         self.total_range_bits = total_range_bits
 99 |         self.packer = BitPacker(bits=1, fo=fo)  # we push single bits at a time.
100 |         self.low: int = 0
101 |         self.high: int = 0
102 |         self.max_bit: int = -1
103 |         self._dbg: tp.List[tp.Any] = []
104 |         self._dbg2: tp.List[tp.Any] = []
105 | 
106 |     @property
107 |     def delta(self) -> int:
108 |         """Return the current range width."""
109 |         return self.high - self.low + 1
110 | 
111 |     def _flush_common_prefix(self):
112 |         # If self.low and self.high start with the sames bits,
113 |         # those won't change anymore as we always just increase the range
114 |         # by powers of 2, and we can flush them out to the bit stream.
115 |         assert self.high >= self.low, (self.low, self.high)
116 |         assert self.high < 2 ** (self.max_bit + 1)
117 |         while self.max_bit >= 0:
118 |             b1 = self.low >> self.max_bit
119 |             b2 = self.high >> self.max_bit
120 |             if b1 == b2:
121 |                 self.low -= b1 << self.max_bit
122 |                 self.high -= b1 << self.max_bit
123 |                 assert self.high >= self.low, (self.high, self.low, self.max_bit)
124 |                 assert self.low >= 0
125 |                 self.max_bit -= 1
126 |                 self.packer.push(b1)
127 |             else:
128 |                 break
129 | 
130 |     def push(self, symbol: int, quantized_cdf: torch.Tensor):
131 |         """Push the given symbol on the stream, flushing out bits
132 |         if possible.
133 | 
134 |         Args:
135 |             symbol (int): symbol to encode with the AC.
136 |             quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
137 |                 to build this from your pdf estimate.
138 |         """
139 |         while self.delta < 2**self.total_range_bits:
140 |             self.low *= 2
141 |             self.high = self.high * 2 + 1
142 |             self.max_bit += 1
143 | 
144 |         range_low = 0 if symbol == 0 else quantized_cdf[symbol - 1].item()
145 |         range_high = quantized_cdf[symbol].item() - 1
146 |         effective_low = int(math.ceil(range_low * (self.delta / (2**self.total_range_bits))))
147 |         effective_high = int(math.floor(range_high * (self.delta / (2**self.total_range_bits))))
148 |         assert self.low <= self.high
149 |         self.high = self.low + effective_high
150 |         self.low = self.low + effective_low
151 |         assert self.low <= self.high, (effective_low, effective_high, range_low, range_high)
152 |         self._dbg.append((self.low, self.high))
153 |         self._dbg2.append((self.low, self.high))
154 |         outs = self._flush_common_prefix()
155 |         assert self.low <= self.high
156 |         assert self.max_bit >= -1
157 |         assert self.max_bit <= 61, self.max_bit
158 |         return outs
159 | 
160 |     def flush(self):
161 |         """Flush the remaining information to the stream."""
162 |         while self.max_bit >= 0:
163 |             b1 = (self.low >> self.max_bit) & 1
164 |             self.packer.push(b1)
165 |             self.max_bit -= 1
166 |         self.packer.flush()
167 | 
168 | 
169 | class ArithmeticDecoder:
170 |     """ArithmeticDecoder, see `ArithmeticCoder` for a detailed explanation.
171 | 
172 |     Note that this must be called with **exactly** the same parameters and sequence
173 |     of quantized cdf as the arithmetic encoder or the wrong values will be decoded.
174 | 
175 |     If the AC encoder current range is [L, H], with `L` and `H` having the some common
176 |     prefix (i.e. the same most significant bits), then this prefix will be flushed to the stream.
177 |     For instances, having read 3 bits `b1 b2 b3`, we know that `[L, H]` is contained inside
178 |     `[b1 b2 b3 0 ... 0 b1 b3 b3 1 ... 1]`. Now this specific sub-range can only be obtained
179 |     for a specific sequence of symbols and a binary-search allows us to decode those symbols.
180 |     At some point, the prefix `b1 b2 b3` will no longer be sufficient to decode new symbols,
181 |     and we will need to read new bits from the stream and repeat the process.
182 | 
183 |     """
184 | 
185 |     def __init__(self, fo: tp.IO[bytes], total_range_bits: int = 24):
186 |         self.total_range_bits = total_range_bits
187 |         self.low: int = 0
188 |         self.high: int = 0
189 |         self.current: int = 0
190 |         self.max_bit: int = -1
191 |         self.unpacker = BitUnpacker(bits=1, fo=fo)  # we pull single bits at a time.
192 |         # Following is for debugging
193 |         self._dbg: tp.List[tp.Any] = []
194 |         self._dbg2: tp.List[tp.Any] = []
195 |         self._last: tp.Any = None
196 | 
197 |     @property
198 |     def delta(self) -> int:
199 |         return self.high - self.low + 1
200 | 
201 |     def _flush_common_prefix(self):
202 |         # Given the current range [L, H], if both have a common prefix,
203 |         # we know we can remove it from our representation to avoid handling large numbers.
204 |         while self.max_bit >= 0:
205 |             b1 = self.low >> self.max_bit
206 |             b2 = self.high >> self.max_bit
207 |             if b1 == b2:
208 |                 self.low -= b1 << self.max_bit
209 |                 self.high -= b1 << self.max_bit
210 |                 self.current -= b1 << self.max_bit
211 |                 assert self.high >= self.low
212 |                 assert self.low >= 0
213 |                 self.max_bit -= 1
214 |             else:
215 |                 break
216 | 
217 |     def pull(self, quantized_cdf: torch.Tensor) -> tp.Optional[int]:
218 |         """Pull a symbol, reading as many bits from the stream as required.
219 |         This returns `None` when the stream has been exhausted.
220 | 
221 |         Args:
222 |             quantized_cdf (torch.Tensor): use `build_stable_quantized_cdf`
223 |                 to build this from your pdf estimate. This must be **exatly**
224 |                 the same cdf as the one used at encoding time.
225 |         """
226 |         while self.delta < 2**self.total_range_bits:
227 |             bit = self.unpacker.pull()
228 |             if bit is None:
229 |                 return None
230 |             self.low *= 2
231 |             self.high = self.high * 2 + 1
232 |             self.current = self.current * 2 + bit
233 |             self.max_bit += 1
234 | 
235 |         def bin_search(low_idx: int, high_idx: int):
236 |             # Binary search is not just for coding interviews :)
237 |             if high_idx < low_idx:
238 |                 raise RuntimeError("Binary search failed")
239 |             mid = (low_idx + high_idx) // 2
240 |             range_low = quantized_cdf[mid - 1].item() if mid > 0 else 0
241 |             range_high = quantized_cdf[mid].item() - 1
242 |             effective_low = int(math.ceil(range_low * (self.delta / (2**self.total_range_bits))))
243 |             effective_high = int(math.floor(range_high * (self.delta / (2**self.total_range_bits))))
244 |             low = effective_low + self.low
245 |             high = effective_high + self.low
246 |             if self.current >= low:
247 |                 if self.current <= high:
248 |                     return (mid, low, high, self.current)
249 |                 else:
250 |                     return bin_search(mid + 1, high_idx)
251 |             else:
252 |                 return bin_search(low_idx, mid - 1)
253 | 
254 |         self._last = (self.low, self.high, self.current, self.max_bit)
255 |         sym, self.low, self.high, self.current = bin_search(0, len(quantized_cdf) - 1)
256 |         self._dbg.append((self.low, self.high, self.current))
257 |         self._flush_common_prefix()
258 |         self._dbg2.append((self.low, self.high, self.current))
259 | 
260 |         return sym
261 | 
262 | 
263 | def test():
264 |     torch.manual_seed(1234)
265 |     random.seed(1234)
266 |     for _ in range(4):
267 |         pdfs = []
268 |         cardinality = random.randrange(4000)
269 |         steps = random.randrange(100, 500)
270 |         fo = io.BytesIO()
271 |         encoder = ArithmeticCoder(fo)
272 |         symbols = []
273 |         for step in range(steps):
274 |             pdf = torch.softmax(torch.randn(cardinality), dim=0)
275 |             pdfs.append(pdf)
276 |             q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
277 |             symbol = torch.multinomial(pdf, 1).item()
278 |             symbols.append(symbol)
279 |             encoder.push(symbol, q_cdf)
280 |         encoder.flush()
281 | 
282 |         fo.seek(0)
283 |         decoder = ArithmeticDecoder(fo)
284 |         for idx, (pdf, symbol) in enumerate(zip(pdfs, symbols)):
285 |             q_cdf = build_stable_quantized_cdf(pdf, encoder.total_range_bits)
286 |             decoded_symbol = decoder.pull(q_cdf)
287 |             assert decoded_symbol == symbol, idx
288 |         assert decoder.pull(torch.zeros(1)) is None
289 | 
290 | 
291 | if __name__ == "__main__":
292 |     test()
293 | 


--------------------------------------------------------------------------------
/tech_blogs/TOKENIZER_BLOG.md:
--------------------------------------------------------------------------------
  1 | # Higgs Audio Tokenizer
  2 | 
  3 | In this work, we introduce a new discretized audio tokenizer that runs at just **25 frames per second** while keeping—or even improving—audio quality compared to tokenizers with twice the bitrate. Our model is the first to train on **24 kHz data** covering speech, music, and sound events in one unified system. It also uses a simple non-diffusion encoder/decoder for fast, batch inference.
  4 | 
  5 | ![XCodec Architecture](../figures/higgs_audio_tokenizer_architecture.png)
  6 | 
  7 | ## Basics of Audio Quantization
  8 | 
  9 | An audio signal sampled at $f_s$ Hz is first split into frames by an encoder with hop size $M$, giving a frame rate  $f_r = \frac{f_s}{M}\quad\text{(frames/s)}.$
 10 | Two common quantizers are:
 11 | 
 12 | - **Residual Vector Quantization (RVQ)**: $N_q$ cascaded layers with codebook size $N_{cb}$ each. When $N_{cb}=1$, it reduces to single-vector quantization.  
 13 | - **Finite Scalar Quantization (FSQ)**: A single layer ($N_q=1$) with codebook size $N_{cb}$.  
 14 | 
 15 | If every combination of codewords is a token, the vocabulary size is $N_{cb}^{N_q}$, and each token needs $N_q\log_2 N_{cb}$ bits. The overall bitrate (bits/s, BPS) is simply $f_r \times N_q \log_2 N_{cb}.$  
 16 | We aim to push this bitrate as low as possible without hurting audio fidelity.
 17 | 
 18 | ## What Makes Ours Better
 19 | 
 20 | - **Low Frame Rate**: At 25 fps, our tokenizer halves the frame rate of many baselines when still maintaining high audio quality. 
 21 | - **Unified 24 kHz Training**: We mix speech, music, and sound-event clips in one model, capturing both semantic and acoustic details, hugely facilitating the training of audio language models.
 22 | - **Fast Inference**: By avoiding diffusion steps, our encoder/decoder processes batches quickly, making it practical for real-time or large-scale tasks.
 23 | 
 24 | 
 25 | ## Data and Evaluation Metrics
 26 | 
 27 | We test on four subsets, available [here](https://huggingface.co/datasets/bosonai/AudioTokenBench):
 28 | 
 29 | - **Speech, Music, Sound Event**: Includes 1,000 clips for each category, with each clip lasting 10 seconds.   Clips are randomly sampled from [DAPS](https://ccrma.stanford.edu/~gautham/Site/daps.html) (Speech), [MUSDB](https://sigsep.github.io/datasets/musdb.html) (Music), and [AudioSet](https://research.google.com/audioset/index.html) (Sound Event).
 30 | 
 31 | - **Audiophile**: Contains 150 clips, each 30 seconds long, curated from eleven high-fidelity test discs. The clips feature both music and sound events, selected for audio quality evaluation.
 32 | 
 33 | We measure:
 34 | 
 35 | - **Acoustic Quality**: STFT distance between the original and reconstructed audio.  
 36 | - **Semantic Integrity**: Semantic preservation of the original audio using [SeedTTS](https://arxiv.org/abs/2406.02430)[15] dataset on English and Chinese. 
 37 | - **Aesthetics**: SOTA unified model-based quality assessment, [Meta Audiobox Aesthetics](https://github.com/facebookresearch/audiobox-aesthetics)[8], for Content Enjoyment (CE), Content Usefulness (CU) .
 38 | 
 39 | 
 40 | We compare our tokenizer with a wide range of baselines, from tokenizers mainly built for better acoustic reconstruction and compression rate, to those focused on semantic integrity, and to tokenizers used in existing large audio language models. We also compare with tokenizers that are pretrained specifically on speech or on music.
 41 | 
 42 | 
 43 | The tables below summarize the tokenizers evaluated. As shown, our tokenizer achieves a well-rounded balance of efficiency, semantic fidelity, and acoustic quality.
 44 | 
 45 | ### Accoustic Evaluation
 46 | 
 47 | We use the STFT metric here for simplicity. The baselines are ordered chronologically, grouped by whether semantic distillation (SD) is applied.Despite DAC’s top acoustic quality at 12× the bitrate, our tokenizer leads all other baselines.
 48 | 
 49 | 
 50 | | Tokenizer | 💬 | 🎵 | 🥁 | SD | $f_s$ | $f_r$ | BPS* (k) ↓ | Speech ↓ | Sound Event ↓ | Music ↓ | Audiophile ↓ |
 51 | |-----------|----|----|----|----|-------|-------|--------------------------|----------|----------------|--------|--------------|
 52 | | [Encodec](https://huggingface.co/facebook/encodec_24khz)[3] | ✓ | ✓ | ✓ |  | 24 | 75 | 24 | 1.96 | 2.65 | 2.52 | 2.30 |
 53 | | [DAC](https://huggingface.co/hance-ai/descript-audio-codec-24khz)[2] | ✓ | ✓ | ✓ |  | 24 | 75 | 24 | **1.13** | **1.45** | **1.34** | **1.62** |
 54 | | [SNAC-24k](https://huggingface.co/hubertsiuzdak/snac_24khz)[6] | ✓ |  |  |  | 24 | (12, 23, 47) | 0.98 | 1.92 | 2.69 | 2.54 | 2.52 |
 55 | | [SNAC-44.1k](https://huggingface.co/hubertsiuzdak/snac_44khz)[6] |  | ✓ | ✓ |  | 44.1 | (14, 29, 57, 115) | 2.6 | 1.83 | 2.25 | 2.05 | 2.00 |
 56 | | [WavTokenizer](https://huggingface.co/novateur/WavTokenizer-medium-music-audio-75token/blob/main/wavtokenizer_medium_music_audio_320_24k_v2.ckpt)[7] |  | ✓ | ✓ |  | 24 | 75 | 0.9 | 1.93 | 2.44 | 2.17 | 2.15 |
 57 | | [WavTokenizer (Speech)](https://huggingface.co/novateur/WavTokenizer-large-speech-75token/tree/main)[7] | ✓ |  |  |  | 24 | 75 | 0.9 | 1.78 | 2.47 | 2.42 | 2.47 |
 58 | | [MuCodec](https://huggingface.co/haoheliu/audioldm_48k/tree/main)[11] |  | ✓ |  |  | 48 | 25 | 0.35 | 2.87 | 3.69 | 3.36 | 2.97 |
 59 | | [FlowDec-75m](https://github.com/facebookresearch/FlowDec?tab=readme-ov-file)[12] | ✓ | ✓ | ✓ |  | 48 | 75 | 7.5 | 1.73 | 2.14 | 2.01 | 2.03 |
 60 | | [FlowDec-25s](https://github.com/facebookresearch/FlowDec?tab=readme-ov-file)[12] | ✓ | ✓ | ✓ |  | 48 | 25 | 4 | 1.94 | 2.42 | 2.25 | 2.33 |
 61 | | [SpeechTokenizer](https://huggingface.co/fnlp/SpeechTokenizer/tree/main/speechtokenizer_hubert_avg)[14] | ✓ |  |  | ✓ | 16 | 50 | 4 | 3.21 | 3.58 | 3.65 | 3.69 |
 62 | | [SemantiCodec](https://huggingface.co/haoheliu/SemantiCodec/tree/main/semanticodec_tokenrate_100)[5] | ✓ | ✓ | ✓ | ✓ | 16 | 50 | 1.4 | 3.05 | 3.28 | 3.24 | 3.18 |
 63 | | [Mimi](https://huggingface.co/docs/transformers/en/model_doc/mimi)[13] | ✓ |  |  | ✓ | 24 | 12.5 | 4.4 | 1.77 | 2.40 | 2.30 | 2.15 |
 64 | | [XCodec](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert_general.yaml)[1] | ✓ | ✓ | ✓ | ✓ | 16 | 50 | 4 | 2.95 | 3.16 | 3.00 | 3.03 |
 65 | | [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B)[13] | ✓ |  |  | ✓ | 16 | 25 | -** | 2.30 | 3.30 | 3.14 | 3.25 |
 66 | | [XCodec2](https://huggingface.co/HKUST-Audio/xcodec2/blob/main/ckpt/epoch%3D4-step%3D1400000.ckpt)[9] | ✓ |  |  | ✓ | 16 | 50 | 0.8 | 3.06 | 3.72 | 3.62 | 3.64 |
 67 | | [XY](https://huggingface.co/fnlp/XY_Tokenizer_TTSD_V0/tree/main)[10] | ✓ |  |  | ✓ | 24 | 12.5 | 1 | 1.89 | 2.51 | 2.40 | 2.26 |
 68 | | Ours | ✓ | ✓ | ✓ | ✓ | 24 | 25 | 2 | **1.62** | **2.03** | **1.85** | **1.80** |
 69 | 
 70 | 
 71 | 
 72 | \* Bits-per-second is calculated according to the checkpoint the author provided.
 73 | 
 74 | \*\* CosyVoice 2 uses the continuous feature as the conditioning, we include it for completeness.
 75 | 
 76 | 
 77 | ### Semantic Evaluation
 78 | Here we only compare with tokenizers that are trained with semantic distillation.
 79 | [SeedTTS](https://github.com/BytedanceSpeech/seed-tts-eval) is a dataset includes prompt/target audio and texts. We reconstructed the target audio, and use the word error rate (WER) and speaker similarity (SIM) metrics to evaluate the semantic integrity. SIM is calculated by the similarity between the prompt audio and reconstructed targeted audio with [WavLM-large](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view) as the embedding model. 
 80 | 
 81 | The following table shows that our tokenizer achieves comparable performance to tokenizers that 2.2x the bitrate of our model.
 82 | 
 83 | | Model | BPS (k) | en WER ↓ | en SIM ↑ | zh WER ↓ | zh SIM ↑ |
 84 | |------------------|---------|------------|------------|------------|------------|
 85 | | [SpeechTokenizer](https://huggingface.co/fnlp/SpeechTokenizer/tree/main/speechtokenizer_hubert_avg) | 4 | 2.82 | 0.63 | 2.04 | 0.65 |
 86 | | [SemantiCodec](https://huggingface.co/haoheliu/SemantiCodec/tree/main/semanticodec_tokenrate_100) | 1.4 | 3.46 | 0.56 | 2.18 | 0.60 |
 87 | | [Mimi](https://huggingface.co/docs/transformers/en/model_doc/mimi) | 4.4 | **2.35** | **0.70** | **1.48** | **0.72** |
 88 | | [XCodec](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert_general.yaml) | 4.0 | 2.68 | 0.63 | 1.66 | 0.66 |
 89 | | [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B) | - | 3.17 | 0.65 | 2.11 | 0.70 |
 90 | | [XCodec2](https://huggingface.co/HKUST-Audio/xcodec2/blob/main/ckpt/epoch%3D4-step%3D1400000.ckpt) | 0.8 | 2.74 | 0.62 | 1.91 | 0.67 |
 91 | | [XY-MOSS-TTSD](https://huggingface.co/fnlp/XY_Tokenizer_TTSD_V0/tree/main) | 1.0 | 2.72 | 0.61 | 1.58 | 0.67 |
 92 | | Ours | 2.0 | 2.52 | 0.67 | **1.48** | 0.71 |
 93 | 
 94 | 
 95 | 
 96 | ### Audiobox Aesthetics Evaluation
 97 | 
 98 | This model based evaluation[8] further demonstrates the superiority of our tokenizer. CU is the Content Usefulness and CE is the Content Enjoyment. Each term is rated on a scale of 1-10. Notably, our tokenizer performs best on the Audiophile set—demonstrating a clear advantage when the original audio quality is high.
 99 | 
100 | 
101 | | Model | BPS (k) | Music CE ↑ | Music CU ↑ | Sound Event CE ↑ | Sound Event CU ↑ | Speech CE ↑ | Speech CU ↑ | Audiophile CE ↑ | Audiophile CU ↑ |
102 | |------------------|---------|--------------|--------------|--------------------|--------------------|---------------|---------------|--------------------|--------------------|
103 | | Origin | - | 6.20 | 7.10 | 4.47 | 5.64 | 5.03 | 4.87 | 7.17 | 7.65 |
104 | | [SpeechTokenizer](https://huggingface.co/fnlp/SpeechTokenizer/tree/main/speechtokenizer_hubert_avg) | 4.0 | 3.55 | 5.22 | 3.03 | 4.50 | 4.68 | 4.58 | 3.59 | 5.07 |
105 | | [SemantiCodec](https://huggingface.co/haoheliu/SemantiCodec/tree/main/semanticodec_tokenrate_100) | 1.4 | 6.01 | 6.83 | 4.22 | 5.30 | 4.28 | 4.12 | 6.97 | 7.43 |
106 | | [Mimi](https://huggingface.co/docs/transformers/en/model_doc/mimi) | 4.4 | 6.01 | 6.83 | 4.26 | 5.35 | 4.87 | 4.72 | 6.80 | 7.29 |
107 | | [XCodec](https://huggingface.co/ZhenYe234/xcodec/blob/main/config_hubert_general.yaml) | 4.0 | **6.30** | **7.10** | **4.43** | 5.45 | **4.96** | **4.79** | 7.06 | 7.49 |
108 | | [CosyVoice 2](https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B) | - | 5.21 | 6.14 | 4.08 | 4.73 | **4.91** | **4.75** | 5.97 | 6.56 |
109 | | [XCodec2](https://huggingface.co/HKUST-Audio/xcodec2/blob/main/ckpt/epoch%3D4-step%3D1400000.ckpt) | 0.8 | 4.38 | 5.66 | 3.43 | 4.63 | **4.93** | **4.78** | 4.56 | 5.46 |
110 | | [XY-MOSS-TTSD](https://huggingface.co/fnlp/XY_Tokenizer_TTSD_V0/tree/main) | 1.0 | 5.77 | 6.80 | 4.23 | 5.34 | 4.88 | 4.72 | 6.95 | 7.48 |
111 | | Ours | 2.0 | **6.35** | **7.15** | **4.47** | **5.51** | 4.90 | 4.70 | **7.21** | **7.66** |
112 | 
113 | 
114 | 
115 | Note that since some tokenizers are trained on 16 kHz data, we upsample their audio outputs to 24 kHz before computing metrics. Different upsampling methods may cause slight variations (e.g., 4.36 vs. 4.43 for XCodec Sound Event CE). We report the best results we could obtain and highlight any results within 0.05 of the best one. 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | <!-- xcodec [1]
124 | dac [2]
125 | encodec [3]
126 | moshi [4]
127 | semanticodec [5]
128 | snac [6]
129 | wavtokenizer [7]
130 | xcodec [8]
131 | xcodec2 [9]
132 | xy-tokenizer [10]
133 | mucodec [11]
134 | flowdec [12]
135 | cosyvoice2 [13]
136 | speechtokenizer [14] -->
137 | 
138 | 
139 | 
140 | 
141 | ## Reference 
142 | [1] [Ye, Zhen, et al. "Codec does matter: Exploring the semantic shortcoming of codec for audio language model." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 39. No. 24. 2025.](https://arxiv.org/abs/2408.17175)
143 | 
144 | [2] [Kumar, Rithesh, et al. "High-fidelity audio compression with improved rvqgan." Advances in Neural Information Processing Systems 36 (2023): 27980-27993.](https://dl.acm.org/doi/10.5555/3666122.3667336)
145 | 
146 | [3] [Défossez, Alexandre, et al. "High fidelity neural audio compression." arXiv preprint arXiv:2210.13438 (2022).](https://arxiv.org/abs/2210.13438)
147 | 
148 | [4] [Défossez, Alexandre, et al. "Moshi: a speech-text foundation model for real-time dialogue." arXiv preprint arXiv:2410.00037 (2024).](https://arxiv.org/abs/2410.00037)
149 | 
150 | [5] [Liu, Haohe, et al. "Semanticodec: An ultra low bitrate semantic audio codec for general sound." IEEE Journal of Selected Topics in Signal Processing (2024).](https://ieeexplore.ieee.org/document/10768970)
151 | 
152 | [6] [Siuzdak, Hubert, Florian Grötschla, and Luca A. Lanzendörfer. "Snac: Multi-scale neural audio codec." arXiv preprint arXiv:2410.14411 (2024).](https://arxiv.org/abs/2410.14411)
153 | 
154 | [7] [Ji, Shengpeng, et al. "Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling." arXiv preprint arXiv:2408.16532 (2024).](https://arxiv.org/abs/2408.16532)
155 | 
156 | [8] [Tjandra, Andros, et al. "Meta audiobox aesthetics: Unified automatic quality assessment for speech, music, and sound." arXiv preprint arXiv:2502.05139 (2025).](https://arxiv.org/abs/2502.05139)
157 | 
158 | [9] [Ye, Zhen, et al. "Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis." arXiv preprint arXiv:2502.04128 (2025).](https://arxiv.org/abs/2502.04128)
159 | 
160 | [10] [Gong, Yitian, et al. "XY-Tokenizer: Mitigating the Semantic-Acoustic Conflict in Low-Bitrate Speech Codecs." arXiv preprint arXiv:2506.23325 (2025).](https://arxiv.org/abs/2506.23325)
161 | 
162 | [11] [Xu, Yaoxun, et al. "MuCodec: Ultra Low-Bitrate Music Codec." arXiv preprint arXiv:2409.13216 (2024).](https://arxiv.org/abs/2409.13216)
163 | 
164 | [12] [Welker, Simon, et al. "FlowDec: A flow-based full-band general audio codec with high perceptual quality." arXiv preprint arXiv:2503.01485 (2025).](https://arxiv.org/abs/2503.01485)
165 | 
166 | [13] [Du, Zhihao, et al. "Cosyvoice 2: Scalable streaming speech synthesis with large language models." arXiv preprint arXiv:2412.10117 (2024).](https://arxiv.org/abs/2412.10117)
167 | 
168 | [14] [Zhang, Xin, et al. "Speechtokenizer: Unified speech tokenizer for speech large language models." arXiv preprint arXiv:2308.16692 (2023).](https://arxiv.org/abs/2308.16692)
169 | 
170 | [15] [Anastassiou, Philip, et al. "Seed-tts: A family of high-quality versatile speech generation models." arXiv preprint arXiv:2406.02430 (2024).](https://arxiv.org/abs/2406.02430)
171 | 


--------------------------------------------------------------------------------