├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── TRAIN_INSTRUCTION.md ├── ZH_RAP_LORA.md ├── acestep ├── __init__.py ├── apg_guidance.py ├── cpu_offload.py ├── data_sampler.py ├── gui.py ├── language_segmentation │ ├── LangSegment.py │ ├── __init__.py │ ├── language_filters.py │ └── utils │ │ ├── __init__.py │ │ └── num.py ├── models │ ├── __init__.py │ ├── ace_step_transformer.py │ ├── attention.py │ ├── config.json │ ├── customer_attention_processor.py │ └── lyrics_utils │ │ ├── __init__.py │ │ ├── lyric_encoder.py │ │ ├── lyric_normalizer.py │ │ ├── lyric_tokenizer.py │ │ ├── vocab.json │ │ └── zh_num2words.py ├── music_dcae │ ├── __init__.py │ ├── music_dcae_pipeline.py │ ├── music_log_mel.py │ └── music_vocoder.py ├── pipeline_ace_step.py ├── schedulers │ ├── __init__.py │ ├── scheduling_flow_match_euler_discrete.py │ ├── scheduling_flow_match_heun_discrete.py │ └── scheduling_flow_match_pingpong.py ├── text2music_dataset.py └── ui │ ├── __init__.py │ └── components.py ├── assets ├── ACE-Step_framework.png ├── Ace_Step_4x_a2a.json ├── Logo_StepFun.png ├── acestep_tech_report.pdf ├── acestudio_logo.png ├── application_map.png ├── audio2audio_ComfyUI.png ├── audio2audio_demo.gif ├── cpu_offload_performance.png ├── demo_interface.png ├── orgnization_logos.png ├── rap_machine_demo.gif └── train_demo.gif ├── colab_inference.ipynb ├── config └── zh_rap_lora_config.json ├── convert2hf_dataset.py ├── data ├── test_track_001.mp3 ├── test_track_001_lyrics.txt └── test_track_001_prompt.txt ├── docker-compose.yaml ├── examples ├── default │ └── input_params │ │ ├── output_20250426071706_0_input_params.json │ │ ├── output_20250426071812_0_input_params.json │ │ ├── output_20250426072346_0_input_params.json │ │ ├── output_20250426072508_0_input_params.json │ │ ├── output_20250426073829_0_input_params.json │ │ ├── output_20250426074037_0_input_params.json │ │ ├── output_20250426074214_0_input_params.json │ │ ├── output_20250426074413_0_input_params.json │ │ ├── output_20250426075107_0_input_params.json │ │ ├── output_20250426075537_0_input_params.json │ │ ├── output_20250426075843_0_input_params.json │ │ ├── output_20250426080234_0_input_params.json │ │ ├── output_20250426080407_0_input_params.json │ │ ├── output_20250426080601_0_input_params.json │ │ ├── output_20250426081134_0_input_params.json │ │ ├── output_20250426091716_0_input_params.json │ │ ├── output_20250426092025_0_input_params.json │ │ ├── output_20250426093007_0_input_params.json │ │ └── output_20250426093146_0_input_params.json └── zh_rap_lora │ └── input_params │ ├── output_20250512101839_0_input_params.json │ ├── output_20250512114703_0_input_params.json │ ├── output_20250512115409_0_input_params.json │ ├── output_20250512120348_0_input_params.json │ ├── output_20250512143242_0_input_params.json │ ├── output_20250512145057_0_input_params.json │ ├── output_20250512152217_0_input_params.json │ ├── output_20250512153616_0_input_params.json │ ├── output_20250512154907_0_input_params.json │ ├── output_20250512161832_0_input_params.json │ ├── output_20250512164224_0_input_params.json │ ├── output_20250512171227_0_input_params.json │ ├── output_20250512171809_0_input_params.json │ ├── output_20250512172941_0_input_params.json │ ├── output_20250513044511_0_input_params.json │ ├── output_20250513050200_0_input_params.json │ ├── output_20250513055451_0_input_params.json │ └── output_20250513060150_0_input_params.json ├── infer-api.py ├── infer.py ├── inference.ipynb ├── requirements.txt ├── setup.py ├── trainer-api.py ├── trainer.py └── zh_lora_dataset ├── data-00000-of-00001.arrow ├── dataset_info.json └── state.json /.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.ckpt 3 | *.onnx 4 | t5_g2p_model/ 5 | embeddings/ 6 | checkpoints/ 7 | 8 | val_images/ 9 | val_audios/ 10 | lightning_logs/ 11 | lightning_logs_/ 12 | train_images/ 13 | train_audios/ 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .idea/ 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .nox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | *.py,cover 64 | .hypothesis/ 65 | .pytest_cache/ 66 | cover/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | local_settings.py 75 | db.sqlite3 76 | db.sqlite3-journal 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | .pybuilder/ 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | # For a library or package, you might want to ignore these files since the code is 101 | # intended to run in multiple environments; otherwise, check them in: 102 | # .python-version 103 | 104 | # pipenv 105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 108 | # install all needed dependencies. 109 | #Pipfile.lock 110 | 111 | # poetry 112 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 113 | # This is especially recommended for binary packages to ensure reproducibility, and is more 114 | # commonly ignored for libraries. 115 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 116 | #poetry.lock 117 | 118 | # pdm 119 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 120 | #pdm.lock 121 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 122 | # in version control. 123 | # https://pdm.fming.dev/#use-with-ide 124 | .pdm.toml 125 | 126 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 127 | __pypackages__/ 128 | 129 | # Celery stuff 130 | celerybeat-schedule 131 | celerybeat.pid 132 | 133 | # SageMath parsed files 134 | *.sage.py 135 | 136 | # Environments 137 | .env 138 | .venv 139 | env/ 140 | venv/ 141 | ENV/ 142 | env.bak/ 143 | venv.bak/ 144 | 145 | # Spyder project settings 146 | .spyderproject 147 | .spyproject 148 | 149 | # Rope project settings 150 | .ropeproject 151 | 152 | # mkdocs documentation 153 | /site 154 | 155 | # mypy 156 | .mypy_cache/ 157 | .dmypy.json 158 | dmypy.json 159 | 160 | # Pyre type checker 161 | .pyre/ 162 | 163 | # pytype static type analyzer 164 | .pytype/ 165 | 166 | # Cython debug symbols 167 | cython_debug/ 168 | 169 | # PyCharm 170 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 171 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 172 | # and can be added to the global gitignore or merged into this file. For a more nuclear 173 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 174 | #.idea/ 175 | !requirements.txt 176 | *.log 177 | *.flac 178 | *.wav 179 | minio_config.yaml 180 | .history/* 181 | __pycache__/* 182 | train.log 183 | *.tar.gz 184 | __pycache__/ 185 | demo_examples/ 186 | nohup.out 187 | test_results/* 188 | nohup.out 189 | text_audio_align/* 190 | remote/* 191 | MG2P/* 192 | audio_getter.py 193 | refiner_loss_debug/ 194 | outputs/* 195 | !outputs/ 196 | save_checkpoint.ipynb 197 | repos/* 198 | app_demo.py 199 | ui/components_demo.py 200 | data_sampler_demo.py 201 | pipeline_ace_step_demo.py 202 | *.wav 203 | start.sh 204 | exps/* -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.6.0-runtime-ubuntu22.04 AS base 2 | 3 | # Set environment variables 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | PORT=7865 \ 7 | HF_HUB_ENABLE_HF_TRANSFER=1 \ 8 | DEBIAN_FRONTEND=noninteractive 9 | 10 | # Install Python and system dependencies 11 | RUN apt-get update && apt-get install -y --no-install-recommends \ 12 | python3.10 \ 13 | python3-pip \ 14 | python3-venv \ 15 | python3-dev \ 16 | build-essential \ 17 | git \ 18 | curl \ 19 | wget \ 20 | && apt-get clean \ 21 | && rm -rf /var/lib/apt/lists/* \ 22 | && ln -s /usr/bin/python3 /usr/bin/python 23 | 24 | # Create and activate virtual environment 25 | RUN python -m venv /opt/venv 26 | ENV PATH="/opt/venv/bin:$PATH" 27 | 28 | # Create a non-root user to run the application 29 | RUN useradd -m -u 1001 appuser 30 | 31 | # Set working directory 32 | WORKDIR /app 33 | 34 | # Clone the repository 35 | RUN git clone https://github.com/ace-step/ACE-Step.git . 36 | 37 | # Install specific PyTorch version compatible with CUDA 12.6 38 | RUN pip3 install --no-cache-dir --upgrade pip && \ 39 | pip3 install --no-cache-dir hf_transfer peft && \ 40 | pip3 install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126 41 | RUN pip3 install --no-cache-dir . 42 | 43 | # Ensure target directories for volumes exist and have correct initial ownership 44 | RUN mkdir -p /app/outputs /app/checkpoints /app/logs && \ 45 | chown -R appuser:appuser /app/outputs /app/checkpoints /app/logs 46 | 47 | # Change ownership of app files to appuser 48 | RUN chown -R appuser:appuser /app 49 | 50 | # Switch to non-root user 51 | USER appuser 52 | 53 | # Expose the port the app runs on 54 | EXPOSE 7865 55 | 56 | VOLUME [ "/app/checkpoints", "/app/outputs", "/app/logs" ] 57 | 58 | # Set healthcheck 59 | HEALTHCHECK --interval=60s --timeout=10s --start-period=5s --retries=5 \ 60 | CMD curl -f http://localhost:7865/ || exit 1 61 | 62 | # Command to run the application with GPU support 63 | CMD ["python3", "acestep/gui.py", "--server_name", "0.0.0.0", "--bf16", "true"] 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [2025] Timedomain Inc. and stepfun 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /TRAIN_INSTRUCTION.md: -------------------------------------------------------------------------------- 1 | # Training Instruction 2 | 3 | ## 1. Data Preparation 4 | 1. First, check the format of data preparation in the `data` directory under the root directory of the project page. 5 | Prepare your audio. If you already have well-labeled audio, that's great. 6 | If you don't have labels, you can use the following prompt and utilize Qwen Omini to label your audio. The community welcomes contributions of better prompts, as well as annotation tools and UI. 7 | 8 | How to get an audio's reception? 9 | You can use `Qwen-Omini` https://chat.qwen.ai/ to describe an audio. 10 | Here we share the prompt we used. 11 | 12 | ```python 13 | sys_prompt_without_tag = """Analyze the input audio and generate 6 description variants. Each variant must be <200 characters. Follow these exact definitions: 14 | 15 | 1. `simplified`: Use only one most representative tag from the valid set. 16 | 2. `expanded`: Broaden valid tags to include related sub-genres/techniques. 17 | 3. `descriptive`: Convert tags into a sensory-rich sentence based *only on the sound*. DO NOT transcribe or reference the lyrics. 18 | 4. `synonyms`: Replace tags with equivalent terms (e.g., 'strings' → 'orchestral'). 19 | 5. `use_cases`: Suggest practical applications based on audio characteristics. 20 | 6. `analysis`: Analyze the audio's genre, instruments, tempo, and mood **based strictly on the audible musical elements**. Technical breakdown in specified format. 21 | * For the `instruments` list: **Only include instruments that are actually heard playing in the audio recording.** **Explicitly ignore any instruments merely mentioned or sung about in the lyrics.** Cover all audibly present instruments. 22 | 7. `lyrical_rap_check`: if the audio is lyrical rap 23 | **Strictly ignore any information derived solely from the lyrics when performing the analysis, especially for identifying instruments.** 24 | 25 | **Output Format:** 26 | ```json 27 | { 28 | "simplified": , 29 | "expanded": , 30 | "descriptive": , 31 | "synonyms": , 32 | "use_cases": , 33 | "analysis": { 34 | "genre": , 35 | "instruments": , 36 | "tempo": , 37 | "mood": 38 | }, 39 | "lyrical_rap_check": 40 | } 41 | """ 42 | ``` 43 | 44 | ## 2. Convert to Huggingface Dataset Format 45 | 2. Run `python convert2hf_dataset.py --data_dir "./data" --repeat_count 2000 --output_name "zh_lora_dataset"`. (Since there is only one piece of sample data, it is repeated 2000 times. You can adjust it according to the size of your data.) 46 | 47 | ## 3. Configure Lora Parameters 48 | Refer to `config/zh_rap_lora_config.json` for configuring Lora parameters. 49 | 50 | If your VRAM is not sufficient, you can reduce the `r` and `lora_alpha` parameters in the configuration file. Such as: 51 | ```json 52 | { 53 | "r": 16, 54 | "lora_alpha": 32, 55 | "target_modules": [ 56 | "linear_q", 57 | "linear_k", 58 | "linear_v", 59 | "to_q", 60 | "to_k", 61 | "to_v", 62 | "to_out.0" 63 | ] 64 | } 65 | ``` 66 | 67 | ## 4. Run Training 68 | Run `python trainer.py` with the following parameter introduction: 69 | 70 | # Trainer Parameter Interpretation 71 | 72 | ## 1. General Settings 73 | 1. **`--num_nodes`**: This parameter specifies the number of nodes for the training process. It is an integer value, and the default is set to 1. In scenarios where distributed training across multiple nodes is applicable, this parameter determines how many nodes will be utilized. For example, if you have a cluster of machines and want to distribute the training load, you can increase this number. However, for most single-machine or basic training setups, the default value of 1 is sufficient. 74 | 2. **`--shift`**: It is a floating-point parameter with a default value of 3.0. Although its specific function depends on the implementation details of the model, it is likely used for some internal calculations related to the model, such as adjusting certain biases or offsets in the neural network architecture during the training process. 75 | 76 | ## 2. Training Hyperparameters 77 | 1. **`--learning_rate`**: This is a crucial hyperparameter for the training process. It is a floating-point value with a default of 1e-4 (0.0001). The learning rate determines the step size at each iteration while updating the model's weights. A smaller learning rate will make the training process more stable but may require more training steps to converge. On the other hand, a larger learning rate can lead to faster convergence but might cause the model to overshoot the optimal solution and result in unstable training or even divergence. 78 | 2. **`--num_workers`**: This parameter defines the number of worker processes that will be used for data loading. It is an integer with a default value of 8. Having multiple workers can significantly speed up the data loading process, especially when dealing with large datasets. However, it also consumes additional system resources, so you may need to adjust this value based on the available resources of your machine (e.g., CPU cores and memory). 79 | 3. **`--epochs`**: It represents the number of times the entire training dataset will be passed through the model. It is an integer, and the default value is set to -1. When set to -1, the training will continue until another stopping condition (such as reaching the maximum number of steps) is met. If you set a positive integer value, the training will stop after that number of epochs. 80 | 4. **`--max_steps`**: This parameter specifies the maximum number of training steps. It is an integer with a default value of 2000000. Once the model has completed this number of training steps, the training process will stop, regardless of whether the model has fully converged or not. This is useful for setting a limit on the training duration in terms of the number of steps. 81 | 5. **`--every_n_train_steps`**: It is an integer parameter with a default of 2000. It determines how often certain operations (such as saving checkpoints, logging training progress, etc.) will be performed during the training. For example, with a value of 2000, these operations will occur every 2000 training steps. 82 | 83 | ## 3. Dataset and Experiment Settings 84 | 1. **`--dataset_path`**: This is a string parameter that indicates the path to the dataset in the Huggingface dataset format. The default value is "./zh_lora_dataset". You need to ensure that the dataset at this path is correctly formatted and contains the necessary data for training. 85 | 2. **`--exp_name`**: It is a string parameter used to name the experiment. The default value is "chinese_rap_lora". This name can be used to distinguish different training experiments, and it is often used in logging and saving checkpoints to organize and identify the results of different runs. 86 | 87 | ## 4. Training Precision and Gradient Settings 88 | 1. **`--precision`**: This parameter specifies the precision of the training. It is a string with a default value of "32", which usually means 32-bit floating-point precision. Higher precision can lead to more accurate training but may also consume more memory and computational resources. You can adjust this value depending on your hardware capabilities and the requirements of your model. 89 | 2. **`--accumulate_grad_batches`**: It is an integer parameter with a default value of 1. It determines how many batches of data will be used to accumulate gradients before performing an optimization step. For example, if you set it to 4, the gradients from 4 consecutive batches will be accumulated, and then the model's weights will be updated. This can be useful in scenarios where you want to simulate larger batch sizes when your available memory does not allow for actual large batch training. 90 | 3. **`--gradient_clip_val`**: This is a floating-point parameter with a default value of 0.5. It is used to clip the gradients during the backpropagation process. Clipping the gradients helps prevent the issue of gradient explosion, where the gradients become extremely large and cause the model to become unstable. By setting a clip value, the gradients will be adjusted to be within a certain range. 91 | 4. **`--gradient_clip_algorithm`**: It is a string parameter with a default value of "norm". This parameter specifies the algorithm used for gradient clipping. The "norm" algorithm is one common method, but there may be other algorithms available depending on the implementation of the training framework. 92 | 93 | ## 5. Checkpoint and Logging Settings 94 | 1. **`--devices`**: This is an integer parameter with a default value of 1. It specifies the number of devices (such as GPUs) that will be used for training. If you have multiple GPUs available and want to use them for parallel training, you can increase this number accordingly. 95 | 2. **`--logger_dir`**: It is a string parameter with a default value of "./exps/logs/". This parameter indicates the directory where the training logs will be saved. The logs can be useful for monitoring the training progress, analyzing the performance of the model during training, and debugging any issues that may arise. 96 | 3. **`--ckpt_path`**: It is a string parameter with a default value of None. If you want to resume training from a previously saved checkpoint, you can specify the path to the checkpoint file using this parameter. If set to None, the training will start from scratch. 97 | 4. **`--checkpoint_dir`**: This is a string parameter with a default value of None. It specifies the directory where the checkpoints of the model will be saved during the training process. If set to None, checkpoints may not be saved or may be saved in a default location depending on the training framework. 98 | 99 | ## 6. Validation and Reloading Settings 100 | 1. **`--reload_dataloaders_every_n_epochs`**: It is an integer parameter with a default value of 1. It determines how often the data loaders will be reloaded during the training process. Reloading the data loaders can be useful when you want to ensure that the data is shuffled or processed differently for each epoch, especially when dealing with datasets that may change or have some specific requirements. 101 | 2. **`--every_plot_step`**: It is an integer parameter with a default value of 2000. It specifies how often some visualizations or plots (such as loss curves, accuracy plots, etc.) will be generated during the training process. For example, with a value of 2000, the plots will be updated every 2000 training steps. 102 | 3. **`--val_check_interval`**: This is an integer parameter with a default value of None. It determines how often the validation process will be performed during the training. If set to a positive integer, the model will be evaluated on the validation dataset every specified number of steps. If set to None, no regular validation checks will be performed. 103 | 4. **`--lora_config_path`**: It is a string parameter with a default value of "config/zh_rap_lora_config.json". This parameter specifies the path to the configuration file for the Lora (Low-Rank Adaptation) module. The Lora configuration file contains settings related to the Lora module, such as the rank of the low-rank matrices, the learning rate for the Lora parameters, etc. 104 | -------------------------------------------------------------------------------- /ZH_RAP_LORA.md: -------------------------------------------------------------------------------- 1 | # 🎤 RapMachine Release 2 | 3 | We meticulously curated and trained this model on Chinese rap/hip-hop datasets, with rigorous data cleaning and recaptioning. The results include: 4 | - **Improved pronunciation** for Chinese lyrics 5 | - **Enhanced adherence** to hip-hop and electronic music styles 6 | - **Greater diversity** in hip-hop vocal performances 7 | 8 | ### **How to Use** 9 | 1. Generate **higher-quality Chinese songs** (⚠️ It's not just for Chinese songs. You can also use it in other ways. ) 10 | 11 | 2. Create **better hip-hop tracks** 12 | 3. Blend it with other genres to: 13 | - Produce music with **richer vocal details** 14 | - Experiment with **underground or street culture flavors** 15 | 4. Fine-tune outputs using the following dimensions: 16 | 17 | - **`vocal_timbre`**: Describes the inherent qualities of the voice. 18 | - Examples: *Bright, dark, warm, cold, breathy, nasal, gritty, smooth, husky, metallic, whispery, resonant, airy, smoky, sultry, light, clear, high-pitched, raspy, powerful, ethereal, flute-like, hollow, velvety, shrill, hoarse, mellow, thin, thick, reedy, silvery, twangy.* 19 | 20 | - **`techniques`**: 21 | - Examples: 22 | - **Rap styles**: `mumble rap`, `chopper rap`, `melodic rap`, `lyrical rap`, `trap flow`, `double-time rap` 23 | - **Vocal effects**: `auto-tune`, `reverb`, `delay`, `distortion` 24 | - **Delivery styles**: `whispered`, `shouted`, `spoken word`, `narration`, `singing` 25 | - **Other vocalizations**: `ad-libs`, `call-and-response`, `harmonized` 26 | 27 | --- 28 | 29 | ## Community Note 30 | We’ve **revamped and expanded** the LoRA training guide with finer details. This release is a **proof of concept**—showcasing the potential of **ACE-Step**. 31 | 32 | While a Chinese rap LoRA might seem niche for non-Chinese communities, we consistently demonstrate through such projects that ACE-step - as a music generation foundation model - holds boundless potential. It doesn't just improve pronunciation in one language, but spawns new styles. 33 | 34 | The universal human appreciation of music is a precious asset. Like abstract LEGO blocks, these elements will eventually combine in more organic ways. May our open-source contributions propel the evolution of musical history forward. 35 | 36 | **Enjoy it, customize it, and create something entirely new.** 37 | 38 | We can’t wait to hear what you’ll build! 39 | -------------------------------------------------------------------------------- /acestep/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | ACE-Step: A Step Towards Music Generation Foundation Model 3 | 4 | https://github.com/ace-step/ACE-Step 5 | 6 | Apache 2.0 License 7 | """ 8 | -------------------------------------------------------------------------------- /acestep/apg_guidance.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class MomentumBuffer: 5 | def __init__(self, momentum: float = -0.75): 6 | self.momentum = momentum 7 | self.running_average = 0 8 | 9 | def update(self, update_value: torch.Tensor): 10 | new_average = self.momentum * self.running_average 11 | self.running_average = update_value + new_average 12 | 13 | 14 | def project( 15 | v0: torch.Tensor, # [B, C, H, W] 16 | v1: torch.Tensor, # [B, C, H, W] 17 | dims=[-1, -2], 18 | ): 19 | dtype = v0.dtype 20 | device_type = v0.device.type 21 | if device_type == "mps": 22 | v0, v1 = v0.cpu(), v1.cpu() 23 | 24 | v0, v1 = v0.double(), v1.double() 25 | v1 = torch.nn.functional.normalize(v1, dim=dims) 26 | v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1 27 | v0_orthogonal = v0 - v0_parallel 28 | return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to( 29 | device_type 30 | ) 31 | 32 | 33 | def apg_forward( 34 | pred_cond: torch.Tensor, # [B, C, H, W] 35 | pred_uncond: torch.Tensor, # [B, C, H, W] 36 | guidance_scale: float, 37 | momentum_buffer: MomentumBuffer = None, 38 | eta: float = 0.0, 39 | norm_threshold: float = 2.5, 40 | dims=[-1, -2], 41 | ): 42 | diff = pred_cond - pred_uncond 43 | if momentum_buffer is not None: 44 | momentum_buffer.update(diff) 45 | diff = momentum_buffer.running_average 46 | 47 | if norm_threshold > 0: 48 | ones = torch.ones_like(diff) 49 | diff_norm = diff.norm(p=2, dim=dims, keepdim=True) 50 | scale_factor = torch.minimum(ones, norm_threshold / diff_norm) 51 | diff = diff * scale_factor 52 | 53 | diff_parallel, diff_orthogonal = project(diff, pred_cond, dims) 54 | normalized_update = diff_orthogonal + eta * diff_parallel 55 | pred_guided = pred_cond + (guidance_scale - 1) * normalized_update 56 | return pred_guided 57 | 58 | 59 | def cfg_forward(cond_output, uncond_output, cfg_strength): 60 | return uncond_output + cfg_strength * (cond_output - uncond_output) 61 | 62 | 63 | def cfg_double_condition_forward( 64 | cond_output, 65 | uncond_output, 66 | only_text_cond_output, 67 | guidance_scale_text, 68 | guidance_scale_lyric, 69 | ): 70 | return ( 71 | (1 - guidance_scale_text) * uncond_output 72 | + (guidance_scale_text - guidance_scale_lyric) * only_text_cond_output 73 | + guidance_scale_lyric * cond_output 74 | ) 75 | 76 | 77 | def optimized_scale(positive_flat, negative_flat): 78 | 79 | # Calculate dot production 80 | dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True) 81 | 82 | # Squared norm of uncondition 83 | squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8 84 | 85 | # st_star = v_cond^T * v_uncond / ||v_uncond||^2 86 | st_star = dot_product / squared_norm 87 | 88 | return st_star 89 | 90 | 91 | def cfg_zero_star( 92 | noise_pred_with_cond, 93 | noise_pred_uncond, 94 | guidance_scale, 95 | i, 96 | zero_steps=1, 97 | use_zero_init=True, 98 | ): 99 | bsz = noise_pred_with_cond.shape[0] 100 | positive_flat = noise_pred_with_cond.view(bsz, -1) 101 | negative_flat = noise_pred_uncond.view(bsz, -1) 102 | alpha = optimized_scale(positive_flat, negative_flat) 103 | alpha = alpha.view(bsz, 1, 1, 1) 104 | if (i <= zero_steps) and use_zero_init: 105 | noise_pred = noise_pred_with_cond * 0.0 106 | else: 107 | noise_pred = noise_pred_uncond * alpha + guidance_scale * ( 108 | noise_pred_with_cond - noise_pred_uncond * alpha 109 | ) 110 | return noise_pred 111 | -------------------------------------------------------------------------------- /acestep/cpu_offload.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import functools 3 | from typing import Callable, TypeVar 4 | 5 | 6 | class CpuOffloader: 7 | def __init__(self, model, device="cpu"): 8 | self.model = model 9 | self.original_device = device 10 | self.original_dtype = model.dtype 11 | 12 | def __enter__(self): 13 | if not hasattr(self.model,"torchao_quantized"): 14 | self.model.to(self.original_device, dtype=self.original_dtype) 15 | return self.model 16 | 17 | def __exit__(self, *args): 18 | if not hasattr(self.model,"torchao_quantized"): 19 | self.model.to("cpu") 20 | if torch.cuda.is_available(): 21 | torch.cuda.empty_cache() 22 | torch.cuda.synchronize() 23 | 24 | 25 | T = TypeVar('T') 26 | 27 | def cpu_offload(model_attr: str): 28 | def decorator(func: Callable[..., T]) -> Callable[..., T]: 29 | @functools.wraps(func) 30 | def wrapper(self, *args, **kwargs): 31 | if not self.cpu_offload: 32 | return func(self, *args, **kwargs) 33 | 34 | # Get the device from the class 35 | device = self.device 36 | # Get the model from the class attribute 37 | model = getattr(self, model_attr) 38 | 39 | with CpuOffloader(model, device): 40 | return func(self, *args, **kwargs) 41 | 42 | return wrapper 43 | return decorator 44 | -------------------------------------------------------------------------------- /acestep/data_sampler.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | import random 4 | 5 | 6 | DEFAULT_ROOT_DIR = "examples/default/input_params" 7 | ZH_RAP_LORA_ROOT_DIR = "examples/zh_rap_lora/input_params" 8 | 9 | class DataSampler: 10 | def __init__(self, root_dir=DEFAULT_ROOT_DIR): 11 | self.root_dir = root_dir 12 | self.input_params_files = list(Path(self.root_dir).glob("*.json")) 13 | self.zh_rap_lora_input_params_files = list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json")) 14 | self.zh_rap_lora_input_params_files += list(Path(ZH_RAP_LORA_ROOT_DIR).glob("*.json")) 15 | 16 | def load_json(self, file_path): 17 | with open(file_path, "r", encoding="utf-8") as f: 18 | return json.load(f) 19 | 20 | def sample(self, lora_name_or_path=None): 21 | if lora_name_or_path is None or lora_name_or_path == "none": 22 | json_path = random.choice(self.input_params_files) 23 | json_data = self.load_json(json_path) 24 | else: 25 | json_path = random.choice(self.zh_rap_lora_input_params_files) 26 | json_data = self.load_json(json_path) 27 | # Update the lora_name in the json_data 28 | json_data["lora_name_or_path"] = lora_name_or_path 29 | 30 | return json_data 31 | -------------------------------------------------------------------------------- /acestep/gui.py: -------------------------------------------------------------------------------- 1 | """ 2 | ACE-Step: A Step Towards Music Generation Foundation Model 3 | 4 | https://github.com/ace-step/ACE-Step 5 | 6 | Apache 2.0 License 7 | """ 8 | 9 | import os 10 | import click 11 | 12 | @click.command() 13 | @click.option( 14 | "--checkpoint_path", 15 | type=str, 16 | default="", 17 | help="Path to the checkpoint directory. Downloads automatically if empty.", 18 | ) 19 | @click.option( 20 | "--server_name", 21 | type=str, 22 | default="127.0.0.1", 23 | help="The server name to use for the Gradio app.", 24 | ) 25 | @click.option( 26 | "--port", type=int, default=7865, help="The port to use for the Gradio app." 27 | ) 28 | @click.option("--device_id", type=int, default=0, help="The CUDA device ID to use.") 29 | @click.option( 30 | "--share", 31 | type=click.BOOL, 32 | default=False, 33 | help="Whether to create a public, shareable link for the Gradio app.", 34 | ) 35 | @click.option( 36 | "--bf16", 37 | type=click.BOOL, 38 | default=True, 39 | help="Whether to use bfloat16 precision. Turn off if using MPS.", 40 | ) 41 | @click.option( 42 | "--torch_compile", type=click.BOOL, default=False, help="Whether to use torch.compile." 43 | ) 44 | @click.option( 45 | "--cpu_offload", type=bool, default=False, help="Whether to use CPU offloading (only load current stage's model to GPU)" 46 | ) 47 | @click.option( 48 | "--overlapped_decode", type=bool, default=False, help="Whether to use overlapped decoding (run dcae and vocoder using sliding windows)" 49 | ) 50 | def main(checkpoint_path, server_name, port, device_id, share, bf16, torch_compile, cpu_offload, overlapped_decode): 51 | """ 52 | Main function to launch the ACE Step pipeline demo. 53 | """ 54 | 55 | os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) 56 | 57 | from acestep.ui.components import create_main_demo_ui 58 | from acestep.pipeline_ace_step import ACEStepPipeline 59 | from acestep.data_sampler import DataSampler 60 | 61 | model_demo = ACEStepPipeline( 62 | checkpoint_dir=checkpoint_path, 63 | dtype="bfloat16" if bf16 else "float32", 64 | torch_compile=torch_compile, 65 | cpu_offload=cpu_offload, 66 | overlapped_decode=overlapped_decode 67 | ) 68 | data_sampler = DataSampler() 69 | 70 | demo = create_main_demo_ui( 71 | text2music_process_func=model_demo.__call__, 72 | sample_data_func=data_sampler.sample, 73 | load_data_func=data_sampler.load_json, 74 | ) 75 | demo.launch(server_name=server_name, server_port=port, share=share) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /acestep/language_segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | from acestep.language_segmentation.LangSegment import LangSegment 2 | 3 | 4 | # release 5 | __version__ = "0.3.5" 6 | 7 | 8 | # develop 9 | __develop__ = "dev-0.0.1" 10 | -------------------------------------------------------------------------------- /acestep/language_segmentation/language_filters.py: -------------------------------------------------------------------------------- 1 | default = [ 2 | "af", 3 | "am", 4 | "an", 5 | "ar", 6 | "as", 7 | "az", 8 | "be", 9 | "bg", 10 | "bn", 11 | "br", 12 | "bs", 13 | "ca", 14 | "cs", 15 | "cy", 16 | "da", 17 | "de", 18 | "dz", 19 | "el", 20 | "en", 21 | "eo", 22 | "es", 23 | "et", 24 | "eu", 25 | "fa", 26 | "fi", 27 | "fo", 28 | "fr", 29 | "ga", 30 | "gl", 31 | "gu", 32 | "he", 33 | "hi", 34 | "hr", 35 | "ht", 36 | "hu", 37 | "hy", 38 | "id", 39 | "is", 40 | "it", 41 | "ja", 42 | "jv", 43 | "ka", 44 | "kk", 45 | "km", 46 | "kn", 47 | "ko", 48 | "ku", 49 | "ky", 50 | "la", 51 | "lb", 52 | "lo", 53 | "lt", 54 | "lv", 55 | "mg", 56 | "mk", 57 | "ml", 58 | "mn", 59 | "mr", 60 | "ms", 61 | "mt", 62 | "nb", 63 | "ne", 64 | "nl", 65 | "nn", 66 | "no", 67 | "oc", 68 | "or", 69 | "pa", 70 | "pl", 71 | "ps", 72 | "pt", 73 | "qu", 74 | "ro", 75 | "ru", 76 | "rw", 77 | "se", 78 | "si", 79 | "sk", 80 | "sl", 81 | "sq", 82 | "sr", 83 | "sv", 84 | "sw", 85 | "ta", 86 | "te", 87 | "th", 88 | "tl", 89 | "tr", 90 | "ug", 91 | "uk", 92 | "ur", 93 | "vi", 94 | "vo", 95 | "wa", 96 | "xh", 97 | "zh", 98 | "zu", 99 | ] -------------------------------------------------------------------------------- /acestep/language_segmentation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # This file intentionally left blank for Python to recognize the directory as a package. 2 | -------------------------------------------------------------------------------- /acestep/language_segmentation/utils/num.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Digital processing from GPT_SoVITS num.py (thanks) 15 | """ 16 | Rules to verbalize numbers into Chinese characters. 17 | https://zh.wikipedia.org/wiki/中文数字#現代中文 18 | """ 19 | 20 | import re 21 | from collections import OrderedDict 22 | from typing import List 23 | 24 | DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")} 25 | UNITS = OrderedDict( 26 | { 27 | 1: "十", 28 | 2: "百", 29 | 3: "千", 30 | 4: "万", 31 | 8: "亿", 32 | } 33 | ) 34 | 35 | COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)" 36 | 37 | # 分数表达式 38 | RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)") 39 | 40 | 41 | def replace_frac(match) -> str: 42 | """ 43 | Args: 44 | match (re.Match) 45 | Returns: 46 | str 47 | """ 48 | sign = match.group(1) 49 | nominator = match.group(2) 50 | denominator = match.group(3) 51 | sign: str = "负" if sign else "" 52 | nominator: str = num2str(nominator) 53 | denominator: str = num2str(denominator) 54 | result = f"{sign}{denominator}分之{nominator}" 55 | return result 56 | 57 | 58 | # 百分数表达式 59 | RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%") 60 | 61 | 62 | def replace_percentage(match) -> str: 63 | """ 64 | Args: 65 | match (re.Match) 66 | Returns: 67 | str 68 | """ 69 | sign = match.group(1) 70 | percent = match.group(2) 71 | sign: str = "负" if sign else "" 72 | percent: str = num2str(percent) 73 | result = f"{sign}百分之{percent}" 74 | return result 75 | 76 | 77 | # 整数表达式 78 | # 带负号的整数 -10 79 | RE_INTEGER = re.compile(r"(-)" r"(\d+)") 80 | 81 | 82 | def replace_negative_num(match) -> str: 83 | """ 84 | Args: 85 | match (re.Match) 86 | Returns: 87 | str 88 | """ 89 | sign = match.group(1) 90 | number = match.group(2) 91 | sign: str = "负" if sign else "" 92 | number: str = num2str(number) 93 | result = f"{sign}{number}" 94 | return result 95 | 96 | 97 | # 编号-无符号整形 98 | # 00078 99 | RE_DEFAULT_NUM = re.compile(r"\d{3}\d*") 100 | 101 | 102 | def replace_default_num(match): 103 | """ 104 | Args: 105 | match (re.Match) 106 | Returns: 107 | str 108 | """ 109 | number = match.group(0) 110 | return verbalize_digit(number, alt_one=True) 111 | 112 | 113 | # 加减乘除 114 | # RE_ASMD = re.compile( 115 | # r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') 116 | RE_ASMD = re.compile( 117 | r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))" 118 | ) 119 | 120 | asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"} 121 | 122 | 123 | def replace_asmd(match) -> str: 124 | """ 125 | Args: 126 | match (re.Match) 127 | Returns: 128 | str 129 | """ 130 | result = match.group(1) + asmd_map[match.group(8)] + match.group(9) 131 | return result 132 | 133 | 134 | # 次方专项 135 | RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+") 136 | 137 | power_map = { 138 | "⁰": "0", 139 | "¹": "1", 140 | "²": "2", 141 | "³": "3", 142 | "⁴": "4", 143 | "⁵": "5", 144 | "⁶": "6", 145 | "⁷": "7", 146 | "⁸": "8", 147 | "⁹": "9", 148 | "ˣ": "x", 149 | "ʸ": "y", 150 | "ⁿ": "n", 151 | } 152 | 153 | 154 | def replace_power(match) -> str: 155 | """ 156 | Args: 157 | match (re.Match) 158 | Returns: 159 | str 160 | """ 161 | power_num = "" 162 | for m in match.group(0): 163 | power_num += power_map[m] 164 | result = "的" + power_num + "次方" 165 | return result 166 | 167 | 168 | # 数字表达式 169 | # 纯小数 170 | RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))") 171 | # 正整数 + 量词 172 | RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) 173 | RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))") 174 | 175 | 176 | def replace_positive_quantifier(match) -> str: 177 | """ 178 | Args: 179 | match (re.Match) 180 | Returns: 181 | str 182 | """ 183 | number = match.group(1) 184 | match_2 = match.group(2) 185 | if match_2 == "+": 186 | match_2 = "多" 187 | match_2: str = match_2 if match_2 else "" 188 | quantifiers: str = match.group(3) 189 | number: str = num2str(number) 190 | result = f"{number}{match_2}{quantifiers}" 191 | return result 192 | 193 | 194 | def replace_number(match) -> str: 195 | """ 196 | Args: 197 | match (re.Match) 198 | Returns: 199 | str 200 | """ 201 | sign = match.group(1) 202 | number = match.group(2) 203 | pure_decimal = match.group(5) 204 | if pure_decimal: 205 | result = num2str(pure_decimal) 206 | else: 207 | sign: str = "负" if sign else "" 208 | number: str = num2str(number) 209 | result = f"{sign}{number}" 210 | return result 211 | 212 | 213 | # 范围表达式 214 | # match.group(1) and match.group(8) are copy from RE_NUMBER 215 | 216 | RE_RANGE = re.compile( 217 | r""" 218 | (? str: 229 | """ 230 | Args: 231 | match (re.Match) 232 | Returns: 233 | str 234 | """ 235 | first, second = match.group(1), match.group(6) 236 | first = RE_NUMBER.sub(replace_number, first) 237 | second = RE_NUMBER.sub(replace_number, second) 238 | result = f"{first}到{second}" 239 | return result 240 | 241 | 242 | # ~至表达式 243 | RE_TO_RANGE = re.compile( 244 | r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)" 245 | ) 246 | 247 | 248 | def replace_to_range(match) -> str: 249 | """ 250 | Args: 251 | match (re.Match) 252 | Returns: 253 | str 254 | """ 255 | result = match.group(0).replace("~", "至") 256 | return result 257 | 258 | 259 | def _get_value(value_string: str, use_zero: bool = True) -> List[str]: 260 | stripped = value_string.lstrip("0") 261 | if len(stripped) == 0: 262 | return [] 263 | elif len(stripped) == 1: 264 | if use_zero and len(stripped) < len(value_string): 265 | return [DIGITS["0"], DIGITS[stripped]] 266 | else: 267 | return [DIGITS[stripped]] 268 | else: 269 | largest_unit = next( 270 | power for power in reversed(UNITS.keys()) if power < len(stripped) 271 | ) 272 | first_part = value_string[:-largest_unit] 273 | second_part = value_string[-largest_unit:] 274 | return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) 275 | 276 | 277 | def verbalize_cardinal(value_string: str) -> str: 278 | if not value_string: 279 | return "" 280 | 281 | # 000 -> '零' , 0 -> '零' 282 | value_string = value_string.lstrip("0") 283 | if len(value_string) == 0: 284 | return DIGITS["0"] 285 | 286 | result_symbols = _get_value(value_string) 287 | # verbalized number starting with '一十*' is abbreviated as `十*` 288 | if ( 289 | len(result_symbols) >= 2 290 | and result_symbols[0] == DIGITS["1"] 291 | and result_symbols[1] == UNITS[1] 292 | ): 293 | result_symbols = result_symbols[1:] 294 | return "".join(result_symbols) 295 | 296 | 297 | def verbalize_digit(value_string: str, alt_one=False) -> str: 298 | result_symbols = [DIGITS[digit] for digit in value_string] 299 | result = "".join(result_symbols) 300 | if alt_one: 301 | result = result.replace("一", "幺") 302 | return result 303 | 304 | 305 | def num2str(value_string: str) -> str: 306 | integer_decimal = value_string.split(".") 307 | if len(integer_decimal) == 1: 308 | integer = integer_decimal[0] 309 | decimal = "" 310 | elif len(integer_decimal) == 2: 311 | integer, decimal = integer_decimal 312 | else: 313 | raise ValueError( 314 | f"The value string: '${value_string}' has more than one point in it." 315 | ) 316 | 317 | result = verbalize_cardinal(integer) 318 | 319 | decimal = decimal.rstrip("0") 320 | if decimal: 321 | # '.22' is verbalized as '零点二二' 322 | # '3.20' is verbalized as '三点二 323 | result = result if result else "零" 324 | result += "点" + verbalize_digit(decimal) 325 | return result 326 | 327 | 328 | if __name__ == "__main__": 329 | 330 | text = "" 331 | text = num2str(text) 332 | print(text) 333 | pass 334 | -------------------------------------------------------------------------------- /acestep/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/acestep/models/__init__.py -------------------------------------------------------------------------------- /acestep/models/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Tuple, Union 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | from torch import nn 19 | 20 | from diffusers.utils import logging 21 | from diffusers.models.normalization import RMSNorm 22 | 23 | 24 | try: 25 | # from .dcformer import DCMHAttention 26 | from .customer_attention_processor import ( 27 | Attention, 28 | CustomLiteLAProcessor2_0, 29 | CustomerAttnProcessor2_0, 30 | ) 31 | except ImportError: 32 | # from dcformer import DCMHAttention 33 | from customer_attention_processor import ( 34 | Attention, 35 | CustomLiteLAProcessor2_0, 36 | CustomerAttnProcessor2_0, 37 | ) 38 | 39 | 40 | logger = logging.get_logger(__name__) 41 | 42 | 43 | def val2list(x: list or tuple or any, repeat_time=1) -> list: # type: ignore 44 | """Repeat `val` for `repeat_time` times and return the list or val if list/tuple.""" 45 | if isinstance(x, (list, tuple)): 46 | return list(x) 47 | return [x for _ in range(repeat_time)] 48 | 49 | 50 | def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple: # type: ignore 51 | """Return tuple with min_len by repeating element at idx_repeat.""" 52 | # convert to list first 53 | x = val2list(x) 54 | 55 | # repeat elements if necessary 56 | if len(x) > 0: 57 | x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))] 58 | 59 | return tuple(x) 60 | 61 | 62 | def t2i_modulate(x, shift, scale): 63 | return x * (1 + scale) + shift 64 | 65 | 66 | def get_same_padding( 67 | kernel_size: Union[int, Tuple[int, ...]], 68 | ) -> Union[int, Tuple[int, ...]]: 69 | if isinstance(kernel_size, tuple): 70 | return tuple([get_same_padding(ks) for ks in kernel_size]) 71 | else: 72 | assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number" 73 | return kernel_size // 2 74 | 75 | 76 | class ConvLayer(nn.Module): 77 | def __init__( 78 | self, 79 | in_dim: int, 80 | out_dim: int, 81 | kernel_size=3, 82 | stride=1, 83 | dilation=1, 84 | groups=1, 85 | padding: Union[int, None] = None, 86 | use_bias=False, 87 | norm=None, 88 | act=None, 89 | ): 90 | super().__init__() 91 | if padding is None: 92 | padding = get_same_padding(kernel_size) 93 | padding *= dilation 94 | 95 | self.in_dim = in_dim 96 | self.out_dim = out_dim 97 | self.kernel_size = kernel_size 98 | self.stride = stride 99 | self.dilation = dilation 100 | self.groups = groups 101 | self.padding = padding 102 | self.use_bias = use_bias 103 | 104 | self.conv = nn.Conv1d( 105 | in_dim, 106 | out_dim, 107 | kernel_size=kernel_size, 108 | stride=stride, 109 | padding=padding, 110 | dilation=dilation, 111 | groups=groups, 112 | bias=use_bias, 113 | ) 114 | if norm is not None: 115 | self.norm = RMSNorm(out_dim, elementwise_affine=False) 116 | else: 117 | self.norm = None 118 | if act is not None: 119 | self.act = nn.SiLU(inplace=True) 120 | else: 121 | self.act = None 122 | 123 | def forward(self, x: torch.Tensor) -> torch.Tensor: 124 | x = self.conv(x) 125 | if self.norm: 126 | x = self.norm(x) 127 | if self.act: 128 | x = self.act(x) 129 | return x 130 | 131 | 132 | class GLUMBConv(nn.Module): 133 | def __init__( 134 | self, 135 | in_features: int, 136 | hidden_features: int, 137 | out_feature=None, 138 | kernel_size=3, 139 | stride=1, 140 | padding: Union[int, None] = None, 141 | use_bias=False, 142 | norm=(None, None, None), 143 | act=("silu", "silu", None), 144 | dilation=1, 145 | ): 146 | out_feature = out_feature or in_features 147 | super().__init__() 148 | use_bias = val2tuple(use_bias, 3) 149 | norm = val2tuple(norm, 3) 150 | act = val2tuple(act, 3) 151 | 152 | self.glu_act = nn.SiLU(inplace=False) 153 | self.inverted_conv = ConvLayer( 154 | in_features, 155 | hidden_features * 2, 156 | 1, 157 | use_bias=use_bias[0], 158 | norm=norm[0], 159 | act=act[0], 160 | ) 161 | self.depth_conv = ConvLayer( 162 | hidden_features * 2, 163 | hidden_features * 2, 164 | kernel_size, 165 | stride=stride, 166 | groups=hidden_features * 2, 167 | padding=padding, 168 | use_bias=use_bias[1], 169 | norm=norm[1], 170 | act=None, 171 | dilation=dilation, 172 | ) 173 | self.point_conv = ConvLayer( 174 | hidden_features, 175 | out_feature, 176 | 1, 177 | use_bias=use_bias[2], 178 | norm=norm[2], 179 | act=act[2], 180 | ) 181 | 182 | def forward(self, x: torch.Tensor) -> torch.Tensor: 183 | x = x.transpose(1, 2) 184 | x = self.inverted_conv(x) 185 | x = self.depth_conv(x) 186 | 187 | x, gate = torch.chunk(x, 2, dim=1) 188 | gate = self.glu_act(gate) 189 | x = x * gate 190 | 191 | x = self.point_conv(x) 192 | x = x.transpose(1, 2) 193 | 194 | return x 195 | 196 | 197 | class LinearTransformerBlock(nn.Module): 198 | """ 199 | A Sana block with global shared adaptive layer norm (adaLN-single) conditioning. 200 | """ 201 | 202 | def __init__( 203 | self, 204 | dim, 205 | num_attention_heads, 206 | attention_head_dim, 207 | use_adaln_single=True, 208 | cross_attention_dim=None, 209 | added_kv_proj_dim=None, 210 | context_pre_only=False, 211 | mlp_ratio=4.0, 212 | add_cross_attention=False, 213 | add_cross_attention_dim=None, 214 | qk_norm=None, 215 | ): 216 | super().__init__() 217 | 218 | self.norm1 = RMSNorm(dim, elementwise_affine=False, eps=1e-6) 219 | self.attn = Attention( 220 | query_dim=dim, 221 | cross_attention_dim=cross_attention_dim, 222 | added_kv_proj_dim=added_kv_proj_dim, 223 | dim_head=attention_head_dim, 224 | heads=num_attention_heads, 225 | out_dim=dim, 226 | bias=True, 227 | qk_norm=qk_norm, 228 | processor=CustomLiteLAProcessor2_0(), 229 | ) 230 | 231 | self.add_cross_attention = add_cross_attention 232 | self.context_pre_only = context_pre_only 233 | 234 | if add_cross_attention and add_cross_attention_dim is not None: 235 | self.cross_attn = Attention( 236 | query_dim=dim, 237 | cross_attention_dim=add_cross_attention_dim, 238 | added_kv_proj_dim=add_cross_attention_dim, 239 | dim_head=attention_head_dim, 240 | heads=num_attention_heads, 241 | out_dim=dim, 242 | context_pre_only=context_pre_only, 243 | bias=True, 244 | qk_norm=qk_norm, 245 | processor=CustomerAttnProcessor2_0(), 246 | ) 247 | 248 | self.norm2 = RMSNorm(dim, 1e-06, elementwise_affine=False) 249 | 250 | self.ff = GLUMBConv( 251 | in_features=dim, 252 | hidden_features=int(dim * mlp_ratio), 253 | use_bias=(True, True, False), 254 | norm=(None, None, None), 255 | act=("silu", "silu", None), 256 | ) 257 | self.use_adaln_single = use_adaln_single 258 | if use_adaln_single: 259 | self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) 260 | 261 | def forward( 262 | self, 263 | hidden_states: torch.FloatTensor, 264 | encoder_hidden_states: torch.FloatTensor = None, 265 | attention_mask: torch.FloatTensor = None, 266 | encoder_attention_mask: torch.FloatTensor = None, 267 | rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, 268 | rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, 269 | temb: torch.FloatTensor = None, 270 | ): 271 | 272 | N = hidden_states.shape[0] 273 | 274 | # step 1: AdaLN single 275 | if self.use_adaln_single: 276 | shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( 277 | self.scale_shift_table[None] + temb.reshape(N, 6, -1) 278 | ).chunk(6, dim=1) 279 | 280 | norm_hidden_states = self.norm1(hidden_states) 281 | if self.use_adaln_single: 282 | norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa 283 | 284 | # step 2: attention 285 | if not self.add_cross_attention: 286 | attn_output, encoder_hidden_states = self.attn( 287 | hidden_states=norm_hidden_states, 288 | attention_mask=attention_mask, 289 | encoder_hidden_states=encoder_hidden_states, 290 | encoder_attention_mask=encoder_attention_mask, 291 | rotary_freqs_cis=rotary_freqs_cis, 292 | rotary_freqs_cis_cross=rotary_freqs_cis_cross, 293 | ) 294 | else: 295 | attn_output, _ = self.attn( 296 | hidden_states=norm_hidden_states, 297 | attention_mask=attention_mask, 298 | encoder_hidden_states=None, 299 | encoder_attention_mask=None, 300 | rotary_freqs_cis=rotary_freqs_cis, 301 | rotary_freqs_cis_cross=None, 302 | ) 303 | 304 | if self.use_adaln_single: 305 | attn_output = gate_msa * attn_output 306 | hidden_states = attn_output + hidden_states 307 | 308 | if self.add_cross_attention: 309 | attn_output = self.cross_attn( 310 | hidden_states=hidden_states, 311 | attention_mask=attention_mask, 312 | encoder_hidden_states=encoder_hidden_states, 313 | encoder_attention_mask=encoder_attention_mask, 314 | rotary_freqs_cis=rotary_freqs_cis, 315 | rotary_freqs_cis_cross=rotary_freqs_cis_cross, 316 | ) 317 | hidden_states = attn_output + hidden_states 318 | 319 | # step 3: add norm 320 | norm_hidden_states = self.norm2(hidden_states) 321 | if self.use_adaln_single: 322 | norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp 323 | 324 | # step 4: feed forward 325 | ff_output = self.ff(norm_hidden_states) 326 | if self.use_adaln_single: 327 | ff_output = gate_mlp * ff_output 328 | 329 | hidden_states = hidden_states + ff_output 330 | 331 | return hidden_states 332 | -------------------------------------------------------------------------------- /acestep/models/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_class_name": "Transformer2DModel", 3 | "_diffusers_version": "0.27.2", 4 | "in_channels": 8, 5 | "num_layers": 24, 6 | "inner_dim": 2560, 7 | "attention_head_dim": 128, 8 | "num_attention_heads": 20, 9 | "mlp_ratio": 2.5, 10 | "out_channels": 8, 11 | "max_position": 32768, 12 | "rope_theta": 1000000.0, 13 | "speaker_embedding_dim": 512, 14 | "text_embedding_dim": 768, 15 | "ssl_encoder_depths": [8, 8], 16 | "ssl_names": ["mert", "m-hubert"], 17 | "ssl_latent_dims": [1024, 768], 18 | "patch_size": [16, 1], 19 | "max_height": 16, 20 | "max_width": 32768, 21 | "lyric_encoder_vocab_size": 6693, 22 | "lyric_hidden_size": 1024 23 | } 24 | -------------------------------------------------------------------------------- /acestep/models/lyrics_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/acestep/models/lyrics_utils/__init__.py -------------------------------------------------------------------------------- /acestep/models/lyrics_utils/lyric_normalizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from opencc import OpenCC 3 | 4 | 5 | t2s_converter = OpenCC("t2s") 6 | s2t_converter = OpenCC("s2t") 7 | 8 | 9 | EMOJI_PATTERN = re.compile( 10 | "[" 11 | "\U0001f600-\U0001f64f" # Emoticons 12 | "]+", 13 | flags=re.UNICODE, 14 | ) 15 | 16 | # 创建一个翻译表,用于替换和移除字符 17 | TRANSLATION_TABLE = str.maketrans( 18 | { 19 | "-": " ", # 将 '-' 替换为空格 20 | ",": None, 21 | ".": None, 22 | ",": None, 23 | "。": None, 24 | "!": None, 25 | "!": None, 26 | "?": None, 27 | "?": None, 28 | "…": None, 29 | ";": None, 30 | ";": None, 31 | ":": None, 32 | ":": None, 33 | "\u3000": " ", # 将全角空格替换为空格 34 | } 35 | ) 36 | 37 | # 替换括号中的内容,包括中括号和小括号 38 | BACKSLASH_PATTERN = re.compile(r"\(.*?\)|\[.*?\]") 39 | 40 | SPACE_PATTERN = re.compile("(? Tensor: 35 | if y.ndim == 3: 36 | y = y.squeeze(1) 37 | 38 | y = torch.nn.functional.pad( 39 | y.unsqueeze(1), 40 | ( 41 | (self.win_length - self.hop_length) // 2, 42 | (self.win_length - self.hop_length + 1) // 2, 43 | ), 44 | mode="reflect", 45 | ).squeeze(1) 46 | dtype = y.dtype 47 | spec = torch.stft( 48 | y.float(), 49 | self.n_fft, 50 | hop_length=self.hop_length, 51 | win_length=self.win_length, 52 | window=self.window, 53 | center=self.center, 54 | pad_mode="reflect", 55 | normalized=False, 56 | onesided=True, 57 | return_complex=True, 58 | ) 59 | spec = torch.view_as_real(spec) 60 | 61 | if self.mode == "pow2_sqrt": 62 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 63 | spec = spec.to(dtype) 64 | return spec 65 | 66 | 67 | class LogMelSpectrogram(nn.Module): 68 | def __init__( 69 | self, 70 | sample_rate=44100, 71 | n_fft=2048, 72 | win_length=2048, 73 | hop_length=512, 74 | n_mels=128, 75 | center=False, 76 | f_min=0.0, 77 | f_max=None, 78 | ): 79 | super().__init__() 80 | 81 | self.sample_rate = sample_rate 82 | self.n_fft = n_fft 83 | self.win_length = win_length 84 | self.hop_length = hop_length 85 | self.center = center 86 | self.n_mels = n_mels 87 | self.f_min = f_min 88 | self.f_max = f_max or sample_rate // 2 89 | 90 | self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center) 91 | self.mel_scale = MelScale( 92 | self.n_mels, 93 | self.sample_rate, 94 | self.f_min, 95 | self.f_max, 96 | self.n_fft // 2 + 1, 97 | "slaney", 98 | "slaney", 99 | ) 100 | 101 | def compress(self, x: Tensor) -> Tensor: 102 | return torch.log(torch.clamp(x, min=1e-5)) 103 | 104 | def decompress(self, x: Tensor) -> Tensor: 105 | return torch.exp(x) 106 | 107 | def forward(self, x: Tensor, return_linear: bool = False) -> Tensor: 108 | linear = self.spectrogram(x) 109 | x = self.mel_scale(linear) 110 | x = self.compress(x) 111 | # print(x.shape) 112 | if return_linear: 113 | return x, self.compress(linear) 114 | 115 | return x 116 | -------------------------------------------------------------------------------- /acestep/schedulers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/acestep/schedulers/__init__.py -------------------------------------------------------------------------------- /acestep/schedulers/scheduling_flow_match_heun_discrete.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from typing import Optional, Tuple, Union 17 | 18 | import numpy as np 19 | import torch 20 | 21 | from diffusers.configuration_utils import ConfigMixin, register_to_config 22 | from diffusers.utils import BaseOutput, logging 23 | from diffusers.utils.torch_utils import randn_tensor 24 | from diffusers.schedulers.scheduling_utils import SchedulerMixin 25 | 26 | 27 | logger = logging.get_logger(__name__) # pylint: disable=invalid-name 28 | 29 | 30 | @dataclass 31 | class FlowMatchHeunDiscreteSchedulerOutput(BaseOutput): 32 | """ 33 | Output class for the scheduler's `step` function output. 34 | 35 | Args: 36 | prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): 37 | Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the 38 | denoising loop. 39 | """ 40 | 41 | prev_sample: torch.FloatTensor 42 | 43 | 44 | class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin): 45 | """ 46 | Heun scheduler. 47 | 48 | This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic 49 | methods the library implements for all schedulers such as loading and saving. 50 | 51 | Args: 52 | num_train_timesteps (`int`, defaults to 1000): 53 | The number of diffusion steps to train the model. 54 | timestep_spacing (`str`, defaults to `"linspace"`): 55 | The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and 56 | Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. 57 | shift (`float`, defaults to 1.0): 58 | The shift value for the timestep schedule. 59 | """ 60 | 61 | _compatibles = [] 62 | order = 2 63 | 64 | @register_to_config 65 | def __init__( 66 | self, 67 | num_train_timesteps: int = 1000, 68 | shift: float = 1.0, 69 | sigma_max: Optional[float] = 1.0, 70 | ): 71 | timesteps = np.linspace( 72 | 1.0, sigma_max*num_train_timesteps, num_train_timesteps, dtype=np.float32 73 | )[::-1].copy() 74 | timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) 75 | 76 | sigmas = timesteps / num_train_timesteps 77 | sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) 78 | 79 | self.timesteps = sigmas * num_train_timesteps 80 | 81 | self._step_index = None 82 | self._begin_index = None 83 | 84 | self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication 85 | self.sigma_min = self.sigmas[-1].item() 86 | self.sigma_max = self.sigmas[0].item() 87 | 88 | @property 89 | def step_index(self): 90 | """ 91 | The index counter for current timestep. It will increase 1 after each scheduler step. 92 | """ 93 | return self._step_index 94 | 95 | @property 96 | def begin_index(self): 97 | """ 98 | The index for the first timestep. It should be set from pipeline with `set_begin_index` method. 99 | """ 100 | return self._begin_index 101 | 102 | # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index 103 | def set_begin_index(self, begin_index: int = 0): 104 | """ 105 | Sets the begin index for the scheduler. This function should be run from pipeline before the inference. 106 | 107 | Args: 108 | begin_index (`int`): 109 | The begin index for the scheduler. 110 | """ 111 | self._begin_index = begin_index 112 | 113 | def scale_noise( 114 | self, 115 | sample: torch.FloatTensor, 116 | timestep: Union[float, torch.FloatTensor], 117 | noise: Optional[torch.FloatTensor] = None, 118 | ) -> torch.FloatTensor: 119 | """ 120 | Forward process in flow-matching 121 | 122 | Args: 123 | sample (`torch.FloatTensor`): 124 | The input sample. 125 | timestep (`int`, *optional*): 126 | The current timestep in the diffusion chain. 127 | 128 | Returns: 129 | `torch.FloatTensor`: 130 | A scaled input sample. 131 | """ 132 | if self.step_index is None: 133 | self._init_step_index(timestep) 134 | 135 | sigma = self.sigmas[self.step_index] 136 | sample = sigma * noise + (1.0 - sigma) * sample 137 | 138 | return sample 139 | 140 | def _sigma_to_t(self, sigma): 141 | return sigma * self.config.num_train_timesteps 142 | 143 | def set_timesteps( 144 | self, num_inference_steps: int, device: Union[str, torch.device] = None 145 | ): 146 | """ 147 | Sets the discrete timesteps used for the diffusion chain (to be run before inference). 148 | 149 | Args: 150 | num_inference_steps (`int`): 151 | The number of diffusion steps used when generating samples with a pre-trained model. 152 | device (`str` or `torch.device`, *optional*): 153 | The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. 154 | """ 155 | self.num_inference_steps = num_inference_steps 156 | 157 | timesteps = np.linspace( 158 | self._sigma_to_t(self.sigma_max), 159 | self._sigma_to_t(self.sigma_min), 160 | num_inference_steps, 161 | ) 162 | 163 | sigmas = timesteps / self.config.num_train_timesteps 164 | sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) 165 | sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) 166 | 167 | timesteps = sigmas * self.config.num_train_timesteps 168 | timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)]) 169 | self.timesteps = timesteps.to(device=device) 170 | 171 | sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) 172 | self.sigmas = torch.cat( 173 | [sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]] 174 | ) 175 | 176 | # empty dt and derivative 177 | self.prev_derivative = None 178 | self.dt = None 179 | 180 | self._step_index = None 181 | self._begin_index = None 182 | 183 | def index_for_timestep(self, timestep, schedule_timesteps=None): 184 | if schedule_timesteps is None: 185 | schedule_timesteps = self.timesteps 186 | 187 | indices = (schedule_timesteps == timestep).nonzero() 188 | 189 | # The sigma index that is taken for the **very** first `step` 190 | # is always the second index (or the last index if there is only 1) 191 | # This way we can ensure we don't accidentally skip a sigma in 192 | # case we start in the middle of the denoising schedule (e.g. for image-to-image) 193 | pos = 1 if len(indices) > 1 else 0 194 | 195 | return indices[pos].item() 196 | 197 | def _init_step_index(self, timestep): 198 | if self.begin_index is None: 199 | if isinstance(timestep, torch.Tensor): 200 | timestep = timestep.to(self.timesteps.device) 201 | self._step_index = self.index_for_timestep(timestep) 202 | else: 203 | self._step_index = self._begin_index 204 | 205 | @property 206 | def state_in_first_order(self): 207 | return self.dt is None 208 | 209 | def step( 210 | self, 211 | model_output: torch.FloatTensor, 212 | timestep: Union[float, torch.FloatTensor], 213 | sample: torch.FloatTensor, 214 | s_churn: float = 0.0, 215 | s_tmin: float = 0.0, 216 | s_tmax: float = float("inf"), 217 | s_noise: float = 1.0, 218 | generator: Optional[torch.Generator] = None, 219 | return_dict: bool = True, 220 | omega: Union[float, np.array] = 0.0, 221 | ) -> Union[FlowMatchHeunDiscreteSchedulerOutput, Tuple]: 222 | """ 223 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 224 | process from the learned model outputs (most often the predicted noise). 225 | 226 | Args: 227 | model_output (`torch.FloatTensor`): 228 | The direct output from learned diffusion model. 229 | timestep (`float`): 230 | The current discrete timestep in the diffusion chain. 231 | sample (`torch.FloatTensor`): 232 | A current instance of a sample created by the diffusion process. 233 | s_churn (`float`): 234 | s_tmin (`float`): 235 | s_tmax (`float`): 236 | s_noise (`float`, defaults to 1.0): 237 | Scaling factor for noise added to the sample. 238 | generator (`torch.Generator`, *optional*): 239 | A random number generator. 240 | return_dict (`bool`): 241 | Whether or not to return a [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or 242 | tuple. 243 | 244 | Returns: 245 | [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or `tuple`: 246 | If return_dict is `True`, [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] is 247 | returned, otherwise a tuple is returned where the first element is the sample tensor. 248 | """ 249 | 250 | def logistic_function(x, L=0.9, U=1.1, x_0=0.0, k=1): 251 | # L = Lower bound 252 | # U = Upper bound 253 | # x_0 = Midpoint (x corresponding to y = 1.0) 254 | # k = Steepness, can adjust based on preference 255 | 256 | if isinstance(x, torch.Tensor): 257 | device_ = x.device 258 | x = x.to(torch.float).cpu().numpy() 259 | 260 | new_x = L + (U - L) / (1 + np.exp(-k * (x - x_0))) 261 | 262 | if isinstance(new_x, np.ndarray): 263 | new_x = torch.from_numpy(new_x).to(device_) 264 | return new_x 265 | 266 | self.omega_bef_rescale = omega 267 | omega = logistic_function(omega, k=0.1) 268 | self.omega_aft_rescale = omega 269 | 270 | if ( 271 | isinstance(timestep, int) 272 | or isinstance(timestep, torch.IntTensor) 273 | or isinstance(timestep, torch.LongTensor) 274 | ): 275 | raise ValueError( 276 | ( 277 | "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" 278 | " `HeunDiscreteScheduler.step()` is not supported. Make sure to pass" 279 | " one of the `scheduler.timesteps` as a timestep." 280 | ), 281 | ) 282 | 283 | if self.step_index is None: 284 | self._init_step_index(timestep) 285 | 286 | # Upcast to avoid precision issues when computing prev_sample 287 | sample = sample.to(torch.float32) 288 | 289 | if self.state_in_first_order: 290 | sigma = self.sigmas[self.step_index] 291 | sigma_next = self.sigmas[self.step_index + 1] 292 | else: 293 | # 2nd order / Heun's method 294 | sigma = self.sigmas[self.step_index - 1] 295 | sigma_next = self.sigmas[self.step_index] 296 | 297 | gamma = ( 298 | min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) 299 | if s_tmin <= sigma <= s_tmax 300 | else 0.0 301 | ) 302 | 303 | sigma_hat = sigma * (gamma + 1) 304 | 305 | if gamma > 0: 306 | noise = randn_tensor( 307 | model_output.shape, 308 | dtype=model_output.dtype, 309 | device=model_output.device, 310 | generator=generator, 311 | ) 312 | eps = noise * s_noise 313 | sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5 314 | 315 | if self.state_in_first_order: 316 | # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise 317 | denoised = sample - model_output * sigma 318 | # 2. convert to an ODE derivative for 1st order 319 | derivative = (sample - denoised) / sigma_hat 320 | # 3. Delta timestep 321 | dt = sigma_next - sigma_hat 322 | 323 | # store for 2nd order step 324 | self.prev_derivative = derivative 325 | self.dt = dt 326 | self.sample = sample 327 | else: 328 | # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise 329 | denoised = sample - model_output * sigma_next 330 | # 2. 2nd order / Heun's method 331 | derivative = (sample - denoised) / sigma_next 332 | derivative = 0.5 * (self.prev_derivative + derivative) 333 | 334 | # 3. take prev timestep & sample 335 | dt = self.dt 336 | sample = self.sample 337 | 338 | # free dt and derivative 339 | # Note, this puts the scheduler in "first order mode" 340 | self.prev_derivative = None 341 | self.dt = None 342 | self.sample = None 343 | 344 | # original sample way 345 | # prev_sample = sample + derivative * dt 346 | 347 | dx = derivative * dt 348 | m = dx.mean() 349 | dx_ = (dx - m) * omega + m 350 | prev_sample = sample + dx_ 351 | 352 | # Cast sample back to model compatible dtype 353 | prev_sample = prev_sample.to(model_output.dtype) 354 | 355 | # upon completion increase step index by one 356 | self._step_index += 1 357 | 358 | if not return_dict: 359 | return (prev_sample,) 360 | 361 | return FlowMatchHeunDiscreteSchedulerOutput(prev_sample=prev_sample) 362 | 363 | def __len__(self): 364 | return self.config.num_train_timesteps 365 | -------------------------------------------------------------------------------- /acestep/schedulers/scheduling_flow_match_pingpong.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | from dataclasses import dataclass 17 | from typing import List, Optional, Tuple, Union 18 | 19 | import numpy as np 20 | import torch 21 | 22 | from diffusers.configuration_utils import ConfigMixin, register_to_config 23 | from diffusers.utils import BaseOutput, logging 24 | from diffusers.schedulers.scheduling_utils import SchedulerMixin 25 | 26 | 27 | logger = logging.get_logger(__name__) # pylint: disable=invalid-name 28 | 29 | 30 | @dataclass 31 | class FlowMatchPingPongSchedulerOutput(BaseOutput): 32 | """ 33 | Output class for the scheduler's `step` function output. 34 | 35 | Args: 36 | prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): 37 | Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the 38 | denoising loop. 39 | """ 40 | 41 | prev_sample: torch.FloatTensor 42 | 43 | 44 | class FlowMatchPingPongScheduler(SchedulerMixin, ConfigMixin): 45 | """ 46 | PingPong scheduler. 47 | 48 | This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic 49 | methods the library implements for all schedulers such as loading and saving. 50 | 51 | Args: 52 | num_train_timesteps (`int`, defaults to 1000): 53 | The number of diffusion steps to train the model. 54 | timestep_spacing (`str`, defaults to `"linspace"`): 55 | The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and 56 | Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. 57 | shift (`float`, defaults to 1.0): 58 | The shift value for the timestep schedule. 59 | """ 60 | 61 | _compatibles = [] 62 | order = 1 63 | 64 | @register_to_config 65 | def __init__( 66 | self, 67 | num_train_timesteps: int = 1000, 68 | shift: float = 1.0, 69 | use_dynamic_shifting=False, 70 | base_shift: Optional[float] = 0.5, 71 | max_shift: Optional[float] = 1.15, 72 | base_image_seq_len: Optional[int] = 256, 73 | max_image_seq_len: Optional[int] = 4096, 74 | sigma_max: Optional[float] = 1.0, 75 | ): 76 | timesteps = np.linspace( 77 | 1, sigma_max*num_train_timesteps, num_train_timesteps, dtype=np.float32 78 | )[::-1].copy() 79 | timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) 80 | 81 | sigmas = timesteps / num_train_timesteps 82 | if not use_dynamic_shifting: 83 | # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution 84 | sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) 85 | 86 | self.timesteps = sigmas * num_train_timesteps 87 | 88 | self._step_index = None 89 | self._begin_index = None 90 | 91 | self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication 92 | self.sigma_min = self.sigmas[-1].item() 93 | self.sigma_max = self.sigmas[0].item() 94 | 95 | @property 96 | def step_index(self): 97 | """ 98 | The index counter for current timestep. It will increase 1 after each scheduler step. 99 | """ 100 | return self._step_index 101 | 102 | @property 103 | def begin_index(self): 104 | """ 105 | The index for the first timestep. It should be set from pipeline with `set_begin_index` method. 106 | """ 107 | return self._begin_index 108 | 109 | # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index 110 | def set_begin_index(self, begin_index: int = 0): 111 | """ 112 | Sets the begin index for the scheduler. This function should be run from pipeline before the inference. 113 | 114 | Args: 115 | begin_index (`int`): 116 | The begin index for the scheduler. 117 | """ 118 | self._begin_index = begin_index 119 | 120 | def scale_noise( 121 | self, 122 | sample: torch.FloatTensor, 123 | timestep: Union[float, torch.FloatTensor], 124 | noise: Optional[torch.FloatTensor] = None, 125 | ) -> torch.FloatTensor: 126 | """ 127 | Forward process in flow-matching 128 | 129 | Args: 130 | sample (`torch.FloatTensor`): 131 | The input sample. 132 | timestep (`int`, *optional*): 133 | The current timestep in the diffusion chain. 134 | 135 | Returns: 136 | `torch.FloatTensor`: 137 | A scaled input sample. 138 | """ 139 | # Make sure sigmas and timesteps have the same device and dtype as original_samples 140 | sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) 141 | 142 | if sample.device.type == "mps" and torch.is_floating_point(timestep): 143 | # mps does not support float64 144 | schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) 145 | timestep = timestep.to(sample.device, dtype=torch.float32) 146 | else: 147 | schedule_timesteps = self.timesteps.to(sample.device) 148 | timestep = timestep.to(sample.device) 149 | 150 | # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index 151 | if self.begin_index is None: 152 | step_indices = [ 153 | self.index_for_timestep(t, schedule_timesteps) for t in timestep 154 | ] 155 | elif self.step_index is not None: 156 | # add_noise is called after first denoising step (for inpainting) 157 | step_indices = [self.step_index] * timestep.shape[0] 158 | else: 159 | # add noise is called before first denoising step to create initial latent(img2img) 160 | step_indices = [self.begin_index] * timestep.shape[0] 161 | 162 | sigma = sigmas[step_indices].flatten() 163 | while len(sigma.shape) < len(sample.shape): 164 | sigma = sigma.unsqueeze(-1) 165 | 166 | sample = sigma * noise + (1.0 - sigma) * sample 167 | 168 | return sample 169 | 170 | def _sigma_to_t(self, sigma): 171 | return sigma * self.config.num_train_timesteps 172 | 173 | def time_shift(self, mu: float, sigma: float, t: torch.Tensor): 174 | return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) 175 | 176 | def set_timesteps( 177 | self, 178 | num_inference_steps: int = None, 179 | device: Union[str, torch.device] = None, 180 | sigmas: Optional[List[float]] = None, 181 | mu: Optional[float] = None, 182 | ): 183 | """ 184 | Sets the discrete timesteps used for the diffusion chain (to be run before inference). 185 | 186 | Args: 187 | num_inference_steps (`int`): 188 | The number of diffusion steps used when generating samples with a pre-trained model. 189 | device (`str` or `torch.device`, *optional*): 190 | The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. 191 | """ 192 | 193 | if self.config.use_dynamic_shifting and mu is None: 194 | raise ValueError( 195 | " you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`" 196 | ) 197 | 198 | if sigmas is None: 199 | self.num_inference_steps = num_inference_steps 200 | timesteps = np.linspace( 201 | self._sigma_to_t(self.sigma_max), 202 | self._sigma_to_t(self.sigma_min), 203 | num_inference_steps, 204 | ) 205 | 206 | sigmas = timesteps / self.config.num_train_timesteps 207 | 208 | if self.config.use_dynamic_shifting: 209 | sigmas = self.time_shift(mu, 1.0, sigmas) 210 | else: 211 | sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) 212 | 213 | sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) 214 | timesteps = sigmas * self.config.num_train_timesteps 215 | 216 | self.timesteps = timesteps.to(device=device) 217 | self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) 218 | 219 | self._step_index = None 220 | self._begin_index = None 221 | 222 | def index_for_timestep(self, timestep, schedule_timesteps=None): 223 | if schedule_timesteps is None: 224 | schedule_timesteps = self.timesteps 225 | 226 | indices = (schedule_timesteps == timestep).nonzero() 227 | 228 | # The sigma index that is taken for the **very** first `step` 229 | # is always the second index (or the last index if there is only 1) 230 | # This way we can ensure we don't accidentally skip a sigma in 231 | # case we start in the middle of the denoising schedule (e.g. for image-to-image) 232 | pos = 1 if len(indices) > 1 else 0 233 | 234 | return indices[pos].item() 235 | 236 | def _init_step_index(self, timestep): 237 | if self.begin_index is None: 238 | if isinstance(timestep, torch.Tensor): 239 | timestep = timestep.to(self.timesteps.device) 240 | self._step_index = self.index_for_timestep(timestep) 241 | else: 242 | self._step_index = self._begin_index 243 | 244 | def step( 245 | self, 246 | model_output: torch.FloatTensor, 247 | timestep: Union[float, torch.FloatTensor], 248 | sample: torch.FloatTensor, 249 | s_churn: float = 0.0, 250 | s_tmin: float = 0.0, 251 | s_tmax: float = float("inf"), 252 | s_noise: float = 1.0, 253 | generator: Optional[torch.Generator] = None, 254 | return_dict: bool = True, 255 | omega: Union[float, np.array] = 0.0, 256 | ) -> Union[FlowMatchPingPongSchedulerOutput, Tuple]: 257 | """ 258 | Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion 259 | process from the learned model outputs (most often the predicted noise). 260 | 261 | Args: 262 | model_output (`torch.FloatTensor`): 263 | The direct output from learned diffusion model. 264 | timestep (`float`): 265 | The current discrete timestep in the diffusion chain. 266 | sample (`torch.FloatTensor`): 267 | A current instance of a sample created by the diffusion process. 268 | s_churn (`float`): 269 | s_tmin (`float`): 270 | s_tmax (`float`): 271 | s_noise (`float`, defaults to 1.0): 272 | Scaling factor for noise added to the sample. 273 | generator (`torch.Generator`, *optional*): 274 | A random number generator. 275 | return_dict (`bool`): 276 | Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or 277 | tuple. 278 | 279 | Returns: 280 | [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: 281 | If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is 282 | returned, otherwise a tuple is returned where the first element is the sample tensor. 283 | """ 284 | 285 | def logistic_function(x, L=0.9, U=1.1, x_0=0.0, k=1): 286 | # L = Lower bound 287 | # U = Upper bound 288 | # x_0 = Midpoint (x corresponding to y = 1.0) 289 | # k = Steepness, can adjust based on preference 290 | 291 | if isinstance(x, torch.Tensor): 292 | device_ = x.device 293 | x = x.to(torch.float).cpu().numpy() 294 | 295 | new_x = L + (U - L) / (1 + np.exp(-k * (x - x_0))) 296 | 297 | if isinstance(new_x, np.ndarray): 298 | new_x = torch.from_numpy(new_x).to(device_) 299 | return new_x 300 | 301 | self.omega_bef_rescale = omega 302 | omega = logistic_function(omega, k=0.1) 303 | self.omega_aft_rescale = omega 304 | 305 | if ( 306 | isinstance(timestep, int) 307 | or isinstance(timestep, torch.IntTensor) 308 | or isinstance(timestep, torch.LongTensor) 309 | ): 310 | raise ValueError( 311 | ( 312 | "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" 313 | " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" 314 | " one of the `scheduler.timesteps` as a timestep." 315 | ), 316 | ) 317 | 318 | if self.step_index is None: 319 | self._init_step_index(timestep) 320 | 321 | # Upcast to avoid precision issues when computing prev_sample 322 | sample = sample.to(torch.float32) 323 | 324 | sigma = self.sigmas[self.step_index] 325 | sigma_next = self.sigmas[self.step_index + 1] 326 | 327 | denoised = sample - sigma * model_output 328 | noise = torch.empty_like(sample).normal_(generator=generator) 329 | prev_sample = (1 - sigma_next) * denoised + sigma_next * noise 330 | 331 | # Cast sample back to model compatible dtype 332 | prev_sample = prev_sample.to(model_output.dtype) 333 | 334 | # upon completion increase step index by one 335 | self._step_index += 1 336 | 337 | if not return_dict: 338 | return (prev_sample,) 339 | 340 | return FlowMatchPingPongSchedulerOutput(prev_sample=prev_sample) 341 | 342 | def __len__(self): 343 | return self.config.num_train_timesteps 344 | -------------------------------------------------------------------------------- /acestep/ui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/acestep/ui/__init__.py -------------------------------------------------------------------------------- /assets/ACE-Step_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/ACE-Step_framework.png -------------------------------------------------------------------------------- /assets/Logo_StepFun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/Logo_StepFun.png -------------------------------------------------------------------------------- /assets/acestep_tech_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/acestep_tech_report.pdf -------------------------------------------------------------------------------- /assets/acestudio_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/acestudio_logo.png -------------------------------------------------------------------------------- /assets/application_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/application_map.png -------------------------------------------------------------------------------- /assets/audio2audio_ComfyUI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/audio2audio_ComfyUI.png -------------------------------------------------------------------------------- /assets/audio2audio_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/audio2audio_demo.gif -------------------------------------------------------------------------------- /assets/cpu_offload_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/cpu_offload_performance.png -------------------------------------------------------------------------------- /assets/demo_interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/demo_interface.png -------------------------------------------------------------------------------- /assets/orgnization_logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/orgnization_logos.png -------------------------------------------------------------------------------- /assets/rap_machine_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/rap_machine_demo.gif -------------------------------------------------------------------------------- /assets/train_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/assets/train_demo.gif -------------------------------------------------------------------------------- /colab_inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "private_outputs": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "source": [ 33 | "# Install" 34 | ], 35 | "metadata": { 36 | "id": "_sjfo37-gDQV" 37 | } 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "id": "0W0bvPq1df_a" 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "#!pip uninstall ace-step -y\n", 48 | "!pip install --upgrade git+https://github.com/ace-step/ACE-Step.git\n", 49 | "import os\n", 50 | "os.environ['ACE_PIPELINE_DTYPE'] = 'float16'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "source": [ 56 | "# Import Model From GDrive (Optional)" 57 | ], 58 | "metadata": { 59 | "id": "2FQ5E6MvgJ09" 60 | } 61 | }, 62 | { 63 | "cell_type": "code", 64 | "source": [ 65 | "from google.colab import drive\n", 66 | "drive.mount('/gdrive')\n", 67 | "!unzip /gdrive/MyDrive/acestep/checkpoints.zip -d /unzip" 68 | ], 69 | "metadata": { 70 | "id": "QZjFgQxGgOdc" 71 | }, 72 | "execution_count": null, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "source": [ 78 | "# Run Interface" 79 | ], 80 | "metadata": { 81 | "id": "TYaWXOLcgO4A" 82 | } 83 | }, 84 | { 85 | "cell_type": "code", 86 | "source": [ 87 | "torch_compile = True # @param {type: \"boolean\"}\n", 88 | "cpu_offload = False # @param {type: \"boolean\"}\n", 89 | "overlapped_decode = True # @param {type: \"boolean\"}\n", 90 | "#bf16 = True # @param {type: \"boolean\"}\n", 91 | "\n", 92 | "!acestep --checkpoint_path /unzip/checkpoints/ --port 7865 --device_id 0 --share true --torch_compile {torch_compile} --cpu_offload {cpu_offload} --overlapped_decode {overlapped_decode}" 93 | ], 94 | "metadata": { 95 | "id": "Q9S6FxllgPHw" 96 | }, 97 | "execution_count": null, 98 | "outputs": [] 99 | } 100 | ] 101 | } -------------------------------------------------------------------------------- /config/zh_rap_lora_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "r": 256, 3 | "lora_alpha": 32, 4 | "target_modules": [ 5 | "speaker_embedder", 6 | "linear_q", 7 | "linear_k", 8 | "linear_v", 9 | "to_q", 10 | "to_k", 11 | "to_v", 12 | "to_out.0" 13 | ], 14 | "use_rslora": true 15 | } -------------------------------------------------------------------------------- /convert2hf_dataset.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset 2 | from pathlib import Path 3 | import os 4 | 5 | def create_dataset(data_dir="./data", repeat_count=2000, output_name="zh_lora_dataset"): 6 | data_path = Path(data_dir) 7 | all_examples = [] 8 | 9 | for song_path in data_path.glob("*.mp3"): 10 | prompt_path = str(song_path).replace(".mp3", "_prompt.txt") 11 | lyric_path = str(song_path).replace(".mp3", "_lyrics.txt") 12 | try: 13 | assert os.path.exists(prompt_path), f"Prompt file {prompt_path} does not exist." 14 | assert os.path.exists(lyric_path), f"Lyrics file {lyric_path} does not exist." 15 | with open(prompt_path, "r", encoding="utf-8") as f: 16 | prompt = f.read().strip() 17 | 18 | with open(lyric_path, "r", encoding="utf-8") as f: 19 | lyrics = f.read().strip() 20 | 21 | keys = song_path.stem 22 | example = { 23 | "keys": keys, 24 | "filename": str(song_path), 25 | "tags": prompt.split(", "), 26 | "speaker_emb_path": "", 27 | "norm_lyrics": lyrics, 28 | "recaption": {} 29 | } 30 | all_examples.append(example) 31 | except AssertionError as e: 32 | continue 33 | 34 | # repeat specified times 35 | ds = Dataset.from_list(all_examples * repeat_count) 36 | ds.save_to_disk(output_name) 37 | 38 | import argparse 39 | 40 | def main(): 41 | parser = argparse.ArgumentParser(description="Create a dataset from audio files.") 42 | parser.add_argument("--data_dir", type=str, default="./data", help="Directory containing the audio files.") 43 | parser.add_argument("--repeat_count", type=int, default=1, help="Number of times to repeat the dataset.") 44 | parser.add_argument("--output_name", type=str, default="zh_lora_dataset", help="Name of the output dataset.") 45 | args = parser.parse_args() 46 | 47 | create_dataset(data_dir=args.data_dir, repeat_count=args.repeat_count, output_name=args.output_name) 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /data/test_track_001.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/data/test_track_001.mp3 -------------------------------------------------------------------------------- /data/test_track_001_lyrics.txt: -------------------------------------------------------------------------------- 1 | [Intro] 2 | "System booting... 语言 模型 loading..." 3 | 4 | [Verse 1] 5 | 硅谷 那个 coder 调试 neural network 6 | 北京 的 极客 训练 A I 写 report 7 | 不同 架构 的 chip 不同 算法 的 war 8 | 屏幕上 跑的 全是 machine learning (learning) 9 | 10 | [Bridge] 11 | 多少年 我们 chase 摩尔 定律 的 trend (yeah) 12 | 这两年 换他们 study 中文 N L P 13 | Convolution L S T M 14 | 好烧脑 的 backprop 好暴力 的 big data 15 | 16 | [Verse 2] 17 | Python 强 say加加 刚 Python 调用 C++ 的 A P I 18 | say加加 嫌 Python 太 slow Python 笑 C++ 太 hardcore 19 | L L V M 默默 generate 中间 code 20 | 到底 interpreter 还是 compiler 屌? 21 | 22 | [Verse 3] 23 | P M 和 engineer 24 | 白板 画满 flow chart 服务器 闪着 red light 25 | P M 说 add feature engineer 说 no way 26 | 需求 变更 code 重构 27 | 不知 是 P M 太 fly 还是 deadline 太 high 28 | 29 | [Chorus] 30 | 全世界 都在 train neural network 31 | Transformer 的 paper 越来越 难 go through 32 | 全世界 都在 tune 超参数 33 | 我们 写的 bug 让 G P U 都 say no 34 | 35 | [Verse 4] 36 | 柏林 hackathon demo blockchain contract 37 | 上海 的 dev 用 federated learning 破 data wall 38 | 各种 语言 的 error 各种 框架 的 doc 39 | terminal 里 滚的 全是 dependency 冲突 40 | 41 | [Bridge] 42 | 曾以为 English 才是 coding 的 language (yeah) 43 | 直到见 G P T 用 文言文 generate 正则 expression 44 | Gradient explode 45 | 好硬核 的 prompt 好头秃 的 debug road 46 | 47 | [Verse 5] 48 | 有个 bug 叫 quantum 49 | 测试 环境 run perfect 上线 立即就 crash 50 | 查 log 看 monitor 发现是 thread 不同步 51 | 改 sync 加 lock 慢 deadlock 更难办 52 | 量子 computer 也解不开 这 chaos chain 53 | 54 | [Verse 6] 55 | 你说 996 我说 007 56 | 你说 福报 我说 burnout 57 | Product 要 agile Boss 要 KPI 58 | Code 要 elegant deadline 是 tomorrow 59 | 不如 直接 script 自动 submit 离职信 60 | 61 | [Outro] 62 | "Warning: 内存 leak...core dumping..." 63 | 全世界 都在 train neural network (neural network) 64 | Loss 还没 converge 天已经亮 65 | 全世界 都在 tune 超参数 66 | 我们 写的 code (让它) 让 world (reboot) 都 reboot 无效 -------------------------------------------------------------------------------- /data/test_track_001_prompt.txt: -------------------------------------------------------------------------------- 1 | articulate, spoken word, young adult, rap music, female, clear, energetic, warm -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | 2 | services: 3 | &name ace-step: 4 | build: 5 | context: https://github.com/ace-step/ACE-Step.git 6 | dockerfile: Dockerfile 7 | container_name: *name 8 | hostname: *name 9 | stop_grace_period: 2s 10 | ports: 11 | - "7865:7865" 12 | volumes: 13 | - ./checkpoints:/app/checkpoints 14 | - ./outputs:/app/outputs 15 | - ./logs:/app/logs 16 | environment: 17 | - ACE_OUTPUT_DIR=/app/outputs 18 | # command: python app.py --server_name 0.0.0.0 --port 7865 --share False --bf16 True --torch_compile True --device-id 0 19 | healthcheck: 20 | test: ["CMD", "curl", "-f", "http://localhost:7865/"] 21 | interval: 60s 22 | timeout: 10s 23 | retries: 30 24 | start_period: 3s 25 | restart: unless-stopped 26 | runtime: nvidia 27 | deploy: 28 | resources: 29 | reservations: 30 | devices: 31 | - driver: nvidia 32 | count: all 33 | capabilities: ["compute", "utility", "graphics", "video"] 34 | -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426071706_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "pop, rap, electronic, blues, hip-house, rhythm and blues", 3 | "lyrics": "[verse]\n我走过深夜的街道\n冷风吹乱思念的漂亮外套\n你的微笑像星光很炫耀\n照亮了我孤独的每分每秒\n\n[chorus]\n愿你是风吹过我的脸\n带我飞过最远最遥远的山间\n愿你是风轻触我的梦\n停在心头不再飘散无迹无踪\n\n[verse]\n一起在喧哗避开世俗的骚动\n独自在天台探望月色的朦胧\n你说爱像音乐带点重节奏\n一拍一跳让我忘了心的温度多空洞\n\n[bridge]\n唱起对你的想念不隐藏\n像诗又像画写满藏不了的渴望\n你的影子挥不掉像风的倔强\n追着你飞扬穿越云海一样泛光\n\n[chorus]\n愿你是风吹过我的手\n暖暖的触碰像春日细雨温柔\n愿你是风盘绕我的身\n深情万万重不会有一天走远走\n\n[verse]\n深夜的钢琴弹起动人的旋律\n低音鼓砸进心底的每一次呼吸\n要是能将爱化作歌声传递\n你是否会听见我心里的真心实意", 4 | "audio_duration": 170.63997916666668, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 3.191075086593628, 19 | "diffusion": 17.459356784820557, 20 | "latent2audio": 1.7095518112182617 21 | }, 22 | "actual_seeds": [ 23 | 3299954530 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426071812_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "country rock, folk rock, southern rock, bluegrass, country pop", 3 | "lyrics": "[verse]\nWoke up to the sunrise glow\nTook my heart and hit the road\nWheels hummin' the only tune I know\nStraight to where the wildflowers grow\n\n[verse]\nGot that old map all wrinkled and torn\nDestination unknown but I'm reborn\nWith a smile that the wind has worn\nChasin' dreams that can't be sworn\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time\n\n[verse]\nMet a girl with a heart of gold\nTold stories that never get old\nHer laugh like a tale that's been told\nA melody so bold yet uncontrolled\n\n[bridge]\nClouds roll by like silent ghosts\nAs we drive along the coast\nWe toast to the days we love the most\nFreedom's song is what we post\n\n[chorus]\nRidin' on a highway to sunshine\nGot my shades and my radio on fine\nLeave the shadows in the rearview rhyme\nHeart's racing as we chase the time", 4 | "audio_duration": 224.23997916666667, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 4.262240648269653, 19 | "diffusion": 15.380569219589233, 20 | "latent2audio": 2.3227272033691406 21 | }, 22 | "actual_seeds": [ 23 | 401640 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426072346_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "hip-house, funk", 3 | "lyrics": "[verse]\n哎呀跳起来,脚尖踩节拍 (oo-yeah!)\n灯光闪烁像星星盛开 (uh-huh!)\n人人都醒来,把烦恼踹开 (get it!)\n热血沸腾,汗水自己安排\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[bridge]\n咚咚鼓声啊,让你的灵魂起飞 (woo!)\n手心拍一拍,能量翻倍 (ah-hah!)\n键盘响起来,如宇宙的交汇 (oh yeah!)\n就是这感觉,兄弟姐妹都陶醉\n\n[verse]\n灵魂从不睡,只想继续燃烧 (woo!)\n节奏像热浪,席卷这街道 (ow!)\n大伙儿涌上楼台,满面微笑 (yeah!)\n这一刻属于我们,无可替代\n\n[chorus]\n嘿,你还等啥?快抓住节拍 (come on!)\n光芒指引,让心都不存在 (whoa!)\n点燃热火,我们一起飙high (let’s go!)\n跳入午夜的狂欢时代\n\n[verse]\n世界多精彩,握紧把它打开 (alright!)\n每一步都像星球在摇摆 (uh-huh!)\n无边无际的律动像大海 (oo-yeah!)\n跟着光芒之舞,一起澎湃", 4 | "audio_duration": 204.19997916666668, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.05196118354797363, 19 | "diffusion": 15.530808210372925, 20 | "latent2audio": 2.5604095458984375 21 | }, 22 | "actual_seeds": [ 23 | 401640 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426072508_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "funk, pop, soul, rock, melodic, guitar, drums, bass, keyboard, percussion, 105 BPM, energetic, upbeat, groovy, vibrant, dynamic", 3 | "lyrics": "[verse]\nNeon lights they flicker bright\nCity hums in dead of night\nRhythms pulse through concrete veins\nLost in echoes of refrains\n\n[verse]\nBassline groovin' in my chest\nHeartbeats match the city's zest\nElectric whispers fill the air\nSynthesized dreams everywhere\n\n[chorus]\nTurn it up and let it flow\nFeel the fire let it grow\nIn this rhythm we belong\nHear the night sing out our song\n\n[verse]\nGuitar strings they start to weep\nWake the soul from silent sleep\nEvery note a story told\nIn this night we’re bold and gold\n\n[bridge]\nVoices blend in harmony\nLost in pure cacophony\nTimeless echoes timeless cries\nSoulful shouts beneath the skies\n\n[verse]\nKeyboard dances on the keys\nMelodies on evening breeze\nCatch the tune and hold it tight\nIn this moment we take flight", 4 | "audio_duration": 178.87997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.02882218360900879, 19 | "diffusion": 16.91233205795288, 20 | "latent2audio": 1.7794082164764404 21 | }, 22 | "actual_seeds": [ 23 | 401640 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426073829_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "electronic rap", 3 | "lyrics": "[verse]\nWaves on the bass, pulsing in the speakers,\nTurn the dial up, we chasing six-figure features,\nGrinding on the beats, codes in the creases,\nDigital hustler, midnight in sneakers.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb.\n\n[verse]\nSynthesizers blaze, city lights a glow,\nRhythm in the haze, moving with the flow,\nSwagger on stage, energy to blow,\nFrom the blocks to the booth, you already know.\n\n[bridge]\nNight's electric, streets full of dreams,\nBass hits collective, bursting at seams,\nHustle perspective, all in the schemes,\nRise and reflective, ain't no in-betweens.\n\n[verse]\nVibin' with the crew, sync in the wire,\nGot the dance moves, fire in the attire,\nRhythm and blues, soul's our supplier,\nRun the digital zoo, higher and higher.\n\n[chorus]\nElectro vibes, hearts beat with the hum,\nUrban legends ride, we ain't ever numb,\nCircuits sparking live, tapping on the drum,\nLiving on the edge, never succumb.", 4 | "audio_duration": 221.42547916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.024875164031982422, 19 | "diffusion": 20.566852569580078, 20 | "latent2audio": 2.2281734943389893 21 | }, 22 | "actual_seeds": [ 23 | 401640 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426074037_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "electronic, house, electro house, synthesizer, drums, bass, percussion, fast, energetic, uplifting, exciting", 3 | "lyrics": "[verse]\n霓虹灯下我们追逐\n人群跃动像潮水满布\n热浪袭来吹散孤独\n跳进节奏不如停下脚步\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦\n\n[verse]\n天空翻滚黑云入夜\n每颗星星像音乐律贴\n耳边回响那低音线\n环绕耳际如梦境般甜\n\n[pre-chorus]\n脚尖触电快点感受\n迎着风声释放自由\n心跳节拍配合节奏\n一切烦恼请靠边游\n\n[chorus]\n夏夜狂奔没有尽头\n星光闪烁舞池不朽\n尽情挥洒所有节奏\n无边热情把你包裹哦", 4 | "audio_duration": 221.47997916666668, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.028400182723999023, 19 | "diffusion": 13.195815324783325, 20 | "latent2audio": 2.1679723262786865 21 | }, 22 | "actual_seeds": [ 23 | 3440445703 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426074214_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "synth-pop, electronic, pop, synthesizer, drums, bass, piano, 128 BPM, energetic, uplifting, modern", 3 | "lyrics": "[verse]\nWoke up in a city that's always alive\nNeon lights they shimmer they thrive\nElectric pulses beat they drive\nMy heart races just to survive\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[verse]\nLost in the labyrinth of screens\nVirtual love or so it seems\nIn the night the city gleams\nDigital faces haunted by memes\n\n[chorus]\nOh electric dreams they keep me high\nThrough the wires I soar and fly\nMidnight rhythms in the sky\nElectric dreams together we’ll defy\n\n[bridge]\nSilent whispers in my ear\nPixelated love serene and clear\nThrough the chaos find you near\nIn electric dreams no fear\n\n[verse]\nBound by circuits intertwined\nLove like ours is hard to find\nIn this world we’re truly blind\nBut electric dreams free the mind", 4 | "audio_duration": 221.27997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.025463581085205078, 19 | "diffusion": 15.243804454803467, 20 | "latent2audio": 2.170398473739624 21 | }, 22 | "actual_seeds": [ 23 | 3400270027 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426074413_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "Cuban music, salsa, son, Afro-Cuban, traditional Cuban", 3 | "lyrics": "[verse]\nSun dips low the night ignites\nBassline hums with gleaming lights\nElectric guitar singing tales so fine\nIn the rhythm we all intertwine\n\n[verse]\nDrums beat steady calling out\nPercussion guides no room for doubt\nElectric pulse through every vein\nDance away every ounce of pain\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb\n\n[bridge]\nStars above they start to glow\nEchoes of the night's soft glow\nElectric strings weave through the air\nIn this moment none compare\n\n[verse]\nHeartbeats sync with every tone\nLost in music never alone\nElectric tales of love and peace\nIn this groove we find release\n\n[chorus]\nFeel the rhythm feel the flow\nLet the music take control\nBassline deep electric hum\nIn this night we're never numb", 4 | "audio_duration": 208.27997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.026132583618164062, 19 | "diffusion": 15.139378070831299, 20 | "latent2audio": 2.2071540355682373 21 | }, 22 | "actual_seeds": [ 23 | 3358899399 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426075107_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "pop, piano, rap, dark, atmospheric", 3 | "lyrics": "[verse]\n月光爬上窗 染白冷的床\n心跳的方向 带我入迷惘\n黑夜吞噬光 命运的纸张\n爱是血色霜 邪恶又芬芳\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[verse]\n心锁在门外 谁会解开关怀\n温柔的手拍 藏着冷酷杀害\n思绪如尘埃 撞击爱的霹雳\n灵魂的独白 为你沾满血迹\n\n[bridge]\n你是噩梦的歌唱 是灵魂的捆绑\n绝望中带着光 悬崖边的渴望\n心跳被你鼓掌 恶魔也痴痴想\n渐渐没了抵抗 古老诡计流淌\n\n[chorus]\n你是猎人的欲望 我是迷途的小羊\n深陷你眼眸的荒 唐突献出心脏\n我在夜里回荡 是谁给我希望\n黑暗风中飘荡 假装不再受伤\n\n[outro]\n爱如月黑无光 渗进梦的战场\n逃入无声的场 放手或心嚷嚷\n隐秘的极端 爱是极致风浪\n灵魂彻底交偿 你是终极虚妄", 4 | "audio_duration": 146.91997916666668, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.03876018524169922, 19 | "diffusion": 15.962624549865723, 20 | "latent2audio": 1.4594337940216064 21 | }, 22 | "actual_seeds": [ 23 | 2065110378 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426075537_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "surf music", 3 | "lyrics": "[verse]\nSunshine on the boulevard the beach is calling loud\nWaves are dancing golden sand under a cotton cloud\nElectric heartbeat pounding fast the tide is on our side\nCatch a wave and feel alive we’ll take it for a ride\n\n[verse]\nPalm trees swaying left to right they know where we belong\nFeel the rhythm of the night it keeps us moving strong\nSea spray kisses salty air we’re flying with the breeze\nChampagne states of mind we ride we do just as we please\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn\n\n[bridge]\nMoonlight paints the ocean blue reflections in our eyes\nStars align to light our path we’re surfing through the skies\nEvery moment like a song we sing it loud and clear\nEvery day’s a new adventure with you always near\n\n[verse]\nNeon lights and city sounds they blend with ocean views\nWe’re unstoppable tonight no way that we can lose\nDreams are written in the sand they sparkle in the sun\nTogether we’re a masterpiece our story’s just begun\n\n[chorus]\nWe’re riding waves of life together hand in hand\nWith every beat we chase the beat it’s our own wonderland\nFeel the music take you higher as the shorelines blur\nThis is our world our endless summer as we live and learn", 4 | "audio_duration": 236.55997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.033666133880615234, 19 | "diffusion": 16.291455507278442, 20 | "latent2audio": 2.3726775646209717 21 | }, 22 | "actual_seeds": [ 23 | 508630535 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426075843_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "alternative rock, pop, rock", 3 | "lyrics": "[verse]\nBright lights flashing in the city sky\nRunning fast and we don't know why\nElectric nights got our hearts on fire\nChasing dreams we'll never tire\n\n[verse]\nGrit in our eyes wind in our hair\nBreaking rules we don't even care\nShouting loud above the crowd\nLiving life like we're unbowed\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[verse]\nPiercing through like a lightning strike\nEvery moment feels like a hike\nDaring bold never backing down\nKings and queens without a crown\n\n[chorus]\nRunning wild in the night so free\nFeel the beat pumping endlessly\nHearts collide in the midnight air\nWe belong we don't have a care\n\n[bridge]\nClose your eyes let your spirit soar\nWe are the ones who wanted more\nBreaking chains of the mundane\nIn this world we'll make our claim", 4 | "audio_duration": 202.19997916666668, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.02512216567993164, 19 | "diffusion": 18.860822677612305, 20 | "latent2audio": 2.0361969470977783 21 | }, 22 | "actual_seeds": [ 23 | 1255121549 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426080234_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "rock, hip - hop, orchestral, bass, drums, electric guitar, piano, synthesizer, violin, viola, cello, fast, energetic, motivational, inspirational, empowering", 3 | "lyrics": "### **[Intro – Spoken]** \n*\"The streets whisper, their echoes never fade. \nEvery step I take leaves a mark—this ain't just a game.\"* \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 1]** \nCold nights, empty pockets, dreams laced with fight, \nEvery loss made me sharper, cut deep like a knife. \nThey said I wouldn’t make it, now they watch in despair, \nFrom the curb to the throne, took the pain, made it rare. \nEvery siren’s a melody, every alley holds a tale, \nRose from the shadows, left my name on the trail. \nStreetlights flicker like warnings in the haze, \nBut I move like a phantom, unfazed by the blaze. \n\n### **[Hook/Chorus]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Verse 2]** \nBarbed wire fences couldn't lock in my mind, \nEvery cage they designed, I left broken behind. \nThey want control, but I’m destined to roam, \nWhere the lost find their voice, where the heart sets the tone. \nSteel and concrete, where the lessons run deep, \nEvery crack in the pavement tells a story of heat. \nBut I rise, undefeated, like a king with no throne, \nWriting scripts in the struggle, my legacy’s stone. \n\n### **[Bridge]** \nFeel the rhythm of the underground roar, \nEvery wound tells a story of the battles before. \nBlood, sweat, and echoes fill the cold midnight, \nBut we move with the fire—unshaken, upright. \n\n### **[Verse 3]** \nNo regrets, no retreat, this game has no pause, \nEvery step that I take is a win for the lost. \nI took lessons from hustlers, wisdom from pain, \nNow the echoes of struggle carve power in my name. \nThey built walls, but I walk through the cracks, \nTurned dirt into gold, never looked back. \nThrough the struggle we rise, through the fire we claim, \nThis is more than just music—it's life in the frame. \n\n### **[Hook/Chorus – Reprise]** \nBorn in the chaos, I weather the storm, \nRising from ashes where warriors are born. \nChains couldn't hold me, the system’s a maze, \nI rewrite the rules, set the city ablaze! \n\n### **[Outro – Spoken]** \n*\"The scars, the struggle, the grind—it’s all part of the rhythm. \nWe never break, we never fold. We rise.\"*", 4 | "audio_duration": 153.95997916666667, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.04368758201599121, 19 | "diffusion": 17.16369390487671, 20 | "latent2audio": 1.5405471324920654 21 | }, 22 | "actual_seeds": [ 23 | 2659225017 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426080407_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "tango finlandés, campanas, disco, dark pop, electro, guitarra clásica, corridos tumba", 3 | "lyrics": "[inst]", 4 | "audio_duration": 162.79997916666667, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.011058568954467773, 19 | "diffusion": 9.924944400787354, 20 | "latent2audio": 1.6034839153289795 21 | }, 22 | "actual_seeds": [ 23 | 780297686 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426080601_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "Nightclubs, dance parties, workout playlists, radio broadcasts", 3 | "lyrics": "Burning in motion, set me alight!\nEvery heartbeat turns into a fight!\nCaged in rhythm, chained in time!\nLove’s a battle— You're Mine! You're Mine!", 4 | "audio_duration": 221.83997916666667, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.012485980987548828, 19 | "diffusion": 14.345409154891968, 20 | "latent2audio": 2.174558639526367 21 | }, 22 | "actual_seeds": [ 23 | 1318394052 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426081134_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "melancholic, world, sad, medieval, soulful", 3 | "lyrics": "[Verse]\nIn a world so grand he roams the skies alone\nHis heart a heavy stone a tale untold\nWhispers of his past echo through the night\nA lonely dragon searching for the light\n\n[Verse 2]\nOnce a mighty force now he drifts in pain\nHis scales once shimmered now they're dark with shame\nCast out by his kin in shadows he does hide\nA haunting sorrow burns deep inside\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day\n\n[Bridge]\nThe world turns cold the nights grow long\nIn his heart he carries an ancient song\nOf battles fought and love long gone\nA legend now but his soul is torn\n\n[Verse 3]\nHoping for a day he'll find a kindred soul\nTo share his pain and make him whole\nTill then he drifts a shadow in the sky\nA lonely dragon with tears in his eye\n\n[Chorus]\nRoaming endless fields with no friend in sight\nHis roar a mournful cry beneath the moon's pale light\nTears fall like stars as he flies on his way\nA lonely dragon yearning for the break of day", 4 | "audio_duration": 239.99997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.029100656509399414, 19 | "diffusion": 22.503791570663452, 20 | "latent2audio": 2.3603708744049072 21 | }, 22 | "actual_seeds": [ 23 | 2166832218 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426091716_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted", 3 | "lyrics": "[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge]\n目を閉じて、くるっと背を向けて、\n何も見なかったフリするから、\n怒らないで… 許してよ…\n\n[Chorus]\nねぇ、顔が赤いよ?\nどうしたの? 熱があるの?\nそれとも怒ってるの?\nねぇ、言ってよ!\n\nどうしてそんな目で見るの?\n私、悪いことした?\n何か間違えたの?\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge 2]\n待って、もし私が悪いなら、\nごめんなさいって言うから、\nアイスクリームあげるから、\nもう怒らないで?\n\nOoooh… 言ってよ!", 4 | "audio_duration": 160, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.0282442569732666, 19 | "diffusion": 12.104875326156616, 20 | "latent2audio": 1.587641954421997 21 | }, 22 | "actual_seeds": [ 23 | 4028738662 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426092025_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "dark, death rock, metal, hardcore, electric guitar, powerful, bass, drums, 110 bpm, G major", 3 | "lyrics": "[Verse]\nMy lovers betray me\nThe snake in my garden is hissing\nIn the air is the sweetness of roses\nAnd under my skin\nThere's a thorn\n\n[Verse 2]\nI should have known\nThat God sends his angel in shadows\nWith blood in his veins\nI watch the enemy\nGivin' me the hand of my savior\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me\n\n[Verse 3]\nWith sugar and spice\nIt's hard to ignore the nostalgia\nWith the men on their knees\nAt the gates of my heart\nHow they beg me\n\n[Verse 4]\nThey say\n\"No one will ever love you\nThe way that I do\nNo one will ever touch you\nThe way that I do\"\n\n[Chorus]\nAnd I can't love again\nWith the echo of your name in my head\nWith the demons in my bed\nWith the memories\nYour ghost\nI see it\n'Cause it comes to haunt me\nJust to taunt me\nIt comes to haunt me\nJust to taunt me", 4 | "audio_duration": 174.27997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 3.8372838497161865, 19 | "diffusion": 13.039669275283813, 20 | "latent2audio": 1.7923030853271484 21 | }, 22 | "actual_seeds": [ 23 | 4064916393 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426093007_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "aggressive, Heavy Riffs, Blast Beats, Satanic Black Metal", 3 | "lyrics": "[verse]\nFloating through the galaxy on a midnight ride\nStars are dancing all around in cosmic tides\nFeel the pulse of space and time beneath our feet\nEvery beat a heartbeat in this endless suite\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free\n\n[verse]\nPlanetary whispers in the sky tonight\nEvery constellation's got a secret sight\nDistant worlds and moons we have yet to see\nIn the void of space where we can just be\n\n[bridge]\nAsteroids and comets in a ballet they spin\nLost in the rhythm of where our dreams begin\nClose your eyes and let the synths take flight\nWe're voyagers on an electric night\n\n[verse]\nLet the piano keys unlock the stars above\nEvery chord a memory every note is love\nIn this synth symphony we find our grace\nDrifting forever in this boundless space\n\n[chorus]\nGalactic dreams under neon lights\nSailing through the velvet nights\nWe are echoes in a cosmic sea\nIn a universe where we are free", 4 | "audio_duration": 181.99997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.025065898895263672, 19 | "diffusion": 17.176705837249756, 20 | "latent2audio": 1.8225171566009521 21 | }, 22 | "actual_seeds": [ 23 | 1132623236 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/default/input_params/output_20250426093146_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "prompt": "r&b, soul, funk/soul", 3 | "lyrics": "[verse]\nDancing through electric fires\nHeart is buzzing like live wires\nIn your arms I find desire\nFeel the beat as we get higher\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[verse]\nWhisper secrets that make me blush\nUnder the neon city hush\nYour touch gives me such a rush\nTurn it up we're feeling lush\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why\n\n[bridge]\nThrough the lights and the smoky haze\nI see you in a thousand ways\nLove's a script and we’re the play\nTurn the page stay till we sway\n\n[chorus]\nElectric love in the night sky\nWe’re gonna soar baby you and I\nDrop the bass let the rhythm fly\nFeel the heat and don't ask why", 4 | "audio_duration": 195.15997916666666, 5 | "infer_step": 60, 6 | "guidance_scale": 15, 7 | "scheduler_type": "euler", 8 | "cfg_type": "apg", 9 | "omega_scale": 10, 10 | "guidance_interval": 0.5, 11 | "guidance_interval_decay": 0, 12 | "min_guidance_scale": 3, 13 | "use_erg_tag": true, 14 | "use_erg_lyric": true, 15 | "use_erg_diffusion": true, 16 | "oss_steps": [], 17 | "timecosts": { 18 | "preprocess": 0.025553464889526367, 19 | "diffusion": 18.250118494033813, 20 | "latent2audio": 1.9400627613067627 21 | }, 22 | "actual_seeds": [ 23 | 2853131993 24 | ] 25 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512101839_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora", 3 | "task": "text2music", 4 | "prompt": "Rap, adult, male, spoken word, singing, bright, energetic, clear", 5 | "lyrics": "[Intro]\n他们说我来自阴影里\n说我的肤色是原罪的印记\n\n[Verse]\n眼神像刀子刮过 穿透我的皮肤\n带着审判和偏见 让我无处可逃处\n你没听过我的故事 没走过我的路\n凭什么就下一个判决 把我划出你的版图\n你说我威胁到你 抢走了你的机会\n可你可知我付出的 是你不敢想象的血泪\n被贴上标签 被区别对待\n呼吸都是错的 只因我生来就不一样态\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Verse]\n每一次努力争取 都会被审视被放大\n每一个细微的错误 都变成攻击的靶\n他们选择性失明 看不见我的汗水\n只看见他们想看的 带着恶意的定位\n系统性的歧视 像一张无形的网\n把我困在原地 无法自由地翱翔\n他们在享受特权 却指责我的贫困\n嘲笑我的口音 我的名字 我的出身\n\n[Chorus]\n看不见的墙 把我阻隔在外面\n听不见的声音 屏蔽了我的呼唤\n他们制造偏见 他们散播谎言\n只因为我的存在 让他们觉得不安\n\n[Bridge]\n我不想寻求同情 只想被公平对待\n不想被定义被束缚 有选择自己未来的权利\n什么时候 才能放下心中的成见\n看到真正的我 而不是你脑海里的画面\n\n[Outro]\n画面... 不安...\n偏见... 歧视...\n什么时候能停止...", 6 | "audio_duration": 134.64, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.3, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.032018184661865234, 21 | "diffusion": 13.275121927261353, 22 | "latent2audio": 1.291429042816162 23 | }, 24 | "actual_seeds": [ 25 | 3826585269 26 | ], 27 | "retake_seeds": [ 28 | 2907904223 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512101839_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512114703_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora", 3 | "task": "text2music", 4 | "prompt": "Chorus Hook, Melodic Rap, Ambient Synth Pads, adult, rap, Very Fast, Storytelling, Chinese Rap, male, spoken word, bright, energetic, Melodic Flow, clear, clarity, 130 bpm", 5 | "lyrics": "[Intro]\n舌 头 打 结 了... 快 念 快 念...\n\n[Verse 1]\n这 个 赌 鬼 蹲 在 柜 台 啃 着 苦 瓜 干 快 很 干\n赌 桌 堆 满 骨 牌 古 怪 股 票 和 五 块 钢 镚 儿 钢 镚\n他 甩 出 扑 克 牌 啪 啪 啪 拍 扁 螃 蟹 壳 哦 壳 扁\n又 摸 摸 麻 将 摸 出 幺 鸡 摸 出 发 财 摸 出 一 条 蛇 蛇 蛇\n庄 家 咳 嗽 咳 破 锣 嗓 子 喊 开 开 开 快 开 开\n赌 鬼 咕 嘟 咕 嘟 灌 咖 啡 灌 到 筷 子 戳 穿 碗 快 戳 穿\n空 气 里 飘 着 锅 巴 味 混 合 隔 夜 的 酸 奶 罐 哦 酸\n输 光 裤 带 还 想 翻 盘 翻 成 煎 饼 摊 老 板 快 翻 盘\n\n[Chorus]\n赌 鬼 赌 鬼 哦 赌 鬼 赌 鬼 快 很 快\n舌 头 打 结 着 念 这 段 哦 这 段 绕 口 令 牌\n若 念 错 一 字 就 罚 你 哦 罚 你 吞 十 斤 海 带\n赌 场 规 矩 就 是 绕 晕 你 哦 绕 晕 你 快 很 快\n\n[Verse 2]\n他 掏 出 铜 板 抠 出 口 袋 最 后 一 颗 快 很 颗\n庄 家 哗 啦 哗 啦 摇 骰 子 摇 出 三 点 又 三 点 哦 三 点\n赌 鬼 急 得 咬 牙 切 齿 咬 到 舌 头 打 蝴 蝶 结 快 打 结\n还 想 押 上 祖 传 的 拖 鞋 拖 把 铁 锅 和 半 包 盐 盐 盐\n突 然 警 笛 嘀 嘟 嘀 嘟 吓 得 他 钻 进 垃 圾 罐 哦 垃 圾\n警 察 咔 嚓 咔 嚓 拍 照 拍 到 他 头 顶 菠 菜 叶 快 拍 照\n最 后 赌 鬼 蹲 监 狱 天 天 背 这 首 绕 口 令 哦 背 不 完\n若 背 错 一 句 就 加 刑 十 年 再 加 十 年 快 加 刑\n\n[Outro]\n舌 头 打 结 了... 赌 鬼 哭 了 哦...\n这 首 歌... 绕 死 人 了 哦...", 6 | "audio_duration": 186.59997916666666, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.7, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.03011012077331543, 21 | "diffusion": 21.696259260177612, 22 | "latent2audio": 1.7648537158966064 23 | }, 24 | "actual_seeds": [ 25 | 3776541388 26 | ], 27 | "retake_seeds": [ 28 | 4274500599 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512114703_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512115409_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora", 3 | "task": "text2music", 4 | "prompt": "electronic, hip-hop, rap, synthesizer, drums, vocals, fast, energetic, modern, uplifting, young adult, male, spoken word, singing, bright, energetic, clear, 140 bpm, female", 5 | "lyrics": "[Verse 1]\n红鲤鱼绿鲤鱼,驴在河里追鲤鱼,\n驴追鲤鱼鱼躲驴,气得驴子直喘气。\n扁担长板凳宽,扁担绑在板凳边,\n扁担要绑板凳不让绑,扁担偏要绑上板凳面!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n红鲤鱼驴扁担板凳,\n一口气念完算你赢!\n\n[Verse 2]\n四是四十是十,十四是十四四十是四十,\n谁说四十是十四,舌头打结别放肆。\n黑化肥会挥发,灰化肥也发黑,\n化肥混一起,黑灰不分嘴发废!\n\n[Chorus]\n绕口令,练嘴皮,\n说快说慢别迟疑,\n四十十四化肥灰,\n念错罚你唱十回!\n\n[Bridge]\n坡上立着一只鹅,坡下流着一条河,\n鹅要过河河渡鹅,河要渡鹅鹅笑河——\n到底谁更啰嗦?!\n\n[Outro]\n嘴皮子功夫别小瞧,\n绕口令rap我最飙,\n下次挑战准备好,\n舌头打结别求饶!", 6 | "audio_duration": 123.2, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.7, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.026150941848754883, 21 | "diffusion": 12.212433099746704, 22 | "latent2audio": 1.1857895851135254 23 | }, 24 | "actual_seeds": [ 25 | 1415752189 26 | ], 27 | "retake_seeds": [ 28 | 685932970 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512115409_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora", 3 | "task": "text2music", 4 | "prompt": "singing, bright, slightly nasal, energetic, spoken word, young adult, male, rap music", 5 | "lyrics": "[Intro]\nYo, check it—speed demon, lyrical heat, uh!\nRatatat like a drum when the beat bumps, uh!\n\n[Verse 1]\nRapatapa tap tap, flash like a snap,\nRap tap tap, I don’t chat, I clap clap clap!\nFingers snap, flow don’t slack, rapataptaptap,\nSpit it fast, hit the gas, rap tap tap rap!\n\n[Pre-Chorus]\nBoom-bap, zoom past, leave ’em flat,\nRap taptaprapataptaptap—where ya at?\n\n[Chorus]\nRapatapa tap tap, yeah, I go brrrr,\nRap tap tap, make the crowd stir!\nRapataptaptap, no lag, just spit,\nRap taptaprapataptaptap—I’m lit!\n\n[Verse 2]\nTongue-twist, quick wrist, rapatapa boom,\nTap tap rap, leave ya stuck like glue-gum!\nNo slow-mo, turbo, rapataptaptap,\nRap tap rap, yeah, I clap clap clap!\n\n[Outro]\nRapatapa—TAP! Mic drop—that’s that.", 6 | "audio_duration": 60, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.018491744995117188, 21 | "diffusion": 8.084580898284912, 22 | "latent2audio": 0.5694489479064941 23 | }, 24 | "actual_seeds": [ 25 | 226581098 26 | ], 27 | "retake_seeds": [ 28 | 1603201617 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512120348_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512143242_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA", 3 | "task": "text2music", 4 | "prompt": "G-Funk, Hip Hop, Rap, Female Vocals, Melodic Rap, Summer, Laid-back Groove, Smooth Rhythm, Synthesizer Lead, Heavy Bassline, Groovy, West Coast Hip Hop", 5 | "lyrics": "(Intro)\nOh yeah... \n\n(Verse 1)\n阳光下,沙滩排球场,一个身影跳跃\n小麦色,运动背心,闪耀活力四射\n她跳起扣杀,动作利落又巧妙\n汗水浸湿发梢,笑容比阳光更美好\n摇摆的节奏,是她的背景配乐\n每一次移动,都踩在鼓点上那么和谐\n我不由自主地停下脚步\n目光被她紧紧锁住\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n想走上前去搭讪,嫌自己笨拙呆板\n这青春的气息,耀眼,灿烂!\n\n(Verse 3)\n她和队友击掌庆祝,笑声清脆悦耳\n拿起毛巾擦汗,不经意间瞥我一眼\n鼓起勇气走上前,假装问问时间\n她友好地回答,笑容灿烂没有敷衍\n聊了几句,发现彼此爱这摇摆音乐\n她眼中也闪过惊喜和亲切\n这共同点,让气氛变得融洽又热烈!\n夏天的故事,就这样开始了感觉真切!\n\n(Chorus)\n沙滩排球女孩, 摇摆节拍下的身材\n无忧无虑的笑容,把我的心都填满\n不再犹豫和等待,勇敢把脚步迈开\n这夏天的感觉,心跳,不断!", 6 | "audio_duration": 93.93038, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.03020024299621582, 21 | "diffusion": 9.942127704620361, 22 | "latent2audio": 0.9470341205596924 23 | }, 24 | "actual_seeds": [ 25 | 3826585299 26 | ], 27 | "retake_seeds": [ 28 | 2519711205 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512143242_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512145057_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "lyrical rap, young adult, female, rap flow, spoken word, ad-libs, bright, energetic, eat, Fast, Engaging, Energetic", 5 | "lyrics": "[Intro]\n扁擔寬 板凳長 扁擔想綁在板凳上\n扁擔寬 板凳長 扁擔想綁在板凳上\n\n[Verse]\n倫敦 瑪莉蓮 買了 件 旗袍 送 媽媽\n莫斯科 的 夫司基 愛上 牛肉 麵 疙瘩\n各種 顏色 的 皮膚 各種 顏色 的 頭髮\n嘴裡念的 說的 開始 流行 中國話 (中國話)\n\n[Bridge]\n多少年 我們 苦練 英文 發音 和 文法 (yeah)\n這幾年 換他們 捲著 舌頭 學 平上去入 的 變化\n平平 仄仄 平平 仄\n好聰明 的 中國人 好優美 的 中國話\n\n[Verse]\n扁擔寬 板凳長 扁擔想綁在板凳上\n板凳不讓扁擔綁在板凳上 扁擔偏要綁在板凳上\n板凳偏偏不讓扁擔綁在那板凳上\n到底扁擔寬 還是板凳長?\n\n[Verse]\n哥哥弟弟坡前坐\n坡上臥著一隻鵝 坡下流著一條河\n哥哥說 寬寬的河 弟弟說 白白的鵝\n鵝要過河 河要渡鵝\n不知是那鵝過河 還是河渡鵝\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[Verse]\n紐約蘇珊娜開了間禪風 lounge bar\n柏林來的沃夫岡拿胡琴配著電吉他\n各種顏色的皮膚 各種顏色的頭髮\n嘴裡念的 說的 開始流行中國話 (中國話)\n\n[Bridge]\n多少年我們苦練英文發音和文法 (yeah)\n這幾年換他們捲著舌頭學平上去入的變化\n仄仄平平仄仄平\n好聰明的中國人 好優美的中國話\n\n[Verse]\n有個小孩叫小杜 上街打醋又買布\n買了布 打了醋 回頭看見鷹抓兔\n放下布 擱下醋 上前去追鷹和兔\n飛了鷹 跑了兔 灑了醋 濕了布\n\n[Verse]\n嘴說腿 腿說嘴\n嘴說腿 愛跑腿\n腿說嘴 愛賣嘴\n光動嘴 不動腿\n光動腿 不動嘴\n不如不長腿和嘴\n到底是那嘴說腿 還是腿說嘴?\n\n[Chorus]\n全世界都在學中國話\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 讓世界都認真聽話\n\n[outro]\n全世界都在學中國話 (在學中國話)\n孔夫子的話 越來越國際化\n全世界都在講中國話\n我們說的話 (讓他) 讓世界 (認真) 都認真聽話", 6 | "audio_duration": 239.8355625, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.04363536834716797, 21 | "diffusion": 18.706920385360718, 22 | "latent2audio": 2.1645781993865967 23 | }, 24 | "actual_seeds": [ 25 | 2364345905 26 | ], 27 | "retake_seeds": [ 28 | 2100914041 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512145057_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512152217_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap", 5 | "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球", 6 | "audio_duration": 239.8355625, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.05357813835144043, 21 | "diffusion": 25.644447326660156, 22 | "latent2audio": 2.1787476539611816 23 | }, 24 | "actual_seeds": [ 25 | 3246571430 26 | ], 27 | "retake_seeds": [ 28 | 1352325167 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512152217_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512153616_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "articulate, spoken word, young adult, warm, rap music, male, clear, street, dark, rap flow, hardcore rap, fast", 5 | "lyrics": "[verse]\n球场 的 橡胶味 弥漫 隔壁 是 健身房\n场 边上 的 老教练 战术 有 三套\n教 交叉 运球 的 大叔 会 欧洲步 耍 背后 传\n硬 身板 对抗 最 擅长 还 会 急停跳 后仰 投\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[chorus]\n看什么 看什么\n变速 突破 心 自在\n看什么 看什么\n假动作 晃 开 防守 来\n看什么 看什么\n每日 训练 绑 沙袋\n空中拉杆 莫 奇怪\n唰唰 入袋\n\n[verse]\n一个 试探 步后 一记 左 变向 右 变向\n一句 挑衅 我 的 人 别 嚣张\n一再 重演 一颗 我 不 投 的 球\n悬在 篮筐 上 它 一直 在 摇晃\n\n[chorus]\n看什么 看什么\n我 激活 小宇宙 来\n看什么 看什么\n菜鸟 新人 的 名号\n看什么 看什么\n已 被 我 一球 击倒\n\n[chorus]\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n篮球 之 人 切记 勇者 无惧\n是 谁 在 玩 花式 引爆 空气\n快 秀出 指尖 转球 砰砰 啪嗒\n快 秀出 指尖 转球 砰砰 啪嗒\n如果 我 有 滞空 逆天 补扣\n为人 热血 不怂 一生 傲骨 吼\n\n[verse]\n他们 徒弟 我 习惯 从小 就 耳濡目染\n什么 胯下 跟 变向 我 都 玩 的 有模有样\n什么 招式 最 喜欢 转身 过 人 柔中 带 刚\n想要 去 纽约 街头 斗 洛克 公园 场\n\n[outro]\n快 秀出 指尖 转球 砰\n快 秀出 指尖 转球 砰\n如果 我 有 滞空 吼\n为人 热血 不怂 一生 傲骨 吼\n快 秀出 指尖 转球 砰\n我 用 背传 助攻 吼\n压哨 的 三分 球", 6 | "audio_duration": 183.23, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.046170711517333984, 21 | "diffusion": 14.21678113937378, 22 | "latent2audio": 2.685957193374634 23 | }, 24 | "actual_seeds": [ 25 | 3072005931 26 | ], 27 | "retake_seeds": [ 28 | 562842491 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512153616_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512154907_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "articulate, spoken word, young adult, rap music, female, clear, energetic, warm", 5 | "lyrics": "[Intro]\n\"System booting... 语言 模型 loading...\"\n\n[Verse 1]\n硅谷 那个 coder 调试 neural network\n北京 的 极客 训练 A I 写 report\n不同 架构 的 chip 不同 算法 的 war\n屏幕上 跑的 全是 machine learning (learning)\n\n[Bridge]\n多少年 我们 chase 摩尔 定律 的 trend (yeah)\n这两年 换他们 study 中文 N L P\nConvolution L S T M\n好烧脑 的 backprop 好暴力 的 big data\n\n[Verse 2]\nPython 强 say加加 刚 Python 调用 C++ 的 A P I\nsay加加 嫌 Python 太 slow Python 笑 C++ 太 hardcore\nL L V M 默默 generate 中间 code\n到底 interpreter 还是 compiler 屌?\n\n[Verse 3]\nP M 和 engineer\n白板 画满 flow chart 服务器 闪着 red light\nP M 说 add feature engineer 说 no way\n需求 变更 code 重构\n不知 是 P M 太 fly 还是 deadline 太 high\n\n[Chorus]\n全世界 都在 train neural network\nTransformer 的 paper 越来越 难 go through\n全世界 都在 tune 超参数\n我们 写的 bug 让 G P U 都 say no\n\n[Verse 4]\n柏林 hackathon demo blockchain contract\n上海 的 dev 用 federated learning 破 data wall\n各种 语言 的 error 各种 框架 的 doc\nterminal 里 滚的 全是 dependency 冲突\n\n[Bridge]\n曾以为 English 才是 coding 的 language (yeah)\n直到见 G P T 用 文言文 generate 正则 expression\nGradient explode\n好硬核 的 prompt 好头秃 的 debug road\n\n[Verse 5]\n有个 bug 叫 quantum\n测试 环境 run perfect 上线 立即就 crash\n查 log 看 monitor 发现是 thread 不同步\n改 sync 加 lock 慢 deadlock 更难办\n量子 computer 也解不开 这 chaos chain\n\n[Verse 6]\n你说 996 我说 007\n你说 福报 我说 burnout\nProduct 要 agile Boss 要 KPI\nCode 要 elegant deadline 是 tomorrow\n不如 直接 script 自动 submit 离职信\n\n[Outro]\n\"Warning: 内存 leak...core dumping...\"\n全世界 都在 train neural network (neural network)\nLoss 还没 converge 天已经亮\n全世界 都在 tune 超参数\n我们 写的 code (让它) 让 world (reboot) 都 reboot 无效", 6 | "audio_duration": 179.12, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.062120914459228516, 21 | "diffusion": 13.499217987060547, 22 | "latent2audio": 1.6430137157440186 23 | }, 24 | "actual_seeds": [ 25 | 1637990575 26 | ], 27 | "retake_seeds": [ 28 | 101283039 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512154907_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512161832_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club, auto-tune, mumble rap, trap", 5 | "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草!\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来!", 6 | "audio_duration": 169.12, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.04321885108947754, 21 | "diffusion": 14.026689767837524, 22 | "latent2audio": 1.5587565898895264 23 | }, 24 | "actual_seeds": [ 25 | 1905941472 26 | ], 27 | "retake_seeds": [ 28 | 3018484796 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512161832_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512164224_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "四川话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright", 5 | "lyrics": "[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n老子 在 弄堂 斜起 走 想 拦路 的 先 报 名号\n我 早看透 你们 手抖 脚软\n只敢 网上 吠 现实 怂成 猫\n看 你们 混的 真 可怜 整天 蹲在 网吧 蹭 烟\n钱 赚不到 架 不敢打 还 学人 摆 大哥 脸\n\n[verse]\n叫 我 沪上 老 克勒 不是 拉菲 我 不 碰杯\n规矩 我 懒得 讲 太多 钞票 直接 拍 你 脸上 飞\n老子 耐心 差 门槛 高 你 找茬 等于 自 寻 烦恼\n要么 跪 要么 爬 最后 警告 只 说 一 遭\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[verse]\n古巴 雪茄 在 指间 绕 代表 魔都 格调 必须 顶\nOG 在 你 够不到 的 高度 My bro 永远 在 顶层 盯\nCheck my vibe 不靠 大 金劳 留声机 放 周璇 和 白光\n爹妈 太 宠你 养出 巨婴 症 早晚 社会 教你 做人 经\n\n[verse]\n玩 说唱 小囡 太 年轻 要 比 flow 先去 练 气功\n廿年 磨 枪 才 亮 锋芒 我 三十六 招 收 你 入 瓮\n老子 存在 就是 打假 标\n多少 人 眼红 又 不敢 挑\n键盘 侠 的 狠话 像 棉花 糖\n见 真人 秒变 Hello Kitty 叫\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗\n\n[chorus]\n黑 墨镜 金 链子 越 低调 越 霸气\n玩 街机 泡 吧里 再 野的 场子 都 不 怯气\n上海 滩 老 江湖 外滩 钟声 敲 胜负\n陆家嘴 黄浦江 财路 宽 给 你 开 扇窗", 6 | "audio_duration": 135.92, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.038518667221069336, 21 | "diffusion": 16.47420620918274, 22 | "latent2audio": 2.5094873905181885 23 | }, 24 | "actual_seeds": [ 25 | 2159904788 26 | ], 27 | "retake_seeds": [ 28 | 2403013980 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512164224_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512171227_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "ACE-Step/ACE-Step-v1-chinese-rap-LoRA", 3 | "task": "text2music", 4 | "prompt": "Rap, Chinese Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear", 5 | "lyrics": "(Intro)\nLet's drift away...\n\n(Verse 1)\n现实是灰色的格子间,重复的工作,枯燥的报表 \n敲打着键盘,眼神却放空,意识早已挣脱了肉体的镣铐\n飘向窗外,飞过拥挤的街道,穿过云层,到达想象的群岛\n那里色彩斑斓,形状奇异,逻辑失效,一切都随心所欲地飘摇\n迷幻的鼓点,像心跳的变奏,忽快忽慢,难以预料\n抽象的采样,扭曲的人声,构建一个超现实的音景环绕\n我变成一只鸟,一条鱼,一束光,自由地变换形态和奔跑\n在这白日梦里,我无所不能,摆脱了所有现实的烦恼, feeling the afterglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道 \n迷幻嘻哈的节拍,是白日梦的引导 \n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n\n(Verse 2)\n会议室里老板在讲话,声音模糊,像隔着水听不清道\n我的思绪,早已潜入深海,与发光的水母一起舞蹈\n或者飞向外太空,在星云间穿梭,探索未知的星球和轨道\n现实的规则,在这里被打破,物理定律也失去效劳\n白日梦是我的避难所,是精神的氧气罩\n在乏味的现实里,为我注入一点色彩和奇妙\n虽然短暂,虽然虚幻,但它让我能够喘息,重新把能量找到\n然后回到现实,继续扮演那个,循规蹈矩的角色,把梦藏好, keep the dream aglow\n\n(Chorus)\n意识漫游,逃离乏味的轨道\n迷幻嘻哈的节拍,是白日梦的引导\n抽象的世界,逻辑被重新构造\nMind wandering free, where reality starts to fade slow\n", 6 | "audio_duration": 153.7148, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.04823446273803711, 21 | "diffusion": 13.158645629882812, 22 | "latent2audio": 1.493880033493042 23 | }, 24 | "actual_seeds": [ 25 | 2945962357 26 | ], 27 | "retake_seeds": [ 28 | 2676242300 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0.7, 32 | "guidance_scale_lyric": 1.5, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512171227_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512171809_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora", 3 | "task": "text2music", 4 | "prompt": "J-Pop, Anime, kawaii future bass, Femal vocals, EDM, Boombap, Aggressive, Intense, Crisp Snare, Super Fast, Rap", 5 | "lyrics": "[Intro]\nYo, 这是来自深渊的怒吼\n\n[Verse]\n指尖飞快刷新,屏幕又亮起\n渴望那点赞,像致命的氧气\n精心修饰的脸庞,完美到诡异\n背后隐藏的疲惫,谁又会在意\n光鲜亮丽的橱窗,贩卖着焦虑\n每个人都在表演,戴着虚伪面具\n比较的游戏,让人逐渐窒息\n迷失在数据洪流,找不到自己\n\n[Chorus]\n这流量的时代,真假早已分不清\n盲目追随潮流,丢掉了初心\n为了那点虚荣,灵魂在沉沦\n看不见的锁链,捆绑每个灵魂\n\n[Verse]\n滤镜下的生活,美得不切实际\n营造虚假繁荣,掩盖内心空虚\n他人的光环下,显得自己多余\n嫉妒和自卑,交织成悲剧\n\n[Chorus]\n朋友圈里炫耀,现实中却叹气\n刷着别人的故事,忘记了呼吸\n算法推荐着你,想看的一切东西\n不知不觉间,你已不再是你\n他们说这是进步,我看是种病\n精神鸦片侵蚀,慢慢要了你的命\n\n[Bridge]\n屏幕亮了又暗,一天又过去\n究竟得到了什么,还是失去了自己\n那真实的连接,在何处寻觅\n困在这迷宫里,找不到出口的轨迹\n\n[Outro]\n我想挣脱,我想呼吸\n这虚拟的繁华,让我喘不过气\n谁能告诉我,这到底有什么意义\n一切都像泡沫,一触就破裂没余地", 6 | "audio_duration": 119.44348, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.04764962196350098, 21 | "diffusion": 10.94297981262207, 22 | "latent2audio": 1.1815783977508545 23 | }, 24 | "actual_seeds": [ 25 | 3826585273 26 | ], 27 | "retake_seeds": [ 28 | 2527594022 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512171809_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250512172941_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k", 3 | "task": "text2music", 4 | "prompt": "Hip Hop, Hi-hat Rolls, spoken word, Melodic Flow, articulate, Female Rap, 120 BPM, clear, warm, female, melodic Rap, adult, super fast", 5 | "lyrics": "[Verse 1]\n打南边来了个喇嘛,手里提拉着五斤鳎目,\n打北边来了个哑巴,腰里别着个喇叭。\n喇嘛想换哑巴的喇叭,哑巴摇头不说话,\n鳎目一甩像道闪电,喇叭一响震天涯!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 2]\n墙上一根钉,钉下绳摇晃,\n绳吊着瓶,瓶碰碎了光。\n灯骂瓶,瓶怪绳,绳怨钉,\n稀里哗啦,一场荒唐!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!\n\n[Verse 3]\n板凳宽,扁担长,\n一个偏要绑,一个偏不让。\n青龙洞里龙翻身,\n千年大梦变稻香!\n\n[Bridge]\n麻婆婆的狗,咬破麻叉口,\n麻线穿针眼,补丁也风流。\n左一句,右一句,\n舌头打结心自由!\n\n[Chorus]\n丁丁当当,乒乓乓乓,\n话赶话,舌绕梁,\n东边的钉,西边的墙,\n绕不完的弯,唱不完的慌!", 6 | "audio_duration": 214.12, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.031190156936645508, 21 | "diffusion": 20.130417823791504, 22 | "latent2audio": 1.9650826454162598 23 | }, 24 | "actual_seeds": [ 25 | 1946426111 26 | ], 27 | "retake_seeds": [ 28 | 331383387 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250512172941_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250513044511_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k", 3 | "task": "text2music", 4 | "prompt": "东北话, spoken word, male, Tempo - Fast, Elements - Chorus Hook, Subgenre-Satirical Hip Hop, Rap, Chinese-language music, energetic, slightly nasal, Instrument - Live Bass Guitar, adult, Vocals - Syncopated Flow, Genre - Hip-Hop, rapping, bright", 5 | "lyrics": "[verse]\n挣着 憋屈的 工资 还得 装乐呵\n猫着 怂样儿 还搁 朋友圈 嘚瑟\n扛着 傻逼的 指标 没人 搭把手\n这儿 不是 托儿所 少整 那出儿 哭唧尿嚎\n\n俺们 就像 一条条 老板的 裤衩子\n陪着 笑脸 接他 每一回 突突\n哎呦 老板 今儿个 穿我呗\n他 撅个腚 眼角 瞟你 那熊样\n\n[chorus]\n他们 骂我 打工仔 太多人 没睡醒\n寻思 抠搜 老板 一天天 穷折腾\n不想 俺的 人生 烂在 这嘎达\n不想 俺的 将来 折在 这破棚\n\n老子 不想 上班 老子 是外星人\n你都 把俺 骂急眼了 俺还 这么淡定\n现实 才是 梦 啥时候 能醒啊\n那 糟践人的 答案 在西北风 里飘\n\n[verse]\n瞅见 二愣子 同事 给老板 舔腚沟子\n瞅见 浪蹄子 女同事 在老板 胯骨轴 扭搭\n瞅见 白瞎的 光阴 耗在 没亮儿的 道儿\n瞅见 公交车上 一帮 僵尸 吐酸水\n\n瞅见 俺的 命 定在 苦逼的 坑里\n瞅见 俺的 爱情 被轮了 成了 老处女\n瞅见 好事儿 全归 高富帅\n还有 那些 臭不要脸 扭腚的 货色\n\n[chorus](重复)\n他们 骂我 打工仔 太多人 没睡醒...\n\n[bridge]\n加班 没补助 俺认了\n欠薪 揍员工 把俺 当牲口\n去你妈 的小姘头\n\n[verse]\n破逼 管理制度 净整 娱乐八卦\n撸管式 管理 也就 你自己 嗨\n出点儿 屁事儿 就往 下属 脑瓜子 扣\n挣俩 钢镚儿 立马 牛逼 不分 公母\n\n你挖个 大坑 把俺们 往里 踹\n说这 叫梦想 你当年 多能耐\n俺们 就当 听传销 洗脑课\n可怜 连骗人 你都 就会 这一套\n\n[outro]\n老子 不想 上班\n老子 不想 上班\n老子 不想 上班", 6 | "audio_duration": 135.92, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.06204533576965332, 21 | "diffusion": 35.75483560562134, 22 | "latent2audio": 1.5193355083465576 23 | }, 24 | "actual_seeds": [ 25 | 4176354214 26 | ], 27 | "retake_seeds": [ 28 | 601086915 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250513044511_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250513050200_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k", 3 | "task": "text2music", 4 | "prompt": "Rap, J-Pop, Anime, kawaii pop, EDM, Aggressive, Intense, Crisp Snare, Super Fast, Clear", 5 | "lyrics": "[Intro]\nNya.\n\n[Verse]\n我 在 五 点 二 十 早 起,十 三 点 十 四 弹 会儿 琴\n习 惯 了 坐 班,习惯了 隔夜 的 剩 饭,\n习 惯 了 没有 你\n\n[Verse]\n怕 你 想 不 开,拦 在 你 的 面 前\n那 时 候 摔 得 差 点 住 院\n东 京 的 春 天 莺 莺 燕 燕\n我 说 想 不 想 来 跟 我 玩 音乐\n\n[Verse]\n带 着 我 的 朋 友 守 在 你 的 门 口\n弹 着 我 的 钢 琴 当 伴 奏\n等 你 放 学 后,陪 你 K T V\n端 着 我 的 红 茶 跟 你 碰 杯\n\n[Pre-Chorus]\n忽然间现实淹没了远方\n万家灯火,盖住月光\n奔走,忍受,变成了人偶\n别再对我伸出你的 双 手,会 受 伤\n\n[Chorus]\n明明都向前走,方向却渐渐不同\n时间让你我越走越近,却越来越陌生\n春 天 在 滂 沱 的 大 雨 里 飘 落\n得 了 心 太 高 脸 太 薄 的病\n\n[Bridge]\n我越难过,春日影越顶\n眼泪晃得我看不清\n埋葬了懦弱还有矫情\n却还是会在半夜摸眼睛\n\n青春期大部分时间在工 作\n用微笑换来余额几个零\n戴上了面具也明白了生活\n拼的是数字和脸更是命\n\n[Verse]\n我在五点二十早起,十三点十四弹会琴\n早上要做饭,回家时满地的瓶罐\n\n师 徒 二 人 站 在 我 的 面 前\n台 词 很 熟 练,照 着 就 念\n\n背 后 的 小 睦 扭 扭 捏 捏\n我 说 我 还 有 点 事 要 不 改 天 见\n\n然 后 你 的 双手 握 住 我 的 袖 口\n开 始 哭 着 求 我 不 要 走\n\n[Verse]\n我在下班后,忙活柴米油\n你和你的姐妹住着高楼\n\n苦 来 兮 苦,早 就 没 了\n现 实 扬 鞭,赶 着 我 向 前\n没有时间跟你分辨什么对与错\n\n[Bridge]\n没有什么对错,没有罪过\n谁不曾天真,是我太早看破\n生活一片狼藉,却又不想放弃\n一 边 聚 光 灯 下 绽 放,一 边 坠 落\n故作坚强,筑起心的墙\n越是委屈的伤口,越要藏\nLet it all out, it’s all right\n\n[Outro]\n俺 是 东 京 嘞,东 京 打 工 妹\n\n从虎之门带你转到浅草\n再从新宿转到竹桥\n\n俺 是 东 京 嘞,东 京 打 工 妹\n\n带 你 转 羽田 成田 蒲田 神田\n做 你 嘞 小 甜 甜!\n\n俺 是 东 京 嘞,东 京 打 工 妹\n带 你 转 赤 坂,带 你 转 霞 关\n恁 咋 不 早 说,今 天 不 管 饭\n", 6 | "audio_duration": 147.62212, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.5, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.052134037017822266, 21 | "diffusion": 17.909283876419067, 22 | "latent2audio": 1.4904146194458008 23 | }, 24 | "actual_seeds": [ 25 | 2945962357 26 | ], 27 | "retake_seeds": [ 28 | 2252292438 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0.7, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250513050200_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250513055451_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k", 3 | "task": "text2music", 4 | "prompt": "Rap, adult, male, spoken word, rapping, clear, warm, articulate, Lo-Fi Hip Hop, 100-120 BPM, Keyboard Chords, Male Rap, Lazy Rhythm, Melancholy, Rap", 5 | "lyrics": "[Intro]\n夜色 很 淡 像 褪色 的 照片 \n但 记忆 却 像 刀锋 一样 锐利 \n\n[Verse 1]\n你 说过 的 甜言蜜语 现在 听来 像 最 恶毒 的 咒骂 \n你 刺进 我 心里 的 刀 现在 还 在 滴血 未 干 哪 \n慵懒 的 旋律 像 我 的 脚步 拖着 沉重 的 躯壳 \n脑海 里 循环 播放 那 画面 快 把 我 逼疯 了 \n键盘 和弦 低沉 又 忧伤 弹奏 着 我 的 绝望 \n我 曾经 的 信任 像 玻璃 一样 被 你 狠狠 地 摔 在 地上 \n不想 振作 不想 原谅 只 想 让 这 一切 都 停止 \n可 心底 有 个 声音 嘶吼 着 要 你 付出 该 有 的 代价 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Verse 2]\n曾经 的 兄弟 现在 面目全非 像 个 陌生人 \n你 的 自私 像 癌细胞 一点点 吞噬 我 的 纯真 \n我 学着 你 的 样子 把 心 锁 起来 不再 轻易 相信 \n让 懒散 的 节奏 包裹 我 给 自己 一点 喘息 \n键盘 的 音色 变得 更加 阴冷 像 秋天 的 雨滴 \n冲刷 掉 所有 温情 只 剩下 彻骨 的 寒意 \n我 不会 大喊大叫 只是 默默 地 计划 \n每 一步 都 走向 让 你 后悔 的 那 一 刹那 \n\n[Chorus]\n背叛 像 毒药 渗透 我 的 血液 \n复仇 的 火焰 在 我 眼中 燃起 \n哪怕 遍体鳞伤 哪怕 万劫不复 \n我 也 要 亲手 撕碎 你 的 幸福 \n这 是 我 的 哀歌 也 是 我 的 战书 \n键盘 的 音符 每 一下 都 带着 恨意 和 痛苦 \n\n[Bridge]\n也许 复仇 不能 带来 平静 \n也许 只 会 让 我 更 堕落 \n但 如果 不 这样 做 \n我 连 活下去 的 勇气 都 没有 \n\n[Outro]\n复仇 复仇 复仇 \n直到 最后 一刻 \n懒散 地 复仇 着 ", 6 | "audio_duration": 202.64, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.65, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 0.036400794982910156, 21 | "diffusion": 23.055809259414673, 22 | "latent2audio": 1.8787360191345215 23 | }, 24 | "actual_seeds": [ 25 | 3900061002 26 | ], 27 | "retake_seeds": [ 28 | 3037373819 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250513055451_0.wav" 45 | } -------------------------------------------------------------------------------- /examples/zh_rap_lora/input_params/output_20250513060150_0_input_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_100k", 3 | "task": "text2music", 4 | "prompt": "Orchestra, Symphony, Sonata, Opera, Concerto, Rap, Beat, DJ, MC, StreetCulture", 5 | "lyrics": "[verse1]\n羊皮卷轴 墨香飘 莫扎特 熬 安魂曲 通宵 \n和弦齿轮 咔哒转 比 瑞士 手表 更 精密 律动 \n八轨磁带 玩叠叠乐 披头士 炸 录音棚 天花板 \nAI 卷起 新风暴 像 灭霸 打响指 般 简单 \n\n[chorus]\n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse2]\n春之祭 召唤 百人 乐团 才够 燥 \n合成器 极客 玩电焊 焊出 赛博 神庙 \nDAW 解放 双手 钢琴卷帘 变 乐高 \n音色库 开挂 像 吃 金币 的 马里奥 \n\nAI 拆解 爵士乐 黑话 像 庖丁 解牛 \nCityPop 复古 滤镜 直接 参数 调油 \n神经网络 偷师 贝多芬 半夜 翻墙头 \n音乐 基因库 被 改写成 超频 万花筒 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[verse3] \n电子琴 被 吐槽 塑料 味 超标 \n卧室 制作人 用 鼠标 单挑 整个 乐团 编制 \nAI 伴奏 刚上线 就被 键盘侠 集火 \n却 忘了 电吉他 曾被 说 是 魔鬼 的 副歌 \n\n现在 我 指尖 蹦迪 在 数据 炼丹炉 \n提示词 召唤 莫扎特 跨次元 碰杯 珍珠奶茶 \n当 比特 海洋 淹没 所有 物理 琴柱 \n最后 的 音轨 永远 连着 心脏 的 跳针 \n\n[bridge] \n鹅毛笔 蘸着 银河 当 墨汁(绝了) \n音浪 在 元宇宙 开 分店(疯了) \n技术 迷雾 散成 像素 烟花 \n而 我们 始终 带着 老派 的 心跳 混搭 \n\n[chorus] \n琴弦 到 代码 进化论 狂飙(skr) \n象牙塔 被 鼠标 点爆 像 泡泡(boom) \n灵感 加 算法 等于 王炸 大招 \n人类 心跳 才是 终极 混音 调料 \n\n[outro] \n从 蒸汽 到 硅基 浪潮 我 冲浪(yo) \n用 脑洞 接住 每个 技术 暴击(叮) \n当 所有 设备 没电 的 凌晨 三点钟 \n最 原始 的 旋律 在 胸腔 敲击 成 龙卷风 ", 6 | "audio_duration": 172.64, 7 | "infer_step": 60, 8 | "guidance_scale": 15, 9 | "scheduler_type": "euler", 10 | "cfg_type": "apg", 11 | "omega_scale": 10, 12 | "guidance_interval": 0.65, 13 | "guidance_interval_decay": 0, 14 | "min_guidance_scale": 3, 15 | "use_erg_tag": true, 16 | "use_erg_lyric": false, 17 | "use_erg_diffusion": true, 18 | "oss_steps": [], 19 | "timecosts": { 20 | "preprocess": 3.648996353149414, 21 | "diffusion": 16.44967818260193, 22 | "latent2audio": 1.614703893661499 23 | }, 24 | "actual_seeds": [ 25 | 1198023141 26 | ], 27 | "retake_seeds": [ 28 | 3389016134 29 | ], 30 | "retake_variance": 0.5, 31 | "guidance_scale_text": 0, 32 | "guidance_scale_lyric": 0, 33 | "repaint_start": 0, 34 | "repaint_end": 0, 35 | "edit_n_min": 0.0, 36 | "edit_n_max": 1.0, 37 | "edit_n_avg": 1, 38 | "src_audio_path": null, 39 | "edit_target_prompt": null, 40 | "edit_target_lyrics": null, 41 | "audio2audio_enable": false, 42 | "ref_audio_strength": 0.5, 43 | "ref_audio_input": null, 44 | "audio_path": "./outputs/output_20250513060150_0.wav" 45 | } -------------------------------------------------------------------------------- /infer-api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException 2 | from pydantic import BaseModel 3 | from typing import List, Optional 4 | import os 5 | from acestep.pipeline_ace_step import ACEStepPipeline 6 | from acestep.data_sampler import DataSampler 7 | import uuid 8 | 9 | app = FastAPI(title="ACEStep Pipeline API") 10 | 11 | class ACEStepInput(BaseModel): 12 | checkpoint_path: str 13 | bf16: bool = True 14 | torch_compile: bool = False 15 | device_id: int = 0 16 | output_path: Optional[str] = None 17 | audio_duration: float 18 | prompt: str 19 | lyrics: str 20 | infer_step: int 21 | guidance_scale: float 22 | scheduler_type: str 23 | cfg_type: str 24 | omega_scale: float 25 | actual_seeds: List[int] 26 | guidance_interval: float 27 | guidance_interval_decay: float 28 | min_guidance_scale: float 29 | use_erg_tag: bool 30 | use_erg_lyric: bool 31 | use_erg_diffusion: bool 32 | oss_steps: List[int] 33 | guidance_scale_text: float = 0.0 34 | guidance_scale_lyric: float = 0.0 35 | 36 | class ACEStepOutput(BaseModel): 37 | status: str 38 | output_path: Optional[str] 39 | message: str 40 | 41 | def initialize_pipeline(checkpoint_path: str, bf16: bool, torch_compile: bool, device_id: int) -> ACEStepPipeline: 42 | os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) 43 | return ACEStepPipeline( 44 | checkpoint_dir=checkpoint_path, 45 | dtype="bfloat16" if bf16 else "float32", 46 | torch_compile=torch_compile, 47 | ) 48 | 49 | @app.post("/generate", response_model=ACEStepOutput) 50 | async def generate_audio(input_data: ACEStepInput): 51 | try: 52 | # Initialize pipeline 53 | model_demo = initialize_pipeline( 54 | input_data.checkpoint_path, 55 | input_data.bf16, 56 | input_data.torch_compile, 57 | input_data.device_id 58 | ) 59 | 60 | # Prepare parameters 61 | params = ( 62 | input_data.audio_duration, 63 | input_data.prompt, 64 | input_data.lyrics, 65 | input_data.infer_step, 66 | input_data.guidance_scale, 67 | input_data.scheduler_type, 68 | input_data.cfg_type, 69 | input_data.omega_scale, 70 | ", ".join(map(str, input_data.actual_seeds)), 71 | input_data.guidance_interval, 72 | input_data.guidance_interval_decay, 73 | input_data.min_guidance_scale, 74 | input_data.use_erg_tag, 75 | input_data.use_erg_lyric, 76 | input_data.use_erg_diffusion, 77 | ", ".join(map(str, input_data.oss_steps)), 78 | input_data.guidance_scale_text, 79 | input_data.guidance_scale_lyric, 80 | ) 81 | 82 | # Generate output path if not provided 83 | output_path = input_data.output_path or f"output_{uuid.uuid4().hex}.wav" 84 | 85 | # Run pipeline 86 | model_demo( 87 | *params, 88 | save_path=output_path 89 | ) 90 | 91 | return ACEStepOutput( 92 | status="success", 93 | output_path=output_path, 94 | message="Audio generated successfully" 95 | ) 96 | 97 | except Exception as e: 98 | raise HTTPException(status_code=500, detail=f"Error generating audio: {str(e)}") 99 | 100 | @app.get("/health") 101 | async def health_check(): 102 | return {"status": "healthy"} 103 | 104 | if __name__ == "__main__": 105 | import uvicorn 106 | uvicorn.run(app, host="0.0.0.0", port=8000) 107 | -------------------------------------------------------------------------------- /infer.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | 4 | from acestep.pipeline_ace_step import ACEStepPipeline 5 | from acestep.data_sampler import DataSampler 6 | 7 | 8 | def sample_data(json_data): 9 | return ( 10 | json_data["audio_duration"], 11 | json_data["prompt"], 12 | json_data["lyrics"], 13 | json_data["infer_step"], 14 | json_data["guidance_scale"], 15 | json_data["scheduler_type"], 16 | json_data["cfg_type"], 17 | json_data["omega_scale"], 18 | ", ".join(map(str, json_data["actual_seeds"])), 19 | json_data["guidance_interval"], 20 | json_data["guidance_interval_decay"], 21 | json_data["min_guidance_scale"], 22 | json_data["use_erg_tag"], 23 | json_data["use_erg_lyric"], 24 | json_data["use_erg_diffusion"], 25 | ", ".join(map(str, json_data["oss_steps"])), 26 | json_data["guidance_scale_text"] if "guidance_scale_text" in json_data else 0.0, 27 | ( 28 | json_data["guidance_scale_lyric"] 29 | if "guidance_scale_lyric" in json_data 30 | else 0.0 31 | ), 32 | ) 33 | 34 | 35 | @click.command() 36 | @click.option( 37 | "--checkpoint_path", type=str, default="", help="Path to the checkpoint directory" 38 | ) 39 | @click.option("--bf16", type=bool, default=True, help="Whether to use bfloat16") 40 | @click.option( 41 | "--torch_compile", type=bool, default=False, help="Whether to use torch compile" 42 | ) 43 | @click.option( 44 | "--cpu_offload", type=bool, default=False, help="Whether to use CPU offloading (only load current stage's model to GPU)" 45 | ) 46 | @click.option( 47 | "--overlapped_decode", type=bool, default=False, help="Whether to use overlapped decoding (run dcae and vocoder using sliding windows)" 48 | ) 49 | @click.option("--device_id", type=int, default=0, help="Device ID to use") 50 | @click.option("--output_path", type=str, default=None, help="Path to save the output") 51 | def main(checkpoint_path, bf16, torch_compile, cpu_offload, overlapped_decode, device_id, output_path): 52 | os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) 53 | 54 | model_demo = ACEStepPipeline( 55 | checkpoint_dir=checkpoint_path, 56 | dtype="bfloat16" if bf16 else "float32", 57 | torch_compile=torch_compile, 58 | cpu_offload=cpu_offload, 59 | overlapped_decode=overlapped_decode 60 | ) 61 | print(model_demo) 62 | 63 | data_sampler = DataSampler() 64 | 65 | json_data = data_sampler.sample() 66 | json_data = sample_data(json_data) 67 | print(json_data) 68 | 69 | ( 70 | audio_duration, 71 | prompt, 72 | lyrics, 73 | infer_step, 74 | guidance_scale, 75 | scheduler_type, 76 | cfg_type, 77 | omega_scale, 78 | manual_seeds, 79 | guidance_interval, 80 | guidance_interval_decay, 81 | min_guidance_scale, 82 | use_erg_tag, 83 | use_erg_lyric, 84 | use_erg_diffusion, 85 | oss_steps, 86 | guidance_scale_text, 87 | guidance_scale_lyric, 88 | ) = json_data 89 | 90 | model_demo( 91 | audio_duration=audio_duration, 92 | prompt=prompt, 93 | lyrics=lyrics, 94 | infer_step=infer_step, 95 | guidance_scale=guidance_scale, 96 | scheduler_type=scheduler_type, 97 | cfg_type=cfg_type, 98 | omega_scale=omega_scale, 99 | manual_seeds=manual_seeds, 100 | guidance_interval=guidance_interval, 101 | guidance_interval_decay=guidance_interval_decay, 102 | min_guidance_scale=min_guidance_scale, 103 | use_erg_tag=use_erg_tag, 104 | use_erg_lyric=use_erg_lyric, 105 | use_erg_diffusion=use_erg_diffusion, 106 | oss_steps=oss_steps, 107 | guidance_scale_text=guidance_scale_text, 108 | guidance_scale_lyric=guidance_scale_lyric, 109 | save_path=output_path, 110 | ) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "source": [ 33 | "# ACE-Step Inference\n", 34 | "\n", 35 | "\n", 36 | "

\n", 37 | " \"StepFun\n", 38 | "

\n", 39 | "\n", 40 | " A Step Towards Music Generation Foundation Model\n", 41 | "\n", 42 | "\n", 43 | "\n", 44 | "## Credits:\n", 45 | "\n", 46 | "* Ace-Step by [Ace-Step](https://github.com/ace-step/ACE-Step)\n", 47 | "\n", 48 | "* Colab improvement by [NeoDev](https://github.com/TheNeodev)" 49 | ], 50 | "metadata": { 51 | "id": "w4sQAC7AB5GV" 52 | } 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "source": [ 57 | "**🖥️ Hardware Performance**\n", 58 | "\n", 59 | "We have evaluated ACE-Step across different hardware setups, yielding the following throughput results:\n", 60 | "\n", 61 | "| Device | RTF (27 steps) | Time to render 1 min audio (27 steps) | RTF (60 steps) | Time to render 1 min audio (60 steps) |\n", 62 | "| --------------- | -------------- | ------------------------------------- | -------------- | ------------------------------------- |\n", 63 | "| NVIDIA RTX 4090 | 34.48 × | 1.74 s | 15.63 × | 3.84 s |\n", 64 | "| NVIDIA A100 | 27.27 × | 2.20 s | 12.27 × | 4.89 s |\n", 65 | "| NVIDIA RTX 3090 | 12.76 × | 4.70 s | 6.48 × | 9.26 s |\n", 66 | "| MacBook M2 Max | 2.27 × | 26.43 s | 1.03 × | 58.25 s |\n", 67 | "\n", 68 | "\n", 69 | "We use RTF (Real-Time Factor) to measure the performance of ACE-Step. Higher values indicate faster generation speed. 27.27x means to generate 1 minute of music, it takes 2.2 seconds (60/27.27). The performance is measured on a single GPU with batch size 1 and 27 steps." 70 | ], 71 | "metadata": { 72 | "id": "pXuz4oKlDFk_" 73 | } 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "cellView": "form", 80 | "id": "EB_ztF9xch5s" 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "#@title 🎯 Install and Download\n", 85 | "\n", 86 | "from IPython.display import clear_output" 87 | "\n", 88 | "import codecs\n", 89 | "\n", 90 | "\n", 91 | "\n", 92 | "print(\"Installing...\")\n", 93 | "!sudo apt update > /dev/null 2>&1\n", 94 | "!sudo apt install python3.10 > /dev/null 2>&1\n", 95 | "!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 > /dev/null 2>&1\n", 96 | "!sudo update-alternatives --set python3 /usr/bin/python3.10 > /dev/null 2>&1\n", 97 | "!curl -sS https://bootstrap.pypa.io/get-pip.py | python3 > /dev/null 2>&1\n", 98 | "import sys\n", 99 | "sys.path.append('/usr/local/lib/python3.10/dist-packages')\n", 100 | "\n", 101 | "\n", 102 | "repopath = codecs.decode('erdhverzragf.gkg', 'rot_13')\n", 103 | "\n", 104 | "\n", 105 | "!git clone https://github.com/ace-step/ACE-Step\n", 106 | "%cd /content/ACE-Step\n", 107 | "\n", 108 | "\n", 109 | "!pip install uv pyngrok > /dev/null 2>&1\n", 110 | "!uv pip install -r {repopath} > /dev/null 2>&1\n", 111 | "!uv pip install huggingface-hub numpy==1.26.0 > /dev/null 2>&1\n", 112 | "!huggingface-cli download ACE-Step/ACE-Step-v1-3.5B --local-dir /content/ACE-Step/checkpoints --local-dir-use-symlinks False\n", 113 | "\n", 114 | "!pip install e .", 115 | "\n", 116 | "\n", 117 | "import os\n", 118 | "os.environ['MPLBACKEND'] = 'agg'" 119 | "\n\n" 120 | "clear_output()" 121 | "\n\n" 122 | "print("✅ Installation Complete!")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "source": [ 128 | "#@title 🚀 Run Gradio UI\n", 129 | "bf16 = True # @param {\"type\":\"boolean\"}\n", 130 | "\n", 131 | "print(\" 🚀 Running UI...\")\n", 132 | "\n", 133 | "!acestep --checkpoint_path ./checkpoints/ --port 7865 --device_id 0 --share true --bf16 {bf16}" 134 | ], 135 | "metadata": { 136 | "cellView": "form", 137 | "id": "0m835b3ZecQW" 138 | }, 139 | "execution_count": null, 140 | "outputs": [] 141 | } 142 | ] 143 | } 144 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==3.4.1 2 | diffusers>=0.33.0 3 | gradio 4 | librosa==0.11.0 5 | loguru==0.7.3 6 | matplotlib==3.10.1 7 | numpy 8 | pypinyin==0.53.0 9 | pytorch_lightning==2.5.1 10 | soundfile==0.13.1 11 | torch 12 | torchaudio 13 | torchvision 14 | tqdm 15 | transformers==4.50.0 16 | py3langid==0.3.0 17 | hangul-romanize==0.1.0 18 | num2words==0.5.14 19 | spacy==3.8.4 20 | accelerate==1.6.0 21 | cutlet 22 | fugashi[unidic-lite] 23 | click 24 | peft 25 | tensorboard 26 | tensorboardX -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_namespace_packages 2 | 3 | setup( 4 | name="ace_step", 5 | description="ACE Step: A Step Towards Music Generation Foundation Model", 6 | long_description=open("README.md", encoding="utf-8").read(), 7 | long_description_content_type="text/markdown", 8 | version="0.2.0", 9 | packages=find_namespace_packages(), 10 | install_requires=open("requirements.txt", encoding="utf-8").read().splitlines(), 11 | author="ACE Studio, StepFun AI", 12 | license="Apache 2.0", 13 | classifiers=[ 14 | "Development Status :: 3 - Alpha", 15 | "Intended Audience :: Science/Research", 16 | "License :: OSI Approved :: Apache Software License", 17 | ], 18 | entry_points={ 19 | "console_scripts": [ 20 | "acestep=acestep.gui:main", 21 | ], 22 | }, 23 | include_package_data=True, # Ensure this is set to True 24 | package_data={ 25 | "acestep.models.lyrics_utils": ["vocab.json"], # Specify the relative path to vocab.json 26 | }, 27 | extras_require={ 28 | "train": [ 29 | "peft", 30 | "tensorboard", 31 | "tensorboardX" 32 | ] 33 | }, 34 | ) 35 | -------------------------------------------------------------------------------- /trainer-api.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | from fastapi import FastAPI, HTTPException 4 | from pydantic import BaseModel 5 | from typing import Optional 6 | import os 7 | import random 8 | from diffusers.utils.torch_utils import randn_tensor 9 | from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps 10 | from acestep.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler 11 | from acestep.pipeline_ace_step import ACEStepPipeline 12 | from acestep.apg_guidance import apg_forward, MomentumBuffer 13 | from transformers import AutoTokenizer 14 | from loguru import logger 15 | import uvicorn 16 | import time 17 | from datetime import datetime 18 | 19 | app = FastAPI(title="Text-to-Music API") 20 | 21 | class TextToMusicRequest(BaseModel): 22 | prompt: str 23 | duration: int = 240 # Duration in seconds (default 240s) 24 | infer_steps: int = 60 25 | guidance_scale: float = 15.0 26 | omega_scale: float = 10.0 27 | seed: Optional[int] = None 28 | 29 | class TextToMusicResponse(BaseModel): 30 | audio_path: str 31 | prompt: str 32 | seed: int 33 | sample_rate: int 34 | 35 | class InferencePipeline: 36 | def __init__(self, checkpoint_dir: str, device: str = "cuda"): 37 | self.device = torch.device(device if torch.cuda.is_available() else "cpu") 38 | logger.info(f"Initializing model on device: {self.device}") 39 | 40 | # Load the ACEStepPipeline 41 | self.acestep_pipeline = ACEStepPipeline(checkpoint_dir) 42 | self.acestep_pipeline.load_checkpoint(checkpoint_dir) 43 | 44 | # Initialize components 45 | self.transformers = self.acestep_pipeline.ace_step_transformer.float().to(self.device).eval() 46 | self.dcae = self.acestep_pipeline.music_dcae.float().to(self.device).eval() 47 | self.text_encoder_model = self.acestep_pipeline.text_encoder_model.float().to(self.device).eval() 48 | self.text_tokenizer = self.acestep_pipeline.text_tokenizer 49 | 50 | # Ensure no gradients are computed 51 | self.transformers.requires_grad_(False) 52 | self.dcae.requires_grad_(False) 53 | self.text_encoder_model.requires_grad_(False) 54 | 55 | # Initialize scheduler 56 | self.scheduler = FlowMatchEulerDiscreteScheduler( 57 | num_train_timesteps=1000, 58 | shift=3.0, 59 | ) 60 | 61 | def get_text_embeddings(self, texts, device, text_max_length=256): 62 | inputs = self.text_tokenizer( 63 | texts, 64 | return_tensors="pt", 65 | padding=True, 66 | truncation=True, 67 | max_length=text_max_length, 68 | ) 69 | inputs = {key: value.to(device) for key, value in inputs.items()} 70 | with torch.no_grad(): 71 | outputs = self.text_encoder_model(**inputs) 72 | last_hidden_states = outputs.last_hidden_state 73 | attention_mask = inputs["attention_mask"] 74 | return last_hidden_states, attention_mask 75 | 76 | def diffusion_process( 77 | self, 78 | duration, 79 | encoder_text_hidden_states, 80 | text_attention_mask, 81 | speaker_embds, 82 | lyric_token_ids, 83 | lyric_mask, 84 | random_generator=None, 85 | infer_steps=60, 86 | guidance_scale=15.0, 87 | omega_scale=10.0, 88 | ): 89 | do_classifier_free_guidance = guidance_scale > 1.0 90 | device = encoder_text_hidden_states.device 91 | dtype = encoder_text_hidden_states.dtype 92 | bsz = encoder_text_hidden_states.shape[0] 93 | 94 | timesteps, num_inference_steps = retrieve_timesteps( 95 | self.scheduler, num_inference_steps=infer_steps, device=device 96 | ) 97 | 98 | frame_length = int(duration * 44100 / 512 / 8) 99 | target_latents = randn_tensor( 100 | shape=(bsz, 8, 16, frame_length), 101 | generator=random_generator, 102 | device=device, 103 | dtype=dtype, 104 | ) 105 | attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype) 106 | 107 | if do_classifier_free_guidance: 108 | attention_mask = torch.cat([attention_mask] * 2, dim=0) 109 | encoder_text_hidden_states = torch.cat( 110 | [encoder_text_hidden_states, torch.zeros_like(encoder_text_hidden_states)], 111 | 0, 112 | ) 113 | text_attention_mask = torch.cat([text_attention_mask] * 2, dim=0) 114 | speaker_embds = torch.cat([speaker_embds, torch.zeros_like(speaker_embds)], 0) 115 | lyric_token_ids = torch.cat([lyric_token_ids, torch.zeros_like(lyric_token_ids)], 0) 116 | lyric_mask = torch.cat([lyric_mask, torch.zeros_like(lyric_mask)], 0) 117 | 118 | momentum_buffer = MomentumBuffer() 119 | 120 | for t in timesteps: 121 | latent_model_input = ( 122 | torch.cat([target_latents] * 2) if do_classifier_free_guidance else target_latents 123 | ) 124 | timestep = t.expand(latent_model_input.shape[0]) 125 | with torch.no_grad(): 126 | noise_pred = self.transformers( 127 | hidden_states=latent_model_input, 128 | attention_mask=attention_mask, 129 | encoder_text_hidden_states=encoder_text_hidden_states, 130 | text_attention_mask=text_attention_mask, 131 | speaker_embeds=speaker_embds, 132 | lyric_token_idx=lyric_token_ids, 133 | lyric_mask=lyric_mask, 134 | timestep=timestep, 135 | ).sample 136 | 137 | if do_classifier_free_guidance: 138 | noise_pred_with_cond, noise_pred_uncond = noise_pred.chunk(2) 139 | noise_pred = apg_forward( 140 | pred_cond=noise_pred_with_cond, 141 | pred_uncond=noise_pred_uncond, 142 | guidance_scale=guidance_scale, 143 | momentum_buffer=momentum_buffer, 144 | ) 145 | 146 | target_latents = self.scheduler.step( 147 | model_output=noise_pred, 148 | timestep=t, 149 | sample=target_latents, 150 | omega=omega_scale, 151 | )[0] 152 | 153 | return target_latents 154 | 155 | def generate_audio( 156 | self, 157 | prompt: str, 158 | duration: int, 159 | infer_steps: int, 160 | guidance_scale: float, 161 | omega_scale: float, 162 | seed: Optional[int], 163 | ): 164 | # Set random seed 165 | if seed is not None: 166 | random.seed(seed) 167 | torch.manual_seed(seed) 168 | else: 169 | seed = random.randint(0, 2**32 - 1) 170 | random.seed(seed) 171 | torch.manual_seed(seed) 172 | 173 | generator = torch.Generator(device=self.device).manual_seed(seed) 174 | 175 | # Get text embeddings 176 | encoder_text_hidden_states, text_attention_mask = self.get_text_embeddings( 177 | [prompt], self.device 178 | ) 179 | 180 | # Dummy speaker embeddings and lyrics (since not provided in API request) 181 | bsz = 1 182 | speaker_embds = torch.zeros(bsz, 512, device=self.device, dtype=encoder_text_hidden_states.dtype) 183 | lyric_token_ids = torch.zeros(bsz, 256, device=self.device, dtype=torch.long) 184 | lyric_mask = torch.zeros(bsz, 256, device=self.device, dtype=torch.long) 185 | 186 | # Run diffusion process 187 | pred_latents = self.diffusion_process( 188 | duration=duration, 189 | encoder_text_hidden_states=encoder_text_hidden_states, 190 | text_attention_mask=text_attention_mask, 191 | speaker_embds=speaker_embds, 192 | lyric_token_ids=lyric_token_ids, 193 | lyric_mask=lyric_mask, 194 | random_generator=generator, 195 | infer_steps=infer_steps, 196 | guidance_scale=guidance_scale, 197 | omega_scale=omega_scale, 198 | ) 199 | 200 | # Decode latents to audio 201 | audio_lengths = torch.tensor([int(duration * 44100)], device=self.device) 202 | sr, pred_wavs = self.dcae.decode(pred_latents, audio_lengths=audio_lengths, sr=48000) 203 | 204 | # Save audio 205 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 206 | output_dir = "generated_audio" 207 | os.makedirs(output_dir, exist_ok=True) 208 | audio_path = f"{output_dir}/generated_{timestamp}_{seed}.wav" 209 | torchaudio.save(audio_path, pred_wavs.float().cpu(), sr) 210 | 211 | return audio_path, sr, seed 212 | 213 | # Global model instance 214 | model = None 215 | 216 | @app.on_event("startup") 217 | async def startup_event(): 218 | global model 219 | checkpoint_dir = os.getenv("CHECKPOINT_DIR", "./checkpoints") 220 | model = InferencePipeline(checkpoint_dir=checkpoint_dir) 221 | logger.info("Model loaded successfully") 222 | 223 | @app.post("/generate", response_model=TextToMusicResponse) 224 | async def generate_music(request: TextToMusicRequest): 225 | if model is None: 226 | raise HTTPException(status_code=503, detail="Model not initialized") 227 | 228 | try: 229 | start_time = time.time() 230 | audio_path, sr, seed = model.generate_audio( 231 | prompt=request.prompt, 232 | duration=request.duration, 233 | infer_steps=request.infer_steps, 234 | guidance_scale=request.guidance_scale, 235 | omega_scale=request.omega_scale, 236 | seed=request.seed, 237 | ) 238 | logger.info(f"Generation completed in {time.time() - start_time:.2f} seconds") 239 | return TextToMusicResponse( 240 | audio_path=audio_path, 241 | prompt=request.prompt, 242 | seed=seed, 243 | sample_rate=sr, 244 | ) 245 | except Exception as e: 246 | logger.error(f"Error during generation: {str(e)}") 247 | raise HTTPException(status_code=500, detail=str(e)) 248 | 249 | if __name__ == "__main__": 250 | uvicorn.run(app, host="0.0.0.0", port=8000) 251 | -------------------------------------------------------------------------------- /zh_lora_dataset/data-00000-of-00001.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ace-step/ACE-Step/1028991adc5c3d464cf9af5f64633838a062bf07/zh_lora_dataset/data-00000-of-00001.arrow -------------------------------------------------------------------------------- /zh_lora_dataset/dataset_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "citation": "", 3 | "description": "", 4 | "features": { 5 | "keys": { 6 | "dtype": "string", 7 | "_type": "Value" 8 | }, 9 | "filename": { 10 | "dtype": "string", 11 | "_type": "Value" 12 | }, 13 | "tags": { 14 | "feature": { 15 | "dtype": "string", 16 | "_type": "Value" 17 | }, 18 | "_type": "Sequence" 19 | }, 20 | "speaker_emb_path": { 21 | "dtype": "string", 22 | "_type": "Value" 23 | }, 24 | "norm_lyrics": { 25 | "dtype": "string", 26 | "_type": "Value" 27 | }, 28 | "recaption": {} 29 | }, 30 | "homepage": "", 31 | "license": "" 32 | } -------------------------------------------------------------------------------- /zh_lora_dataset/state.json: -------------------------------------------------------------------------------- 1 | { 2 | "_data_files": [ 3 | { 4 | "filename": "data-00000-of-00001.arrow" 5 | } 6 | ], 7 | "_fingerprint": "76c203d4bc1fdd7e", 8 | "_format_columns": null, 9 | "_format_kwargs": {}, 10 | "_format_type": null, 11 | "_output_all_columns": false, 12 | "_split": null 13 | } --------------------------------------------------------------------------------