├── .github ├── codecov.yml ├── dependabot.yml └── workflows │ ├── close-stale-issues.yml │ ├── docs.yml │ ├── lint.yml │ ├── publish-pypi.yml │ └── pytest.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── benchmarks ├── README.md ├── __init__.py ├── midi_file_read │ ├── README.md │ ├── benchmark_midi_read.py │ ├── midi_read.csv │ ├── midi_read.md │ └── midi_read.txt ├── miditok_preprocess_file │ ├── README.md │ ├── benchmark_preprocess.py │ ├── preprocess.csv │ ├── preprocess.md │ └── preprocess.txt ├── miditok_tokenize │ ├── README.md │ ├── benchmark_tokenize.py │ ├── tokenize.csv │ ├── tokenize.md │ └── tokenize.txt ├── tokenizer_training │ ├── README.md │ ├── benchmark_training.py │ └── results │ │ ├── seq_split_lengths.csv │ │ ├── seq_split_lengths.md │ │ ├── seq_split_lengths.txt │ │ ├── wordpiece_max_chars_enc_time.csv │ │ ├── wordpiece_max_chars_enc_time.md │ │ ├── wordpiece_max_chars_enc_time.txt │ │ ├── wordpiece_max_chars_train_time.csv │ │ ├── wordpiece_max_chars_train_time.md │ │ └── wordpiece_max_chars_train_time.txt └── utils.py ├── colab-notebooks ├── Example_HuggingFace_Mistral_Transformer.ipynb ├── MidiTok_Full_Workflow_Tutorial.ipynb └── README.md ├── docs ├── Makefile ├── additional_tokens_table.csv ├── assets │ ├── Octuple_TS_Rest │ │ ├── original.png │ │ └── tokenized.png │ ├── bases │ │ ├── pianoroll_daw.png │ │ ├── sheet_music.png │ │ └── spectrogram.png │ ├── cp_word.png │ ├── embeddings.png │ ├── favicon.png │ ├── midi_like.png │ ├── midi_preprocessing_original.png │ ├── midi_preprocessing_preprocessed.png │ ├── miditok_logo.png │ ├── miditok_logo.svg │ ├── miditok_logo_stroke.png │ ├── mumidi.png │ ├── music_sheet.png │ ├── octuple.png │ ├── pitch_intervals.png │ ├── pitch_intervals_original.png │ ├── remi.png │ ├── remiplus.png │ ├── structured.png │ ├── transformer.png │ └── tsd.png ├── attribute_controls.rst ├── citations.rst ├── conf.py ├── configuration.rst ├── data_augmentation.rst ├── examples.rst ├── hf_hub.rst ├── index.rst ├── make.bat ├── midi.rst ├── music_formats.rst ├── pytorch_data.rst ├── sequential_models.rst ├── tokenizations.rst ├── tokenizing_music_with_miditok.rst ├── train.rst └── utils.rst ├── pyproject.toml ├── src └── miditok │ ├── __init__.py │ ├── attribute_controls │ ├── __init__.py │ ├── bar_attribute_controls.py │ ├── classes.py │ └── track_attribute_controls.py │ ├── classes.py │ ├── constants.py │ ├── data_augmentation │ ├── __init__.py │ └── data_augmentation.py │ ├── midi_tokenizer.py │ ├── pytorch_data │ ├── __init__.py │ ├── collators.py │ └── datasets.py │ ├── tokenizations │ ├── __init__.py │ ├── cp_word.py │ ├── midi_like.py │ ├── mmm.py │ ├── mumidi.py │ ├── octuple.py │ ├── pertok.py │ ├── remi.py │ ├── structured.py │ └── tsd.py │ ├── tokenizer_training_iterator.py │ └── utils │ ├── __init__.py │ ├── split.py │ └── utils.py └── tests ├── MIDIs_corrupted └── ValueError_Control168.mid ├── MIDIs_multitrack ├── Aicha.mid ├── All The Small Things.mid ├── Funkytown.mid ├── Girls Just Want to Have Fun.mid ├── I Gotta Feeling.mid ├── In Too Deep.mid ├── Les Yeux Revolvers.mid ├── Mr. Blue Sky.mid ├── Shut Up.mid ├── What a Fool Believes.mid ├── d6caebd1964d9e4a3c5ea59525230e2a.mid └── d8faddb8596fff7abb24d78666f73e4e.mid ├── MIDIs_one_track ├── 6338816_Etude No. 4.mid ├── 6354774_Macabre Waltz.mid ├── Maestro_1.mid ├── Maestro_10.mid ├── Maestro_2.mid ├── Maestro_3.mid ├── Maestro_4.mid ├── Maestro_5.mid ├── Maestro_6.mid ├── Maestro_7.mid ├── Maestro_8.mid ├── Maestro_9.mid ├── POP909_008.mid ├── POP909_010.mid ├── POP909_022.mid ├── POP909_191.mid └── empty.mid ├── __init__.py ├── abc_files ├── a_morning_in_summer.abc ├── flowers_of_edinburgh.abc ├── rising_sun.abc ├── the_rising_of_the_moon.abc └── the_wheels_of_the_world.abc ├── conftest.py ├── test_attribute_controls.py ├── test_data_augmentation.py ├── test_hf_hub.py ├── test_io_formats.py ├── test_methods.py ├── test_preprocess.py ├── test_pytorch_data_loading.py ├── test_saving_loading_config.py ├── test_tokenize.py ├── test_toksequence.py ├── test_train.py ├── test_utils.py └── utils_tests.py /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | # Codecov params 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: false # disable the default status that measures entire project 7 | tests: 8 | paths: 9 | - "tests/" 10 | target: 70% 11 | source: 12 | paths: 13 | - "src/miditok/" 14 | target: 75% 15 | threshold: 0.5% 16 | patch: 17 | default: 18 | enabled: no # target: 75% # new contributions should have a coverage at least equal to target 19 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | # Check for updates to GitHub Actions every week 12 | interval: "weekly" 13 | -------------------------------------------------------------------------------- /.github/workflows/close-stale-issues.yml: -------------------------------------------------------------------------------- 1 | # This workflow will automatically mark inactive issues as stale, and close them. 2 | # For more information see: https://github.com/marketplace/actions/close-stale-issues and https://docs.github.com/en/github-ae@latest/actions/managing-issues-and-pull-requests/closing-inactive-issues 3 | 4 | name: Close inactive issues 5 | on: 6 | schedule: 7 | - cron: "30 1 * * *" 8 | 9 | jobs: 10 | close-issues: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | issues: write 14 | pull-requests: write 15 | steps: 16 | - uses: actions/stale@v9.1.0 17 | with: 18 | days-before-issue-stale: 21 19 | days-before-issue-close: 7 20 | stale-issue-label: "stale" 21 | stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." 22 | close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." 23 | days-before-pr-stale: -1 24 | days-before-pr-close: -1 25 | repo-token: ${{ secrets.GITHUB_TOKEN }} 26 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | # Build docs preview in pull-requests. 2 | 3 | name: docs/preview 4 | on: 5 | pull_request_target: 6 | types: 7 | - opened 8 | # Execute this action only on PRs that touch 9 | # documentation files. 10 | # paths: 11 | # - "docs/**" 12 | 13 | permissions: 14 | pull-requests: write 15 | 16 | jobs: 17 | documentation-links: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: readthedocs/actions/preview@v1 21 | with: 22 | project-slug: "miditok" 23 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-python@v5 15 | - uses: pre-commit/action@v3.0.1 16 | env: 17 | RUFF_OUTPUT_FORMAT: github 18 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Publish package on PyPi 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | build: 17 | name: Build distribution 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.x" 25 | - name: Install pypa/build 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install hatch --user 29 | - name: Build a binary wheel and a source tarball 30 | run: hatch build 31 | - name: Store the distribution packages 32 | uses: actions/upload-artifact@v4 33 | with: 34 | name: python-package-distributions 35 | path: dist/ 36 | 37 | pypi-publish: 38 | name: Upload release to PyPI 39 | needs: 40 | - build 41 | runs-on: ubuntu-latest 42 | environment: 43 | name: PyPI 44 | url: https://pypi.org/p/MidiTok 45 | permissions: 46 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 47 | steps: 48 | # retrieve your distributions here 49 | - name: Download all the dists 50 | uses: actions/download-artifact@v4 51 | with: 52 | name: python-package-distributions 53 | path: dist/ 54 | 55 | - name: Publish package distributions to PyPI 56 | uses: pypa/gh-action-pypi-publish@release/v1 57 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # PyTest workflow 2 | 3 | name: Tests 4 | 5 | on: 6 | push: 7 | branches: [main] 8 | pull_request: 9 | branches: [main] 10 | 11 | jobs: 12 | test: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | python-version: ["3.9", "3.10", "3.12"] 17 | os: [ ubuntu-latest, macos-latest, windows-latest ] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | cache: pip 27 | cache-dependency-path: pyproject.toml 28 | 29 | - name: Install dependencies 30 | run: | 31 | # Install local package with tests dependencies extras 32 | python -m pip install --upgrade pip 33 | pip install -e ".[tests]" 34 | 35 | # Tokenizer training tests are significantly slower than others. 36 | # So that xdist don't assign chunks of training tests to the same worker, we use 37 | # the `--dist worksteal` distribution mode to dynamically reassign queued tests to 38 | # free workers. 39 | - name: Test with pytest 40 | run: python -m pytest --cov=./ --cov-report=xml -n logical --dist worksteal --durations=0 -v tests 41 | env: 42 | HF_TOKEN_HUB_TESTS: ${{ secrets.HF_TOKEN_HUB_TESTS }} 43 | 44 | - name: Codecov 45 | uses: codecov/codecov-action@v5.4.3 46 | with: 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | 49 | build: 50 | runs-on: ubuntu-latest 51 | steps: 52 | - uses: actions/checkout@v4 53 | - name: Set up Python 54 | uses: actions/setup-python@v5 55 | with: 56 | python-version: '3.x' 57 | - name: Install dependencies 58 | run: pip install hatch 59 | - name: Build package 60 | run: hatch build 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated files in test 2 | tests/configs 3 | tests/Multitrack_tokens 4 | tests/Multitrack_tokens_aug 5 | tests/Multitrack_MIDIs_aug 6 | 7 | # Standard Python gitignore from https://github.com/github/gitignore/blob/main/Python.gitignore 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 118 | .pdm.toml 119 | .pdm-python 120 | .pdm-build/ 121 | 122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 123 | __pypackages__/ 124 | 125 | # Celery stuff 126 | celerybeat-schedule 127 | celerybeat.pid 128 | 129 | # SageMath parsed files 130 | *.sage.py 131 | 132 | # Environments 133 | .env 134 | .venv 135 | env/ 136 | venv/ 137 | ENV/ 138 | env.bak/ 139 | venv.bak/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # Cython debug symbols 163 | cython_debug/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | #.idea/ 171 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 172 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 173 | 174 | # User-specific stuff 175 | .idea/**/workspace.xml 176 | .idea/**/tasks.xml 177 | .idea/**/usage.statistics.xml 178 | .idea/**/dictionaries 179 | .idea/**/shelf 180 | 181 | # AWS User-specific 182 | .idea/**/aws.xml 183 | 184 | # Generated files 185 | .idea/**/contentModel.xml 186 | 187 | # Sensitive or high-churn files 188 | .idea/**/dataSources/ 189 | .idea/**/dataSources.ids 190 | .idea/**/dataSources.local.xml 191 | .idea/**/sqlDataSources.xml 192 | .idea/**/dynamic.xml 193 | .idea/**/uiDesigner.xml 194 | .idea/**/dbnavigator.xml 195 | 196 | # Gradle 197 | .idea/**/gradle.xml 198 | .idea/**/libraries 199 | 200 | # Gradle and Maven with auto-import 201 | # When using Gradle or Maven with auto-import, you should exclude module files, 202 | # since they will be recreated, and may cause churn. Uncomment if using 203 | # auto-import. 204 | # .idea/artifacts 205 | # .idea/compiler.xml 206 | # .idea/jarRepositories.xml 207 | # .idea/modules.xml 208 | # .idea/*.iml 209 | # .idea/modules 210 | # *.iml 211 | # *.ipr 212 | 213 | # CMake 214 | cmake-build-*/ 215 | 216 | # Mongo Explorer plugin 217 | .idea/**/mongoSettings.xml 218 | 219 | # File-based project format 220 | *.iws 221 | 222 | # IntelliJ 223 | out/ 224 | 225 | # mpeltonen/sbt-idea plugin 226 | .idea_modules/ 227 | 228 | # JIRA plugin 229 | atlassian-ide-plugin.xml 230 | 231 | # Cursive Clojure plugin 232 | .idea/replstate.xml 233 | 234 | # SonarLint plugin 235 | .idea/sonarlint/ 236 | 237 | # Crashlytics plugin (for Android Studio and IntelliJ) 238 | com_crashlytics_export_strings.xml 239 | crashlytics.properties 240 | crashlytics-build.properties 241 | fabric.properties 242 | 243 | # Editor-based Rest Client 244 | .idea/httpRequests 245 | 246 | # Android studio 3.1+ serialized cache file 247 | .idea/caches/build_file_checksums.ser 248 | 249 | # Aider cache directory 250 | .aider* 251 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.5.0 4 | hooks: 5 | - id: ruff 6 | args: 7 | - --fix 8 | - id: ruff-format 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v4.6.0 11 | hooks: 12 | - id: end-of-file-fixer 13 | - id: trailing-whitespace 14 | #- repo: https://github.com/pre-commit/mirrors-mypy 15 | # rev: v1.10.0 16 | # hooks: 17 | # - id: mypy 18 | # # types: [ python ] 19 | # args: [--strict, --ignore-missing-imports] # --no-warn-return-any 20 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | # You can also specify other tool versions: 13 | # nodejs: "20" 14 | # rust: "1.70" 15 | # golang: "1.20" 16 | 17 | # Build documentation in the "docs/" directory with Sphinx 18 | sphinx: 19 | configuration: docs/conf.py 20 | # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs 21 | # builder: "dirhtml" 22 | # Fail on all warnings to avoid broken references 23 | fail_on_warning: true 24 | 25 | # Optionally build your docs in additional formats such as PDF and ePub 26 | # formats: 27 | # - pdf 28 | # - epub 29 | 30 | # Optional but recommended, declare the Python requirements required 31 | # to build your documentation 32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 33 | python: 34 | install: 35 | - method: pip 36 | path: . 37 | extra_requirements: 38 | - docs 39 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Fradet" 5 | given-names: "Nathan" 6 | orcid: "https://orcid.org/0000-0003-4729-570X" 7 | - family-names: "Briot" 8 | given-names: "Jean-Pierre" 9 | orcid: "https://orcid.org/0000-0003-1621-6335" 10 | - family-names: "Chhel" 11 | given-names: "Fabien" 12 | orcid: "https://orcid.org/0000-0003-2224-8296" 13 | - family-names: "El Fallah Seghrouchni" 14 | given-names: "Amal" 15 | orcid: "https://orcid.org/0000-0002-8390-8780" 16 | - family-names: "Gutowski" 17 | given-names: "Nicolas" 18 | orcid: "https://orcid.org/0000-0002-5765-9901" 19 | title: "MidiTok: A Python package for MIDI file tokenization" 20 | license: MIT 21 | date-released: 2021-11-07 22 | url: "https://github.com/Natooz/MidiTok" 23 | repository-code: "https://github.com/Natooz/MidiTok" 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `MidiTok` 2 | 3 | - Reporting a bug. 4 | - Discussing the current state of the code. 5 | - Submitting a fix. 6 | - Proposing new features. 7 | - Becoming a maintainer. 8 | 9 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 10 | 11 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 12 | 13 | 1. Fork the repo and create your branch from the `main`. 14 | 2. If you've added code that should be tested, add [tests](tests). 15 | 3. If you've changed APIs, update the documentation. 16 | 4. Ensure the test suite passes. 17 | 5. Make sure your code lints. 18 | 6. Issue that pull request! 19 | 20 | ## Report bugs using Github's [issues](https://github.com/Natooz/MidiTok/issues) 21 | 22 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/Natooz/MidiTok/issues/new). 23 | 24 | ## Write bug Reports with Detail, Background, and Sample Code 25 | 26 | **Great Bug Reports** tend to have: 27 | 28 | - A quick summary and/or background. 29 | - Steps to reproduce. 30 | - Be specific! 31 | - Give a sample code if you can, for example, 32 | - What you expected would happen. 33 | - What actually happens. 34 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work). 35 | 36 | ## Development 37 | 38 | ### Tests 39 | 40 | We use `pytest`/`pytest-xdist` for testing and `pytest-cov` for measuring coverage. Running all the tests can take between 10 to 30min depending on your hardware. You don't need to run all of them, but try to run those affected by your changes. 41 | 42 | ```bash 43 | pip install pytest-cov "pytest-xdist[psutil]" 44 | pytest --cov=./ --cov-report=xml -n auto --durations=0 -v tests/ 45 | ``` 46 | 47 | ### Use a Consistent Coding Style 48 | 49 | We use the [ruff](https://github.com/astral-sh/ruff) formatter for Python in this project. Ruff allows to automatically analyze the code and format it according to rules if needed. This is handled by using pre-commit (following section). 50 | 51 | ### Pre-commit Lints 52 | 53 | Linting is configured via [pre-commit](https://www.pre-commit.com/). You can set up pre-commit by running: 54 | 55 | ```bash 56 | pip install pre-commit 57 | pre-commit install # installs pre-commit Git hook in the repository 58 | ``` 59 | 60 | When your changes are finished and the tests are passing, you can run `pre-commit run` to check if your code lints according to our ruff rules. 61 | If errors are found, we encourage you to fix them to follow the best code practices. If you struggle with this step, don't hesitate to ask for help, and to even commit and push anyway. Contributors will be able to help you. 62 | 63 | ## License 64 | 65 | By contributing, you agree that your contributions will be licensed under the MIT License. 66 | 67 | ## References 68 | 69 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Nathan Fradet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MidiTok 2 | 3 | Python package to tokenize music files, introduced at the ISMIR 2021 LBDs. 4 | 5 | ![MidiTok Logo](docs/assets/miditok_logo_stroke.png?raw=true "") 6 | 7 | [![PyPI version fury.io](https://badge.fury.io/py/miditok.svg)](https://pypi.python.org/pypi/miditok/) 8 | [![Python 3.9](https://img.shields.io/badge/python-≥3.9-blue.svg)](https://www.python.org/downloads/release/) 9 | [![Documentation Status](https://readthedocs.org/projects/miditok/badge/?version=latest)](https://miditok.readthedocs.io/en/latest/?badge=latest) 10 | [![GitHub CI](https://github.com/Natooz/MidiTok/actions/workflows/pytest.yml/badge.svg)](https://github.com/Natooz/MidiTok/actions/workflows/pytest.yml) 11 | [![Codecov](https://img.shields.io/codecov/c/github/Natooz/MidiTok)](https://codecov.io/gh/Natooz/MidiTok) 12 | [![GitHub license](https://img.shields.io/github/license/Natooz/MidiTok.svg)](https://github.com/Natooz/MidiTok/blob/main/LICENSE) 13 | [![Downloads](https://static.pepy.tech/badge/miditok)](https://pepy.tech/project/MidiTok) 14 | [![Code style](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff) 15 | 16 | MidiTok can tokenize MIDI and abc files, i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task. 17 | MidiTok features most known [music tokenizations](https://miditok.readthedocs.io/en/latest/tokenizations.html) (e.g. [REMI](https://arxiv.org/abs/2002.00212), [Compound Word](https://arxiv.org/abs/2101.02402)...), and is built around the idea that they all share common parameters and methods. Tokenizers can be trained with [Byte Pair Encoding (BPE)](https://aclanthology.org/2023.emnlp-main.123/), [Unigram](https://aclanthology.org/P18-1007/) and [WordPiece](https://arxiv.org/abs/1609.08144), and it offers data augmentation methods. 18 | 19 | MidiTok is integrated with the Hugging Face Hub 🤗! Don't hesitate to share your models to the community! 20 | 21 | **Documentation:** [miditok.readthedocs.com](https://miditok.readthedocs.io/en/latest/index.html) 22 | 23 | ## Install 24 | 25 | ```shell 26 | pip install miditok 27 | ``` 28 | MidiTok uses [Symusic](https://github.com/Yikai-Liao/symusic) to read and write MIDI and abc files, and BPE/Unigram is backed by [Hugging Face 🤗tokenizers](https://github.com/huggingface/tokenizers) for superfast encoding. 29 | 30 | ## Usage example 31 | 32 | Tokenizing and detokenzing can be done by calling the tokenizer: 33 | 34 | ```python 35 | from miditok import REMI, TokenizerConfig 36 | from symusic import Score 37 | 38 | # Creating a multitrack tokenizer, read the doc to explore all the parameters 39 | config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True) 40 | tokenizer = REMI(config) 41 | 42 | # Loads a midi, converts to tokens, and back to a MIDI 43 | midi = Score("path/to/your_midi.mid") 44 | tokens = tokenizer(midi) # calling the tokenizer will automatically detect MIDIs, paths and tokens 45 | converted_back_midi = tokenizer(tokens) # PyTorch, Tensorflow and Numpy tensors are supported 46 | ``` 47 | 48 | Here is a complete yet concise example of how you can use MidiTok to train any PyTorch model. And [here](colab-notebooks/Example_HuggingFace_Mistral_Transformer.ipynb) is a simple notebook example showing how to use Hugging Face models to generate music, with MidiTok taking care of tokenizing music files. 49 | 50 | ```python 51 | from miditok import REMI, TokenizerConfig 52 | from miditok.pytorch_data import DatasetMIDI, DataCollator 53 | from miditok.utils import split_files_for_training 54 | from torch.utils.data import DataLoader 55 | from pathlib import Path 56 | 57 | # Creating a multitrack tokenizer, read the doc to explore all the parameters 58 | config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True) 59 | tokenizer = REMI(config) 60 | 61 | # Train the tokenizer with Byte Pair Encoding (BPE) 62 | files_paths = list(Path("path", "to", "midis").glob("**/*.mid")) 63 | tokenizer.train(vocab_size=30000, files_paths=files_paths) 64 | tokenizer.save(Path("path", "to", "save", "tokenizer.json")) 65 | # And pushing it to the Hugging Face hub (you can download it back with .from_pretrained) 66 | tokenizer.push_to_hub("username/model-name", private=True, token="your_hf_token") 67 | 68 | # Split MIDIs into smaller chunks for training 69 | dataset_chunks_dir = Path("path", "to", "midi_chunks") 70 | split_files_for_training( 71 | files_paths=files_paths, 72 | tokenizer=tokenizer, 73 | save_dir=dataset_chunks_dir, 74 | max_seq_len=1024, 75 | ) 76 | 77 | # Create a Dataset, a DataLoader and a collator to train a model 78 | dataset = DatasetMIDI( 79 | files_paths=list(dataset_chunks_dir.glob("**/*.mid")), 80 | tokenizer=tokenizer, 81 | max_seq_len=1024, 82 | bos_token_id=tokenizer["BOS_None"], 83 | eos_token_id=tokenizer["EOS_None"], 84 | ) 85 | collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True) 86 | dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator) 87 | 88 | # Iterate over the dataloader to train a model 89 | for batch in dataloader: 90 | print("Train your model on this batch...") 91 | ``` 92 | 93 | ## Tokenizations 94 | 95 | MidiTok implements the tokenizations: (links to original papers) 96 | * [REMI](https://dl.acm.org/doi/10.1145/3394171.3413671) 97 | * [REMI+](https://openreview.net/forum?id=NyR8OZFHw6i) 98 | * [MIDI-Like](https://link.springer.com/article/10.1007/s00521-018-3758-9) 99 | * [TSD](https://arxiv.org/abs/2301.11975) 100 | * [Structured](https://arxiv.org/abs/2107.05944) 101 | * [CPWord](https://ojs.aaai.org/index.php/AAAI/article/view/16091) 102 | * [Octuple](https://aclanthology.org/2021.findings-acl.70) 103 | * [MuMIDI](https://dl.acm.org/doi/10.1145/3394171.3413721) 104 | * [MMM](https://arxiv.org/abs/2008.06048) 105 | * [PerTok](https://www.arxiv.org/abs/2410.02060) 106 | 107 | You can find short presentations in the [documentation](https://miditok.readthedocs.io/en/latest/tokenizations.html). 108 | 109 | ## Contributions 110 | 111 | Contributions are gratefully welcomed, feel free to open an issue or send a PR if you want to add a tokenization or speed up the code. You can read the [contribution guide](CONTRIBUTING.md) for details. 112 | 113 | ### Todos 114 | 115 | * Support music-xml files; 116 | * `no_duration_drums` option, discarding duration tokens for drum notes; 117 | * Control Change messages; 118 | * Speed-up global/track events parsing with Rust or C++ bindings. 119 | 120 | ## Citation 121 | 122 | If you use MidiTok for your research, a citation in your manuscript would be gladly appreciated. ❤️ 123 | 124 | [**[MidiTok paper]**](https://arxiv.org/abs/2310.17202) 125 | [**[MidiTok original ISMIR publication]**](https://archives.ismir.net/ismir2021/latebreaking/000005.pdf) 126 | ```bibtex 127 | @inproceedings{miditok2021, 128 | title={{MidiTok}: A Python package for {MIDI} file tokenization}, 129 | author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas}, 130 | booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference}, 131 | year={2021}, 132 | url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf}, 133 | } 134 | ``` 135 | 136 | The BibTeX citations of all tokenizations can be found [in the documentation](https://miditok.readthedocs.io/en/latest/citations.html) 137 | 138 | 139 | ## Acknowledgments 140 | 141 | @Natooz thanks its employers who allowed him to develop this project, by chronological order [Aubay](https://blog.aubay.com/index.php/language/en/home/?lang=en), the [LIP6 (Sorbonne University)](https://www.lip6.fr/?LANG=en), and the [Metacreation Lab (Simon Fraser University)](https://www.metacreation.net). 142 | 143 | ## All Thanks To Our Contributors 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This directory contains several benchmarks, measuring in particular encoding-decoding speeds, training speeds and sequence lengths. 4 | They are intended to give you reference points and help you to choose your tokenization parameters. 5 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | """Utils methods for benchmarks.""" 2 | 3 | from .utils import mean_std_str 4 | 5 | __all__ = [ 6 | "mean_std_str", 7 | ] 8 | -------------------------------------------------------------------------------- /benchmarks/midi_file_read/README.md: -------------------------------------------------------------------------------- 1 | # MIDI files reading 2 | 3 | This benchmark measures the read times of MIDI files, comparing the [symusic](https://github.com/Yikai-Liao/symusic), [miditoolkit](https://github.com/YatingMusic/miditoolkit) and [pretty_midi](https://github.com/craffel/pretty-midi) which are the tree main python libraries parsing MIDI files at the note level. 4 | 5 | ## Configuration 6 | 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1 8 | 9 | * symusic version: 0.4.5 10 | * miditoolkit version: 1.0.1 11 | * pretty_midi version: 0.2.10 12 | 13 | ## Results 14 | 15 | | Library | Maestro | MetaMIDI | POP909 | 16 | |:------------|:----------------|:----------------|:----------------| 17 | | Symusic | 1.06 ± 0.89 ms | 0.37 ± 0.32 ms | 0.20 ± 0.05 ms | 18 | | MidiToolkit | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec | 19 | | Pretty MIDI | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec | 20 | 21 | miditoolkit and pretty_midi perform equally in average. The two libraries are very similar and both rely on [mido](https://github.com/mido/mido) to read and write MIDI messages. 22 | symusic on the other hand is respectively 104, 108 and 100 times faster than the two others on the Maestro, MetaMIDI and POP909 datasets. 23 | -------------------------------------------------------------------------------- /benchmarks/midi_file_read/benchmark_midi_read.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 python 2 | 3 | """Benchmark for Python MIDI parsing libraries.""" 4 | 5 | from __future__ import annotations 6 | 7 | import random 8 | from pathlib import Path 9 | from time import time 10 | 11 | import numpy as np 12 | from miditoolkit import MidiFile 13 | from pandas import DataFrame 14 | from pretty_midi import PrettyMIDI 15 | from symusic import Score 16 | from tqdm import tqdm 17 | 18 | HERE = Path(__file__).parent 19 | DATASETS = ["Maestro", "MMD", "POP909"] 20 | LIBRARIES = ["Symusic", "MidiToolkit", "Pretty MIDI"] 21 | MAX_NUM_FILES = 1000 22 | 23 | 24 | def read_midi_files( 25 | midi_paths: list[Path], 26 | ) -> tuple[list[float], list[float], list[float]]: 27 | """ 28 | Read a list of MIDI files and return their reading times. 29 | 30 | :param midi_paths: paths to the midi files to read. 31 | :return: times of files reads for symusic, miditoolkit and pretty_midi. 32 | """ 33 | times_mtk, times_sms, times_ptm = [], [], [] 34 | for midi_path in tqdm(midi_paths, desc="Loading MIDIs"): 35 | # We count times only if all libraries load the file without error 36 | try: 37 | # Miditoolkit 38 | t0 = time() 39 | _ = MidiFile(midi_path) 40 | t_mtk = time() - t0 41 | 42 | # Symusic 43 | t0 = time() 44 | _ = Score(midi_path) 45 | t_sms = time() - t0 46 | 47 | # Pretty MIDI 48 | t0 = time() 49 | _ = PrettyMIDI(str(midi_path)) 50 | t_ptm = time() - t0 51 | except: # noqa: E722, S112 52 | continue 53 | 54 | times_mtk.append(t_mtk) 55 | times_sms.append(t_sms) 56 | times_ptm.append(t_ptm) 57 | 58 | return times_sms, times_mtk, times_ptm 59 | 60 | 61 | def benchmark_midi_parsing( 62 | seed: int = 777, 63 | ) -> None: 64 | r""" 65 | Measure the reading time of MIDI files with different libraries. 66 | 67 | :param seed: random seed 68 | """ 69 | random.seed(seed) 70 | 71 | df = DataFrame(index=LIBRARIES, columns=DATASETS) 72 | 73 | # Record times 74 | for dataset in DATASETS: 75 | midi_paths = list( 76 | (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid") 77 | )[:MAX_NUM_FILES] 78 | all_times = read_midi_files(midi_paths) 79 | for library, times in zip(LIBRARIES, all_times): 80 | times_ = np.array(times) 81 | if library == "Symusic": 82 | times_ *= 1e3 83 | unit = "ms" 84 | else: 85 | unit = "sec" 86 | df.at[library, dataset] = ( 87 | f"{np.mean(times_):.2f} ± {np.std(times_):.2f} {unit}" 88 | ) 89 | 90 | df.to_csv(HERE / "midi_read.csv") 91 | df.to_markdown(HERE / "midi_read.md") 92 | df.to_latex(HERE / "midi_read.txt") 93 | 94 | 95 | if __name__ == "__main__": 96 | benchmark_midi_parsing() 97 | -------------------------------------------------------------------------------- /benchmarks/midi_file_read/midi_read.csv: -------------------------------------------------------------------------------- 1 | ,Maestro,MMD,POP909 2 | Symusic,1.06 ± 0.89 ms,0.37 ± 0.32 ms,0.20 ± 0.05 ms 3 | MidiToolkit,0.11 ± 0.10 sec,0.04 ± 0.04 sec,0.02 ± 0.01 sec 4 | Pretty MIDI,0.11 ± 0.10 sec,0.04 ± 0.04 sec,0.02 ± 0.01 sec 5 | -------------------------------------------------------------------------------- /benchmarks/midi_file_read/midi_read.md: -------------------------------------------------------------------------------- 1 | | | Maestro | MMD | POP909 | 2 | |:------------|:----------------|:----------------|:----------------| 3 | | Symusic | 1.06 ± 0.89 ms | 0.37 ± 0.32 ms | 0.20 ± 0.05 ms | 4 | | MidiToolkit | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec | 5 | | Pretty MIDI | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec | 6 | -------------------------------------------------------------------------------- /benchmarks/midi_file_read/midi_read.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llll} 2 | \toprule 3 | & Maestro & MMD & POP909 \\ 4 | \midrule 5 | Symusic & 1.06 ± 0.89 ms & 0.37 ± 0.32 ms & 0.20 ± 0.05 ms \\ 6 | MidiToolkit & 0.11 ± 0.10 sec & 0.04 ± 0.04 sec & 0.02 ± 0.01 sec \\ 7 | Pretty MIDI & 0.11 ± 0.10 sec & 0.04 ± 0.04 sec & 0.02 ± 0.01 sec \\ 8 | \bottomrule 9 | \end{tabular} 10 | -------------------------------------------------------------------------------- /benchmarks/miditok_preprocess_file/README.md: -------------------------------------------------------------------------------- 1 | # MidiTok preprocessing 2 | 3 | This benchmark measures the preprocessing times of MIDI files, performed by MidiTok with the `tokenizer.preprocess_score` method. 4 | 5 | ## Configuration 6 | 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1 8 | 9 | * Maximum number of files per dataset for analysis: 1k 10 | * Using tempo, time signature, sustain pedal and pitch bend tokens 11 | 12 | ## Results 13 | 14 | | | symusic version | Maestro - REMI | Maestro - TSD | Maestro - MIDILike | Maestro - Structured | MMD - REMI | MMD - TSD | MMD - MIDILike | MMD - Structured | POP909 - REMI | POP909 - TSD | POP909 - MIDILike | POP909 - Structured | 15 | |:--------------|:------------------|:-----------------|:----------------|:---------------------|:-----------------------|:-------------|:-------------|:-----------------|:-------------------|:----------------|:---------------|:--------------------|:----------------------| 16 | | miditok 3.0.3 | 0.4.5 | 0.64±0.36 ms | 0.62±0.35 ms | 0.47±0.25 ms | 0.46±0.32 ms | 1.55±3.68 ms | 1.54±3.68 ms | 1.40±3.63 ms | 0.40±0.51 ms | 0.32±0.07 ms | 0.30±0.07 ms | 0.24±0.06 ms | 0.16±0.03 ms | 17 | -------------------------------------------------------------------------------- /benchmarks/miditok_preprocess_file/benchmark_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 python 2 | 3 | """Measure the average MIDI preprocessing speed.""" 4 | 5 | from __future__ import annotations 6 | 7 | from importlib.metadata import version 8 | from pathlib import Path 9 | from time import time 10 | 11 | import miditok 12 | import numpy as np 13 | from miditok.constants import SCORE_LOADING_EXCEPTION 14 | from pandas import DataFrame, read_csv 15 | from symusic import Score 16 | from tqdm import tqdm 17 | 18 | from benchmarks.utils import mean_std_str 19 | 20 | TOKENIZER_CONFIG_KWARGS = { 21 | "use_tempos": True, 22 | "use_time_signatures": True, 23 | "use_sustain_pedals": True, 24 | "use_pitch_bends": True, 25 | "log_tempos": True, 26 | "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2}, 27 | "delete_equal_successive_time_sig_changes": True, 28 | "delete_equal_successive_tempo_changes": True, 29 | } 30 | 31 | HERE = Path(__file__).parent 32 | TOKENIZATIONS = ["REMI", "TSD", "MIDILike", "Structured"] 33 | DATASETS = ["Maestro", "MMD", "POP909"] 34 | MAX_NUM_FILES = 1000 35 | 36 | 37 | def benchmark_preprocess() -> None: 38 | r"""Read MIDI files and call `tokenizer.preprocess_score` on them.""" 39 | results_path = HERE / "preprocess.csv" 40 | if results_path.is_file(): 41 | df = read_csv(results_path, index_col=0) 42 | else: 43 | columns = ["symusic version"] + [ 44 | f"{dataset} - {tokenization}" 45 | for dataset in DATASETS 46 | for tokenization in TOKENIZATIONS 47 | ] 48 | df = DataFrame(index=[], columns=columns) 49 | 50 | # Add a row to the dataframe 51 | index_name = f"miditok {version('miditok')}" 52 | df.at[index_name, "symusic version"] = version("symusic") 53 | 54 | for dataset in DATASETS: 55 | files_paths = list( 56 | (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid") 57 | )[:MAX_NUM_FILES] 58 | for tokenization in TOKENIZATIONS: 59 | col_name = f"{dataset} - {tokenization}" 60 | tok_config = miditok.TokenizerConfig(**TOKENIZER_CONFIG_KWARGS) 61 | tokenizer = getattr(miditok, tokenization)(tok_config) 62 | 63 | times = [] 64 | for midi_path in tqdm(files_paths): 65 | try: 66 | midi = Score(midi_path) 67 | except SCORE_LOADING_EXCEPTION: 68 | continue 69 | t0 = time() 70 | tokenizer.preprocess_score(midi) 71 | times.append(time() - t0) 72 | 73 | times = np.array(times) * 1e3 74 | df.at[index_name, col_name] = f"{mean_std_str(times, 2)} ms" 75 | 76 | df.to_csv(HERE / "preprocess.csv") 77 | df.to_markdown(HERE / "preprocess.md") 78 | df.to_latex(HERE / "preprocess.txt") 79 | 80 | 81 | if __name__ == "__main__": 82 | benchmark_preprocess() 83 | -------------------------------------------------------------------------------- /benchmarks/miditok_preprocess_file/preprocess.csv: -------------------------------------------------------------------------------- 1 | ,symusic version,Maestro - REMI,Maestro - TSD,Maestro - MIDILike,Maestro - Structured,MMD - REMI,MMD - TSD,MMD - MIDILike,MMD - Structured,POP909 - REMI,POP909 - TSD,POP909 - MIDILike,POP909 - Structured 2 | miditok 3.0.3,0.4.5,0.64±0.36 ms,0.62±0.35 ms,0.47±0.25 ms,0.46±0.32 ms,1.55±3.68 ms,1.54±3.68 ms,1.40±3.63 ms,0.40±0.51 ms,0.32±0.07 ms,0.30±0.07 ms,0.24±0.06 ms,0.16±0.03 ms 3 | -------------------------------------------------------------------------------- /benchmarks/miditok_preprocess_file/preprocess.md: -------------------------------------------------------------------------------- 1 | | | symusic version | Maestro - REMI | Maestro - TSD | Maestro - MIDILike | Maestro - Structured | MMD - REMI | MMD - TSD | MMD - MIDILike | MMD - Structured | POP909 - REMI | POP909 - TSD | POP909 - MIDILike | POP909 - Structured | 2 | |:--------------|:------------------|:-----------------|:----------------|:---------------------|:-----------------------|:-------------|:-------------|:-----------------|:-------------------|:----------------|:---------------|:--------------------|:----------------------| 3 | | miditok 3.0.3 | 0.4.5 | 0.64±0.36 ms | 0.62±0.35 ms | 0.47±0.25 ms | 0.46±0.32 ms | 1.55±3.68 ms | 1.54±3.68 ms | 1.40±3.63 ms | 0.40±0.51 ms | 0.32±0.07 ms | 0.30±0.07 ms | 0.24±0.06 ms | 0.16±0.03 ms | 4 | -------------------------------------------------------------------------------- /benchmarks/miditok_preprocess_file/preprocess.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llllllllllllll} 2 | \toprule 3 | & symusic version & Maestro - REMI & Maestro - TSD & Maestro - MIDILike & Maestro - Structured & MMD - REMI & MMD - TSD & MMD - MIDILike & MMD - Structured & POP909 - REMI & POP909 - TSD & POP909 - MIDILike & POP909 - Structured \\ 4 | \midrule 5 | miditok 3.0.3 & 0.4.5 & 0.64±0.36 ms & 0.62±0.35 ms & 0.47±0.25 ms & 0.46±0.32 ms & 1.55±3.68 ms & 1.54±3.68 ms & 1.40±3.63 ms & 0.40±0.51 ms & 0.32±0.07 ms & 0.30±0.07 ms & 0.24±0.06 ms & 0.16±0.03 ms \\ 6 | \bottomrule 7 | \end{tabular} 8 | -------------------------------------------------------------------------------- /benchmarks/miditok_tokenize/README.md: -------------------------------------------------------------------------------- 1 | # Tokenization times 2 | 3 | This benchmark measures the tokenization times of MIDI files from the [Maestro](https://magenta.tensorflow.org/datasets/maestro), [Lakh](https://colinraffel.com/projects/lmd/) and [POP909](https://arxiv.org/abs/2008.07142) datasets. 4 | 5 | ## Configuration 6 | 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1 8 | 9 | * miditok: v3.0.3 10 | * symusic: v0.4.5 11 | * tokenizers: v0.19.0 12 | * numpy: v1.26.4 13 | 14 | * Maximum number of files per dataset for analysis: 1k 15 | * Using tempo, time signature, sustain pedal and pitch bend tokens 16 | 17 | ## Results 18 | 19 | | | Maestro | MMD | POP909 | 20 | |:-----------|:---------------|:---------------|:--------------| 21 | | REMI | 38.97±32.92 ms | 24.55±52.25 ms | 11.00±7.73 ms | 22 | | TSD | 52.62±41.59 ms | 31.70±73.93 ms | 13.35±7.66 ms | 23 | | MIDILike | 61.75±48.27 ms | 36.28±76.87 ms | 17.77±8.91 ms | 24 | | Structured | 60.38±46.78 ms | 35.85±88.48 ms | 16.56±8.62 ms | 25 | -------------------------------------------------------------------------------- /benchmarks/miditok_tokenize/benchmark_tokenize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 python 2 | 3 | """Measure the average MIDI tokenization speed.""" 4 | 5 | from __future__ import annotations 6 | 7 | from pathlib import Path 8 | from time import time 9 | 10 | import miditok 11 | import numpy as np 12 | from miditok.constants import SCORE_LOADING_EXCEPTION 13 | from pandas import DataFrame, read_csv 14 | from symusic import Score 15 | from tqdm import tqdm 16 | 17 | from benchmarks import mean_std_str 18 | 19 | TOKENIZER_CONFIG_KWARGS = { 20 | "use_tempos": True, 21 | "use_time_signatures": True, 22 | "use_sustain_pedals": True, 23 | "use_pitch_bends": True, 24 | "log_tempos": True, 25 | "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2}, 26 | "delete_equal_successive_time_sig_changes": True, 27 | "delete_equal_successive_tempo_changes": True, 28 | } 29 | 30 | HERE = Path(__file__).parent 31 | TOKENIZATIONS = ["REMI", "TSD", "MIDILike", "Structured"] 32 | DATASETS = ["Maestro", "MMD", "POP909"] 33 | MAX_NUM_FILES = 1000 34 | 35 | 36 | def benchmark_tokenize() -> None: 37 | r"""Read MIDI files and tokenize them.""" 38 | results_path = HERE / "tokenize.csv" 39 | if results_path.is_file(): 40 | df = read_csv(results_path, index_col=0) 41 | else: 42 | df = DataFrame(index=TOKENIZATIONS, columns=DATASETS) 43 | 44 | for dataset in DATASETS: 45 | midi_paths = list( 46 | (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid") 47 | )[:MAX_NUM_FILES] 48 | for tokenization in TOKENIZATIONS: 49 | tok_config = miditok.TokenizerConfig(**TOKENIZER_CONFIG_KWARGS) 50 | tokenizer = getattr(miditok, tokenization)(tok_config) 51 | 52 | times = [] 53 | for midi_path in tqdm(midi_paths): 54 | try: 55 | midi = Score(midi_path) 56 | except SCORE_LOADING_EXCEPTION: 57 | continue 58 | t0 = time() 59 | tokenizer.encode(midi) 60 | times.append(time() - t0) 61 | 62 | times = np.array(times) * 1e3 63 | df.at[tokenization, dataset] = f"{mean_std_str(times, 2)} ms" 64 | 65 | df.to_csv(HERE / "tokenize.csv") 66 | df.to_markdown(HERE / "tokenize.md") 67 | df.to_latex(HERE / "tokenize.txt") 68 | 69 | 70 | if __name__ == "__main__": 71 | benchmark_tokenize() 72 | -------------------------------------------------------------------------------- /benchmarks/miditok_tokenize/tokenize.csv: -------------------------------------------------------------------------------- 1 | ,Maestro,MMD,POP909 2 | REMI,38.97±32.92 ms,24.55±52.25 ms,11.00±7.73 ms 3 | TSD,52.62±41.59 ms,31.70±73.93 ms,13.35±7.66 ms 4 | MIDILike,61.75±48.27 ms,36.28±76.87 ms,17.77±8.91 ms 5 | Structured,60.38±46.78 ms,35.85±88.48 ms,16.56±8.62 ms 6 | -------------------------------------------------------------------------------- /benchmarks/miditok_tokenize/tokenize.md: -------------------------------------------------------------------------------- 1 | | | Maestro | MMD | POP909 | 2 | |:-----------|:---------------|:---------------|:--------------| 3 | | REMI | 38.97±32.92 ms | 24.55±52.25 ms | 11.00±7.73 ms | 4 | | TSD | 52.62±41.59 ms | 31.70±73.93 ms | 13.35±7.66 ms | 5 | | MIDILike | 61.75±48.27 ms | 36.28±76.87 ms | 17.77±8.91 ms | 6 | | Structured | 60.38±46.78 ms | 35.85±88.48 ms | 16.56±8.62 ms | 7 | -------------------------------------------------------------------------------- /benchmarks/miditok_tokenize/tokenize.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llll} 2 | \toprule 3 | & Maestro & MMD & POP909 \\ 4 | \midrule 5 | REMI & 38.97±32.92 ms & 24.55±52.25 ms & 11.00±7.73 ms \\ 6 | TSD & 52.62±41.59 ms & 31.70±73.93 ms & 13.35±7.66 ms \\ 7 | MIDILike & 61.75±48.27 ms & 36.28±76.87 ms & 17.77±8.91 ms \\ 8 | Structured & 60.38±46.78 ms & 35.85±88.48 ms & 16.56±8.62 ms \\ 9 | \bottomrule 10 | \end{tabular} 11 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/README.md: -------------------------------------------------------------------------------- 1 | # Tokenizer training benchmark 2 | 3 | This benchmark aims to measure the training speeds of the different tokenization algorithms, as well as their encoding-decoding speeds, sequence length reduction, and the impact of some other strategies such as spitting the tokens per bars or beats. 4 | 5 | ## Configuration 6 | 7 | ### Hardware 8 | 9 | Apple M1 Pro, 16GB of memory, macOS 14.4.1 10 | 11 | ### Software 12 | 13 | * miditok: v3.0.3 14 | * symusic: v0.4.5 15 | * tokenizers: v0.19.0 16 | * numpy: v1.26.4 17 | 18 | ### Parameters 19 | 20 | * Maximum number of files per dataset for training: 20k 21 | * Maximum number of files per dataset for analysis: 2k 22 | * Using tempo, time signature, rests, sustain pedal and pitch bend tokens 23 | 24 | ## Training times 25 | 26 | ## Splitting ids per bars and beats 27 | 28 | Measures the sequence length of subsequence obtained when splitting the token sequences of whole music files per bars or beats. 29 | 30 | | | Maestro | Lakh | Lakh monotrack | 31 | |:----------------|:------------------|:---------------------|:------------------| 32 | | REMI - bar | 74.7±45.8 (↑ 460) | 107.1±129.6 (↑ 2525) | 12.5±24.1 (↑ 624) | 33 | | REMI - beat | 18.7±13.1 (↑ 190) | 27.4±34.5 (↑ 659) | 3.3±6.6 (↑ 307) | 34 | | TSD - bar | 70.9±44.3 (↑ 456) | 105.7±128.8 (↑ 2521) | 11.2±22.3 (↑ 623) | 35 | | TSD - beat | 17.7±12.7 (↑ 188) | 27.1±34.2 (↑ 658) | 2.9±6.1 (↑ 306) | 36 | | MIDILike - bar | 77.5±45.9 (↑ 461) | 133.7±163.5 (↑ 3154) | 11.7±23.8 (↑ 624) | 37 | | MIDILike - beat | 19.4±12.8 (↑ 183) | 34.2±43.1 (↑ 832) | 3.1±6.5 (↑ 317) | 38 | 39 | Main observation: beat subsequences are relatively shorts, and in average four times larger than bar sequences, as most files have 4/* time signatures. 40 | 41 | ## WordPiece `max_input_chars_per_word` impact 42 | 43 | Analyze the impact of the `max_input_chars_per_word` parameter of the WordPiece model, on training and encoding times. 44 | The vocabulary size used here is 20k. 45 | 46 | ### Training time 47 | 48 | | | Maestro no-split | Maestro bar-split | Maestro beat-split | Lakh multitrack no-split | Lakh multitrack bar-split | Lakh multitrack beat-split | 49 | |-----:|:-------------------|:--------------------|:---------------------|:---------------------------|:----------------------------|:-----------------------------| 50 | | 100 | 131.9 sec | 88.2 sec | 99.3 sec | 1216.5 sec | 1463.9 sec | 1538.3 sec | 51 | | 200 | 128.4 sec | 88.2 sec | 98.2 sec | 1140.3 sec | 1283.4 sec | 1505.6 sec | 52 | | 500 | 128.1 sec | 86.6 sec | 98.2 sec | 1171.8 sec | 1457.4 sec | 1604.2 sec | 53 | | 1000 | 127.8 sec | 86.4 sec | 97.0 sec | 1131.1 sec | 1390.0 sec | 1620.8 sec | 54 | | 2000 | 128.5 sec | 86.0 sec | 96.7 sec | 1238.1 sec | 1431.2 sec | 1495.7 sec | 55 | | 5000 | 127.1 sec | 85.5 sec | 96.7 sec | 1229.0 sec | 1543.7 sec | 1709.8 sec | 56 | 57 | `max_input_chars_per_word` has almost no impact on the training time. 58 | 59 | ### Encoding time and ratio of "unknown token" 60 | 61 | | | Maestro no-split | Maestro bar-split | Maestro beat-split | Lakh multitrack no-split | Lakh multitrack bar-split | Lakh multitrack beat-split | 62 | |-----:|:--------------------------|:--------------------------|:--------------------------|:---------------------------|:----------------------------|:-----------------------------| 63 | | 100 | 0.0030±0.0022 (1.000 unk) | 0.0195±0.0156 (0.001 unk) | 0.0238±0.0200 (0.000 unk) | 0.0003±0.0004 (0.937 unk) | 0.0026±0.0159 (0.007 unk) | 0.0044±0.0495 (0.007 unk) | 64 | | 200 | 0.0030±0.0022 (1.000 unk) | 0.0416±0.0332 (0.000 unk) | 0.0239±0.0199 (0.000 unk) | 0.0004±0.0005 (0.866 unk) | 0.0027±0.0146 (0.007 unk) | 0.0038±0.0475 (0.007 unk) | 65 | | 500 | 0.0029±0.0022 (1.000 unk) | 0.0443±0.0365 (0.000 unk) | 0.0235±0.0197 (0.000 unk) | 0.0010±0.0016 (0.698 unk) | 0.0029±0.0156 (0.007 unk) | 0.0038±0.0466 (0.007 unk) | 66 | | 1000 | 0.0030±0.0022 (0.999 unk) | 0.0442±0.0366 (0.000 unk) | 0.0236±0.0202 (0.000 unk) | 0.0057±0.0115 (0.513 unk) | 0.0032±0.0165 (0.007 unk) | 0.0039±0.0478 (0.007 unk) | 67 | | 2000 | 0.0037±0.0127 (0.996 unk) | 0.0442±0.0364 (0.000 unk) | 0.0232±0.0194 (0.000 unk) | 0.0405±0.0771 (0.301 unk) | 0.0029±0.0159 (0.007 unk) | 0.0042±0.0475 (0.007 unk) | 68 | | 5000 | 0.1209±0.6198 (0.955 unk) | 0.0440±0.0363 (0.000 unk) | 0.0238±0.0208 (0.000 unk) | 0.3539±0.8183 (0.102 unk) | 0.0034±0.0174 (0.007 unk) | 0.0043±0.0501 (0.007 unk) | 69 | 70 | `max_input_chars_per_word` has however a significant negative impact on the encoding time of the token ids. 71 | The ratios of unknown tokens also highlight the **importance of splitting the token ids per bars or beats**. Not doing so results in either a high proportion of unknown tokens with low `max_input_chars_per_word` values thus loosing data integrity, or with very high encoding times for high `max_input_chars_per_word` values. 72 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/seq_split_lengths.csv: -------------------------------------------------------------------------------- 1 | ,Maestro,Lakh, Lakh monotrack 2 | REMI - bar,74.7±45.8 (↑ 460),107.1±129.6 (↑ 2525),12.5±24.1 (↑ 624) 3 | REMI - beat,18.7±13.1 (↑ 190),27.4±34.5 (↑ 659),3.3±6.6 (↑ 307) 4 | TSD - bar,70.9±44.3 (↑ 456),105.7±128.8 (↑ 2521),11.2±22.3 (↑ 623) 5 | TSD - beat,17.7±12.7 (↑ 188),27.1±34.2 (↑ 658),2.9±6.1 (↑ 306) 6 | MIDILike - bar,77.5±45.9 (↑ 461),133.7±163.5 (↑ 3154),11.7±23.8 (↑ 624) 7 | MIDILike - beat,19.4±12.8 (↑ 183),34.2±43.1 (↑ 832),3.1±6.5 (↑ 317) 8 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/seq_split_lengths.md: -------------------------------------------------------------------------------- 1 | | | Maestro | Lakh | Lakh monotrack | 2 | |:----------------|:------------------|:---------------------|:------------------| 3 | | REMI - bar | 74.7±45.8 (↑ 460) | 107.1±129.6 (↑ 2525) | 12.5±24.1 (↑ 624) | 4 | | REMI - beat | 18.7±13.1 (↑ 190) | 27.4±34.5 (↑ 659) | 3.3±6.6 (↑ 307) | 5 | | TSD - bar | 70.9±44.3 (↑ 456) | 105.7±128.8 (↑ 2521) | 11.2±22.3 (↑ 623) | 6 | | TSD - beat | 17.7±12.7 (↑ 188) | 27.1±34.2 (↑ 658) | 2.9±6.1 (↑ 306) | 7 | | MIDILike - bar | 77.5±45.9 (↑ 461) | 133.7±163.5 (↑ 3154) | 11.7±23.8 (↑ 624) | 8 | | MIDILike - beat | 19.4±12.8 (↑ 183) | 34.2±43.1 (↑ 832) | 3.1±6.5 (↑ 317) | 9 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/seq_split_lengths.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llll} 2 | \toprule 3 | & Maestro & Lakh & Lakh monotrack \\ 4 | \midrule 5 | REMI - bar & 74.7±45.8 (↑ 460) & 107.1±129.6 (↑ 2525) & 12.5±24.1 (↑ 624) \\ 6 | REMI - beat & 18.7±13.1 (↑ 190) & 27.4±34.5 (↑ 659) & 3.3±6.6 (↑ 307) \\ 7 | TSD - bar & 70.9±44.3 (↑ 456) & 105.7±128.8 (↑ 2521) & 11.2±22.3 (↑ 623) \\ 8 | TSD - beat & 17.7±12.7 (↑ 188) & 27.1±34.2 (↑ 658) & 2.9±6.1 (↑ 306) \\ 9 | MIDILike - bar & 77.5±45.9 (↑ 461) & 133.7±163.5 (↑ 3154) & 11.7±23.8 (↑ 624) \\ 10 | MIDILike - beat & 19.4±12.8 (↑ 183) & 34.2±43.1 (↑ 832) & 3.1±6.5 (↑ 317) \\ 11 | \bottomrule 12 | \end{tabular} 13 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.csv: -------------------------------------------------------------------------------- 1 | ,Maestro no-split,Maestro bar-split,Maestro beat-split,Lakh multitrack no-split,Lakh multitrack bar-split,Lakh multitrack beat-split 2 | 100,0.0030±0.0022 (1.000 unk),0.0195±0.0156 (0.001 unk),0.0238±0.0200 (0.000 unk),0.0003±0.0004 (0.937 unk),0.0026±0.0159 (0.007 unk),0.0044±0.0495 (0.007 unk) 3 | 200,0.0030±0.0022 (1.000 unk),0.0416±0.0332 (0.000 unk),0.0239±0.0199 (0.000 unk),0.0004±0.0005 (0.866 unk),0.0027±0.0146 (0.007 unk),0.0038±0.0475 (0.007 unk) 4 | 500,0.0029±0.0022 (1.000 unk),0.0443±0.0365 (0.000 unk),0.0235±0.0197 (0.000 unk),0.0010±0.0016 (0.698 unk),0.0029±0.0156 (0.007 unk),0.0038±0.0466 (0.007 unk) 5 | 1000,0.0030±0.0022 (0.999 unk),0.0442±0.0366 (0.000 unk),0.0236±0.0202 (0.000 unk),0.0057±0.0115 (0.513 unk),0.0032±0.0165 (0.007 unk),0.0039±0.0478 (0.007 unk) 6 | 2000,0.0037±0.0127 (0.996 unk),0.0442±0.0364 (0.000 unk),0.0232±0.0194 (0.000 unk),0.0405±0.0771 (0.301 unk),0.0029±0.0159 (0.007 unk),0.0042±0.0475 (0.007 unk) 7 | 5000,0.1209±0.6198 (0.955 unk),0.0440±0.0363 (0.000 unk),0.0238±0.0208 (0.000 unk),0.3539±0.8183 (0.102 unk),0.0034±0.0174 (0.007 unk),0.0043±0.0501 (0.007 unk) 8 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.md: -------------------------------------------------------------------------------- 1 | | | Maestro no-split | Maestro bar-split | Maestro beat-split | Lakh multitrack no-split | Lakh multitrack bar-split | Lakh multitrack beat-split | 2 | |-----:|:--------------------------|:--------------------------|:--------------------------|:---------------------------|:----------------------------|:-----------------------------| 3 | | 100 | 0.0030±0.0022 (1.000 unk) | 0.0195±0.0156 (0.001 unk) | 0.0238±0.0200 (0.000 unk) | 0.0003±0.0004 (0.937 unk) | 0.0026±0.0159 (0.007 unk) | 0.0044±0.0495 (0.007 unk) | 4 | | 200 | 0.0030±0.0022 (1.000 unk) | 0.0416±0.0332 (0.000 unk) | 0.0239±0.0199 (0.000 unk) | 0.0004±0.0005 (0.866 unk) | 0.0027±0.0146 (0.007 unk) | 0.0038±0.0475 (0.007 unk) | 5 | | 500 | 0.0029±0.0022 (1.000 unk) | 0.0443±0.0365 (0.000 unk) | 0.0235±0.0197 (0.000 unk) | 0.0010±0.0016 (0.698 unk) | 0.0029±0.0156 (0.007 unk) | 0.0038±0.0466 (0.007 unk) | 6 | | 1000 | 0.0030±0.0022 (0.999 unk) | 0.0442±0.0366 (0.000 unk) | 0.0236±0.0202 (0.000 unk) | 0.0057±0.0115 (0.513 unk) | 0.0032±0.0165 (0.007 unk) | 0.0039±0.0478 (0.007 unk) | 7 | | 2000 | 0.0037±0.0127 (0.996 unk) | 0.0442±0.0364 (0.000 unk) | 0.0232±0.0194 (0.000 unk) | 0.0405±0.0771 (0.301 unk) | 0.0029±0.0159 (0.007 unk) | 0.0042±0.0475 (0.007 unk) | 8 | | 5000 | 0.1209±0.6198 (0.955 unk) | 0.0440±0.0363 (0.000 unk) | 0.0238±0.0208 (0.000 unk) | 0.3539±0.8183 (0.102 unk) | 0.0034±0.0174 (0.007 unk) | 0.0043±0.0501 (0.007 unk) | 9 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lllllll} 2 | \toprule 3 | & Maestro no-split & Maestro bar-split & Maestro beat-split & Lakh multitrack no-split & Lakh multitrack bar-split & Lakh multitrack beat-split \\ 4 | \midrule 5 | 100 & 0.0030±0.0022 (1.000 unk) & 0.0195±0.0156 (0.001 unk) & 0.0238±0.0200 (0.000 unk) & 0.0003±0.0004 (0.937 unk) & 0.0026±0.0159 (0.007 unk) & 0.0044±0.0495 (0.007 unk) \\ 6 | 200 & 0.0030±0.0022 (1.000 unk) & 0.0416±0.0332 (0.000 unk) & 0.0239±0.0199 (0.000 unk) & 0.0004±0.0005 (0.866 unk) & 0.0027±0.0146 (0.007 unk) & 0.0038±0.0475 (0.007 unk) \\ 7 | 500 & 0.0029±0.0022 (1.000 unk) & 0.0443±0.0365 (0.000 unk) & 0.0235±0.0197 (0.000 unk) & 0.0010±0.0016 (0.698 unk) & 0.0029±0.0156 (0.007 unk) & 0.0038±0.0466 (0.007 unk) \\ 8 | 1000 & 0.0030±0.0022 (0.999 unk) & 0.0442±0.0366 (0.000 unk) & 0.0236±0.0202 (0.000 unk) & 0.0057±0.0115 (0.513 unk) & 0.0032±0.0165 (0.007 unk) & 0.0039±0.0478 (0.007 unk) \\ 9 | 2000 & 0.0037±0.0127 (0.996 unk) & 0.0442±0.0364 (0.000 unk) & 0.0232±0.0194 (0.000 unk) & 0.0405±0.0771 (0.301 unk) & 0.0029±0.0159 (0.007 unk) & 0.0042±0.0475 (0.007 unk) \\ 10 | 5000 & 0.1209±0.6198 (0.955 unk) & 0.0440±0.0363 (0.000 unk) & 0.0238±0.0208 (0.000 unk) & 0.3539±0.8183 (0.102 unk) & 0.0034±0.0174 (0.007 unk) & 0.0043±0.0501 (0.007 unk) \\ 11 | \bottomrule 12 | \end{tabular} 13 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.csv: -------------------------------------------------------------------------------- 1 | ,Maestro no-split,Maestro bar-split,Maestro beat-split,Lakh multitrack no-split,Lakh multitrack bar-split,Lakh multitrack beat-split 2 | 100,131.9 sec,88.2 sec,99.3 sec,1216.5 sec,1463.9 sec,1538.3 sec 3 | 200,128.4 sec,88.2 sec,98.2 sec,1140.3 sec,1283.4 sec,1505.6 sec 4 | 500,128.1 sec,86.6 sec,98.2 sec,1171.8 sec,1457.4 sec,1604.2 sec 5 | 1000,127.8 sec,86.4 sec,97.0 sec,1131.1 sec,1390.0 sec,1620.8 sec 6 | 2000,128.5 sec,86.0 sec,96.7 sec,1238.1 sec,1431.2 sec,1495.7 sec 7 | 5000,127.1 sec,85.5 sec,96.7 sec,1229.0 sec,1543.7 sec,1709.8 sec 8 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.md: -------------------------------------------------------------------------------- 1 | | | Maestro no-split | Maestro bar-split | Maestro beat-split | Lakh multitrack no-split | Lakh multitrack bar-split | Lakh multitrack beat-split | 2 | |-----:|:-------------------|:--------------------|:---------------------|:---------------------------|:----------------------------|:-----------------------------| 3 | | 100 | 131.9 sec | 88.2 sec | 99.3 sec | 1216.5 sec | 1463.9 sec | 1538.3 sec | 4 | | 200 | 128.4 sec | 88.2 sec | 98.2 sec | 1140.3 sec | 1283.4 sec | 1505.6 sec | 5 | | 500 | 128.1 sec | 86.6 sec | 98.2 sec | 1171.8 sec | 1457.4 sec | 1604.2 sec | 6 | | 1000 | 127.8 sec | 86.4 sec | 97.0 sec | 1131.1 sec | 1390.0 sec | 1620.8 sec | 7 | | 2000 | 128.5 sec | 86.0 sec | 96.7 sec | 1238.1 sec | 1431.2 sec | 1495.7 sec | 8 | | 5000 | 127.1 sec | 85.5 sec | 96.7 sec | 1229.0 sec | 1543.7 sec | 1709.8 sec | 9 | -------------------------------------------------------------------------------- /benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.txt: -------------------------------------------------------------------------------- 1 | \begin{tabular}{lllllll} 2 | \toprule 3 | & Maestro no-split & Maestro bar-split & Maestro beat-split & Lakh multitrack no-split & Lakh multitrack bar-split & Lakh multitrack beat-split \\ 4 | \midrule 5 | 100 & 131.9 sec & 88.2 sec & 99.3 sec & 1216.5 sec & 1463.9 sec & 1538.3 sec \\ 6 | 200 & 128.4 sec & 88.2 sec & 98.2 sec & 1140.3 sec & 1283.4 sec & 1505.6 sec \\ 7 | 500 & 128.1 sec & 86.6 sec & 98.2 sec & 1171.8 sec & 1457.4 sec & 1604.2 sec \\ 8 | 1000 & 127.8 sec & 86.4 sec & 97.0 sec & 1131.1 sec & 1390.0 sec & 1620.8 sec \\ 9 | 2000 & 128.5 sec & 86.0 sec & 96.7 sec & 1238.1 sec & 1431.2 sec & 1495.7 sec \\ 10 | 5000 & 127.1 sec & 85.5 sec & 96.7 sec & 1229.0 sec & 1543.7 sec & 1709.8 sec \\ 11 | \bottomrule 12 | \end{tabular} 13 | -------------------------------------------------------------------------------- /benchmarks/utils.py: -------------------------------------------------------------------------------- 1 | """Utils methods for benchmarks.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | 8 | def mean_std_str( 9 | dist: np.array | list[int | float], num_dec: int = 2, latex_pm: bool = False 10 | ) -> str: 11 | r""" 12 | Create a nice looking mean and standard deviation string of a distribution. 13 | 14 | :param dist: distribution to measure. 15 | :param num_dec: number of decimals to keep. (default: ``2``) 16 | :param latex_pm: whether to represent the "±" symbol with LaTeX command ("$\pm$"). 17 | (default: ``False``) 18 | :return: string of the average and standard deviation of the distribution. 19 | """ 20 | if not isinstance(dist, np.ndarray): 21 | dist = np.array(dist) 22 | mean, std = float(np.mean(dist)), float(np.std(dist)) 23 | if latex_pm: 24 | return f"{mean:.{num_dec}f}" r"$\pm$" f"{std:.{num_dec}f}" # noqa: ISC001 25 | return f"{mean:.{num_dec}f}±{std:.{num_dec}f}" 26 | -------------------------------------------------------------------------------- /colab-notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Colab Notebooks 2 | 3 | In this directory you will find Notebooks using MidiTok, from which you can take inspiration for you own projects. 4 | 5 | For beginners, we recommend to browse the **Example Hugging Face** notebook. You'll find an up-to-date, concise and full example of training a Transformer to generate music. 6 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/additional_tokens_table.csv: -------------------------------------------------------------------------------- 1 | Tokenization,Tempo,Time signature,Chord,Rest,Sustain pedal, Pitch bend, Pitch interval 2 | MIDILike,✅,✅,✅,✅,✅,✅,✅ 3 | REMI,✅,✅,✅,✅,✅,✅,✅ 4 | TSD,✅,✅,✅,✅,✅,✅,✅ 5 | Structured,❌,❌,❌,❌,❌,❌,❌ 6 | CPWord,✅,✅¹,✅,✅¹,❌,❌,❌ 7 | Octuple,✅,✅²,❌,❌,❌,❌,❌ 8 | MuMIDI,✅,❌,✅,❌,❌,❌,❌ 9 | MMM,✅,✅,✅,❌,✅,✅,✅ 10 | -------------------------------------------------------------------------------- /docs/assets/Octuple_TS_Rest/original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/Octuple_TS_Rest/original.png -------------------------------------------------------------------------------- /docs/assets/Octuple_TS_Rest/tokenized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/Octuple_TS_Rest/tokenized.png -------------------------------------------------------------------------------- /docs/assets/bases/pianoroll_daw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/pianoroll_daw.png -------------------------------------------------------------------------------- /docs/assets/bases/sheet_music.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/sheet_music.png -------------------------------------------------------------------------------- /docs/assets/bases/spectrogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/spectrogram.png -------------------------------------------------------------------------------- /docs/assets/cp_word.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/cp_word.png -------------------------------------------------------------------------------- /docs/assets/embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/embeddings.png -------------------------------------------------------------------------------- /docs/assets/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/favicon.png -------------------------------------------------------------------------------- /docs/assets/midi_like.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_like.png -------------------------------------------------------------------------------- /docs/assets/midi_preprocessing_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_preprocessing_original.png -------------------------------------------------------------------------------- /docs/assets/midi_preprocessing_preprocessed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_preprocessing_preprocessed.png -------------------------------------------------------------------------------- /docs/assets/miditok_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/miditok_logo.png -------------------------------------------------------------------------------- /docs/assets/miditok_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/assets/miditok_logo_stroke.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/miditok_logo_stroke.png -------------------------------------------------------------------------------- /docs/assets/mumidi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/mumidi.png -------------------------------------------------------------------------------- /docs/assets/music_sheet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/music_sheet.png -------------------------------------------------------------------------------- /docs/assets/octuple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/octuple.png -------------------------------------------------------------------------------- /docs/assets/pitch_intervals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/pitch_intervals.png -------------------------------------------------------------------------------- /docs/assets/pitch_intervals_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/pitch_intervals_original.png -------------------------------------------------------------------------------- /docs/assets/remi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/remi.png -------------------------------------------------------------------------------- /docs/assets/remiplus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/remiplus.png -------------------------------------------------------------------------------- /docs/assets/structured.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/structured.png -------------------------------------------------------------------------------- /docs/assets/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/transformer.png -------------------------------------------------------------------------------- /docs/assets/tsd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/tsd.png -------------------------------------------------------------------------------- /docs/attribute_controls.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Attribute Controls 3 | ======================== 4 | 5 | Attribute Controls are special tokens that allow to train a model in order to control music generation during inference. They work either at the track-level or bar-level and specifies specific attributes they featured. By being placed at the beginning of each bar or track in the token sequence, a *causal* model will condition the prediction of the next tokens based on them. At inference, these attribute control tokens can strategically be placed at the beginning of nex tracks or bars in order to condition the generated results. 6 | 7 | Attribute controls are not compatible with "multi-vocabulary" (e.g. Octuple) or multitrack "one token stream" tokenizers. 8 | 9 | To train tokenizers and models with attribute control tokens, you can use the :class:`miditok.TokTrainingIterator` and :class:`miditok.pytorch_data.DatasetMIDI` respectively. 10 | 11 | .. automodule:: miditok.attribute_controls 12 | :members: 13 | 14 | Using custom attribute controls 15 | ------------------------------- 16 | 17 | You can easily add your own attribute controls to an existing tokenizer using the :py:func:`miditok.MusicTokenizer.add_attribute_control` method. You attribute control must subclass either the :class:`miditok.attribute_controls.AttributeControl` (track-level) or the :class:`miditok.attribute_controls.BarAttributeControl` classes and implement the attribute computation method. 18 | -------------------------------------------------------------------------------- /docs/citations.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Citations 3 | ================= 4 | 5 | Here you will find BibTeX citations of the original works presenting these tokenizations 6 | 7 | MidiTok 8 | ------------------------ 9 | 10 | .. code-block:: bib 11 | 12 | @inproceedings{miditok2021, 13 | title={{MidiTok}: A Python package for {MIDI} file tokenization}, 14 | author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas}, 15 | booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference}, 16 | year={2021}, 17 | url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf}, 18 | } 19 | 20 | Tokenizer Training / Byte Pair Encoding / TSD 21 | --------------------------------------------- 22 | 23 | .. code-block:: bib 24 | 25 | @inproceedings{fradet-etal-2023-byte, 26 | title = "Byte Pair Encoding for Symbolic Music", 27 | author = "Fradet, Nathan and 28 | Gutowski, Nicolas and 29 | Chhel, Fabien and 30 | Briot, Jean-Pierre", 31 | editor = "Bouamor, Houda and 32 | Pino, Juan and 33 | Bali, Kalika", 34 | booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", 35 | month = dec, 36 | year = "2023", 37 | address = "Singapore", 38 | publisher = "Association for Computational Linguistics", 39 | url = "https://aclanthology.org/2023.emnlp-main.123", 40 | doi = "10.18653/v1/2023.emnlp-main.123", 41 | pages = "2001--2020", 42 | } 43 | 44 | REMI (Pop Music Transformer) 45 | ---------------------------- 46 | 47 | .. code-block:: bib 48 | 49 | @inproceedings{huang_remi_2020, 50 | author = {Huang, Yu-Siang and Yang, Yi-Hsuan}, 51 | title = {Pop Music Transformer: Beat-Based Modeling and Generation of Expressive Pop Piano Compositions}, 52 | year = {2020}, 53 | isbn = {9781450379885}, 54 | publisher = {Association for Computing Machinery}, 55 | address = {New York, NY, USA}, 56 | url = {https://doi.org/10.1145/3394171.3413671}, 57 | doi = {10.1145/3394171.3413671}, 58 | booktitle = {Proceedings of the 28th ACM International Conference on Multimedia}, 59 | pages = {1180–1188}, 60 | numpages = {9}, 61 | keywords = {transformer, neural sequence model, automatic music composition}, 62 | location = {Seattle, WA, USA}, 63 | series = {MM '20} 64 | } 65 | 66 | MIDI-Like (This Time with feeling) 67 | ---------------------------------- 68 | 69 | .. code-block:: bib 70 | 71 | @article{oore_midilike_2018, 72 | author={Sageev Oore and Ian Simon and Sander Dieleman and Douglas Eck and Karen Simonyan}, 73 | title={This Time with Feeling: Learning Expressive Musical Performance}, 74 | journal={Neural Computing and Applications}, 75 | volume={32}, 76 | year={2018}, 77 | pages={955–967}, 78 | url={https://link.springer.com/article/10.1007/s00521-018-3758-9}, 79 | publisher={Springer} 80 | } 81 | 82 | Structured (Piano Inpainting Application) 83 | ----------------------------------------- 84 | 85 | .. code-block:: bib 86 | 87 | @misc{pia2021hadjeres, 88 | title={The Piano Inpainting Application}, 89 | author={Gaëtan Hadjeres and Léopold Crestel}, 90 | year={2021}, 91 | eprint={2107.05944}, 92 | archivePrefix={arXiv}, 93 | primaryClass={cs.SD}, 94 | url={https://arxiv.org/abs/2107.05944}, 95 | } 96 | 97 | CPWord (Compound Word Transformer) 98 | ---------------------------------- 99 | 100 | .. code-block:: bib 101 | 102 | @article{cpword2021, 103 | title={Compound Word Transformer: Learning to Compose Full-Song Music over Dynamic Directed Hypergraphs}, 104 | volume={35}, 105 | url={https://ojs.aaai.org/index.php/AAAI/article/view/16091}, 106 | DOI={10.1609/aaai.v35i1.16091}, 107 | number={1}, 108 | journal={Proceedings of the AAAI Conference on Artificial Intelligence}, 109 | author={Hsiao, Wen-Yi and Liu, Jen-Yu and Yeh, Yin-Cheng and Yang, Yi-Hsuan}, 110 | year={2021}, 111 | month={May}, 112 | pages={178-186} 113 | } 114 | 115 | Octuple (MusicBERT) 116 | ------------------------ 117 | 118 | .. code-block:: bib 119 | 120 | @inproceedings{zeng2021musicbert, 121 | title = "{M}usic{BERT}: Symbolic Music Understanding with Large-Scale Pre-Training", 122 | author = "Zeng, Mingliang and Tan, Xu and Wang, Rui and Ju, Zeqian and Qin, Tao and Liu, Tie-Yan", 123 | booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", 124 | month = aug, 125 | year = "2021", 126 | address = "Online", 127 | publisher = "Association for Computational Linguistics", 128 | url = "https://aclanthology.org/2021.findings-acl.70", 129 | doi = "10.18653/v1/2021.findings-acl.70", 130 | pages = "791--800", 131 | } 132 | 133 | MuMIDI (PopMAG) 134 | ------------------------ 135 | 136 | .. code-block:: bib 137 | 138 | @inproceedings{popmag2020, 139 | author = {Ren, Yi and He, Jinzheng and Tan, Xu and Qin, Tao and Zhao, Zhou and Liu, Tie-Yan}, 140 | title = {PopMAG: Pop Music Accompaniment Generation}, 141 | year = {2020}, 142 | isbn = {9781450379885}, 143 | publisher = {Association for Computing Machinery}, 144 | url = {https://arxiv.org/abs/2008.07703}, 145 | doi = {10.1145/3394171.3413721}, 146 | abstract = {"MuMIDI encoding, similar to CP. 147 | Generates multitrack music, filling every track tokens in a single sequence}, 148 | booktitle = {Proceedings of the 28th ACM International Conference on Multimedia}, 149 | pages = {1198–1206}, 150 | numpages = {9}, 151 | keywords = {accompaniment generation, music representation, music generation, sequence-to-sequence model, pop music}, 152 | location = {Seattle, WA, USA} 153 | } 154 | 155 | MMM (Multi-Track Music Machine) 156 | -------------------------------- 157 | 158 | .. code-block:: bib 159 | 160 | @misc{ens2020mmm, 161 | title={MMM : Exploring Conditional Multi-Track Music Generation with the Transformer}, 162 | author={Jeff Ens and Philippe Pasquier}, 163 | year={2020}, 164 | eprint={2008.06048}, 165 | archivePrefix={arXiv}, 166 | primaryClass={cs.SD} 167 | } 168 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration file for the Sphinx documentation builder. 3 | 4 | For the full list of built-in configuration values, see the documentation: 5 | https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | """ 7 | 8 | import sys 9 | import tomllib 10 | from pathlib import Path 11 | 12 | sys.path.insert(0, str(Path("..").resolve() / "src")) 13 | 14 | # -- Project information ----------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 16 | 17 | project = "MidiTok" 18 | copyright = "2024, Nathan Fradet" # noqa: A001 19 | author = "Nathan Fradet" 20 | 21 | 22 | with (Path(__file__).parent.parent / "pyproject.toml").open("rb") as f: 23 | data = tomllib.load(f) 24 | version = data["project"]["version"] 25 | 26 | # The language for content autogenerated by Sphinx. Refer to documentation 27 | # for a list of supported languages. 28 | # 29 | # This is also used if you do content translation via gettext catalogs. 30 | # Usually you set "language" from the command line for these cases. 31 | language = "en" 32 | 33 | # -- General configuration --------------------------------------------------- 34 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 35 | 36 | extensions = [ 37 | "sphinx_copybutton", 38 | "sphinx.ext.duration", 39 | "sphinx.ext.doctest", 40 | "sphinx.ext.autodoc", 41 | "sphinx.ext.autosummary", 42 | "sphinx.ext.autosectionlabel", 43 | # "sphinxcontrib.tikz", 44 | ] 45 | 46 | templates_path = ["_templates"] 47 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 51 | 52 | html_theme = "furo" 53 | html_title = "MidiTok's docs" 54 | html_logo = "assets/miditok_logo_stroke.png" 55 | html_favicon = "assets/favicon.png" 56 | # tikz_proc_suite = "GhostScript" # required for readthedocs, produce png, not svg 57 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Tokenizer Configuration 3 | ======================= 4 | 5 | MidiTok's tokenizers can be customized with a wide variety of options, and most of the preprocessing and downsampling steps can be tailored to your specifications. 6 | 7 | Tokenizer config 8 | ------------------------ 9 | 10 | All tokenizers are initialized with common parameters, that are hold in a :class:`miditok.TokenizerConfig` object, documented below. A tokenizer's configuration can be accessed with ``tokenizer.config``. 11 | Some tokenizers might take additional specific arguments / parameters when creating them. 12 | 13 | .. autoclass:: miditok.TokenizerConfig 14 | :members: 15 | 16 | 17 | How MidiTok handles time 18 | ---------------------------- 19 | 20 | MidiTok handles time by resampling the music file's time division (time resolution) to a new resolution determined by the ``beat_res`` attribute of of :class:`miditok.TokenizerConfig`. This argument determines which time tokens are present in the vocabulary. 21 | 22 | It allows to create ``Duration`` and ``TimeShift`` tokens with different resolution depending on their values. It is typically common to use higher resolutions for short time duration (i.e. short values will be represented with greater accuracy) and lower resolutions for higher time values (that generally do not need to be represented with great accuracy). 23 | The values of these tokens take the form of tuple as: ``(num_beats, num_samples, resolution)``. For instance, the time value of token ``(2, 3, 8)`` corresponds to 2 beats and 3/8 of a beat. ``(2, 2, 4)`` corresponds to 2 beats and half of a beat (2.5). 24 | 25 | For position-based tokenizers, the number of ``Position`` in the vocabulary is equal to the maximum resolution found in ``beat_res``. 26 | 27 | An example of the downsampling applied by MidiTok during the preprocessing is shown below. 28 | 29 | .. figure:: /assets/midi_preprocessing_original.png 30 | :alt: Original MIDI file 31 | :width: 800 32 | 33 | Original MIDI file from the `Maestro dataset `_ with a 4/4 time signature. The numbers at the top indicate the bar number (125) followed by the beat number within the bar. 34 | 35 | .. figure:: /assets/midi_preprocessing_preprocessed.png 36 | :alt: Downsampled MIDI file. 37 | :width: 800 38 | 39 | MIDI file with time downsampled to 8 samples per beat. 40 | 41 | Additional tokens 42 | ------------------------ 43 | 44 | MidiTok offers to include additional tokens on music information. You can specify them in the ``tokenizer_config`` argument (:class:`miditok.TokenizerConfig`) when creating a tokenizer. The :class:`miditok.TokenizerConfig` documentations specifically details the role of each of them, and their associated parameters. 45 | 46 | .. csv-table:: Compatibility table of tokenizations and additional tokens. 47 | :file: additional_tokens_table.csv 48 | :header-rows: 1 49 | 50 | ¹: using both time signatures and rests with :class:`miditok.CPWord` might result in time alterations, as the time signature changes are carried with the Bar tokens which can be skipped during period of rests. 51 | ²: using time signatures with :class:`miditok.Octuple` might result in time alterations, as the time signature changes are carried with the note onsets. An example is shown below. 52 | 53 | Alternatively, **Velocity** and **Duration** tokens are optional and are enabled by default for all tokenizers. 54 | 55 | .. image:: /assets/Octuple_TS_Rest/original.png 56 | :width: 800 57 | :alt: Original MIDI sample preprocessed / downsampled 58 | 59 | .. image:: /assets/Octuple_TS_Rest/tokenized.png 60 | :width: 800 61 | :alt: MIDI sample after being tokenized, the time has been shifted to a bar during the time signature change 62 | 63 | Below is an example of how pitch intervals would be tokenized, with a ``max_pitch_interval`` of 15. 64 | 65 | .. image:: /assets/pitch_intervals.png 66 | :width: 800 67 | :alt: Schema of the pitch intervals over a piano-roll 68 | 69 | 70 | Special tokens 71 | ------------------------ 72 | 73 | MidiTok offers to include some special tokens to the vocabulary. These tokens with no "musical" information can be used for training purposes. 74 | To use special tokens, you must specify them with the ``special_tokens`` argument when creating a tokenizer. By default, this argument is set to ``["PAD", "BOS", "EOS", "MASK"]``. Their signification are: 75 | 76 | * **PAD** (``PAD_None``): a padding token to use when training a model with batches of sequences of unequal lengths. The padding token id is often set to 0. If you use Hugging Face models, be sure to pad inputs with this tokens, and pad labels with *-100*. 77 | * **BOS** (``SOS_None``): "Start Of Sequence" token, indicating that a token sequence is beginning. 78 | * **EOS** (``EOS_None``): "End Of Sequence" tokens, indicating that a token sequence is ending. For autoregressive generation, this token can be used to stop it. 79 | * **MASK** (``MASK_None``): a masking token, to use when pre-training a (bidirectional) model with a self-supervised objective like `BERT `_. 80 | 81 | **Note:** you can use the ``tokenizer.special_tokens`` property to get the list of the special tokens of a tokenizer, and ``tokenizer.special_tokens`` for their ids. 82 | -------------------------------------------------------------------------------- /docs/data_augmentation.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Data augmentation 3 | ======================== 4 | 5 | Data augmentation is a technique to artificially increases the size of a dataset by applying various transformations on to the existing data. These transformations consist in altering one or several attributes of the original data. In the context of images, they can include operations such as rotation, scaling, cropping or color adjustments. This is more tricky in the case of natural language, where the meaning of the sentences can easily diverge following how the text is modified, but some techniques such as paraphrase generation or back translation can fill this purpose. 6 | 7 | The purpose of data augmentation is to introduce variability and diversity into the training data without collecting additional real-world data. Data augmentation can be important and increase a model's learning and generalization, as it exposes it to a wider range of variations and patterns present in the data. In turn it can increases its robustness and decrease overfitting. 8 | 9 | MidiTok allows to perform data augmentation, on the MIDI level and token level. Transformations can be made by increasing the values of the velocities and durations of notes, or by shifting their pitches by octaves. Data augmentation is highly recommended to train a model, in order to help a model to learn the global and local harmony of music. In large datasets such as the `Lakh `_ or `Meta MIDI `_ datasets, MIDI files can have various ranges of velocity, duration values, and pitch. By augmenting the data, thus creating more diversified data samples, a model can better generalize learning the melody, harmony and music features rather than learning specific recurrent token successions. 10 | 11 | .. automodule:: miditok.data_augmentation 12 | :members: 13 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Code examples 3 | ================= 4 | 5 | Create a tokenizer 6 | ------------------------ 7 | 8 | A basic example showing how to create a tokenizer, with a selection of custom parameters. 9 | 10 | .. code-block:: python 11 | 12 | from miditok import REMI, TokenizerConfig # here we choose to use REMI 13 | 14 | # Our parameters 15 | TOKENIZER_PARAMS = { 16 | "pitch_range": (21, 109), 17 | "beat_res": {(0, 4): 8, (4, 12): 4}, 18 | "num_velocities": 32, 19 | "special_tokens": ["PAD", "BOS", "EOS", "MASK"], 20 | "use_chords": True, 21 | "use_rests": False, 22 | "use_tempos": True, 23 | "use_time_signatures": False, 24 | "use_programs": False, 25 | "num_tempos": 32, # number of tempo bins 26 | "tempo_range": (40, 250), # (min, max) 27 | } 28 | config = TokenizerConfig(**TOKENIZER_PARAMS) 29 | 30 | # Creates the tokenizer 31 | tokenizer = REMI(config) 32 | 33 | MIDI - Tokens conversion 34 | ------------------------------- 35 | 36 | Here we convert a MIDI to tokens, decode them back to a MIDI. 37 | 38 | .. code-block:: python 39 | 40 | from pathlib import Path 41 | 42 | # Tokenize a MIDI file 43 | tokens = tokenizer(Path("to", "your_midi.mid")) # automatically detects Score objects, paths, tokens 44 | 45 | # Convert to MIDI and save it 46 | generated_midi = tokenizer(tokens) # MidiTok can handle PyTorch/Numpy/Tensorflow tensors 47 | generated_midi.dump_midi(Path("to", "decoded_midi.mid")) 48 | 49 | 50 | Trains a tokenizer with BPE 51 | ----------------------------- 52 | 53 | Here we train the tokenizer with :ref:`Byte Pair Encoding (BPE)`. 54 | BPE allows to reduce the lengths of the sequences of tokens, in turn model efficiency, while improving the results quality/model performance. 55 | 56 | .. code-block:: python 57 | 58 | from miditok import REMI 59 | from pathlib import Path 60 | 61 | # Creates the tokenizer and list the file paths 62 | tokenizer = REMI() # using defaults parameters (constants.py) 63 | midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid")) 64 | 65 | # Builds the vocabulary with BPE 66 | tokenizer.train(vocab_size=30000, files_paths=midi_paths) 67 | 68 | 69 | Prepare a dataset before training 70 | ------------------------------------------- 71 | 72 | MidiTok provides useful methods to split music files into smaller chunks that make approximately a target number of tokens, allowing to use most of your data to train and evaluate models. It also provide data augmentation methods to increase the amount of data to train models. 73 | 74 | .. code-block:: python 75 | 76 | from random import shuffle 77 | 78 | from miditok.data_augmentation import augment_dataset 79 | from miditok.utils import split_files_for_training 80 | 81 | # Split the dataset into train/valid/test subsets, with 15% of the data for each of the two latter 82 | midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid")) 83 | total_num_files = len(midi_paths) 84 | num_files_valid = round(total_num_files * 0.15) 85 | num_files_test = round(total_num_files * 0.15) 86 | shuffle(midi_paths) 87 | midi_paths_valid = midi_paths[:num_files_valid] 88 | midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test] 89 | midi_paths_train = midi_paths[num_files_valid + num_files_test:] 90 | 91 | # Chunk MIDIs and perform data augmentation on each subset independently 92 | for files_paths, subset_name in ( 93 | (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test") 94 | ): 95 | 96 | # Split the MIDIs into chunks of sizes approximately about 1024 tokens 97 | subset_chunks_dir = Path(f"dataset_{subset_name}") 98 | split_files_for_training( 99 | files_paths=files_paths, 100 | tokenizer=tokenizer, 101 | save_dir=subset_chunks_dir, 102 | max_seq_len=1024, 103 | num_overlap_bars=2, 104 | ) 105 | 106 | # Perform data augmentation 107 | augment_dataset( 108 | subset_chunks_dir, 109 | pitch_offsets=[-12, 12], 110 | velocity_offsets=[-4, 4], 111 | duration_offsets=[-0.5, 0.5], 112 | ) 113 | 114 | Creates a Dataset and collator for training 115 | ------------------------------------------- 116 | 117 | Creates a Dataset and a collator to be used with a PyTorch DataLoader to train a model 118 | 119 | .. code-block:: python 120 | 121 | from miditok import REMI 122 | from miditok.pytorch_data import DatasetMIDI, DataCollator 123 | from torch.utils.data import DataLoader 124 | 125 | tokenizer = REMI() # using defaults parameters (constants.py) 126 | midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid")) 127 | dataset = DatasetMIDI( 128 | files_paths=midi_paths, 129 | tokenizer=tokenizer, 130 | max_seq_len=1024, 131 | bos_token_id=tokenizer.pad_token_id, 132 | eos_token_id=tokenizer["BOS_None"], 133 | ) 134 | collator = DataCollator(tokenizer.pad_token_id) 135 | data_loader = DataLoader(dataset=dataset, collate_fn=collator) 136 | 137 | # Using the data loader in the training loop 138 | for batch in data_loader: 139 | print("Train your model on this batch...") 140 | 141 | 142 | Tokenize a dataset 143 | ------------------------ 144 | 145 | Here we tokenize a whole dataset into JSON files storing the tokens ids. 146 | We also perform data augmentation on the pitch, velocity and duration dimension. 147 | 148 | .. code-block:: python 149 | 150 | from miditok import REMI 151 | from miditok.data_augmentation import augment_midi_dataset 152 | from pathlib import Path 153 | 154 | # Creates the tokenizer and list the file paths 155 | tokenizer = REMI() # using defaults parameters (constants.py) 156 | data_path = Path("path", "to", "dataset") 157 | 158 | # A validation method to discard MIDIs we do not want 159 | # It can also be used for custom pre-processing, for instance if you want to merge 160 | # some tracks before tokenizing a MIDI file 161 | def midi_valid(midi) -> bool: 162 | if any(ts.numerator != 4 for ts in midi.time_signature_changes): 163 | return False # time signature different from 4/*, 4 beats per bar 164 | return True 165 | 166 | # Performs data augmentation on one pitch octave (up and down), velocities and 167 | # durations 168 | midi_aug_path = Path("to", "new", "location", "augmented") 169 | augment_midi_dataset( 170 | data_path, 171 | pitch_offsets=[-12, 12], 172 | velocity_offsets=[-4, 5], 173 | duration_offsets=[-0.5, 1], 174 | out_path=midi_aug_path, 175 | ) 176 | tokenizer.tokenize_dataset( # 2 velocity and 1 duration values 177 | data_path, 178 | Path("path", "to", "tokens"), 179 | midi_valid, 180 | ) 181 | -------------------------------------------------------------------------------- /docs/hf_hub.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Hugging Face Hub 3 | ======================== 4 | 5 | What is the Hugging Face hub 6 | --------------------------------- 7 | 8 | The `Hugging Face Hub `_ is a model and dataset sharing platform which is widely used in the AI community. It allows to freely upload, share and download models and datasets, directly in your code in a very convenient way. Its interactions rely on an open-source Python package named `huggingface_hub `_. As it works seamlessly in the Hugging Face ecosystem, especially the `Transformers `_ or `Diffusers `_ libraries, it stood out and became one of the preferred way to openly share and download models. 9 | 10 | Now when downloading a Transformer model, you will need to also download its associated tokenizer to be able to "dialog" with it. Likewise, if you want to share one of your models, you will need to share its tokenizer too for people to be able to use it. MidiTok allows you to push and download tokenizers in similar way to what is done in the Hugging Face Transformers library. 11 | 12 | How MidiTok interoperates with the hub 13 | ------------------------------------------ 14 | 15 | Internally, MidiTok relies on the ``huggingface_hub.ModelHubMixin`` component. It implements the same methods commonly used in the Hugging Face ecosystem. Note that: 16 | 17 | * :py:func:`miditok.MusicTokenizer.save_pretrained` is equivalent to calling :py:func:`miditok.MusicTokenizer.save_params`; 18 | * :py:func:`miditok.MusicTokenizer.from_pretrained` can be used to load tokenizers whether from the Hugging Face hub or from a file on your local filesystem; 19 | * for :py:func:`miditok.MusicTokenizer.save_pretrained` and :py:func:`miditok.MusicTokenizer.push_to_hub`, you can ignore the ``config`` argument which is meant to be used with models (not applicable for tokenizers); 20 | * you can give a ``filename`` keyword argument with the :py:func:`miditok.MusicTokenizer.save_pretrained` and :py:func:`miditok.MusicTokenizer.from_pretrained` methods to use a specific tokenizer configuration file name, otherwise the default one will be used (``tokenizer.json``). 21 | 22 | .. autofunction:: miditok.MusicTokenizer.from_pretrained 23 | :noindex: 24 | 25 | .. autofunction:: miditok.MusicTokenizer.save_pretrained 26 | :noindex: 27 | 28 | .. autofunction:: miditok.MusicTokenizer.push_to_hub 29 | :noindex: 30 | 31 | Example 32 | ------------------------ 33 | 34 | .. code-block:: python 35 | 36 | from miditok import REMI, TokSequence 37 | from copy import deepcopy 38 | 39 | tokenizer = REMI() # using defaults parameters (constants.py) 40 | hf_token = "your_hf_token" # to create on huggingface.co 41 | 42 | # Train the tokenizer with BPE 43 | tokenizer.train( 44 | vocab_size=30000, 45 | files_paths=list(Path("path", "to", "midis").glob("**/*.mid")), 46 | ) 47 | 48 | # Push the tokenizer to the HF hub 49 | tokenizer.push_to_hub("YourUserName/model-name", private=True, token=hf_token) 50 | 51 | # Recreates it from the configuration saved on the hub 52 | tokenizer2 = REMI.from_pretrained("YourUserName/model-name", token=hf_token) 53 | assert tokenizer == tokenizer2 54 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. MidiTok documentation master file, created by 2 | sphinx-quickstart on Sat Feb 4 20:52:11 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to MidiTok's documentation! 7 | ========================================= 8 | 9 | .. image:: /assets/miditok_logo_stroke.png 10 | :width: 600 11 | :alt: 12 | 13 | **MidiTok** is a Python package for MIDI file tokenization, introduced at the ISMIR 2021 LBDs `(paper) `_. 14 | It tokenize symbolic music files (MIDI, abc), i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task. 15 | MidiTok features most known MIDI :ref:`tokenizations`, and is built around the idea that they all share common methods. Tokenizers can be trained with BPE, Unigram or WordPiece (:ref:`Training a tokenizer`) and be push to and pulled from the Hugging Face hub! 16 | 17 | Installation 18 | ================== 19 | 20 | .. code-block:: bash 21 | 22 | pip install miditok 23 | 24 | MidiTok uses `symusic `_ to read and write MIDI files, and tokenizer training is backed by the `Hugging Face 🤗tokenizers `_ for super fast encoding. 25 | 26 | Citation 27 | ================== 28 | 29 | If you use MidiTok for your research, a citation in your manuscript would be gladly appreciated. ❤️ 30 | 31 | You can also find in this documentation BibTeX :ref:`citations` of related research works. 32 | 33 | .. code-block:: bib 34 | 35 | @inproceedings{miditok2021, 36 | title={{MidiTok}: A Python package for {MIDI} file tokenization}, 37 | author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas}, 38 | booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference}, 39 | year={2021}, 40 | url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf}, 41 | } 42 | 43 | Contents 44 | ================== 45 | 46 | .. toctree:: 47 | :maxdepth: 2 48 | :caption: Bases of Music and AI 49 | 50 | music_formats 51 | midi 52 | sequential_models 53 | 54 | .. toctree:: 55 | :maxdepth: 2 56 | :caption: MidiTok 57 | 58 | tokenizing_music_with_miditok 59 | configuration 60 | tokenizations 61 | attribute_controls 62 | train 63 | hf_hub 64 | pytorch_data 65 | data_augmentation 66 | utils 67 | 68 | .. toctree:: 69 | :maxdepth: 2 70 | :caption: Others 71 | 72 | examples 73 | citations 74 | 75 | .. toctree:: 76 | :hidden: 77 | :caption: Project Links 78 | 79 | GitHub 80 | PyPi 81 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/midi.rst: -------------------------------------------------------------------------------- 1 | .. _midi-protocol-label: 2 | 3 | =================================== 4 | The MIDI protocol 5 | =================================== 6 | 7 | MIDI, standing for *Musical Instrument Digital Interface*, is a digital communication protocol standard in the music sector. It describes the protocol itself, the physical connector to transmit the protocol between devices, and a digital file format. 8 | A MIDI file allows to store MIDI messages as a symbolic music file. It is the most abundant file format among available music datasets. 9 | 10 | History of MIDI 11 | ----------------------------- 12 | 13 | MIDI first appeared in the early eighties, when digital instrument manufacturers needed a digital protocol for communication between devices such as synthesizers and computers. It was standardized in 1983 by the first specifications, and is currently maintained by the `MIDI Manufacturers Association `_\. Meanwhile `new specifications `_ were made, the two major ones and still the norm today being the General MIDI 1 (GM1) and General MIDI 2 (GM2). These specifications aim to guide the manufacturers to design digital music devices compatible with the ones from other manufacturers by making sure they implement the protocol by following the same recommendations. 14 | 15 | The MIDI protocol allows to represent **notes, tempos, time signatures, key signatures, instruments (called programs) and effects (called controls) such as sustain pedal, pitch bend or modulation.** 16 | MIDI is an event based protocol. It consists of a series of messages, which can occur in multiple channels. Each message is composed of two key information, 1) the delta time expressed, which is the distance in ticks with the previous event (in the same channel) and so represents its position in time, 2) a series of bytes which represents its content. 17 | 18 | The latest evolution of the MIDI protocol is the MIDI Polyphonic Expression (shortly called MPE). This new norm allows manufacturers to create MIDI devices on which a specific channel is assigned to each note allowing the user to apply pitch bend and modulation on each key independently. These devices are typically built with touch-sensitive keys. The MIDI Manufacturers Association released the complete `specifications `_ on March 2018. 19 | 20 | 21 | MIDI Messages 22 | ----------------------------- 23 | 24 | A message expresses an event or an information. It takes the form of a series of bytes. The first is the Status byte which specifies the type of message and the channel, followed by one or two data bytes which contain the information. All the messages and their significations are described in the GM1 and GM2 specifications. The most important are: 25 | 26 | - *Note On*: a note is being played, specifies its pitch and velocity; 27 | - *Note Off*: a note is released, specifies the note (by its pitch) to stop and the velocity; 28 | - *Time Signature Change*: indicates the current time signature; 29 | - *Tempo Change*: indicates the current tempo; 30 | - *Program Change*: specifies the current instrument being played; 31 | - *Control Change*: a control parameter is modified or applied. The modulation wheel, foot sustain pedal, volume control or bank select are for instance effects transcribed into Control Change messages. 32 | 33 | Note that these messages are "voice messages", which means that each of them is applied within a channel that is specified in its status byte. The MIDI protocol handles up to sixteen channels which allows to connect multiple devices that are playing and communicating simultaneously. The channel 10 is reserved for drums, which is a specific "program" in which the pitch values corresponds to drum sounds like kicks, snares, or hi-hats. 34 | 35 | Time in MIDI 36 | ----------------------------- 37 | 38 | Time in MIDI is determined by its **time division**, which is a clock signal expressed in **ticks per quarter note** (tpq), and can be seen as a time resolution. Common time division values are 384, 480 and 960 tpq as they are divisible by 3, 4, 6 and 8 which are common time signature numerators and denominators. 39 | The time division can also be set in ticks per second, but this option is more rarely encountered as it makes less sense to use seconds as the tempo and time signature are known in MIDI. 40 | The time division is the first information that can be read at the beginning of a file, and a MIDI file can only have one time division. 41 | 42 | The number of ticks per bar and ticks per beat can be calculated from the MIDI's time division (:math:`time_{div}`) and the current time signature (:math:`\frac{ts_{num}}{ts_{denom}}`): 43 | 44 | - :math:`tpbeat = time_{div} \times \frac{4}{ts_{denom}}` 45 | - :math:`tpbar = tpbeat \times ts_{num}` 46 | 47 | Hence, for a :math:`\frac{4}{4}` time signature, the number of ticks per beat is equal to the time division (as a beat is equal to a quarter note) and the number of ticks per bar is equal to four times the number of ticks per beat. 48 | -------------------------------------------------------------------------------- /docs/music_formats.rst: -------------------------------------------------------------------------------- 1 | =================================== 2 | Music formats 3 | =================================== 4 | 5 | This page introduces the two representations of music and symbolic music file formats. It aims to present the basic differences between audio and symbolic music in order to better understand how they can be used with AI models, without going to much in the details, for which more comprehensive references are attached. 6 | 7 | Music: symbolic and audio 8 | --------------------------- 9 | 10 | Music is a unique modality in the way that it can take two different forms: symbolic and audio. 11 | 12 | Symbolic music represents the successions of notes, arranged in time and along with other musical elements such as tempos and time signatures typically found in the western music notations. The `sheet music `_ is the historical handwritten or printed representation of music that shows the notes on staves from left to right and up and down, with the time and key signatures indicated at the beginning. 13 | 14 | .. image:: /assets/bases/sheet_music.png 15 | :width: 800 16 | :alt: A sheet music. 17 | 18 | The `pianoroll `_ is another symbolic representation which consists of a two axis grid with one axis for the time and one for the note pitches. It was originally used in player pianos, and is now used in most `Digital Audio Wordstation (DAW) `_ software to show the notes and other effects of a track. 19 | 20 | .. image:: /assets/bases/pianoroll_daw.png 21 | :width: 800 22 | :alt: A piano roll view in the Logic Pro X DAW. 23 | 24 | Audio on the other hand represents the *physical* form of music, i.e. a sound signal, more specifically vibrations propagating in a material. Audio music is usually represented as waveforms (time domain) or spectrograms (frequency domain). 25 | 26 | A waveform is strictly the amplitude of a sound as a function of time. In the real world, a waveform is purely continuous. A digital audio waveform as found in audio files such as mp3s will feature a sampling frequency which indicates the number of samples per second used to represent this waveform. This time resolution is usually at least 44.1k samples per seconds, following the `Nyquist–Shannon theorem `_ . 27 | 28 | A sound, whether from an instrument, a human voice or a music arrangement, is a superposition of many periodic frequencies, defined by their wavelength, amplitude and phase. A spectrogram depicts the intensity in dB of the frequencies as a function of time. It allow to have a representation of these frequencies which is useful when analyzing sound. It can be computed with a `Fourier Transform `_ , usually a `Short Time Fourier Transform (STFT) `_ . 29 | 30 | .. image:: /assets/bases/spectrogram.png 31 | :width: 800 32 | :alt: The spectrogram of a sound, abscissa is time, ordinate is frequency and the color represents the intensity in dB. 33 | 34 | Symbolic music can be seen as both discrete and continuous as it represent discrete notes that feature however "continuous-like" attributes, and potentially with a high time resolution (in samples per beat or other specific time duration). **For this reason, it is more commonly used with discrete sequential models**, which we introduce in :ref:`sequential-models-label`), **by being represented as sequences of tokens**, which is the purpose of MidiTok. Pianoroll has also been used with `Convolutional Neural Networks (CNNs) `_ in past works (e.g. `MuseGan `_ ) but is now uncommon due to the limitations it imposes on the representation of musical elements. 35 | 36 | On the other hand, audio is by nature a continuous modality, as it represent the waveform of the sound itself. From a practical point of view, modeling raw waveforms with neural networks is often intractable due to the high time resolution of audio, despite works that achieved to do it (`WaveNet `_ , `Jukebox `_ ). For this reason, audio has been more commonly formatted as spectrograms when used with neural networks, and used with CNNs as it conveniently takes the form of a 2-dimensional matrix with distinct continuous patterns like images. 37 | Research in neural audio codecs allowed to "compress" audio waveform into a reduced number of discrete values allows to use waveforms as sequences of tokens with discrete models such as Transformers. For more details, see `SoundStream `_ and `EnCodec `_ which are respectively used with `MusicLM `_ and `MusicGen `_ . 38 | 39 | 40 | Symbolic music files format 41 | ----------------------------- 42 | 43 | There are three major file formats for symbolic music: MIDI, abc and musicXML. **MidiTok supports MIDI and abc files.** 44 | 45 | MIDI, standing for *Musical Instrument Digital Interface*, is a digital communication protocol standard in the music sector. It describes the protocol itself, the physical connector to transmit the protocol between devices, and a digital file format. 46 | A MIDI file allows to store MIDI messages as a symbolic music file. It is the most abundant file format among available music datasets. It is the most comprehensive and versatile file format for musical music, as such we present it more in detail in :ref:`midi-protocol-label`. 47 | 48 | 49 | The ABC notation is a notation for symbolic music, and a file format with the extension ``abc``. Its simplicity has made it widely used to write and share traditional and folk tunes from Western Europe. 50 | Each tune begins with a few lines indicating its title, time signature, default note length, key and others. Lines following the key represent the notes. A note is indicated by its letter, followed by a ``/x`` or ``x`` to respectively divide or multiply its length by ``x`` :math:`\in \mathbb{N}^{\star}` compared to the default note length. An upper case (e.g., A) means a pitch one octave below than a lower case (a). 51 | 52 | MusicXML is an open file format and music notation. Inspired by the XML file format, it is structured with the same item-hierarchy. An example is shown below. 53 | 54 | .. code-block:: xml 55 | 56 | 57 | 60 | 61 | 62 | 63 | Music 64 | 65 | 66 | 67 | 68 | 69 | 1 70 | 0 71 | 72 | G2 73 | 74 | 75 | C4 76 | 4 77 | whole 78 | 79 | 80 | 81 | 82 | 83 | The ``part-list`` references the parts to be written following with the tag ``part``. A ``measure`` is defined with its attributes, followed by notes and their attributes. 84 | The common file extensions are ``.mxl`` and ``.musicxml``. 85 | -------------------------------------------------------------------------------- /docs/pytorch_data.rst: -------------------------------------------------------------------------------- 1 | ================================= 2 | Using MidiTok with Pytorch 3 | ================================= 4 | 5 | MidiTok features PyTorch `Dataset `_ objects to load music data during training, usually coupled with a PyTorch ``DataLoader``. A ``Dataset`` is an object storing the information about a dataset: paths of files to load, or the data itself stored in memory (recommended for small datasets only). 6 | When indexed, the ``Dataset`` will output dictionaries with values corresponding to the inputs and labels. 7 | 8 | Loading data 9 | -------------------------- 10 | 11 | MidiTok provides two dataset classes: :class:`miditok.pytorch_data.DatasetMIDI` and :class:`miditok.pytorch_data.DatasetJSON`. 12 | 13 | :class:`miditok.pytorch_data.DatasetMIDI` loads MIDI files and can either tokenize them on the fly when the dataset is indexed, or pre-tokenize them when creating it and saving the token ids in memory. **For most use cases, this Dataset should fulfill your needs and is recommended.** 14 | 15 | :class:`miditok.pytorch_data.DatasetJSON` loads JSON files containing token ids. It requires to first tokenize a dataset to be used. This dataset is only compatible with JSON files saved as "one token stream" (``tokenizer.one_token_stream``). In order to use it for all the tracks of a multi-stream tokenizer, you will need to save each track token sequence as a separate JSON file. 16 | 17 | Preparing data 18 | -------------------------- 19 | 20 | When training a model, you will likely want to limit the possible token sequence length in order to not run out of memory. The dataset classes handle such case and can trim the token sequences. However, **it is not uncommon for a single MIDI to be tokenized into sequences that can contain several thousands tokens, depending on its duration and number of notes. In such case, using only the first portion of the token sequence would considerably reduce the amount of data used to train and test a model.** 21 | 22 | To handle such case, MidiTok provides the :py:func:`miditok.pytorch_data.split_files_for_training` method to dynamically split MIDI files into chunks that should be tokenized in approximately the number of tokens you want. 23 | If you cannot fit most of your MIDIs into single usable token sequences, we recommend to split your dataset with this method. 24 | 25 | Data loading example 26 | -------------------------- 27 | 28 | MidiTok also provides an "all-in-one" data collator: :class:`miditok.pytorch_data.DataCollator` to be used with PyTorch a ``DataLoader`` in order to pad batches and create attention masks. 29 | Here is a complete example showing how to use this module to train any model. 30 | 31 | .. code-block:: python 32 | 33 | from miditok import REMI, TokenizerConfig 34 | from miditok.pytorch_data import DatasetMIDI, DataCollator, split_files_for_training 35 | from torch.utils.data import DataLoader 36 | from pathlib import Path 37 | 38 | # Creating a multitrack tokenizer configuration, read the doc to explore other parameters 39 | config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True) 40 | tokenizer = REMI(config) 41 | 42 | # Train the tokenizer with Byte Pair Encoding (BPE) 43 | midi_paths = list(Path("path", "to", "midis").glob("**/*.mid")) 44 | tokenizer.train(vocab_size=30000, files_paths=midi_paths) 45 | tokenizer.save_params(Path("path", "to", "save", "tokenizer.json")) 46 | # And pushing it to the Hugging Face hub (you can download it back with .from_pretrained) 47 | tokenizer.push_to_hub("username/model-name", private=True, token="your_hf_token") 48 | 49 | # Split MIDIs into smaller chunks for training 50 | dataset_chunks_dir = Path("path", "to", "midi_chunks") 51 | split_files_for_training( 52 | files_paths=midi_paths, 53 | tokenizer=tokenizer, 54 | save_dir=dataset_chunks_dir, 55 | max_seq_len=1024, 56 | ) 57 | 58 | # Create a Dataset, a DataLoader and a collator to train a model 59 | dataset = DatasetMIDI( 60 | files_paths=list(dataset_chunks_dir.glob("**/*.mid")), 61 | tokenizer=tokenizer, 62 | max_seq_len=1024, 63 | bos_token_id=tokenizer["BOS_None"], 64 | eos_token_id=tokenizer["EOS_None"], 65 | ) 66 | collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True) 67 | dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator) 68 | 69 | # Iterate over the dataloader to train a model 70 | for batch in dataloader: 71 | print("Train your model on this batch...") 72 | 73 | **Note:** This module is imported only if ``torch`` is installed in your Python environment. 74 | 75 | .. automodule:: miditok.pytorch_data 76 | :members: 77 | -------------------------------------------------------------------------------- /docs/sequential_models.rst: -------------------------------------------------------------------------------- 1 | .. _sequential-models-label: 2 | 3 | =================================== 4 | Sequential models and tokens 5 | =================================== 6 | 7 | This page introduces the basic concepts of sequential models, which are often called "language models" as commonly use for natural language, which can be used with MidiTok to be trained on music data. 8 | 9 | 10 | Sequential models 11 | ---------------------------- 12 | 13 | We qualify as sequential model and model that takes as input **sequences of discrete elements**. `RNN `_\, `Long Short Term Memory (LSTM) `_ and `Transformers `_ fall into this category. As a general rule, the operation of these models noted :math:`p_\theta` can be formulated as :math:`p_\theta (\mathbf{x}) = y` where :math:`\mathbf{x} \in \mathbb{N}^n` is a sequence of :math:`\mathbb{N}^n` elements (integers here) and :math:`y` can either be a scalar or a sequence. The common feature of these is that :math:`y` **is conditioned on all the elements from** :math:`\mathbf{x}`. 14 | 15 | .. _transformer-label: 16 | 17 | .. figure:: /assets/transformer.png 18 | :alt: Schema of a Transformer model 19 | :class: with-shadow 20 | :width: 500px 21 | 22 | Schema of a "seq2seq" Transformer model. 23 | 24 | A sequential model can be "seq2seq", "encoder-only" or "decoder-only". 25 | seq2seq means that the model is composed of an encoder and decoder. The model's encoder processes an input sequence into intermediate **hidden states**, which condition the decoder that **autoregressively** generate the output sequence. This architecture is commonly used for translation tasks where the input sequence is in one language and the decoder generates its translation in another one. 26 | 27 | In a seq2seq configuration, the encoder is usually **bi-directional**, meaning that the all the output hidden states are conditioned on all the input elements, whereas the decoder is **causal**, meaning that the logits of a position :math:`t` are conditioned only on the input elements at positions :math:`\leq t`, i.e. the previous ones. 28 | 29 | An encoder-only model (e.g. `BERT `_\) is more useful for non-generative tasks, e.g. classification. On the other hand, a decoder-only model is usually designed to generate content. As each position is conditioned on the previous ones, the model is usually trained with **teacher forcing** to predict the next element. Consequently, it can be used to generate content **autoregressively**, i.e. one element after another on :math:`n` iterations by reinjecting the element generated at a given iteration to the end of the input sequence of the next one. 30 | 31 | 32 | Tokens and vocabulary 33 | ---------------------------- 34 | 35 | This section focuses more specifically on the nature of the inputs of the models. 36 | 37 | Until now, we referred to the sequences as holding "elements" representing discrete attributes of the data. These elements are commonly called **tokens**, and **are fed to a model as integers**. For natural language, these tokens can represent words or parts of words. Consequently, sentence can then be tokenized into a sequence of tokens representing the words and punctuation. For symbolic music, tokens can represent the values of the note attributes (pitch, velocity, duration) or time events. The conversion of raw data to tokens is done by a **tokenizer**, which reads it and serializes it into sequences of tokens from its vocabulary. 38 | 39 | The **vocabulary** of a tokenizer is the finite set of all distinct known tokens. For natural language, it represent the set of words, subwords, punctuations and unicode characters. **Each token is associated to a unique id**, its index in the vocabulary, which is fed to a model. A vocabulary is usually (as in MidiTok) a dictionary acting a lookup table linking tokens (their text forms) to their ids (integer form). 40 | 41 | 42 | Embeddings 43 | ---------------------------- 44 | 45 | This section introduces the notion of embedding, sometimes called *embedding vector* or *word embedding*. 46 | 47 | Vocabularies are often made of thousands of tokens, each of them having a whole variety of meanings and significations. In order for a sequential model to efficiently process them, it must be able to capture their semantic information and features. This step is handled thanks to **embeddings**. 48 | 49 | An embedding :math:`\mathbf{e}^d` is a vector of :math:`d` dimensions, which represent the semantic information of the associated token. The embeddings are **contextually learned** by the model during training, meaning their position are adjusted conditionally to the context in which they are found in the data. Embeddings with similar semantics/meanings will be closer in the **continuous embedding space** of the model than embeddings with no related meanings. They offer a way to the model to capture the semantic of words across these dimensions. 50 | 51 | .. figure:: /assets/embeddings.png 52 | :alt: Embedding space. 53 | :class: with-shadow 54 | :width: 500px 55 | 56 | Visualization of an embedding space reduced in 2 dimensions with `TSNE `_\. 57 | 58 | The embeddings are actually the real input of a sequential model. Each token acts as an index for the model's embedding matrix. In :ref:`transformer-label`, the first operation consist in indexing this matrix with the token ids to get their embeddings which are then processed by the model. 59 | 60 | MidiTok allows you to leverage the features of model embeddings by training the tokenizer (:ref:`training-tokenizer-label`). 61 | -------------------------------------------------------------------------------- /docs/tokenizations.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Tokenizations 3 | ================= 4 | 5 | This page details the tokenizations featured by MidiTok. They inherit from :class:`miditok.MusicTokenizer`, see the documentation for learn to use the common methods. For each of them, the token equivalent of the lead sheet below is showed. 6 | 7 | .. image:: /assets/music_sheet.png 8 | :width: 800 9 | :alt: Music sheet example 10 | 11 | REMI 12 | ------------------------ 13 | 14 | .. image:: /assets/remi.png 15 | :width: 800 16 | :alt: REMI sequence, time is tracked with Bar and position tokens 17 | 18 | .. autoclass:: miditok.REMI 19 | :show-inheritance: 20 | 21 | REMIPlus 22 | ------------------------ 23 | 24 | REMI+ is an extended version of :ref:`REMI` (Huang and Yang) for general multi-track, multi-signature symbolic music sequences, introduced in `FIGARO (Rütte et al.) `_, which handles multiple instruments by adding ``Program`` tokens before the ``Pitch`` ones. 25 | 26 | You can get the REMI+ tokenization by using the :ref:`REMI` tokenizer with ``config.use_programs``, ``config.one_token_stream_for_programs`` and ``config.use_time_signatures`` enabled. 27 | 28 | MIDI-Like 29 | ------------------------ 30 | 31 | .. image:: /assets/midi_like.png 32 | :width: 800 33 | :alt: MIDI-Like token sequence, with TimeShift and NoteOff tokens 34 | 35 | .. autoclass:: miditok.MIDILike 36 | :show-inheritance: 37 | 38 | TSD 39 | ------------------------ 40 | 41 | .. image:: /assets/tsd.png 42 | :width: 800 43 | :alt: TSD sequence, like MIDI-Like with Duration tokens 44 | 45 | .. autoclass:: miditok.TSD 46 | :show-inheritance: 47 | 48 | Structured 49 | ------------------------ 50 | 51 | .. image:: /assets/structured.png 52 | :width: 800 53 | :alt: Structured tokenization, the token types always follow the same succession pattern 54 | 55 | .. autoclass:: miditok.Structured 56 | :show-inheritance: 57 | 58 | CPWord 59 | ------------------------ 60 | 61 | .. image:: /assets/cp_word.png 62 | :width: 800 63 | :alt: CP Word sequence, tokens of the same family are grouped together 64 | 65 | .. autoclass:: miditok.CPWord 66 | :show-inheritance: 67 | 68 | Octuple 69 | ------------------------ 70 | 71 | .. image:: /assets/octuple.png 72 | :width: 800 73 | :alt: Octuple sequence, with a bar and position embeddings 74 | 75 | .. autoclass:: miditok.Octuple 76 | :show-inheritance: 77 | 78 | MuMIDI 79 | ------------------------ 80 | 81 | .. image:: /assets/mumidi.png 82 | :width: 800 83 | :alt: MuMIDI sequence, with a bar and position embeddings 84 | 85 | .. autoclass:: miditok.MuMIDI 86 | :show-inheritance: 87 | 88 | MMM 89 | ------------------------ 90 | 91 | .. autoclass:: miditok.MMM 92 | :show-inheritance: 93 | 94 | PerTok 95 | ------------------------ 96 | 97 | .. autoclass:: miditok.PerTok 98 | :show-inheritance: 99 | 100 | 101 | Create yours 102 | ------------------------ 103 | 104 | You can easily create your own tokenizer and benefit from the MidiTok framework. Just create a class inheriting from :class:`miditok.MusicTokenizer`, and override: 105 | 106 | * :py:func:`miditok.MusicTokenizer._add_time_events` to create time events from global and track events; 107 | * :py:func:`miditok.MusicTokenizer._tokens_to_score` to decode tokens into a ``Score`` object; 108 | * :py:func:`miditok.MusicTokenizer._create_vocabulary` to create the tokenizer's vocabulary; 109 | * :py:func:`miditok.MusicTokenizer._create_token_types_graph` to create the possible token types successions (used for eval only). 110 | 111 | If needed, you can override the methods: 112 | 113 | * :py:func:`miditok.MusicTokenizer._score_to_tokens` the main method calling specific tokenization methods; 114 | * :py:func:`miditok.MusicTokenizer._create_track_events` to include special track events; 115 | * :py:func:`miditok.MusicTokenizer._create_global_events` to include special global events. 116 | 117 | If you think people can benefit from it, feel free to send a pull request on `Github `_. 118 | -------------------------------------------------------------------------------- /docs/tokenizing_music_with_miditok.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Bases of MidiTok 3 | ================= 4 | 5 | This page introduces the bases of MidiTok, how a tokenizer works and what are the basic elements of MidiTok. 6 | 7 | MidiTok's workflow 8 | ------------------------ 9 | 10 | MidiTok uses a common workflow for all its tokenizers, which follows: 11 | 12 | 1. **Music file preprocessing**: time is **downsampled** to match the tokenizer's time resolution, tracks of the same programs are merged, notes with pitches outside the tokenizer's pitch range are removed, note velocities and tempos are downsampled, finally notes, tempos and time signatures are deduplicated; 13 | 2. **Parsing of global events**: tempos and time signature tokens are created; 14 | 3. **Parsing of the tracks events**: notes, chords, controls (pedals...) and tokens specific to each tracks are parsed to create their associated tokens; 15 | 4. **Creating time tokens**: the tokens representing the time are created in order to bind the previously created global and track tokens. 16 | 17 | The resulting tokens are provided by the tokenizer as one or :class:`miditok.TokSequence` depending on the tokenizer's IO format (:ref:`Tokens & TokSequence input / output format`) 18 | 19 | The first three steps are common for all tokenizers, while the fourth is handled independently by each tokenizer. 20 | The first step allows to format the music file so that its content fits the tokenizer's vocabulary before being parsed. 21 | 22 | 23 | Vocabulary 24 | ------------------------ 25 | 26 | As introduced in :ref:`Tokens and vocabulary`, the vocabulary acts as a lookup table between the tokens (string) and their ids (integers). 27 | It can be accessed with ``tokenizer.vocab`` to get the string to id mapping. 28 | 29 | For tokenizers with embedding pooling (e.g. :ref:`CPWord` or :ref:`Octuple`), ``tokenizer.vocab`` will be a list of dictionaries, and the ``tokenizer.is_multi_vocab`` property will be ``True``. 30 | 31 | **With a trained tokenizer:** 32 | ``tokenizer.vocab`` holds all the basic tokens describing the note and time attributes of music. By analogy with text, this vocabulary can be seen as the alphabet of unique characters. 33 | After :ref:`Training a tokenizer`, a new vocabulary is built with newly created tokens from pairs of basic tokens. This vocabulary can be accessed with ``tokenizer.vocab_model``, and maps tokens as bytes (string) to their associated ids (int). This is the vocabulary of the 🤗tokenizers model. 34 | 35 | TokSequence 36 | ------------------------ 37 | 38 | The methods of MidiTok use :class:`miditok.TokSequence` objects as input and outputs. A :class:`miditok.TokSequence` holds tokens as strings, integers, ``miditok.Event`` and bytes (used internally to encode the token ids with trained tokenizers). TokSequences are subscriptable, can be sliced, concatenated and implement the ``__len__`` magic method. 39 | 40 | You can use the :py:func:`miditok.MusicTokenizer.complete_sequence` method to automatically fill the non-initialized attributes of a :class:`miditok.TokSequence`. 41 | 42 | .. autoclass:: miditok.TokSequence 43 | :members: 44 | 45 | 46 | The MusicTokenizer class 47 | ------------------------ 48 | 49 | MidiTok features several MIDI tokenizations, all inheriting from the :class:`miditok.MusicTokenizer` class. 50 | You can customize your tokenizer by creating it with a custom :class:`miditok.TokenizerConfig`. 51 | 52 | .. autoclass:: miditok.MusicTokenizer 53 | :members: 54 | 55 | 56 | Tokens & TokSequence input / output format 57 | -------------------------------------------- 58 | 59 | Depending on the tokenizer at use, the **format** of the tokens returned by the :py:func:`miditok.MusicTokenizer.encode` method may vary, as well as the expected format for the :py:func:`miditok.MusicTokenizer.decode` method. The format is given by the :py:func:`miditok.MusicTokenizer.io_format` property. For any tokenizer, the format is the same for both methods. 60 | 61 | The format is deduced from the :py:func:`miditok.MusicTokenizer.is_multi_voc` and ``one_token_stream`` tokenizer attributes. 62 | ``one_token_stream`` determined wether the tokenizer outputs a unique :class:`miditok.TokSequence` covering all the tracks of a music file or one :class:`miditok.TokSequence` per track. It is equal to ``tokenizer.config.one_token_stream_for_programs``, except for :class:`miditok.MMM` for which it is enabled while ``one_token_stream_for_programs`` is False. 63 | :py:func:`miditok.MusicTokenizer.is_multi_voc` being True means that each "token" within a :class:`miditok.TokSequence` is actually a list of ``C`` "sub-tokens", ``C`` being the number of sub-token classes. 64 | 65 | This results in four situations, where ``I`` (instrument) is the number of tracks, ``T`` (token) is the number of tokens and ``C`` (class) the number of subtokens per token step: 66 | 67 | * ``is_multi_voc`` and ``one_token_stream`` are both ``False``: ``[I,(T)]``; 68 | * ``is_multi_voc`` is ``False`` and ``one_token_stream`` is ``True``: ``(T)``; 69 | * ``is_multi_voc`` is ``True`` and ``one_token_stream`` is ``False``: ``[I,(T,C)]``; 70 | * ``is_multi_voc`` and ``one_token_stream`` are both ``True``: ``(T,C)``. 71 | 72 | **Note that if there is no I dimension in the format, the output of** :py:func:`miditok.MusicTokenizer.encode` **is a** :class:`miditok.TokSequence` **object, otherwise it is a list of** :class:`miditok.TokSequence` **objects (one per token stream / track).** 73 | 74 | Some tokenizer examples to illustrate: 75 | 76 | * **TSD** without ``config.use_programs`` will not have multiple vocabularies and will treat each track as a unique stream of tokens, hence it will convert music files to a list of :class:`miditok.TokSequence` objects, ``(I,T)`` format. 77 | * **TSD** with ``config.use_programs`` being True will convert all tracks to a single stream of tokens, hence one :class:`miditok.TokSequence` object, ``(T)`` format. 78 | * **CPWord** is a multi-voc tokenizer, without ``config.use_programs`` it will treat each track as a distinct stream of tokens, hence it will convert music files to a list of :class:`miditok.TokSequence` objects with the ``(I,T,C)`` format. 79 | * **Octuple** is a multi-voc tokenizer and converts all track to a single stream of tokens, hence it will convert music files to a :class:`miditok.TokSequence` object, ``(T,C)`` format. 80 | 81 | 82 | Magic methods 83 | ------------------------ 84 | 85 | `Magic methods `_ allows to intuitively access to a tokenizer's attributes and methods. We list them here with some examples. 86 | 87 | .. autofunction:: miditok.MusicTokenizer.__call__ 88 | :noindex: 89 | .. code-block:: python 90 | 91 | tokens = tokenizer(score) 92 | score2 = tokenizer(tokens) 93 | 94 | .. autofunction:: miditok.MusicTokenizer.__getitem__ 95 | :noindex: 96 | .. code-block:: python 97 | 98 | pad_token = tokenizer["PAD_None"] 99 | 100 | .. autofunction:: miditok.MusicTokenizer.__len__ 101 | :noindex: 102 | .. code-block:: python 103 | 104 | num_classes = len(tokenizer) 105 | num_classes_per_vocab = tokenizer.len # applicable to tokenizer with embedding pooling, e.g. CPWord or Octuple 106 | 107 | .. autofunction:: miditok.MusicTokenizer.__eq__ 108 | :noindex: 109 | .. code-block:: python 110 | 111 | if tokenizer1 == tokenizer2: 112 | print("The tokenizers have the same vocabulary and configurations!") 113 | 114 | 115 | Save / Load a tokenizer 116 | ------------------------ 117 | 118 | You can save and load a tokenizer, include its configuration and vocabulary. This is especially useful after :ref:`Training a tokenizer`. 119 | 120 | .. autofunction:: miditok.MusicTokenizer.save 121 | :noindex: 122 | 123 | To load a tokenizer from saved parameters, just use the ``params`` argument when creating a it: 124 | 125 | .. code-block:: python 126 | 127 | tokenizer = REMI(params=Path("to", "tokenizer.json")) 128 | -------------------------------------------------------------------------------- /docs/utils.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Utils methods 3 | ======================== 4 | 5 | .. automodule:: miditok.utils 6 | :members: 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "miditok" 7 | version = "3.0.5.post1" 8 | description = "MIDI / symbolic music tokenizers for Deep Learning models." 9 | readme = {file = "README.md", content-type = "text/markdown"} 10 | license = {file = "LICENSE"} 11 | requires-python = ">=3.9" 12 | authors = [ 13 | { name = "Nathan Fradet" }, 14 | ] 15 | keywords = [ 16 | "artificial intelligence", 17 | "deep learning", 18 | "transformer", 19 | "midi", 20 | "tokenization", 21 | "music", 22 | "mir", 23 | ] 24 | classifiers = [ 25 | "Intended Audience :: Developers", 26 | "Intended Audience :: Science/Research", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 28 | "Topic :: Multimedia :: Sound/Audio :: MIDI", 29 | "License :: OSI Approved :: MIT License", 30 | "Programming Language :: Python", 31 | "Programming Language :: Python :: 3 :: Only", 32 | "Programming Language :: Python :: 3.9", 33 | "Programming Language :: Python :: 3.10", 34 | "Programming Language :: Python :: 3.11", 35 | "Programming Language :: Python :: 3.12", 36 | "Programming Language :: Python :: 3.13", 37 | "Operating System :: OS Independent", 38 | ] 39 | dependencies = [ 40 | "numpy>=1.19", 41 | "symusic>=0.5.0", 42 | "tqdm", 43 | "tokenizers>=0.13.0", 44 | "huggingface_hub>=0.16.4", 45 | ] 46 | 47 | [project.optional-dependencies] 48 | tests = [ 49 | "pytest-cov", 50 | "pytest-xdist[psutil]", 51 | "torch", 52 | "tensorflow", 53 | "miditoolkit", 54 | ] 55 | docs = [ 56 | "furo", # theme 57 | "sphinx-copybutton", 58 | "torch", # for pytorch_data module 59 | # "sphinxcontrib-tikz", 60 | ] 61 | 62 | [project.urls] 63 | Homepage = "https://github.com/Natooz/MidiTok" 64 | Repository = "https://github.com/Natooz/MidiTok.git" 65 | Documentation = "https://miditok.readthedocs.io" 66 | Issues = "https://github.com/Natooz/MidiTok/issues" 67 | 68 | [tool.hatch.build.targets.wheel] 69 | packages = ["src/miditok"] 70 | only-packages = true 71 | 72 | [tool.hatch.version] 73 | path = "src/miditok/__init__.py" 74 | 75 | [mypy] 76 | warn_return_any = "True" 77 | warn_unused_configs = "True" 78 | plugins = "numpy.typing.mypy_plugin" 79 | exclude = [ 80 | "venv", 81 | ".venv", 82 | ] 83 | 84 | [tool.pytest.ini_options] 85 | pythonpath = "src" 86 | addopts = [ 87 | "--import-mode=importlib", 88 | ] 89 | 90 | [tool.coverage.report] 91 | exclude_also = [ 92 | "def __repr__", 93 | ] 94 | omit = [ 95 | # files to omit to check 96 | "benchmarks/*" 97 | ] 98 | 99 | [tool.ruff] 100 | target-version = "py313" 101 | 102 | [tool.ruff.lint] 103 | extend-select = [ 104 | "ARG", 105 | "A", 106 | "ANN", 107 | "B", 108 | "BLE", 109 | "C4", 110 | "COM", 111 | "D", 112 | "E", 113 | "EM", 114 | "EXE", 115 | "F", 116 | "FA", 117 | "FBT", 118 | "G", 119 | "I", 120 | "ICN", 121 | "INP", 122 | "INT", 123 | "ISC", 124 | "N", 125 | "NPY", 126 | "PERF", 127 | "PGH", 128 | "PTH", 129 | "PIE", 130 | # "PL", 131 | "PT", 132 | "Q", 133 | "RET", 134 | "RSE", 135 | "RUF", 136 | "S", 137 | # "SLF", 138 | "SIM", 139 | "T", 140 | "TCH", 141 | "TID", 142 | "UP", 143 | "W", 144 | ] 145 | 146 | # Each rule exclusion should be explained here. 147 | # By default, we think it is better to select groups of rules (above), and exclude 148 | # specific problematic rules, instead of selecting specific rules. By doing so, in case 149 | # the ruff rules groups change, this requires us to check and handle the new rules or 150 | # changes, making sure we stay up to date and keep the best practices. 151 | 152 | # ANN003: 153 | # Would mostly apply to args/kwargs that are passed to methods from dependencies, for 154 | # which the signature can change depending on the version. This would either be too 155 | # difficult to comply and/or would add a lot of noqa exceptions. ANN002 is used as it 156 | # adds very few "noqa" exceptions, but ANN003 would add too much complexity. 157 | 158 | # ANN101 and ANN102: 159 | # Yields errors for `self` in methods from classes, which is unecessary. 160 | # The existence of these rules is currently questioned, they are likely to be removed. 161 | # https://github.com/astral-sh/ruff/issues/4396 162 | 163 | # B905 164 | # The `strict` keyword argument for the `zip` built-in method appeared with Python 165 | # 3.10. As we support previous versions, we cannot comply (yet) with this rule. The 166 | # exclusion should be removed when MidiTok drop support for Python 3.9. 167 | 168 | # D107 169 | # We document classes at the class level (D101). This documentation should cover the 170 | # way classes are initialized. So we do not document `__init__` methods. 171 | 172 | # D203 173 | # "one-blank-line-before-class", incompatible with D211 (blank-line-before-class). 174 | # We follow PEP 257 and other conventions by preferring D211 over D203. 175 | 176 | # D212 177 | # "multi-line-summary-first-line", incompatible with D213 178 | # (multi-line-summary-second-line). 179 | # We follow PEP 257, which recommend to set put the summary line on the second line 180 | # after the blank line of the opening quotes. 181 | 182 | # FBT001 and FBT002 183 | # Refactoring all the methods to make boolean arguments keyword only would add 184 | # complexity and could break code of users. It's ok to have booleans as positional 185 | # arguments with default values. For code redability though, we enable FB003. 186 | 187 | # COM812: 188 | # Yields errors for one-line portions without comma. Trailing commas are automatically 189 | # set with ruff format anyway. This exclusion could be removed when this behavior is 190 | # fixed in ruff. 191 | 192 | # UP038 193 | # Recommends to | type union with `isinstance`, which is only supported since Python 194 | # 3.10. The exclusion should be removed when MidiTok drop support for Python 3.9. 195 | 196 | # (ISC001) 197 | # May cause conflicts when used with the ruff formatter. They recommend to disable it. 198 | # We leave it enabled but keep this in mind. 199 | 200 | ignore = [ 201 | "ANN003", 202 | "ANN101", 203 | "ANN102", 204 | "B905", 205 | "COM812", 206 | "D107", 207 | "D203", 208 | "D212", 209 | "FBT001", 210 | "FBT002", 211 | "UP038", 212 | ] 213 | 214 | [tool.ruff.lint.per-file-ignores] 215 | # S105: 216 | # we don't use passwords in MidiTok, only HF token for the interactions with the hub. 217 | # However we have a lot of variables with "token"(s) in their name, which would yield a 218 | # lot of lint errors or require a lot of noqa exceptions. 219 | "src/miditok/**" = [ 220 | "S105", 221 | ] 222 | "tests/**" = [ 223 | "ANN201", # allow no return type hint for pytest methods 224 | "D103", # no need to document pytest methods 225 | "S101", # allow assertions in tests 226 | "T201", # print allowed 227 | ] 228 | "docs/conf.py" = ["INP001"] # not a package 229 | -------------------------------------------------------------------------------- /src/miditok/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Root module. 3 | 4 | Here we only import tokenizer classes and submodules. 5 | """ 6 | 7 | from miditok import data_augmentation 8 | 9 | from .classes import Event, TokenizerConfig, TokSequence 10 | from .midi_tokenizer import MusicTokenizer 11 | from .tokenizations import ( 12 | MMM, 13 | REMI, 14 | TSD, 15 | CPWord, 16 | MIDILike, 17 | MuMIDI, 18 | Octuple, 19 | PerTok, 20 | Structured, 21 | ) 22 | from .tokenizer_training_iterator import TokTrainingIterator 23 | 24 | __all__ = [ 25 | "MusicTokenizer", 26 | "Event", 27 | "TokSequence", 28 | "TokenizerConfig", 29 | "TokTrainingIterator", 30 | "MIDILike", 31 | "REMI", 32 | "TSD", 33 | "Structured", 34 | "Octuple", 35 | "CPWord", 36 | "MuMIDI", 37 | "MMM", 38 | "PerTok", 39 | "utils", 40 | "data_augmentation", 41 | ] 42 | 43 | try: 44 | from miditok import pytorch_data # noqa: F401 45 | 46 | __all__.append("pytorch_data") 47 | except ImportError: 48 | pass 49 | -------------------------------------------------------------------------------- /src/miditok/attribute_controls/__init__.py: -------------------------------------------------------------------------------- 1 | """Attribute controls module.""" 2 | 3 | from .bar_attribute_controls import ( 4 | BarNoteDensity, 5 | BarNoteDuration, 6 | BarOnsetPolyphony, 7 | BarPitchClass, 8 | ) 9 | from .classes import AttributeControl, BarAttributeControl, create_random_ac_indexes 10 | from .track_attribute_controls import ( 11 | TrackNoteDensity, 12 | TrackNoteDuration, 13 | TrackOnsetPolyphony, 14 | TrackRepetition, 15 | ) 16 | 17 | __all__ = ( 18 | "AttributeControl", 19 | "BarAttributeControl", 20 | "BarNoteDensity", 21 | "BarNoteDuration", 22 | "BarOnsetPolyphony", 23 | "BarPitchClass", 24 | "TrackRepetition", 25 | "TrackNoteDuration", 26 | "TrackNoteDensity", 27 | "TrackOnsetPolyphony", 28 | "create_random_ac_indexes", 29 | ) 30 | -------------------------------------------------------------------------------- /src/miditok/attribute_controls/bar_attribute_controls.py: -------------------------------------------------------------------------------- 1 | """Bar-level attribute controls modules.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | from miditok import Event 8 | 9 | from .classes import BarAttributeControl 10 | 11 | 12 | class BarOnsetPolyphony(BarAttributeControl): 13 | """ 14 | Onset polyphony attribute control at the bar level. 15 | 16 | It specifies the minimum and maximum number of notes played simultaneously at a 17 | given time onset. 18 | It can be enabled with the ``ac_polyphony_bar`` argument of 19 | :class:`miditok.TokenizerConfig`. 20 | 21 | :param polyphony_min: minimum number of simultaneous notes to consider. 22 | :param polyphony_max: maximum number of simultaneous notes to consider. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | polyphony_min: int, 28 | polyphony_max: int, 29 | ) -> None: 30 | self.min_polyphony = polyphony_min 31 | self.max_polyphony = polyphony_max 32 | super().__init__( 33 | tokens=[ 34 | f"{tok_type}_{val}" 35 | for tok_type in ("ACBarOnsetPolyphonyMin", "ACBarOnsetPolyphonyMax") 36 | for val in range(polyphony_min, polyphony_max + 1) 37 | ], 38 | ) 39 | 40 | def _compute_on_bar( 41 | self, 42 | notes_soa: dict[str, np.ndarray], 43 | controls_soa: dict[str, np.ndarray], 44 | pitch_bends_soa: dict[str, np.ndarray], 45 | time_division: int, 46 | ) -> list[Event]: 47 | del controls_soa, pitch_bends_soa, time_division 48 | _, counts_onsets = np.unique(notes_soa["time"], return_counts=True) 49 | onset_poly_min, onset_poly_max = np.min(counts_onsets), np.max(counts_onsets) 50 | 51 | min_poly = min(max(onset_poly_min, self.min_polyphony), self.max_polyphony) 52 | max_poly = min(onset_poly_max, self.max_polyphony) 53 | return [ 54 | Event("ACBarOnsetPolyphonyMin", min_poly), 55 | Event("ACBarOnsetPolyphonyMax", max_poly), 56 | ] 57 | 58 | 59 | class BarPitchClass(BarAttributeControl): 60 | """ 61 | Bar-level pitch classes attribute control. 62 | 63 | This attribute control specifies which pitch classes are present within a bar. 64 | """ 65 | 66 | def __init__(self) -> None: 67 | super().__init__(tokens=[f"ACBarPitchClass_{i}" for i in range(12)]) 68 | 69 | def _compute_on_bar( 70 | self, 71 | notes_soa: dict[str, np.ndarray], 72 | controls_soa: dict[str, np.ndarray], 73 | pitch_bends_soa: dict[str, np.ndarray], 74 | time_division: int, 75 | ) -> list[Event]: 76 | del controls_soa, pitch_bends_soa, time_division 77 | pitch_values = notes_soa["pitch"] % 12 78 | pitch_values = np.unique(pitch_values) 79 | return [Event("ACBarPitchClass", pitch) for pitch in pitch_values] 80 | 81 | 82 | class BarNoteDensity(BarAttributeControl): 83 | """ 84 | Bar-level note density attribute control. 85 | 86 | It specifies the number of notes per bar. If a bar contains more that the maximum 87 | density (``density_max``), a ``density_max+`` token will be returned. 88 | 89 | :param density_max: maximum note density per bar to consider. 90 | """ 91 | 92 | def __init__(self, density_max: int) -> None: 93 | self.density_max = density_max 94 | super().__init__( 95 | tokens=[ 96 | *(f"ACBarNoteDensity_{i}" for i in range(density_max)), 97 | f"ACBarNoteDensity_{self.density_max}+", 98 | ], 99 | ) 100 | 101 | def _compute_on_bar( 102 | self, 103 | notes_soa: dict[str, np.ndarray], 104 | controls_soa: dict[str, np.ndarray], 105 | pitch_bends_soa: dict[str, np.ndarray], 106 | time_division: int, 107 | ) -> list[Event]: 108 | del controls_soa, pitch_bends_soa, time_division 109 | n_notes = len(notes_soa["time"]) 110 | if n_notes >= self.density_max: 111 | return [Event("ACBarNoteDensity", f"{self.density_max}+")] 112 | return [Event("ACBarNoteDensity", n_notes)] 113 | 114 | 115 | class BarNoteDuration(BarAttributeControl): 116 | """ 117 | Note duration attribute control. 118 | 119 | This attribute controls specifies the note durations (whole, half, quarter, eight, 120 | sixteenth and thirty-second) present in a bar. 121 | """ 122 | 123 | def __init__(self) -> None: 124 | self._note_durations = ( 125 | "Whole", 126 | "Half", 127 | "Quarter", 128 | "Eight", 129 | "Sixteenth", 130 | "ThirtySecond", 131 | ) 132 | super().__init__( 133 | tokens=[ 134 | f"ACBarNoteDuration{duration}_{val}" 135 | for duration in self._note_durations 136 | for val in (0, 1) 137 | ], 138 | ) 139 | # Factors multiplying ticks/quarter time division 140 | self.factors = (4, 2, 1, 0.5, 0.25) 141 | 142 | def _compute_on_bar( 143 | self, 144 | notes_soa: dict[str, np.ndarray], 145 | controls_soa: dict[str, np.ndarray], 146 | pitch_bends_soa: dict[str, np.ndarray], 147 | time_division: int, 148 | ) -> list[Event]: 149 | del controls_soa, pitch_bends_soa 150 | durations = np.unique(notes_soa["duration"]) 151 | controls = [] 152 | for fi, factor in enumerate(self.factors): 153 | controls.append( 154 | Event( 155 | f"ACBarNoteDuration{self._note_durations[fi]}", 156 | 1 if time_division * factor in durations else 0, 157 | ) 158 | ) 159 | return controls 160 | -------------------------------------------------------------------------------- /src/miditok/data_augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data augmentation module. 3 | 4 | The module implements three public methods: 5 | 6 | * :py:func:`miditok.data_augmentation.augment_midi`: augment a unique midi on a unique 7 | set of offsets; 8 | * :py:func:`miditok.data_augmentation.augment_midi_multiple_offsets`: augment a unique 9 | MIDI on combinations of offsets; 10 | * :py:func:`miditok.data_augmentation.augment_midi_dataset`: augment a list of MIDI 11 | files on combinations of offsets. 12 | 13 | """ 14 | 15 | from .data_augmentation import ( 16 | augment_dataset, 17 | augment_score, 18 | augment_score_multiple_offsets, 19 | ) 20 | 21 | __all__ = [ 22 | "augment_score", 23 | "augment_dataset", 24 | "augment_score_multiple_offsets", 25 | ] 26 | -------------------------------------------------------------------------------- /src/miditok/pytorch_data/__init__.py: -------------------------------------------------------------------------------- 1 | """Dataset classes and data collators to be used with PyTorch when training a model.""" 2 | 3 | from .collators import DataCollator 4 | from .datasets import ( 5 | DatasetJSON, 6 | DatasetMIDI, 7 | ) 8 | 9 | __all__ = [ 10 | "DatasetMIDI", 11 | "DatasetJSON", 12 | "DataCollator", 13 | ] 14 | -------------------------------------------------------------------------------- /src/miditok/pytorch_data/collators.py: -------------------------------------------------------------------------------- 1 | """Collator objects for PyTorch ``DataLoader``s.""" 2 | 3 | from __future__ import annotations 4 | 5 | import warnings 6 | from copy import deepcopy 7 | from typing import TYPE_CHECKING, Any 8 | 9 | import torch 10 | from torch import LongTensor 11 | 12 | if TYPE_CHECKING: 13 | from collections.abc import Mapping, Sequence 14 | 15 | 16 | class DataCollator: 17 | r""" 18 | All-in-one data collator for PyTorch ``DataLoader``. 19 | 20 | It allows to apply padding (right or left side of sequences), prepend or append 21 | *BOS* and *EOS* tokens. It will also add an ``"attention_mask"`` entry to the 22 | batch, following the padding applied. 23 | 24 | :param pad_token_id: padding token id. 25 | :param pad_on_left: if given True, it will pad the sequences on the left. This 26 | can be required when using some libraries expecting padding on left, for 27 | example when generating with Hugging Face Transformers. (default: ``False``) 28 | :param copy_inputs_as_labels: will add a labels entry (``labels_kwarg_name``) to 29 | the batch (or replace the existing one), which is a copy to the input entry: 30 | ``decoder_inputs_kwarg_name`` if present in the batch else 31 | ``labels_kwarg_name``. (default: ``False``) 32 | :param shift_labels: will shift inputs and labels for autoregressive 33 | training/teacher forcing. (default: ``False``) 34 | :param labels_pad_idx: padding id for labels. (default: -100) 35 | :param inputs_kwarg_name: name of dict / kwarg key for inputs. 36 | (default: ``"input_ids"``) 37 | :param labels_kwarg_name: name of dict / kwarg key for inputs. 38 | (default: ``"labels"``) 39 | :param decoder_inputs_kwarg_name: name of dict / kwarg key for decoder inputs. 40 | This key is intended to be used for encoder-decoder (seq2seq) models, for the 41 | decoder inputs while ``inputs_kwarg_name`` is for the encoder inputs. 42 | (default: ``"labels"``) 43 | """ 44 | 45 | def __init__( 46 | self, 47 | pad_token_id: int, 48 | pad_on_left: bool = False, 49 | copy_inputs_as_labels: bool = False, 50 | shift_labels: bool = False, 51 | labels_pad_idx: int = -100, 52 | inputs_kwarg_name: str = "input_ids", 53 | labels_kwarg_name: str = "labels", 54 | decoder_inputs_kwarg_name: str = "decoder_input_ids", 55 | ) -> None: 56 | self.pad_token = pad_token_id 57 | self.pad_on_left = pad_on_left 58 | self.copy_inputs_as_labels = copy_inputs_as_labels 59 | self.shift_labels = shift_labels 60 | self.labels_pad_idx = labels_pad_idx 61 | self.inputs_kwarg_name = inputs_kwarg_name 62 | self.labels_kwarg_name = labels_kwarg_name 63 | self.decoder_inputs_kwarg_name = decoder_inputs_kwarg_name 64 | 65 | def __call__(self, batch: list[Mapping[str, Any]]) -> Mapping[str, LongTensor]: 66 | """ 67 | Collate the sequences of a batch, make them ready to be fed to a model. 68 | 69 | :param batch: batch of sequences, as a list of dictionaries containing input ids 70 | and optionally labels. 71 | :return: the output batch as a dictionary linking to input and optionally target 72 | tensors. 73 | """ 74 | out_batch = {} 75 | inputs = [None, None, None] # x, x_dec, y 76 | 77 | # Figure out inputs 78 | for i, key in enumerate( 79 | ( 80 | self.inputs_kwarg_name, 81 | self.decoder_inputs_kwarg_name, 82 | self.labels_kwarg_name, 83 | ) 84 | ): 85 | if key in batch[0]: 86 | inputs[i] = [ 87 | sample[key] 88 | for sample in batch 89 | if sample[key] is not None and len(sample[key]) > 0 90 | ] 91 | x, x_dec, y = inputs 92 | 93 | # Copy labels, decoder input has priority over x 94 | if y is None and self.copy_inputs_as_labels: 95 | y = deepcopy(x_dec if x_dec is not None else x) 96 | 97 | # Pad inputs / convert to Tensors 98 | if x is not None: 99 | x = _pad_batch(x, self.pad_token, self.pad_on_left) 100 | if x_dec is not None: 101 | x_dec = _pad_batch(x_dec, self.pad_token, self.pad_on_left) 102 | if y is not None: 103 | # If labels are sequences of tokens 104 | if y[0].dim() > 0: 105 | y = _pad_batch(y, self.labels_pad_idx, self.pad_on_left) 106 | else: # classification 107 | y = torch.stack(y) 108 | 109 | # Shift labels, otherwise it's handled by models 110 | if self.shift_labels: 111 | if x_dec is not None: 112 | x_dec = x_dec[:, :-1] 113 | else: 114 | x = x[:, :-1] 115 | if y[0].dim() > 0: 116 | y = y[:, 1:] 117 | else: 118 | warnings.warn( 119 | "MidiTok DataCollator: You set shift_labels=True, but provided int" 120 | "labels (for sequence classification tasks) which is suited for." 121 | "Skipping label shifting.", 122 | stacklevel=2, 123 | ) 124 | 125 | # Add inputs / labels to output batch 126 | if x is not None: 127 | out_batch[self.inputs_kwarg_name] = x 128 | if x_dec is not None: 129 | out_batch[self.decoder_inputs_kwarg_name] = x_dec 130 | if y is not None: 131 | out_batch[self.labels_kwarg_name] = y 132 | 133 | # Create attention mask (just for padding, causal mask is handled by models) 134 | if x is not None: 135 | attention_mask = (x != self.pad_token).int() 136 | if attention_mask.dim() == 3: 137 | attention_mask = attention_mask[..., 0] # (N,T,Z) --> (N,T) 138 | out_batch["attention_mask"] = attention_mask 139 | if x_dec is not None: 140 | attention_mask = (x_dec != self.pad_token).int() 141 | if attention_mask.dim() == 3: 142 | attention_mask = attention_mask[..., 0] # (N,T,Z) --> (N,T) 143 | out_batch["decoder_attention_mask"] = attention_mask 144 | 145 | return out_batch 146 | 147 | 148 | def _pad_batch( 149 | batch: Sequence[LongTensor], 150 | pad_token_id: int, 151 | pad_on_left: bool = False, 152 | ) -> LongTensor: 153 | r""" 154 | Pad sequences of a batch. 155 | 156 | :param batch: batch as a list of Tensors. 157 | :param pad_token_id: padding token id. 158 | :param pad_on_left: if given True, it will pad the sequences on the left. This can 159 | be required when using some libraries expecting padding on left, for example 160 | when generating with Hugging Face Transformers. (default: False) 161 | :return: the batch sequences, padded into a unique Tensor. 162 | """ 163 | length_of_first = batch[0].size(0) 164 | 165 | # Check if padding is necessary. 166 | are_tensors_same_length = all(x.size(0) == length_of_first for x in batch) 167 | if are_tensors_same_length: 168 | return torch.stack(batch, dim=0).long() 169 | 170 | # Creating the full tensor and filling it with our data. 171 | if pad_on_left: 172 | return _pad_left(batch, pad_token_id) 173 | 174 | return torch.nn.utils.rnn.pad_sequence( 175 | batch, batch_first=True, padding_value=pad_token_id 176 | ).long() 177 | 178 | 179 | def _pad_left(batch: Sequence[LongTensor], pad_token_id: int) -> LongTensor: 180 | r""" 181 | Pad sequences on the left, i.e. on the first indices. 182 | 183 | Padding on the left make the last element of each sequence the last token, which is 184 | convenient when generating autoregressively as a method can more easily and 185 | efficiently append the newly generated tokens. 186 | 187 | :param batch: batch as a list of Tensors. 188 | :param pad_token_id: padding token id. 189 | :return: the batch sequences, padded into a unique Tensor. 190 | """ 191 | batch = [torch.flip(seq, dims=(0,)) for seq in batch] 192 | batch = torch.nn.utils.rnn.pad_sequence( 193 | batch, batch_first=True, padding_value=pad_token_id 194 | ) # (N,T) 195 | return torch.flip(batch, dims=(1,)).long() 196 | -------------------------------------------------------------------------------- /src/miditok/tokenizations/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tokenizer module. 3 | 4 | This module implement tokenizer classes, which inherit from ``MusicTokenizer`` and 5 | override specific methods such as ``_add_time_events`` or ``_tokens_to_score`` with 6 | their specific behaviors/representations. 7 | """ 8 | 9 | from .cp_word import CPWord 10 | from .midi_like import MIDILike 11 | from .mmm import MMM 12 | from .mumidi import MuMIDI 13 | from .octuple import Octuple 14 | from .pertok import PerTok 15 | from .remi import REMI 16 | from .structured import Structured 17 | from .tsd import TSD 18 | 19 | __all__ = [ 20 | "MIDILike", 21 | "REMI", 22 | "TSD", 23 | "Structured", 24 | "Octuple", 25 | "CPWord", 26 | "MuMIDI", 27 | "MMM", 28 | "PerTok", 29 | ] 30 | -------------------------------------------------------------------------------- /src/miditok/tokenizer_training_iterator.py: -------------------------------------------------------------------------------- 1 | """Iterator to be used when training a tokenizer with the 🤗tokenizers library.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | from symusic import Score 8 | 9 | from .attribute_controls import create_random_ac_indexes 10 | from .classes import TokSequence 11 | from .constants import SCORE_LOADING_EXCEPTION 12 | 13 | if TYPE_CHECKING: 14 | from collections.abc import Sequence 15 | from pathlib import Path 16 | 17 | from .midi_tokenizer import MusicTokenizer 18 | 19 | 20 | class TokTrainingIterator: 21 | r""" 22 | An iterable class to be used when training a tokenizer. 23 | 24 | It loads music files (MIDI, abc) and tokenize them on the fly, to be used with the 25 | Hugging Face tokenizers library to build a vocabulary with BPE, Unigram or WordPiece 26 | models. 27 | 28 | :param tokenizer: tokenizer to use for training. 29 | :param files_paths: sequence of paths of files to load for training. 30 | :param tracks_idx_random_ratio_range: range of ratios (between 0 and 1 included) of 31 | tracks to compute attribute controls on. If ``None`` is given, the attribute 32 | controls will be computed for all the tracks. (default: ``None``) 33 | :param bars_idx_random_ratio_range: range of ratios (between 0 and 1 included) of 34 | bars to compute attribute controls on. If ``None`` is given, the attribute 35 | controls will be computed for all the bars. (default: ``None``) 36 | """ 37 | 38 | def __init__( 39 | self, 40 | tokenizer: MusicTokenizer, 41 | files_paths: Sequence[Path], 42 | tracks_idx_random_ratio_range: tuple[float, float] | None = None, 43 | bars_idx_random_ratio_range: tuple[float, float] | None = None, 44 | ) -> None: 45 | self.tokenizer = tokenizer 46 | self.files_paths = files_paths 47 | self.tracks_idx_random_ratio_range = ( 48 | tracks_idx_random_ratio_range if tracks_idx_random_ratio_range else [] 49 | ) 50 | self.bars_idx_random_ratio_range = ( 51 | bars_idx_random_ratio_range if bars_idx_random_ratio_range else [] 52 | ) 53 | self.__iter_count = 0 54 | 55 | def load_file(self, path: Path) -> list[str]: 56 | """ 57 | Load a music file and convert it to its byte representation. 58 | 59 | :param path: path to the file to load. 60 | :return: the byte representation of the file. 61 | """ 62 | # Load and tokenize file 63 | try: 64 | score = Score(path) 65 | except SCORE_LOADING_EXCEPTION: 66 | return [] 67 | 68 | # Preprocess first to already have the appropriate tracks idx in case of deletes 69 | score = self.tokenizer.preprocess_score(score) 70 | 71 | # Randomly create attribute controls indexes 72 | ac_indexes = None 73 | if ( 74 | len(self.tracks_idx_random_ratio_range) > 0 75 | or len(self.bars_idx_random_ratio_range) > 0 76 | ): 77 | ac_indexes = create_random_ac_indexes( 78 | score, 79 | self.tokenizer.attribute_controls, 80 | self.tracks_idx_random_ratio_range, 81 | self.bars_idx_random_ratio_range, 82 | ) 83 | 84 | # Tokenize the file 85 | # Need to specify `encode_ids=False` as it might be already pretrained 86 | # For MMM, we make sure to have sequences separated per track 87 | kwargs = {} 88 | # can't use isinstance because of circular import 89 | if type(self.tokenizer).__name__ == "MMM": 90 | kwargs["concatenate_track_sequences"] = False 91 | tokseq = self.tokenizer( 92 | score, 93 | encode_ids=False, 94 | no_preprocess_score=True, 95 | attribute_controls_indexes=ac_indexes, 96 | **kwargs, 97 | ) 98 | 99 | # Split ids if requested 100 | if self.tokenizer.config.encode_ids_split in ["bar", "beat"]: 101 | if isinstance(tokseq, TokSequence): 102 | tokseq = [tokseq] 103 | 104 | new_seqs = [] 105 | for seq in tokseq: 106 | if self.tokenizer.config.encode_ids_split == "bar": 107 | new_seqs += seq.split_per_bars() 108 | else: 109 | new_seqs += seq.split_per_beats() 110 | tokseq = [seq for seq in new_seqs if len(seq) > 0] 111 | 112 | # Convert ids to bytes for training 113 | if isinstance(tokseq, TokSequence): 114 | token_ids = tokseq.ids 115 | else: 116 | token_ids = [seq.ids for seq in tokseq] 117 | bytes_ = self.tokenizer._ids_to_bytes(token_ids, as_one_str=True) 118 | if isinstance(bytes_, str): 119 | bytes_ = [bytes_] 120 | 121 | return bytes_ 122 | 123 | def __len__(self) -> int: 124 | """ 125 | Return the number of files in the training corpus. 126 | 127 | :return: number of files in the training corpus. 128 | """ 129 | return len(self.files_paths) 130 | 131 | def __getitem__(self, idx: int) -> list[str]: 132 | """ 133 | Convert the ``idx``th file to its byte representation. 134 | 135 | :param idx: idx of the file to convert. 136 | :return: byte representation of the file. 137 | """ 138 | return self.load_file(self.files_paths[idx]) 139 | 140 | def __iter__(self) -> TokTrainingIterator: # noqa:D105 141 | return self 142 | 143 | def __next__(self) -> list[str]: # noqa:D105 144 | if self.__iter_count >= len(self): 145 | self.__iter_count = 0 146 | raise StopIteration 147 | 148 | self.__iter_count += 1 149 | return self[self.__iter_count - 1] 150 | 151 | def __str__(self) -> str: 152 | """ 153 | Return the ``str`` representation of the iterator. 154 | 155 | :return: string description. 156 | """ 157 | return f"{self.tokenizer} - {len(self)} files" 158 | -------------------------------------------------------------------------------- /src/miditok/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Module containing utils methods than can be used outside of tokenization.""" 2 | 3 | from .split import ( 4 | get_average_num_tokens_per_note, 5 | split_files_for_training, 6 | split_score_per_beats, 7 | split_score_per_note_density, 8 | split_score_per_ticks, 9 | split_score_per_tracks, 10 | split_seq_in_subsequences, 11 | split_tokens_files_to_subsequences, 12 | ) 13 | from .utils import ( 14 | compute_ticks_per_bar, 15 | compute_ticks_per_beat, 16 | concat_scores, 17 | convert_ids_tensors_to_list, 18 | detect_chords, 19 | filter_dataset, 20 | fix_offsets_overlapping_notes, 21 | get_bars_ticks, 22 | get_beats_ticks, 23 | get_num_notes_per_bar, 24 | get_score_programs, 25 | get_score_ticks_per_beat, 26 | is_track_empty, 27 | merge_same_program_tracks, 28 | merge_scores, 29 | merge_tracks, 30 | merge_tracks_per_class, 31 | num_bar_pos, 32 | remove_duplicated_notes, 33 | ) 34 | 35 | __all__ = [ 36 | "compute_ticks_per_bar", 37 | "compute_ticks_per_beat", 38 | "concat_scores", 39 | "convert_ids_tensors_to_list", 40 | "detect_chords", 41 | "filter_dataset", 42 | "fix_offsets_overlapping_notes", 43 | "get_average_num_tokens_per_note", 44 | "get_bars_ticks", 45 | "get_beats_ticks", 46 | "get_score_programs", 47 | "get_score_ticks_per_beat", 48 | "is_track_empty", 49 | "merge_scores", 50 | "merge_same_program_tracks", 51 | "merge_tracks", 52 | "merge_tracks_per_class", 53 | "num_bar_pos", 54 | "get_num_notes_per_bar", 55 | "remove_duplicated_notes", 56 | "split_score_per_beats", 57 | "split_score_per_ticks", 58 | "split_score_per_tracks", 59 | "split_files_for_training", 60 | "split_score_per_note_density", 61 | "split_tokens_files_to_subsequences", 62 | "split_seq_in_subsequences", 63 | ] 64 | -------------------------------------------------------------------------------- /tests/MIDIs_corrupted/ValueError_Control168.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_corrupted/ValueError_Control168.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Aicha.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Aicha.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/All The Small Things.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/All The Small Things.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Funkytown.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Funkytown.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Girls Just Want to Have Fun.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Girls Just Want to Have Fun.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/I Gotta Feeling.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/I Gotta Feeling.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/In Too Deep.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/In Too Deep.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Les Yeux Revolvers.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Les Yeux Revolvers.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Mr. Blue Sky.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Mr. Blue Sky.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/Shut Up.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Shut Up.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/What a Fool Believes.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/What a Fool Believes.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/d6caebd1964d9e4a3c5ea59525230e2a.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/d6caebd1964d9e4a3c5ea59525230e2a.mid -------------------------------------------------------------------------------- /tests/MIDIs_multitrack/d8faddb8596fff7abb24d78666f73e4e.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/d8faddb8596fff7abb24d78666f73e4e.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/6338816_Etude No. 4.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/6338816_Etude No. 4.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/6354774_Macabre Waltz.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/6354774_Macabre Waltz.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_1.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_1.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_10.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_10.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_2.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_2.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_3.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_3.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_4.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_4.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_5.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_5.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_6.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_6.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_7.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_7.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_8.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_8.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/Maestro_9.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_9.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/POP909_008.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_008.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/POP909_010.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_010.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/POP909_022.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_022.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/POP909_191.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_191.mid -------------------------------------------------------------------------------- /tests/MIDIs_one_track/empty.mid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/empty.mid -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test module. 3 | 4 | Contains the pytest cases (files beginning with "test"), test utils and benchmarks. 5 | """ 6 | -------------------------------------------------------------------------------- /tests/abc_files/a_morning_in_summer.abc: -------------------------------------------------------------------------------- 1 | X: 1 2 | T: A Morning In Summer 3 | R: jig 4 | M: 6/8 5 | L: 1/8 6 | K: Dmaj 7 | |:A|A2DD2E|F2EF2D|G2FG2A|d2cA2G| 8 | A2DD2E|F2EF2D|D2GF2G|A2GG2:| 9 | G|:F2GA2B|=c3AB^c|d2cd2e|=f2dd2e| 10 | =f2ef2d|=c2AA2F|G2FG2A|d2cA2G:| 11 | -------------------------------------------------------------------------------- /tests/abc_files/flowers_of_edinburgh.abc: -------------------------------------------------------------------------------- 1 | X:14 2 | T:Flowers of Edinburgh 3 | R:Reel 4 | O:Scotland 5 | O:Ireland 6 | M:2/4 7 | L:1/16 8 | K:G 9 | "G"GE|D2DE G2GA|BGBd cBAG|"D7"FGFE DEFG|ABAF E2GE| 10 | "G"D2DE G2GA|"G"BABd "C"efge|"D7"dcBA GFGA|"G"B2G2 G2:| 11 | |:"G"Bd|"G"g2gf gbag|"D7"f2fe fagf|"C"edef gfed|"Em"B2e2 e2ge| 12 | "G"dBGB d2dd|"C"edef "Am"g2fe|"D7"dcBA GFGA|"G"B2G2 G2:| 13 | -------------------------------------------------------------------------------- /tests/abc_files/rising_sun.abc: -------------------------------------------------------------------------------- 1 | X:1 2 | T:Rising Sun [4] 3 | S:fiddler Hiram Horner (Southwestern, Pa., 1944) 4 | M:2/4 5 | L:1/8 6 | K:D 7 | (3A/B/c/||Od>d ~ed/e/|~fd~dA|Be e/f/e/d/|cA A(3A/B/c/| 8 | d2 ~ed/e/|~fddA|Be AB/c/|d2 d|| 9 | z2z|f a3|f a3|{a}g~fe~d|c~BAz| 10 | f a3|f a3|^g~f ec/e/|a2 a/g/f/e/O|| 11 | -------------------------------------------------------------------------------- /tests/abc_files/the_rising_of_the_moon.abc: -------------------------------------------------------------------------------- 1 | % Generated more or less automatically by swtoabc by Erich Rickheit KSC 2 | X:1 3 | T:The Rising of the Moon 4 | M:2/4 5 | L:1/8 6 | K:Eb 7 | C3/2 D/2| EF Gc| B/2 G3/2 FD| GF/2-D/2 CC| C2 C3/2 D/2| EF Gc| B/2 G3/2 FD|\ 8 | GF/2-D/2 C3/2 C/2| C2 Gc| c3/2 B/2 d3/2 c/2| BG FG| B/2 G3/2 Bd| c2 Gc|\ 9 | cB d3/2 c/2| B/2 G3/2 FD| GF/2-E/2 C3/2 C/2| C2|| 10 | -------------------------------------------------------------------------------- /tests/abc_files/the_wheels_of_the_world.abc: -------------------------------------------------------------------------------- 1 | X:340 2 | T:the Wheels of the World (reel) 3 | R:Reel 4 | O:Ireland 5 | B:Ceol Rince 1, n168 6 | S:Ceol Rince 1 7 | Z:Transcription, chords:Mike Long 8 | M:C| 9 | L:1/8 10 | K:G 11 | "D"dD~D2 FAGF|"C"EC~C2 EFGA|"D"dD~D2 FAGF|"C"EDCE "D"D2FA|\ 12 | "D"dD~D2 FAGF|"C"EC~C2 EFGc| 13 | "Am/G"AddB cBAG|"C"ECGE "D"D3|]\ 14 | B|\ 15 | "D"dAdf afdf|"C"ecgc acge|"D"dAdf afdf| 16 | "C"edce "D"d3A|\ 17 | "D"dAdf afdf|"C"ecgc acge|"Am"abag efge|"D"fa"C"ge "D"d3z|] 18 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest configuration file. 3 | 4 | Doc: https://docs.pytest.org/en/latest/reference/reference.html. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | import os 10 | 11 | import pytest 12 | 13 | 14 | @pytest.fixture() 15 | def hf_token() -> str | None: 16 | """ 17 | Get the Hugging Face token from the environment variable HF_TOKEN_HUB_TESTS. 18 | 19 | If the variable is not set, the test using this fixture will be skipped. 20 | """ 21 | token = os.environ.get("HF_TOKEN_HUB_TESTS") 22 | if not token: 23 | pytest.skip("HF_TOKEN_HUB_TESTS is not set") 24 | return token 25 | -------------------------------------------------------------------------------- /tests/test_attribute_controls.py: -------------------------------------------------------------------------------- 1 | """Test methods.""" 2 | 3 | from __future__ import annotations 4 | 5 | from random import seed 6 | from typing import TYPE_CHECKING, Any, Literal 7 | 8 | import miditok 9 | import pytest 10 | from miditok.attribute_controls import create_random_ac_indexes 11 | from symusic import Score 12 | 13 | from .utils_tests import ( 14 | BARS_RANDOM_RATIO_RANGE, 15 | MIDI_PATHS_ALL, 16 | MIDI_PATHS_ONE_TRACK, 17 | SEED, 18 | TRACKS_RANDOM_RATIO_RANGE, 19 | check_control_tokens_are_well_inserted, 20 | ) 21 | 22 | if TYPE_CHECKING: 23 | from collections.abc import Sequence 24 | from pathlib import Path 25 | 26 | TOKENIZATIONS = ["REMI", "TSD", "MMM"] 27 | TOKENIZER_PARAMS = { 28 | "pitch_range": (21, 109), 29 | "beat_res": {(0, 4): 8, (4, 12): 4}, 30 | "num_velocities": 32, 31 | "special_tokens": ["PAD", "BOS", "EOS", "MASK"], 32 | "use_chords": True, 33 | "use_rests": False, 34 | "use_tempos": True, 35 | "use_time_signatures": True, 36 | "use_programs": False, 37 | "num_tempos": 32, # number of tempo bins 38 | "tempo_range": (40, 250), # (min, max) 39 | "base_tokenizer": "REMI", 40 | "ac_polyphony_track": True, 41 | "ac_polyphony_bar": True, 42 | "ac_pitch_class_bar": True, 43 | "ac_note_density_track": True, 44 | "ac_note_density_bar": True, 45 | "ac_note_duration_bar": True, 46 | "ac_note_duration_track": True, 47 | "ac_repetition_track": True, 48 | } 49 | VOCAB_SIZE = 2000 50 | NUM_ADDITIONAL_TOKENS_SECOND_TRAINING = 400 51 | WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BAR = 500 # higher than default MidiTok values 52 | WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BEAT = 150 53 | 54 | 55 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ALL, ids=lambda path: path.name) 56 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS) 57 | @pytest.mark.parametrize( 58 | "random_tracks_idx", 59 | [False, True], 60 | ids=lambda r: "rand_tracks" if r else "all_tracks", 61 | ) 62 | @pytest.mark.parametrize( 63 | "random_bars_idx", [False, True], ids=lambda r: "rand_bars" if r else "all_bars" 64 | ) 65 | def test_attribute_controls_computation( 66 | file_path: Path, 67 | tokenization: str, 68 | random_tracks_idx: bool, 69 | random_bars_idx: bool, 70 | tokenizer_params: dict[str, Any] | None = None, 71 | ) -> None: 72 | if tokenizer_params is None: 73 | tokenizer_params = TOKENIZER_PARAMS 74 | 75 | tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)( 76 | tokenizer_config=miditok.TokenizerConfig(**tokenizer_params) 77 | ) 78 | score = Score(file_path) 79 | score = tokenizer.preprocess_score(score) 80 | 81 | # Set attribute controls indexes 82 | seed(SEED) 83 | tracks_idx_ratio = (0, 1) if random_tracks_idx else 1 84 | bars_idx_ratio = (0, 1) if random_bars_idx else 1 85 | ac_indexes = create_random_ac_indexes( 86 | score, 87 | tokenizer.attribute_controls, 88 | tracks_idx_ratio, 89 | bars_idx_ratio, 90 | ) 91 | 92 | # Tokenize Score with attribute controls injected 93 | tokens = tokenizer.encode( 94 | score, no_preprocess_score=True, attribute_controls_indexes=ac_indexes 95 | ) 96 | 97 | # Check for errors 98 | injection_errors = check_control_tokens_are_well_inserted( 99 | tokenizer, score, tokens, ac_indexes 100 | ) 101 | assert len(injection_errors) == 0 102 | 103 | 104 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS) 105 | @pytest.mark.parametrize("model", ["BPE"]) 106 | @pytest.mark.parametrize( 107 | "encode_ids_split", 108 | ["no", "bar", "beat"], 109 | ids=lambda s: f"{s}_split", 110 | ) 111 | def test_tokenizer_training_and_encoding_decoding( 112 | tokenization: str, 113 | model: Literal["BPE", "Unigram", "WordPiece"], 114 | encode_ids_split: Literal["bar", "beat", "no"], 115 | files_paths: Sequence[Path] = MIDI_PATHS_ONE_TRACK, 116 | vocab_size: int = VOCAB_SIZE, 117 | ): 118 | r""" 119 | Train a tokenizer to make sure the training iterator works with attribute controls. 120 | 121 | :param files_paths: list of paths of music files to use for the tests. 122 | :param encode_ids_split: type of token ids split before encoding/training. 123 | """ 124 | if encode_ids_split == "no" and model == "WordPiece": 125 | pytest.skip(f"Skipping training with {model} and {encode_ids_split} split") 126 | 127 | # Creates tokenizers 128 | TOKENIZER_PARAMS["encode_ids_split"] = encode_ids_split 129 | tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)( 130 | tokenizer_config=miditok.TokenizerConfig(**TOKENIZER_PARAMS) 131 | ) 132 | 133 | training_kwargs = {} 134 | if model == "WordPiece": 135 | training_kwargs["max_input_chars_per_word"] = ( 136 | WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BAR 137 | if encode_ids_split == "bar" 138 | else WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BEAT 139 | ) 140 | 141 | # Train the tokenizer 142 | training_iterator = miditok.TokTrainingIterator( 143 | tokenizer, files_paths, TRACKS_RANDOM_RATIO_RANGE, BARS_RANDOM_RATIO_RANGE 144 | ) 145 | tokenizer.train( 146 | vocab_size=vocab_size + NUM_ADDITIONAL_TOKENS_SECOND_TRAINING, 147 | model=model, 148 | iterator=training_iterator, 149 | **training_kwargs, 150 | ) 151 | -------------------------------------------------------------------------------- /tests/test_data_augmentation.py: -------------------------------------------------------------------------------- 1 | """Test methods.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | from miditok.data_augmentation import ( 8 | augment_dataset, 9 | ) 10 | from symusic import Score 11 | from tqdm import tqdm 12 | 13 | from .utils_tests import HERE 14 | 15 | if TYPE_CHECKING: 16 | from pathlib import Path 17 | 18 | 19 | def test_augment_dataset( 20 | tmp_path: Path, 21 | data_path: Path = HERE / "MIDIs_multitrack", 22 | ) -> None: 23 | # We only test data augmentation on MIDIs with one tokenization 24 | 25 | midi_aug_path = tmp_path / "Multitrack_MIDIs_aug" 26 | min_duration = 0.03125 27 | augment_dataset( 28 | data_path, 29 | pitch_offsets=[-2, 1, 2], 30 | velocity_offsets=[-4, 5], 31 | duration_offsets=[-0.5, 1], 32 | all_offset_combinations=True, 33 | min_duration=min_duration, 34 | out_path=midi_aug_path, 35 | ) 36 | 37 | aug_midi_paths = list(midi_aug_path.glob("**/*.mid")) 38 | for aug_midi_path in tqdm( 39 | aug_midi_paths, desc="CHECKING DATA AUGMENTATION ON MIDIS" 40 | ): 41 | # Determine offsets of file 42 | parts = aug_midi_path.stem.split("#") 43 | # If original non-augmented file 44 | if len(parts) < 2: 45 | continue 46 | original_stem, offsets_str = parts[0], parts[1].split("_") 47 | offsets = [0, 0, 0] 48 | for offset_str in offsets_str: 49 | for pos, letter in enumerate(["p", "v", "d"]): 50 | if offset_str[0] == letter: 51 | offsets[pos] = int(offset_str[1:]) 52 | 53 | # Loads MIDIs to compare 54 | midi_aug = Score(aug_midi_path) 55 | midi_ogi = Score(data_path / f"{original_stem}.mid") 56 | min_duration_ticks = round(min_duration * midi_aug.ticks_per_quarter) 57 | 58 | # Compare them 59 | for track_ogi, track_aug in zip(midi_ogi.tracks, midi_aug.tracks): 60 | if track_ogi.is_drum: 61 | continue 62 | track_ogi.notes.sort(key=lambda x: (x.start, x.pitch, x.end, x.velocity)) 63 | track_aug.notes.sort(key=lambda x: (x.start, x.pitch, x.end, x.velocity)) 64 | for note_o, note_a in zip(track_ogi.notes, track_aug.notes): 65 | if note_a.pitch != note_o.pitch + offsets[0]: 66 | msg = ( 67 | f"Pitch assertion failed: expected " 68 | f"{note_o.pitch + offsets[0]}, got {note_a.pitch}" 69 | ) 70 | raise ValueError(msg) 71 | # If negative duration offset, dur_exp must be greater or equal than 72 | # min_duration_ticks 73 | if offsets[2] < 0: 74 | dur_exp = max( 75 | note_o.duration + offsets[2], 76 | min_duration_ticks, 77 | ) 78 | # If positive duration offset, the original duration was just shifted 79 | elif offsets[2] > 0: 80 | dur_exp = note_o.duration + offsets[2] 81 | else: 82 | dur_exp = note_o.duration 83 | if note_a.duration != dur_exp: 84 | msg = ( 85 | f"Duration assertion failed: expected {dur_exp}, got " 86 | f"{note_a.duration}" 87 | ) 88 | raise ValueError(msg) 89 | # We need to resort the tracks with the velocity key in third position 90 | # before checking their values. 91 | track_ogi.notes.sort(key=lambda x: (x.start, x.pitch, x.velocity)) 92 | track_aug.notes.sort(key=lambda x: (x.start, x.pitch, x.velocity)) 93 | for note_o, note_a in zip(track_ogi.notes, track_aug.notes): 94 | if note_a.velocity not in [1, 127, note_o.velocity + offsets[1]]: 95 | msg = ( 96 | f"Velocity assertion failed: expected one in " 97 | f"{[1, 127, note_o.velocity + offsets[1]]}, got {note_a.pitch}" 98 | ) 99 | raise ValueError(msg) 100 | -------------------------------------------------------------------------------- /tests/test_hf_hub.py: -------------------------------------------------------------------------------- 1 | """Test the integration of the Hugging Face Hub.""" 2 | 3 | from __future__ import annotations 4 | 5 | import warnings 6 | from time import sleep 7 | from typing import TYPE_CHECKING 8 | 9 | import miditok 10 | import pytest 11 | from huggingface_hub.utils import HfHubHTTPError 12 | 13 | if TYPE_CHECKING: 14 | from pathlib import Path 15 | 16 | MAX_NUM_TRIES_HF_PUSH = 3 17 | NUM_SECONDS_RETRY = 8 18 | 19 | AUTO_TOKENIZER_CASES = [ 20 | # ("class_name", "save_path", "class_name_assert") 21 | ("REMI", "rem", "REMI"), 22 | ("REMI", "rem2", "TSD"), 23 | ("TSD", "tsd", "TSD"), 24 | ] 25 | 26 | 27 | def test_push_and_load_to_hf_hub(hf_token: str): 28 | tokenizer = miditok.REMI( 29 | miditok.TokenizerConfig(num_velocities=62, pitch_range=(12, 44)) 30 | ) 31 | num_tries = 0 32 | while num_tries < MAX_NUM_TRIES_HF_PUSH: 33 | try: 34 | tokenizer.push_to_hub("Natooz/MidiTok-tests", private=True, token=hf_token) 35 | except HfHubHTTPError as e: 36 | if e.response.status_code == 429: # hourly quota exceeded 37 | # We performed to many tests, we skip it to not break the HF servers 🥲 38 | pytest.skip( 39 | "Hugging Face hourly quota exceeded, skipping" 40 | "`test_push_and_load_to_hf_hub` test." 41 | ) 42 | elif e.response.status_code in [500, 412]: 43 | num_tries += 1 44 | sleep(NUM_SECONDS_RETRY) 45 | else: 46 | num_tries = MAX_NUM_TRIES_HF_PUSH 47 | 48 | # No skip, we rerun it if possible 49 | if num_tries == MAX_NUM_TRIES_HF_PUSH: 50 | warnings.warn("Tokenizer push failed", stacklevel=2) 51 | 52 | tokenizer2 = miditok.REMI.from_pretrained("Natooz/MidiTok-tests", token=hf_token) 53 | assert tokenizer == tokenizer2 54 | 55 | 56 | def test_from_pretrained_local(tmp_path: Path): 57 | # Here using paths to directories 58 | tokenizer = miditok.TSD() 59 | tokenizer.save_pretrained(tmp_path) 60 | tokenizer2 = miditok.TSD.from_pretrained(tmp_path) 61 | assert tokenizer == tokenizer2 62 | 63 | 64 | @pytest.mark.parametrize("params_case", AUTO_TOKENIZER_CASES) 65 | def test_autotokenizer(tmp_path: Path, params_case: tuple[str, str, str]): 66 | tok_class, save_path, tok_class2 = params_case 67 | 68 | tokenizer = getattr(miditok, tok_class)() 69 | tokenizer.save_pretrained(tmp_path / save_path) 70 | tokenizer2 = getattr(miditok, tok_class2)( 71 | params=tmp_path / save_path / "tokenizer.json" 72 | ) 73 | 74 | assert (tokenizer == tokenizer2) == (tok_class == tok_class2) 75 | -------------------------------------------------------------------------------- /tests/test_io_formats.py: -------------------------------------------------------------------------------- 1 | """Testing the possible I/O formats of the tokenizers.""" 2 | 3 | from __future__ import annotations 4 | 5 | from copy import deepcopy 6 | from typing import TYPE_CHECKING, Any 7 | 8 | import miditok 9 | import pytest 10 | from symusic import Score 11 | 12 | from .utils_tests import ( 13 | ALL_TOKENIZATIONS, 14 | HERE, 15 | TOKENIZER_CONFIG_KWARGS, 16 | adjust_tok_params_for_tests, 17 | tokenize_and_check_equals, 18 | ) 19 | 20 | if TYPE_CHECKING: 21 | from pathlib import Path 22 | 23 | default_params = deepcopy(TOKENIZER_CONFIG_KWARGS) 24 | default_params.update( 25 | { 26 | "use_chords": True, 27 | "use_rests": True, 28 | "use_tempos": True, 29 | "use_time_signatures": True, 30 | "use_sustain_pedals": True, 31 | "use_pitch_bends": True, 32 | "base_tokenizer": "TSD", 33 | } 34 | ) 35 | tokenizations_no_one_stream = [ 36 | "TSD", 37 | "REMI", 38 | "MIDILike", 39 | "Structured", 40 | "CPWord", 41 | "Octuple", 42 | ] 43 | configs = ( 44 | { 45 | "use_programs": True, 46 | "one_token_stream_for_programs": True, 47 | "program_changes": False, 48 | }, 49 | { 50 | "use_programs": True, 51 | "one_token_stream_for_programs": True, 52 | "program_changes": True, 53 | }, 54 | { 55 | "use_programs": True, 56 | "one_token_stream_for_programs": False, 57 | "program_changes": False, 58 | }, 59 | ) 60 | TOK_PARAMS_IO = [] 61 | for tokenization_ in ALL_TOKENIZATIONS: 62 | params_ = deepcopy(default_params) 63 | adjust_tok_params_for_tests(tokenization_, params_) 64 | TOK_PARAMS_IO.append((tokenization_, params_)) 65 | 66 | if tokenization_ in tokenizations_no_one_stream: 67 | for config in configs: 68 | params_tmp = deepcopy(params_) 69 | params_tmp.update(config) 70 | TOK_PARAMS_IO.append((tokenization_, params_tmp)) 71 | 72 | 73 | @pytest.mark.parametrize("tok_params_set", TOK_PARAMS_IO) 74 | def test_io_formats( 75 | tok_params_set: tuple[str, dict[str, Any]], 76 | midi_path: Path = HERE / "MIDIs_multitrack" / "Funkytown.mid", 77 | ) -> None: 78 | r""" 79 | Tokenize and decode a MIDI back to make sure the possible I/O format are ok. 80 | 81 | :param tok_params_set: tokenizer and its parameters to run. 82 | :param midi_path: path to the MIDI file to test. 83 | """ 84 | midi = Score(midi_path) 85 | tokenization, params = tok_params_set 86 | tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)( 87 | tokenizer_config=miditok.TokenizerConfig(**params) 88 | ) 89 | 90 | _, _, has_errors = tokenize_and_check_equals(midi, tokenizer, midi_path.stem) 91 | assert not has_errors 92 | -------------------------------------------------------------------------------- /tests/test_preprocess.py: -------------------------------------------------------------------------------- 1 | """Tests on the preprocessing steps of music files, before tokenization.""" 2 | 3 | from pathlib import Path 4 | 5 | import miditok 6 | import pytest 7 | from symusic import Score 8 | 9 | from .utils_tests import MIDI_PATHS_ALL 10 | 11 | CONFIG_KWARGS = { 12 | "use_tempos": True, 13 | "use_time_signatures": True, 14 | "use_sustain_pedals": True, 15 | "use_pitch_bends": True, 16 | "log_tempos": True, 17 | "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2}, 18 | "delete_equal_successive_time_sig_changes": True, 19 | "delete_equal_successive_tempo_changes": True, 20 | } 21 | TOKENIZATIONS = ["MIDILike", "TSD"] 22 | 23 | 24 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS) 25 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ALL, ids=lambda p: p.name) 26 | def test_preprocess(tokenization: str, file_path: Path): 27 | r""" 28 | Check that a second preprocessing doesn't alter the MIDI anymore. 29 | 30 | :param tokenization: name of the tokenizer class. 31 | :param file_path: paths to MIDI file to test. 32 | """ 33 | # Creates tokenizer 34 | tok_config = miditok.TokenizerConfig(**CONFIG_KWARGS) 35 | tokenizer = getattr(miditok, tokenization)(tok_config) 36 | 37 | # Preprocess original file, and once again on the already preprocessed file 38 | score = Score(file_path) 39 | score_processed1 = tokenizer.preprocess_score(score) 40 | score_processed2 = tokenizer.preprocess_score(score_processed1) 41 | 42 | # The second preprocess shouldn't do anything 43 | assert score_processed1 == score_processed2 44 | -------------------------------------------------------------------------------- /tests/test_pytorch_data_loading.py: -------------------------------------------------------------------------------- 1 | """Test classes and methods from the pytorch_data module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from time import time 6 | from typing import TYPE_CHECKING 7 | 8 | import miditok 9 | import pytest 10 | from torch import randint 11 | from torch.utils.data import DataLoader 12 | 13 | from .utils_tests import ( 14 | ABC_PATHS, 15 | MAX_BAR_EMBEDDING, 16 | MIDI_PATHS_CORRUPTED, 17 | MIDI_PATHS_MULTITRACK, 18 | MIDI_PATHS_ONE_TRACK, 19 | ) 20 | 21 | if TYPE_CHECKING: 22 | from collections.abc import Callable, Sequence 23 | from pathlib import Path 24 | 25 | from symusic import Score 26 | 27 | 28 | def get_labels_seq_len(score: Score, tokseq: miditok.TokSequence, _: Path) -> int: 29 | num_track = 1 if len(score.tracks) == 0 else len(score.tracks) 30 | if isinstance(tokseq, miditok.TokSequence): 31 | return len(tokseq) // num_track 32 | return len(tokseq[0]) // num_track 33 | 34 | 35 | def get_labels_seq(score: Score, tokseq: miditok.TokSequence, _: Path) -> list[int]: 36 | if isinstance(tokseq, list): 37 | return tokseq[0].ids[: -len(score.tracks)] 38 | if len(tokseq) > len(score.tracks): 39 | return tokseq.ids[: -len(score.tracks)] 40 | return tokseq.ids 41 | 42 | 43 | @pytest.mark.parametrize( 44 | "tokenizer_cls", [miditok.TSD, miditok.Octuple], ids=["TSD", "Octuple"] 45 | ) 46 | @pytest.mark.parametrize( 47 | "one_token_stream_for_programs", [True, False], ids=["1 strm", "n strm"] 48 | ) 49 | @pytest.mark.parametrize("split_files", [True, False], ids=["split", "no split"]) 50 | @pytest.mark.parametrize("pre_tokenize", [True, False], ids=["pretok", "no pretok"]) 51 | @pytest.mark.parametrize("ac_random_tracks_ratio", [None, (0.0, 1.0)]) 52 | @pytest.mark.parametrize("ac_random_bars_ratio", [None, (0.0, 1.0)]) 53 | @pytest.mark.parametrize("func_labels", [get_labels_seq_len, get_labels_seq]) 54 | @pytest.mark.parametrize("num_overlap_bars", [0, 1], ids=["no overlap", "overlap"]) 55 | def test_dataset_midi( 56 | tmp_path: Path, 57 | tokenizer_cls: Callable, 58 | one_token_stream_for_programs: bool, 59 | split_files: bool, 60 | pre_tokenize: bool, 61 | ac_random_tracks_ratio: tuple[float, float] | None, 62 | ac_random_bars_ratio: tuple[float, float] | None, 63 | func_labels: Callable, 64 | num_overlap_bars: int, 65 | files_paths: Sequence[Path] = MIDI_PATHS_MULTITRACK 66 | + MIDI_PATHS_ONE_TRACK 67 | + MIDI_PATHS_CORRUPTED 68 | + ABC_PATHS, 69 | max_seq_len: int = 1000, 70 | ): 71 | config = miditok.TokenizerConfig( 72 | use_programs=True, 73 | one_token_stream_for_programs=one_token_stream_for_programs, 74 | max_bar_embedding=MAX_BAR_EMBEDDING, 75 | ) 76 | tokenizer = tokenizer_cls(config) 77 | 78 | # Split files if requested 79 | # We perform it twice as the second time, the method would return the same paths as 80 | # the ones created in the first call. 81 | if split_files: 82 | t0 = time() 83 | file_paths_split1 = miditok.utils.split_files_for_training( 84 | files_paths, 85 | tokenizer, 86 | tmp_path, 87 | max_seq_len, 88 | num_overlap_bars=num_overlap_bars, 89 | ) 90 | t1 = time() - t0 91 | print(f"First Score split call: {t1:.2f} sec") 92 | t0 = time() 93 | file_paths_split2 = miditok.utils.split_files_for_training( 94 | files_paths, 95 | tokenizer, 96 | tmp_path, 97 | max_seq_len, 98 | num_overlap_bars=num_overlap_bars, 99 | ) 100 | t1 = time() - t0 101 | print(f"Second Score split call: {t1:.2f} sec") 102 | 103 | file_paths_split1.sort() 104 | file_paths_split2.sort() 105 | assert file_paths_split1 == file_paths_split2 106 | files_paths = file_paths_split1 107 | 108 | # Creating the Dataset, splitting MIDIs 109 | t0 = time() 110 | dataset = miditok.pytorch_data.DatasetMIDI( 111 | files_paths, 112 | tokenizer, 113 | max_seq_len, 114 | tokenizer["BOS_None"], 115 | tokenizer["EOS_None"], 116 | pre_tokenize=pre_tokenize, 117 | ac_tracks_random_ratio_range=ac_random_tracks_ratio, 118 | ac_bars_random_ratio_range=ac_random_bars_ratio, 119 | func_to_get_labels=func_labels, 120 | ) 121 | t1 = time() - t0 122 | print(f"Dataset init took {t1:.2f} sec") 123 | 124 | # Test iteration, and collator with user labels 125 | batch = [dataset[i] for i in range(min(len(dataset), 10))] 126 | 127 | # Test with DataLoader and collator 128 | collator = miditok.pytorch_data.DataCollator( 129 | tokenizer.pad_token_id, 130 | pad_on_left=True, 131 | ) 132 | _ = collator(batch) 133 | dataloader = DataLoader(dataset, 16, collate_fn=collator) 134 | for _ in dataloader: 135 | pass 136 | 137 | 138 | def test_dataset_json(tmp_path: Path, file_paths: Sequence[Path] | None = None): 139 | if file_paths is None: 140 | file_paths = MIDI_PATHS_MULTITRACK[:5] 141 | tokens_dir_path = tmp_path / "multitrack_tokens_dataset_json" 142 | 143 | config = miditok.TokenizerConfig(use_programs=True) 144 | tokenizer = miditok.TSD(config) 145 | if not tokens_dir_path.is_dir(): 146 | tokenizer.tokenize_dataset(file_paths, tokens_dir_path) 147 | 148 | tokens_split_dir_path = tmp_path / "multitrack_tokens_dataset_json_split" 149 | miditok.utils.split_tokens_files_to_subsequences( 150 | list(tokens_dir_path.glob("**/*.json")), 151 | tokens_split_dir_path, 152 | 300, 153 | 1000, 154 | ) 155 | dataset = miditok.pytorch_data.DatasetJSON( 156 | list(tokens_split_dir_path.glob("**/*.json")), 157 | 1000, 158 | tokenizer["BOS_None"], 159 | tokenizer["EOS_None"], 160 | ) 161 | 162 | for _ in dataset: 163 | pass 164 | 165 | 166 | def test_collator(): 167 | collator = miditok.pytorch_data.DataCollator( 168 | 0, 169 | pad_on_left=True, 170 | copy_inputs_as_labels=True, 171 | shift_labels=True, 172 | ) 173 | seq_lengths = [120, 100, 80, 200] 174 | 175 | # Just input ids 176 | batch_from_dataloader = [ 177 | {"input_ids": randint(0, 300, (seq_len,))} for seq_len in seq_lengths 178 | ] 179 | batch_collated = collator(batch_from_dataloader) 180 | # seq_len - 1 as we shift labels 181 | assert list(batch_collated["input_ids"].size()) == [ 182 | len(seq_lengths), 183 | max(seq_lengths) - 1, 184 | ] 185 | 186 | # Encoder and decoder input ids 187 | batch_from_dataloader = [ 188 | { 189 | "input_ids": randint(0, 300, (seq_len,)), 190 | "decoder_input_ids": randint(0, 300, (seq_len,)), 191 | } 192 | for seq_len in seq_lengths 193 | ] 194 | batch_collated = collator(batch_from_dataloader) 195 | # seq_len - 1 as we shift labels 196 | assert list(batch_collated["input_ids"].size()) == [ 197 | len(seq_lengths), 198 | max(seq_lengths), 199 | ] 200 | assert list(batch_collated["decoder_input_ids"].size()) == [ 201 | len(seq_lengths), 202 | max(seq_lengths) - 1, 203 | ] 204 | 205 | # This time with labels already in batch and embed pooling, padding right 206 | collator.pad_on_left = False 207 | batch_from_dataloader = [ 208 | { 209 | "input_ids": randint(0, 300, (seq_len, 5)), 210 | "decoder_input_ids": randint(0, 300, (seq_len, 5)), 211 | "labels": randint(0, 300, (seq_len, 5)), 212 | } 213 | for seq_len in seq_lengths 214 | ] 215 | batch_collated = collator(batch_from_dataloader) 216 | assert list(batch_collated["input_ids"].size()) == [ 217 | len(seq_lengths), 218 | max(seq_lengths), 219 | 5, 220 | ] 221 | assert list(batch_collated["decoder_input_ids"].size()) == [ 222 | len(seq_lengths), 223 | max(seq_lengths) - 1, 224 | 5, 225 | ] 226 | assert list(batch_collated["labels"].size()) == [ 227 | len(seq_lengths), 228 | max(seq_lengths) - 1, 229 | 5, 230 | ] 231 | -------------------------------------------------------------------------------- /tests/test_saving_loading_config.py: -------------------------------------------------------------------------------- 1 | """Tests for the saving/loading methods of tokenizers.""" 2 | 3 | from __future__ import annotations 4 | 5 | from copy import deepcopy 6 | from typing import TYPE_CHECKING, Any 7 | 8 | import miditok 9 | import pytest 10 | 11 | from .utils_tests import ( 12 | ALL_TOKENIZATIONS, 13 | MAX_BAR_EMBEDDING, 14 | MIDI_PATHS_MULTITRACK, 15 | MIDI_PATHS_ONE_TRACK, 16 | ) 17 | 18 | if TYPE_CHECKING: 19 | from pathlib import Path 20 | 21 | ADDITIONAL_TOKENS_TEST = { 22 | "use_chords": False, # False to speed up tests 23 | "use_rests": True, 24 | "use_tempos": True, 25 | "use_time_signatures": True, 26 | "use_programs": False, 27 | "beat_res_rest": {(0, 16): 4}, 28 | "num_tempos": 32, 29 | "tempo_range": (40, 250), 30 | "base_tokenizer": "TSD", 31 | "use_microtiming": True, 32 | "ticks_per_quarter": 480, 33 | "max_microtiming_shift": 0.25, 34 | "num_microtiming_bins": 110, 35 | } 36 | 37 | TOK_PARAMS_MULTITRACK = [] 38 | tokenizations_non_one_stream = [ 39 | "TSD", 40 | "REMI", 41 | "MIDILike", 42 | "Structured", 43 | "CPWord", 44 | "Octuple", 45 | ] 46 | for tokenization_ in ALL_TOKENIZATIONS: 47 | params_ = {"use_programs": True} 48 | if tokenization_ == "MMM": 49 | params_["base_tokenizer"] = "TSD" 50 | elif tokenization_ in ["Octuple", "MuMIDI"]: 51 | params_["max_bar_embedding"] = MAX_BAR_EMBEDDING 52 | elif tokenization_ in ["PerTok"]: 53 | params_["use_microtiming"] = True 54 | params_["ticks_per_quarter"] = 220 55 | params_["max_microtiming_shift"] = 0.25 56 | params_["num_microtiming_bins"] = 110 57 | TOK_PARAMS_MULTITRACK.append((tokenization_, params_)) 58 | 59 | if tokenization_ in tokenizations_non_one_stream: 60 | params_tmp = deepcopy(params_) 61 | params_tmp["one_token_stream_for_programs"] = False 62 | # Disable tempos for Octuple with one_token_stream_for_programs, as tempos are 63 | # carried by note tokens 64 | if tokenization_ == "Octuple": 65 | params_tmp["use_tempos"] = False 66 | TOK_PARAMS_MULTITRACK.append((tokenization_, params_tmp)) 67 | 68 | 69 | @pytest.mark.parametrize("tokenization", ALL_TOKENIZATIONS) 70 | def test_saving_loading_tokenizer_config(tokenization: str, tmp_path: Path): 71 | config1 = miditok.TokenizerConfig() 72 | config1.save_to_json(tmp_path / f"tok_conf_{tokenization}.json") 73 | 74 | config2 = miditok.TokenizerConfig.load_from_json( 75 | tmp_path / f"tok_conf_{tokenization}.json" 76 | ) 77 | 78 | assert config1 == config2 79 | config1.pitch_range = (0, 777) 80 | assert config1 != config2 81 | 82 | 83 | @pytest.mark.parametrize("tokenization", ALL_TOKENIZATIONS) 84 | def test_saving_loading_tokenizer(tokenization: str, tmp_path: Path): 85 | r""" 86 | Make sure saving and loading end with the identical tokenizer. 87 | 88 | Create a tokenizer, save its config, and load it back. 89 | If all went well the reloaded tokenizer should be identical. 90 | """ 91 | tokenizer_config = miditok.TokenizerConfig(**ADDITIONAL_TOKENS_TEST) 92 | tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)( 93 | tokenizer_config=tokenizer_config 94 | ) 95 | tokenizer.save(tmp_path / f"{tokenization}.txt") 96 | 97 | tokenizer2: miditok.MusicTokenizer = getattr(miditok, tokenization)( 98 | params=tmp_path / f"{tokenization}.txt" 99 | ) 100 | assert tokenizer == tokenizer2 101 | if tokenization == "Octuple": 102 | tokenizer.vocab[0]["PAD_None"] = 8 103 | assert tokenizer != tokenizer2 104 | 105 | 106 | @pytest.mark.parametrize("file_path", MIDI_PATHS_MULTITRACK[:3], ids=lambda p: p.name) 107 | @pytest.mark.parametrize("tok_params_set", TOK_PARAMS_MULTITRACK) 108 | def test_multitrack_midi_to_tokens_to_midi( 109 | file_path: Path, 110 | tok_params_set: tuple[str, dict[str, Any]], 111 | tmp_path: Path, 112 | ): 113 | # Create tokenizer 114 | tokenization, params = tok_params_set 115 | tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)( 116 | tokenizer_config=miditok.TokenizerConfig(**params) 117 | ) 118 | 119 | # Tokenize the file, save tokens and load them back 120 | tokens = tokenizer(file_path) 121 | tokenizer.save_tokens(tokens, tmp_path / "tokens.json") 122 | tokens_loaded = tokenizer.load_tokens(tmp_path / "tokens.json") 123 | 124 | # Assert tokens are the same 125 | assert tokens == tokens_loaded 126 | 127 | 128 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ONE_TRACK[:3], ids=lambda p: p.name) 129 | def test_pertok_microtiming_tick_values(file_path: Path): 130 | # Create the pertok tokenizer 131 | cfg = miditok.TokenizerConfig( 132 | use_chords=False, 133 | use_microtiming=True, 134 | ticks_per_quarter=480, 135 | max_microtiming_shift=0.25, 136 | num_microtiming_bins=110, 137 | ) 138 | tok = miditok.PerTok(cfg) 139 | # Train the tokenizer 140 | tok.train(files_paths=[file_path], vocab_size=1000) 141 | # Dump the tokenizer to a JSON 142 | tok.save("tmp.json") 143 | # Reload the tokenizer 144 | newtok = miditok.PerTok(params="tmp.json") 145 | # Should still have the microtiming_tick_values parameter 146 | assert hasattr(newtok, "microtiming_tick_values") 147 | -------------------------------------------------------------------------------- /tests/test_toksequence.py: -------------------------------------------------------------------------------- 1 | """Test methods.""" 2 | 3 | from collections.abc import Callable 4 | from pathlib import Path 5 | 6 | import pytest 7 | from miditok import TSD, TokenizerConfig, TokSequence 8 | 9 | from .utils_tests import MIDI_PATHS_MULTITRACK 10 | 11 | 12 | def test_tokseq_concat(): 13 | ids1 = list(range(10)) 14 | ids2 = list(range(10, 20)) 15 | str1 = [str(id_ * 2) for id_ in ids1] 16 | str2 = [str(id_ * 2) for id_ in ids2] 17 | bytes1 = "".join(str1) 18 | bytes2 = "".join(str2) 19 | 20 | tokseq1 = TokSequence(ids=ids1, tokens=str1, bytes=bytes1) 21 | tokseq2 = TokSequence(ids=ids2, tokens=str2, bytes=bytes2) 22 | seq_concat = tokseq1 + tokseq2 23 | 24 | assert seq_concat.ids == ids1 + ids2 25 | assert seq_concat.tokens == str1 + str2 26 | assert seq_concat.bytes == bytes1 + bytes2 27 | 28 | 29 | def test_tokseq_slice_and_concat(): 30 | ids1 = list(range(20)) 31 | str1 = [str(id_ * 2) for id_ in ids1] 32 | bytes1 = "".join(str1) 33 | 34 | tokseq = TokSequence(ids=ids1, tokens=str1, bytes=bytes1) 35 | subseq1 = tokseq[:10] 36 | subseq2 = tokseq[10:] 37 | 38 | assert subseq1.ids == ids1[:10] 39 | assert subseq1.tokens == str1[:10] 40 | assert subseq1.bytes == bytes1[:10] 41 | assert subseq2.ids == ids1[10:] 42 | assert subseq2.tokens == str1[10:] 43 | assert subseq2.bytes == bytes1[10:] 44 | 45 | tokseq_concat = subseq1 + subseq2 46 | assert tokseq == tokseq_concat 47 | 48 | 49 | @pytest.mark.parametrize("file_path", MIDI_PATHS_MULTITRACK, ids=lambda p: p.name) 50 | def test_split_tokseq_per_bars_beats(file_path: Path, tokenization: Callable = TSD): 51 | tokenizer = tokenization(TokenizerConfig(use_programs=True)) 52 | tokseq = tokenizer(file_path) 53 | 54 | # Split per bars 55 | seqs = tokseq.split_per_bars() 56 | concat_seq = seqs.pop(0) 57 | for seq in seqs: 58 | concat_seq += seq 59 | assert concat_seq == tokseq 60 | 61 | # Split per beats 62 | seqs = tokseq.split_per_beats() 63 | concat_seq = seqs.pop(0) 64 | for seq in seqs: 65 | concat_seq += seq 66 | assert concat_seq == tokseq 67 | --------------------------------------------------------------------------------