├── .github
    ├── codecov.yml
    ├── dependabot.yml
    └── workflows
    │   ├── close-stale-issues.yml
    │   ├── docs.yml
    │   ├── lint.yml
    │   ├── publish-pypi.yml
    │   └── pytest.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmarks
    ├── README.md
    ├── __init__.py
    ├── midi_file_read
    │   ├── README.md
    │   ├── benchmark_midi_read.py
    │   ├── midi_read.csv
    │   ├── midi_read.md
    │   └── midi_read.txt
    ├── miditok_preprocess_file
    │   ├── README.md
    │   ├── benchmark_preprocess.py
    │   ├── preprocess.csv
    │   ├── preprocess.md
    │   └── preprocess.txt
    ├── miditok_tokenize
    │   ├── README.md
    │   ├── benchmark_tokenize.py
    │   ├── tokenize.csv
    │   ├── tokenize.md
    │   └── tokenize.txt
    ├── tokenizer_training
    │   ├── README.md
    │   ├── benchmark_training.py
    │   └── results
    │   │   ├── seq_split_lengths.csv
    │   │   ├── seq_split_lengths.md
    │   │   ├── seq_split_lengths.txt
    │   │   ├── wordpiece_max_chars_enc_time.csv
    │   │   ├── wordpiece_max_chars_enc_time.md
    │   │   ├── wordpiece_max_chars_enc_time.txt
    │   │   ├── wordpiece_max_chars_train_time.csv
    │   │   ├── wordpiece_max_chars_train_time.md
    │   │   └── wordpiece_max_chars_train_time.txt
    └── utils.py
├── colab-notebooks
    ├── Example_HuggingFace_Mistral_Transformer.ipynb
    ├── MidiTok_Full_Workflow_Tutorial.ipynb
    └── README.md
├── docs
    ├── Makefile
    ├── additional_tokens_table.csv
    ├── assets
    │   ├── Octuple_TS_Rest
    │   │   ├── original.png
    │   │   └── tokenized.png
    │   ├── bases
    │   │   ├── pianoroll_daw.png
    │   │   ├── sheet_music.png
    │   │   └── spectrogram.png
    │   ├── cp_word.png
    │   ├── embeddings.png
    │   ├── favicon.png
    │   ├── midi_like.png
    │   ├── midi_preprocessing_original.png
    │   ├── midi_preprocessing_preprocessed.png
    │   ├── miditok_logo.png
    │   ├── miditok_logo.svg
    │   ├── miditok_logo_stroke.png
    │   ├── mumidi.png
    │   ├── music_sheet.png
    │   ├── octuple.png
    │   ├── pitch_intervals.png
    │   ├── pitch_intervals_original.png
    │   ├── remi.png
    │   ├── remiplus.png
    │   ├── structured.png
    │   ├── transformer.png
    │   └── tsd.png
    ├── attribute_controls.rst
    ├── citations.rst
    ├── conf.py
    ├── configuration.rst
    ├── data_augmentation.rst
    ├── examples.rst
    ├── hf_hub.rst
    ├── index.rst
    ├── make.bat
    ├── midi.rst
    ├── music_formats.rst
    ├── pytorch_data.rst
    ├── sequential_models.rst
    ├── tokenizations.rst
    ├── tokenizing_music_with_miditok.rst
    ├── train.rst
    └── utils.rst
├── pyproject.toml
├── src
    └── miditok
    │   ├── __init__.py
    │   ├── attribute_controls
    │       ├── __init__.py
    │       ├── bar_attribute_controls.py
    │       ├── classes.py
    │       └── track_attribute_controls.py
    │   ├── classes.py
    │   ├── constants.py
    │   ├── data_augmentation
    │       ├── __init__.py
    │       └── data_augmentation.py
    │   ├── midi_tokenizer.py
    │   ├── pytorch_data
    │       ├── __init__.py
    │       ├── collators.py
    │       └── datasets.py
    │   ├── tokenizations
    │       ├── __init__.py
    │       ├── cp_word.py
    │       ├── midi_like.py
    │       ├── mmm.py
    │       ├── mumidi.py
    │       ├── octuple.py
    │       ├── pertok.py
    │       ├── remi.py
    │       ├── structured.py
    │       └── tsd.py
    │   ├── tokenizer_training_iterator.py
    │   └── utils
    │       ├── __init__.py
    │       ├── split.py
    │       └── utils.py
└── tests
    ├── MIDIs_corrupted
        └── ValueError_Control168.mid
    ├── MIDIs_multitrack
        ├── Aicha.mid
        ├── All The Small Things.mid
        ├── Funkytown.mid
        ├── Girls Just Want to Have Fun.mid
        ├── I Gotta Feeling.mid
        ├── In Too Deep.mid
        ├── Les Yeux Revolvers.mid
        ├── Mr. Blue Sky.mid
        ├── Shut Up.mid
        ├── What a Fool Believes.mid
        ├── d6caebd1964d9e4a3c5ea59525230e2a.mid
        └── d8faddb8596fff7abb24d78666f73e4e.mid
    ├── MIDIs_one_track
        ├── 6338816_Etude No. 4.mid
        ├── 6354774_Macabre Waltz.mid
        ├── Maestro_1.mid
        ├── Maestro_10.mid
        ├── Maestro_2.mid
        ├── Maestro_3.mid
        ├── Maestro_4.mid
        ├── Maestro_5.mid
        ├── Maestro_6.mid
        ├── Maestro_7.mid
        ├── Maestro_8.mid
        ├── Maestro_9.mid
        ├── POP909_008.mid
        ├── POP909_010.mid
        ├── POP909_022.mid
        ├── POP909_191.mid
        └── empty.mid
    ├── __init__.py
    ├── abc_files
        ├── a_morning_in_summer.abc
        ├── flowers_of_edinburgh.abc
        ├── rising_sun.abc
        ├── the_rising_of_the_moon.abc
        └── the_wheels_of_the_world.abc
    ├── conftest.py
    ├── test_attribute_controls.py
    ├── test_data_augmentation.py
    ├── test_hf_hub.py
    ├── test_io_formats.py
    ├── test_methods.py
    ├── test_preprocess.py
    ├── test_pytorch_data_loading.py
    ├── test_saving_loading_config.py
    ├── test_tokenize.py
    ├── test_toksequence.py
    ├── test_train.py
    ├── test_utils.py
    └── utils_tests.py


/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | # Codecov params
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default: false  # disable the default status that measures entire project
 7 |       tests:
 8 |         paths:
 9 |           - "tests/"
10 |         target: 70%
11 |       source:
12 |         paths:
13 |           - "src/miditok/"
14 |         target: 75%
15 |         threshold: 0.5%
16 |     patch:
17 |       default:
18 |         enabled: no # target: 75%  # new contributions should have a coverage at least equal to target
19 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       # Check for updates to GitHub Actions every week
12 |       interval: "weekly"
13 | 


--------------------------------------------------------------------------------
/.github/workflows/close-stale-issues.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will automatically mark inactive issues as stale, and close them.
 2 | # For more information see: https://github.com/marketplace/actions/close-stale-issues and https://docs.github.com/en/github-ae@latest/actions/managing-issues-and-pull-requests/closing-inactive-issues
 3 | 
 4 | name: Close inactive issues
 5 | on:
 6 |   schedule:
 7 |     - cron: "30 1 * * *"
 8 | 
 9 | jobs:
10 |   close-issues:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       issues: write
14 |       pull-requests: write
15 |     steps:
16 |       - uses: actions/stale@v9.1.0
17 |         with:
18 |           days-before-issue-stale: 21
19 |           days-before-issue-close: 7
20 |           stale-issue-label: "stale"
21 |           stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
22 |           close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
23 |           days-before-pr-stale: -1
24 |           days-before-pr-close: -1
25 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | # Build docs preview in pull-requests.
 2 | 
 3 | name: docs/preview
 4 | on:
 5 |   pull_request_target:
 6 |     types:
 7 |       - opened
 8 |     # Execute this action only on PRs that touch
 9 |     # documentation files.
10 |     # paths:
11 |     #   - "docs/**"
12 | 
13 | permissions:
14 |   pull-requests: write
15 | 
16 | jobs:
17 |   documentation-links:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: readthedocs/actions/preview@v1
21 |         with:
22 |           project-slug: "miditok"
23 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - uses: actions/setup-python@v5
15 |       - uses: pre-commit/action@v3.0.1
16 |         env:
17 |           RUFF_OUTPUT_FORMAT: github
18 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Publish package on PyPi
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   build:
17 |     name: Build distribution
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: "3.x"
25 |       - name: Install pypa/build
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           python -m pip install hatch --user
29 |       - name: Build a binary wheel and a source tarball
30 |         run: hatch build
31 |       - name: Store the distribution packages
32 |         uses: actions/upload-artifact@v4
33 |         with:
34 |           name: python-package-distributions
35 |           path: dist/
36 | 
37 |   pypi-publish:
38 |     name: Upload release to PyPI
39 |     needs:
40 |       - build
41 |     runs-on: ubuntu-latest
42 |     environment:
43 |       name: PyPI
44 |       url: https://pypi.org/p/MidiTok
45 |     permissions:
46 |       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
47 |     steps:
48 |       # retrieve your distributions here
49 |       - name: Download all the dists
50 |         uses: actions/download-artifact@v4
51 |         with:
52 |           name: python-package-distributions
53 |           path: dist/
54 | 
55 |       - name: Publish package distributions to PyPI
56 |         uses: pypa/gh-action-pypi-publish@release/v1
57 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | # PyTest workflow
 2 | 
 3 | name: Tests
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [main]
 8 |   pull_request:
 9 |     branches: [main]
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.9", "3.10", "3.12"]
17 |         os: [ ubuntu-latest, macos-latest, windows-latest ]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v5
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           cache: pip
27 |           cache-dependency-path: pyproject.toml
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           # Install local package with tests dependencies extras
32 |           python -m pip install --upgrade pip
33 |           pip install -e ".[tests]"
34 | 
35 |       # Tokenizer training tests are significantly slower than others.
36 |       # So that xdist don't assign chunks of training tests to the same worker, we use
37 |       # the `--dist worksteal` distribution mode to dynamically reassign queued tests to
38 |       # free workers.
39 |       - name: Test with pytest
40 |         run: python -m pytest --cov=./ --cov-report=xml -n logical --dist worksteal --durations=0 -v tests
41 |         env:
42 |           HF_TOKEN_HUB_TESTS: ${{ secrets.HF_TOKEN_HUB_TESTS }}
43 | 
44 |       - name: Codecov
45 |         uses: codecov/codecov-action@v5.4.3
46 |         with:
47 |           token: ${{ secrets.CODECOV_TOKEN }}
48 | 
49 |   build:
50 |     runs-on: ubuntu-latest
51 |     steps:
52 |       - uses: actions/checkout@v4
53 |       - name: Set up Python
54 |         uses: actions/setup-python@v5
55 |         with:
56 |           python-version: '3.x'
57 |       - name: Install dependencies
58 |         run: pip install hatch
59 |       - name: Build package
60 |         run: hatch build
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Generated files in test
  2 | tests/configs
  3 | tests/Multitrack_tokens
  4 | tests/Multitrack_tokens_aug
  5 | tests/Multitrack_MIDIs_aug
  6 | 
  7 | # Standard Python gitignore from https://github.com/github/gitignore/blob/main/Python.gitignore
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | cover/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | .pybuilder/
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | #   For a library or package, you might want to ignore these files since the code is
 95 | #   intended to run in multiple environments; otherwise, check them in:
 96 | # .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # poetry
106 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
108 | #   commonly ignored for libraries.
109 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | #   in version control.
117 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
118 | .pdm.toml
119 | .pdm-python
120 | .pdm-build/
121 | 
122 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123 | __pypackages__/
124 | 
125 | # Celery stuff
126 | celerybeat-schedule
127 | celerybeat.pid
128 | 
129 | # SageMath parsed files
130 | *.sage.py
131 | 
132 | # Environments
133 | .env
134 | .venv
135 | env/
136 | venv/
137 | ENV/
138 | env.bak/
139 | venv.bak/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # Cython debug symbols
163 | cython_debug/
164 | 
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
172 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
173 | 
174 | # User-specific stuff
175 | .idea/**/workspace.xml
176 | .idea/**/tasks.xml
177 | .idea/**/usage.statistics.xml
178 | .idea/**/dictionaries
179 | .idea/**/shelf
180 | 
181 | # AWS User-specific
182 | .idea/**/aws.xml
183 | 
184 | # Generated files
185 | .idea/**/contentModel.xml
186 | 
187 | # Sensitive or high-churn files
188 | .idea/**/dataSources/
189 | .idea/**/dataSources.ids
190 | .idea/**/dataSources.local.xml
191 | .idea/**/sqlDataSources.xml
192 | .idea/**/dynamic.xml
193 | .idea/**/uiDesigner.xml
194 | .idea/**/dbnavigator.xml
195 | 
196 | # Gradle
197 | .idea/**/gradle.xml
198 | .idea/**/libraries
199 | 
200 | # Gradle and Maven with auto-import
201 | # When using Gradle or Maven with auto-import, you should exclude module files,
202 | # since they will be recreated, and may cause churn.  Uncomment if using
203 | # auto-import.
204 | # .idea/artifacts
205 | # .idea/compiler.xml
206 | # .idea/jarRepositories.xml
207 | # .idea/modules.xml
208 | # .idea/*.iml
209 | # .idea/modules
210 | # *.iml
211 | # *.ipr
212 | 
213 | # CMake
214 | cmake-build-*/
215 | 
216 | # Mongo Explorer plugin
217 | .idea/**/mongoSettings.xml
218 | 
219 | # File-based project format
220 | *.iws
221 | 
222 | # IntelliJ
223 | out/
224 | 
225 | # mpeltonen/sbt-idea plugin
226 | .idea_modules/
227 | 
228 | # JIRA plugin
229 | atlassian-ide-plugin.xml
230 | 
231 | # Cursive Clojure plugin
232 | .idea/replstate.xml
233 | 
234 | # SonarLint plugin
235 | .idea/sonarlint/
236 | 
237 | # Crashlytics plugin (for Android Studio and IntelliJ)
238 | com_crashlytics_export_strings.xml
239 | crashlytics.properties
240 | crashlytics-build.properties
241 | fabric.properties
242 | 
243 | # Editor-based Rest Client
244 | .idea/httpRequests
245 | 
246 | # Android studio 3.1+ serialized cache file
247 | .idea/caches/build_file_checksums.ser
248 | 
249 | # Aider cache directory
250 | .aider*
251 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     rev: v0.5.0
 4 |     hooks:
 5 |       - id: ruff
 6 |         args:
 7 |           - --fix
 8 |       - id: ruff-format
 9 |   - repo: https://github.com/pre-commit/pre-commit-hooks
10 |     rev: v4.6.0
11 |     hooks:
12 |       - id: end-of-file-fixer
13 |       - id: trailing-whitespace
14 |   #- repo: https://github.com/pre-commit/mirrors-mypy
15 |   #  rev: v1.10.0
16 |   #  hooks:
17 |   #    - id: mypy
18 |   #      # types: [ python ]
19 |   #      args: [--strict, --ignore-missing-imports]  # --no-warn-return-any
20 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for Sphinx projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 |     # You can also specify other tool versions:
13 |     # nodejs: "20"
14 |     # rust: "1.70"
15 |     # golang: "1.20"
16 | 
17 | # Build documentation in the "docs/" directory with Sphinx
18 | sphinx:
19 |   configuration: docs/conf.py
20 |   # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
21 |   # builder: "dirhtml"
22 |   # Fail on all warnings to avoid broken references
23 |   fail_on_warning: true
24 | 
25 | # Optionally build your docs in additional formats such as PDF and ePub
26 | # formats:
27 | #    - pdf
28 | #    - epub
29 | 
30 | # Optional but recommended, declare the Python requirements required
31 | # to build your documentation
32 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
33 | python:
34 |   install:
35 |     - method: pip
36 |       path: .
37 |       extra_requirements:
38 |         - docs
39 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Fradet"
 5 |   given-names: "Nathan"
 6 |   orcid: "https://orcid.org/0000-0003-4729-570X"
 7 | - family-names: "Briot"
 8 |   given-names: "Jean-Pierre"
 9 |   orcid: "https://orcid.org/0000-0003-1621-6335"
10 | - family-names: "Chhel"
11 |   given-names: "Fabien"
12 |   orcid: "https://orcid.org/0000-0003-2224-8296"
13 | - family-names: "El Fallah Seghrouchni"
14 |   given-names: "Amal"
15 |   orcid: "https://orcid.org/0000-0002-8390-8780"
16 | - family-names: "Gutowski"
17 |   given-names: "Nicolas"
18 |   orcid: "https://orcid.org/0000-0002-5765-9901"
19 | title: "MidiTok: A Python package for MIDI file tokenization"
20 | license: MIT
21 | date-released: 2021-11-07
22 | url: "https://github.com/Natooz/MidiTok"
23 | repository-code: "https://github.com/Natooz/MidiTok"
24 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to `MidiTok`
 2 | 
 3 | - Reporting a bug.
 4 | - Discussing the current state of the code.
 5 | - Submitting a fix.
 6 | - Proposing new features.
 7 | - Becoming a maintainer.
 8 | 
 9 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
10 | 
11 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
12 | 
13 | 1. Fork the repo and create your branch from the `main`.
14 | 2. If you've added code that should be tested, add [tests](tests).
15 | 3. If you've changed APIs, update the documentation.
16 | 4. Ensure the test suite passes.
17 | 5. Make sure your code lints.
18 | 6. Issue that pull request!
19 | 
20 | ## Report bugs using Github's [issues](https://github.com/Natooz/MidiTok/issues)
21 | 
22 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/Natooz/MidiTok/issues/new).
23 | 
24 | ## Write bug Reports with Detail, Background, and Sample Code
25 | 
26 | **Great Bug Reports** tend to have:
27 | 
28 | - A quick summary and/or background.
29 | - Steps to reproduce.
30 |   - Be specific!
31 |   - Give a sample code if you can, for example,
32 | - What you expected would happen.
33 | - What actually happens.
34 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work).
35 | 
36 | ## Development
37 | 
38 | ### Tests
39 | 
40 | We use `pytest`/`pytest-xdist` for testing and `pytest-cov` for measuring coverage. Running all the tests can take between 10 to 30min depending on your hardware. You don't need to run all of them, but try to run those affected by your changes.
41 | 
42 | ```bash
43 | pip install pytest-cov "pytest-xdist[psutil]"
44 | pytest --cov=./ --cov-report=xml -n auto --durations=0 -v tests/
45 | ```
46 | 
47 | ### Use a Consistent Coding Style
48 | 
49 | We use the [ruff](https://github.com/astral-sh/ruff) formatter for Python in this project. Ruff allows to automatically analyze the code and format it according to rules if needed. This is handled by using pre-commit (following section).
50 | 
51 | ### Pre-commit Lints
52 | 
53 | Linting is configured via [pre-commit](https://www.pre-commit.com/). You can set up pre-commit by running:
54 | 
55 | ```bash
56 | pip install pre-commit
57 | pre-commit install  # installs pre-commit Git hook in the repository
58 | ```
59 | 
60 | When your changes are finished and the tests are passing, you can run `pre-commit run` to check if your code lints according to our ruff rules.
61 | If errors are found, we encourage you to fix them to follow the best code practices. If you struggle with this step, don't hesitate to ask for help, and to even commit and push anyway. Contributors will be able to help you.
62 | 
63 | ## License
64 | 
65 | By contributing, you agree that your contributions will be licensed under the MIT License.
66 | 
67 | ## References
68 | 
69 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)
70 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Nathan Fradet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MidiTok
  2 | 
  3 | Python package to tokenize music files, introduced at the ISMIR 2021 LBDs.
  4 | 
  5 | ![MidiTok Logo](docs/assets/miditok_logo_stroke.png?raw=true "")
  6 | 
  7 | [![PyPI version fury.io](https://badge.fury.io/py/miditok.svg)](https://pypi.python.org/pypi/miditok/)
  8 | [![Python 3.9](https://img.shields.io/badge/python-≥3.9-blue.svg)](https://www.python.org/downloads/release/)
  9 | [![Documentation Status](https://readthedocs.org/projects/miditok/badge/?version=latest)](https://miditok.readthedocs.io/en/latest/?badge=latest)
 10 | [![GitHub CI](https://github.com/Natooz/MidiTok/actions/workflows/pytest.yml/badge.svg)](https://github.com/Natooz/MidiTok/actions/workflows/pytest.yml)
 11 | [![Codecov](https://img.shields.io/codecov/c/github/Natooz/MidiTok)](https://codecov.io/gh/Natooz/MidiTok)
 12 | [![GitHub license](https://img.shields.io/github/license/Natooz/MidiTok.svg)](https://github.com/Natooz/MidiTok/blob/main/LICENSE)
 13 | [![Downloads](https://static.pepy.tech/badge/miditok)](https://pepy.tech/project/MidiTok)
 14 | [![Code style](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
 15 | 
 16 | MidiTok can tokenize MIDI and abc files, i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task.
 17 | MidiTok features most known [music tokenizations](https://miditok.readthedocs.io/en/latest/tokenizations.html) (e.g. [REMI](https://arxiv.org/abs/2002.00212), [Compound Word](https://arxiv.org/abs/2101.02402)...), and is built around the idea that they all share common parameters and methods. Tokenizers can be trained with [Byte Pair Encoding (BPE)](https://aclanthology.org/2023.emnlp-main.123/), [Unigram](https://aclanthology.org/P18-1007/) and [WordPiece](https://arxiv.org/abs/1609.08144), and it offers data augmentation methods.
 18 | 
 19 | MidiTok is integrated with the Hugging Face Hub 🤗! Don't hesitate to share your models to the community!
 20 | 
 21 | **Documentation:** [miditok.readthedocs.com](https://miditok.readthedocs.io/en/latest/index.html)
 22 | 
 23 | ## Install
 24 | 
 25 | ```shell
 26 | pip install miditok
 27 | ```
 28 | MidiTok uses [Symusic](https://github.com/Yikai-Liao/symusic) to read and write MIDI and abc files, and BPE/Unigram is backed by [Hugging Face 🤗tokenizers](https://github.com/huggingface/tokenizers) for superfast encoding.
 29 | 
 30 | ## Usage example
 31 | 
 32 | Tokenizing and detokenzing can be done by calling the tokenizer:
 33 | 
 34 | ```python
 35 | from miditok import REMI, TokenizerConfig
 36 | from symusic import Score
 37 | 
 38 | # Creating a multitrack tokenizer, read the doc to explore all the parameters
 39 | config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
 40 | tokenizer = REMI(config)
 41 | 
 42 | # Loads a midi, converts to tokens, and back to a MIDI
 43 | midi = Score("path/to/your_midi.mid")
 44 | tokens = tokenizer(midi)  # calling the tokenizer will automatically detect MIDIs, paths and tokens
 45 | converted_back_midi = tokenizer(tokens)  # PyTorch, Tensorflow and Numpy tensors are supported
 46 | ```
 47 | 
 48 | Here is a complete yet concise example of how you can use MidiTok to train any PyTorch model. And [here](colab-notebooks/Example_HuggingFace_Mistral_Transformer.ipynb) is a simple notebook example showing how to use Hugging Face models to generate music, with MidiTok taking care of tokenizing music files.
 49 | 
 50 | ```python
 51 | from miditok import REMI, TokenizerConfig
 52 | from miditok.pytorch_data import DatasetMIDI, DataCollator
 53 | from miditok.utils import split_files_for_training
 54 | from torch.utils.data import DataLoader
 55 | from pathlib import Path
 56 | 
 57 | # Creating a multitrack tokenizer, read the doc to explore all the parameters
 58 | config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
 59 | tokenizer = REMI(config)
 60 | 
 61 | # Train the tokenizer with Byte Pair Encoding (BPE)
 62 | files_paths = list(Path("path", "to", "midis").glob("**/*.mid"))
 63 | tokenizer.train(vocab_size=30000, files_paths=files_paths)
 64 | tokenizer.save(Path("path", "to", "save", "tokenizer.json"))
 65 | # And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
 66 | tokenizer.push_to_hub("username/model-name", private=True, token="your_hf_token")
 67 | 
 68 | # Split MIDIs into smaller chunks for training
 69 | dataset_chunks_dir = Path("path", "to", "midi_chunks")
 70 | split_files_for_training(
 71 |     files_paths=files_paths,
 72 |     tokenizer=tokenizer,
 73 |     save_dir=dataset_chunks_dir,
 74 |     max_seq_len=1024,
 75 | )
 76 | 
 77 | # Create a Dataset, a DataLoader and a collator to train a model
 78 | dataset = DatasetMIDI(
 79 |     files_paths=list(dataset_chunks_dir.glob("**/*.mid")),
 80 |     tokenizer=tokenizer,
 81 |     max_seq_len=1024,
 82 |     bos_token_id=tokenizer["BOS_None"],
 83 |     eos_token_id=tokenizer["EOS_None"],
 84 | )
 85 | collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
 86 | dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)
 87 | 
 88 | # Iterate over the dataloader to train a model
 89 | for batch in dataloader:
 90 |     print("Train your model on this batch...")
 91 | ```
 92 | 
 93 | ## Tokenizations
 94 | 
 95 | MidiTok implements the tokenizations: (links to original papers)
 96 | * [REMI](https://dl.acm.org/doi/10.1145/3394171.3413671)
 97 | * [REMI+](https://openreview.net/forum?id=NyR8OZFHw6i)
 98 | * [MIDI-Like](https://link.springer.com/article/10.1007/s00521-018-3758-9)
 99 | * [TSD](https://arxiv.org/abs/2301.11975)
100 | * [Structured](https://arxiv.org/abs/2107.05944)
101 | * [CPWord](https://ojs.aaai.org/index.php/AAAI/article/view/16091)
102 | * [Octuple](https://aclanthology.org/2021.findings-acl.70)
103 | * [MuMIDI](https://dl.acm.org/doi/10.1145/3394171.3413721)
104 | * [MMM](https://arxiv.org/abs/2008.06048)
105 | * [PerTok](https://www.arxiv.org/abs/2410.02060)
106 | 
107 | You can find short presentations in the [documentation](https://miditok.readthedocs.io/en/latest/tokenizations.html).
108 | 
109 | ## Contributions
110 | 
111 | Contributions are gratefully welcomed, feel free to open an issue or send a PR if you want to add a tokenization or speed up the code. You can read the [contribution guide](CONTRIBUTING.md) for details.
112 | 
113 | ### Todos
114 | 
115 | * Support music-xml files;
116 | * `no_duration_drums` option, discarding duration tokens for drum notes;
117 | * Control Change messages;
118 | * Speed-up global/track events parsing with Rust or C++ bindings.
119 | 
120 | ## Citation
121 | 
122 | If you use MidiTok for your research, a citation in your manuscript would be gladly appreciated. ❤️
123 | 
124 | [**[MidiTok paper]**](https://arxiv.org/abs/2310.17202)
125 | [**[MidiTok original ISMIR publication]**](https://archives.ismir.net/ismir2021/latebreaking/000005.pdf)
126 | ```bibtex
127 | @inproceedings{miditok2021,
128 |     title={{MidiTok}: A Python package for {MIDI} file tokenization},
129 |     author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas},
130 |     booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference},
131 |     year={2021},
132 |     url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf},
133 | }
134 | ```
135 | 
136 | The BibTeX citations of all tokenizations can be found [in the documentation](https://miditok.readthedocs.io/en/latest/citations.html)
137 | 
138 | 
139 | ## Acknowledgments
140 | 
141 | @Natooz thanks its employers who allowed him to develop this project, by chronological order [Aubay](https://blog.aubay.com/index.php/language/en/home/?lang=en), the [LIP6 (Sorbonne University)](https://www.lip6.fr/?LANG=en), and the [Metacreation Lab (Simon Fraser University)](https://www.metacreation.net).
142 | 
143 | ## All Thanks To Our Contributors
144 | 
145 | <a href="https://github.com/Natooz/MidiTok/graphs/contributors">
146 |   <img src="https://contrib.rocks/image?repo=Natooz/MidiTok" />
147 | </a>
148 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarks
2 | 
3 | This directory contains several benchmarks, measuring in particular encoding-decoding speeds, training speeds and sequence lengths.
4 | They are intended to give you reference points and help you to choose your tokenization parameters.
5 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | """Utils methods for benchmarks."""
2 | 
3 | from .utils import mean_std_str
4 | 
5 | __all__ = [
6 |     "mean_std_str",
7 | ]
8 | 


--------------------------------------------------------------------------------
/benchmarks/midi_file_read/README.md:
--------------------------------------------------------------------------------
 1 | # MIDI files reading
 2 | 
 3 | This benchmark measures the read times of MIDI files, comparing the [symusic](https://github.com/Yikai-Liao/symusic), [miditoolkit](https://github.com/YatingMusic/miditoolkit) and [pretty_midi](https://github.com/craffel/pretty-midi) which are the tree main python libraries parsing MIDI files at the note level.
 4 | 
 5 | ## Configuration
 6 | 
 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1
 8 | 
 9 | * symusic version: 0.4.5
10 | * miditoolkit version: 1.0.1
11 | * pretty_midi version: 0.2.10
12 | 
13 | ## Results
14 | 
15 | | Library     | Maestro         | MetaMIDI        | POP909          |
16 | |:------------|:----------------|:----------------|:----------------|
17 | | Symusic     | 1.06 ± 0.89 ms  | 0.37 ± 0.32 ms  | 0.20 ± 0.05 ms  |
18 | | MidiToolkit | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec |
19 | | Pretty MIDI | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec |
20 | 
21 | miditoolkit and pretty_midi perform equally in average. The two libraries are very similar and both rely on [mido](https://github.com/mido/mido) to read and write MIDI messages.
22 | symusic on the other hand is respectively 104, 108 and 100 times faster than the two others on the Maestro, MetaMIDI and POP909 datasets.
23 | 


--------------------------------------------------------------------------------
/benchmarks/midi_file_read/benchmark_midi_read.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3 python
 2 | 
 3 | """Benchmark for Python MIDI parsing libraries."""
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | import random
 8 | from pathlib import Path
 9 | from time import time
10 | 
11 | import numpy as np
12 | from miditoolkit import MidiFile
13 | from pandas import DataFrame
14 | from pretty_midi import PrettyMIDI
15 | from symusic import Score
16 | from tqdm import tqdm
17 | 
18 | HERE = Path(__file__).parent
19 | DATASETS = ["Maestro", "MMD", "POP909"]
20 | LIBRARIES = ["Symusic", "MidiToolkit", "Pretty MIDI"]
21 | MAX_NUM_FILES = 1000
22 | 
23 | 
24 | def read_midi_files(
25 |     midi_paths: list[Path],
26 | ) -> tuple[list[float], list[float], list[float]]:
27 |     """
28 |     Read a list of MIDI files and return their reading times.
29 | 
30 |     :param midi_paths: paths to the midi files to read.
31 |     :return: times of files reads for symusic, miditoolkit and pretty_midi.
32 |     """
33 |     times_mtk, times_sms, times_ptm = [], [], []
34 |     for midi_path in tqdm(midi_paths, desc="Loading MIDIs"):
35 |         # We count times only if all libraries load the file without error
36 |         try:
37 |             # Miditoolkit
38 |             t0 = time()
39 |             _ = MidiFile(midi_path)
40 |             t_mtk = time() - t0
41 | 
42 |             # Symusic
43 |             t0 = time()
44 |             _ = Score(midi_path)
45 |             t_sms = time() - t0
46 | 
47 |             # Pretty MIDI
48 |             t0 = time()
49 |             _ = PrettyMIDI(str(midi_path))
50 |             t_ptm = time() - t0
51 |         except:  # noqa: E722, S112
52 |             continue
53 | 
54 |         times_mtk.append(t_mtk)
55 |         times_sms.append(t_sms)
56 |         times_ptm.append(t_ptm)
57 | 
58 |     return times_sms, times_mtk, times_ptm
59 | 
60 | 
61 | def benchmark_midi_parsing(
62 |     seed: int = 777,
63 | ) -> None:
64 |     r"""
65 |     Measure the reading time of MIDI files with different libraries.
66 | 
67 |     :param seed: random seed
68 |     """
69 |     random.seed(seed)
70 | 
71 |     df = DataFrame(index=LIBRARIES, columns=DATASETS)
72 | 
73 |     # Record times
74 |     for dataset in DATASETS:
75 |         midi_paths = list(
76 |             (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid")
77 |         )[:MAX_NUM_FILES]
78 |         all_times = read_midi_files(midi_paths)
79 |         for library, times in zip(LIBRARIES, all_times):
80 |             times_ = np.array(times)
81 |             if library == "Symusic":
82 |                 times_ *= 1e3
83 |                 unit = "ms"
84 |             else:
85 |                 unit = "sec"
86 |             df.at[library, dataset] = (
87 |                 f"{np.mean(times_):.2f} ± {np.std(times_):.2f} {unit}"
88 |             )
89 | 
90 |     df.to_csv(HERE / "midi_read.csv")
91 |     df.to_markdown(HERE / "midi_read.md")
92 |     df.to_latex(HERE / "midi_read.txt")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     benchmark_midi_parsing()
97 | 


--------------------------------------------------------------------------------
/benchmarks/midi_file_read/midi_read.csv:
--------------------------------------------------------------------------------
1 | ,Maestro,MMD,POP909
2 | Symusic,1.06 ± 0.89 ms,0.37 ± 0.32 ms,0.20 ± 0.05 ms
3 | MidiToolkit,0.11 ± 0.10 sec,0.04 ± 0.04 sec,0.02 ± 0.01 sec
4 | Pretty MIDI,0.11 ± 0.10 sec,0.04 ± 0.04 sec,0.02 ± 0.01 sec
5 | 


--------------------------------------------------------------------------------
/benchmarks/midi_file_read/midi_read.md:
--------------------------------------------------------------------------------
1 | |             | Maestro         | MMD             | POP909          |
2 | |:------------|:----------------|:----------------|:----------------|
3 | | Symusic     | 1.06 ± 0.89 ms  | 0.37 ± 0.32 ms  | 0.20 ± 0.05 ms  |
4 | | MidiToolkit | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec |
5 | | Pretty MIDI | 0.11 ± 0.10 sec | 0.04 ± 0.04 sec | 0.02 ± 0.01 sec |
6 | 


--------------------------------------------------------------------------------
/benchmarks/midi_file_read/midi_read.txt:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llll}
 2 | \toprule
 3 |  & Maestro & MMD & POP909 \\
 4 | \midrule
 5 | Symusic & 1.06 ± 0.89 ms & 0.37 ± 0.32 ms & 0.20 ± 0.05 ms \\
 6 | MidiToolkit & 0.11 ± 0.10 sec & 0.04 ± 0.04 sec & 0.02 ± 0.01 sec \\
 7 | Pretty MIDI & 0.11 ± 0.10 sec & 0.04 ± 0.04 sec & 0.02 ± 0.01 sec \\
 8 | \bottomrule
 9 | \end{tabular}
10 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_preprocess_file/README.md:
--------------------------------------------------------------------------------
 1 | # MidiTok preprocessing
 2 | 
 3 | This benchmark measures the preprocessing times of MIDI files, performed by MidiTok with the `tokenizer.preprocess_score` method.
 4 | 
 5 | ## Configuration
 6 | 
 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1
 8 | 
 9 | * Maximum number of files per dataset for analysis: 1k
10 | * Using tempo, time signature, sustain pedal and pitch bend tokens
11 | 
12 | ## Results
13 | 
14 | |               | symusic version   | Maestro - REMI   | Maestro - TSD   | Maestro - MIDILike   | Maestro - Structured   | MMD - REMI   | MMD - TSD    | MMD - MIDILike   | MMD - Structured   | POP909 - REMI   | POP909 - TSD   | POP909 - MIDILike   | POP909 - Structured   |
15 | |:--------------|:------------------|:-----------------|:----------------|:---------------------|:-----------------------|:-------------|:-------------|:-----------------|:-------------------|:----------------|:---------------|:--------------------|:----------------------|
16 | | miditok 3.0.3 | 0.4.5             | 0.64±0.36 ms     | 0.62±0.35 ms    | 0.47±0.25 ms         | 0.46±0.32 ms           | 1.55±3.68 ms | 1.54±3.68 ms | 1.40±3.63 ms     | 0.40±0.51 ms       | 0.32±0.07 ms    | 0.30±0.07 ms   | 0.24±0.06 ms        | 0.16±0.03 ms          |
17 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_preprocess_file/benchmark_preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3 python
 2 | 
 3 | """Measure the average MIDI preprocessing speed."""
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from importlib.metadata import version
 8 | from pathlib import Path
 9 | from time import time
10 | 
11 | import miditok
12 | import numpy as np
13 | from miditok.constants import SCORE_LOADING_EXCEPTION
14 | from pandas import DataFrame, read_csv
15 | from symusic import Score
16 | from tqdm import tqdm
17 | 
18 | from benchmarks.utils import mean_std_str
19 | 
20 | TOKENIZER_CONFIG_KWARGS = {
21 |     "use_tempos": True,
22 |     "use_time_signatures": True,
23 |     "use_sustain_pedals": True,
24 |     "use_pitch_bends": True,
25 |     "log_tempos": True,
26 |     "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2},
27 |     "delete_equal_successive_time_sig_changes": True,
28 |     "delete_equal_successive_tempo_changes": True,
29 | }
30 | 
31 | HERE = Path(__file__).parent
32 | TOKENIZATIONS = ["REMI", "TSD", "MIDILike", "Structured"]
33 | DATASETS = ["Maestro", "MMD", "POP909"]
34 | MAX_NUM_FILES = 1000
35 | 
36 | 
37 | def benchmark_preprocess() -> None:
38 |     r"""Read MIDI files and call `tokenizer.preprocess_score` on them."""
39 |     results_path = HERE / "preprocess.csv"
40 |     if results_path.is_file():
41 |         df = read_csv(results_path, index_col=0)
42 |     else:
43 |         columns = ["symusic version"] + [
44 |             f"{dataset} - {tokenization}"
45 |             for dataset in DATASETS
46 |             for tokenization in TOKENIZATIONS
47 |         ]
48 |         df = DataFrame(index=[], columns=columns)
49 | 
50 |     # Add a row to the dataframe
51 |     index_name = f"miditok {version('miditok')}"
52 |     df.at[index_name, "symusic version"] = version("symusic")
53 | 
54 |     for dataset in DATASETS:
55 |         files_paths = list(
56 |             (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid")
57 |         )[:MAX_NUM_FILES]
58 |         for tokenization in TOKENIZATIONS:
59 |             col_name = f"{dataset} - {tokenization}"
60 |             tok_config = miditok.TokenizerConfig(**TOKENIZER_CONFIG_KWARGS)
61 |             tokenizer = getattr(miditok, tokenization)(tok_config)
62 | 
63 |             times = []
64 |             for midi_path in tqdm(files_paths):
65 |                 try:
66 |                     midi = Score(midi_path)
67 |                 except SCORE_LOADING_EXCEPTION:
68 |                     continue
69 |                 t0 = time()
70 |                 tokenizer.preprocess_score(midi)
71 |                 times.append(time() - t0)
72 | 
73 |             times = np.array(times) * 1e3
74 |             df.at[index_name, col_name] = f"{mean_std_str(times, 2)} ms"
75 | 
76 |     df.to_csv(HERE / "preprocess.csv")
77 |     df.to_markdown(HERE / "preprocess.md")
78 |     df.to_latex(HERE / "preprocess.txt")
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     benchmark_preprocess()
83 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_preprocess_file/preprocess.csv:
--------------------------------------------------------------------------------
1 | ,symusic version,Maestro - REMI,Maestro - TSD,Maestro - MIDILike,Maestro - Structured,MMD - REMI,MMD - TSD,MMD - MIDILike,MMD - Structured,POP909 - REMI,POP909 - TSD,POP909 - MIDILike,POP909 - Structured
2 | miditok 3.0.3,0.4.5,0.64±0.36 ms,0.62±0.35 ms,0.47±0.25 ms,0.46±0.32 ms,1.55±3.68 ms,1.54±3.68 ms,1.40±3.63 ms,0.40±0.51 ms,0.32±0.07 ms,0.30±0.07 ms,0.24±0.06 ms,0.16±0.03 ms
3 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_preprocess_file/preprocess.md:
--------------------------------------------------------------------------------
1 | |               | symusic version   | Maestro - REMI   | Maestro - TSD   | Maestro - MIDILike   | Maestro - Structured   | MMD - REMI   | MMD - TSD    | MMD - MIDILike   | MMD - Structured   | POP909 - REMI   | POP909 - TSD   | POP909 - MIDILike   | POP909 - Structured   |
2 | |:--------------|:------------------|:-----------------|:----------------|:---------------------|:-----------------------|:-------------|:-------------|:-----------------|:-------------------|:----------------|:---------------|:--------------------|:----------------------|
3 | | miditok 3.0.3 | 0.4.5             | 0.64±0.36 ms     | 0.62±0.35 ms    | 0.47±0.25 ms         | 0.46±0.32 ms           | 1.55±3.68 ms | 1.54±3.68 ms | 1.40±3.63 ms     | 0.40±0.51 ms       | 0.32±0.07 ms    | 0.30±0.07 ms   | 0.24±0.06 ms        | 0.16±0.03 ms          |
4 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_preprocess_file/preprocess.txt:
--------------------------------------------------------------------------------
1 | \begin{tabular}{llllllllllllll}
2 | \toprule
3 |  & symusic version & Maestro - REMI & Maestro - TSD & Maestro - MIDILike & Maestro - Structured & MMD - REMI & MMD - TSD & MMD - MIDILike & MMD - Structured & POP909 - REMI & POP909 - TSD & POP909 - MIDILike & POP909 - Structured \\
4 | \midrule
5 | miditok 3.0.3 & 0.4.5 & 0.64±0.36 ms & 0.62±0.35 ms & 0.47±0.25 ms & 0.46±0.32 ms & 1.55±3.68 ms & 1.54±3.68 ms & 1.40±3.63 ms & 0.40±0.51 ms & 0.32±0.07 ms & 0.30±0.07 ms & 0.24±0.06 ms & 0.16±0.03 ms \\
6 | \bottomrule
7 | \end{tabular}
8 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_tokenize/README.md:
--------------------------------------------------------------------------------
 1 | # Tokenization times
 2 | 
 3 | This benchmark measures the tokenization times of MIDI files from the [Maestro](https://magenta.tensorflow.org/datasets/maestro), [Lakh](https://colinraffel.com/projects/lmd/) and [POP909](https://arxiv.org/abs/2008.07142) datasets.
 4 | 
 5 | ## Configuration
 6 | 
 7 | **Hardware:** Apple M1 Pro cpu, 16GB of memory, macOS 14.4.1
 8 | 
 9 | * miditok: v3.0.3
10 | * symusic: v0.4.5
11 | * tokenizers: v0.19.0
12 | * numpy: v1.26.4
13 | 
14 | * Maximum number of files per dataset for analysis: 1k
15 | * Using tempo, time signature, sustain pedal and pitch bend tokens
16 | 
17 | ## Results
18 | 
19 | |            | Maestro        | MMD            | POP909        |
20 | |:-----------|:---------------|:---------------|:--------------|
21 | | REMI       | 38.97±32.92 ms | 24.55±52.25 ms | 11.00±7.73 ms |
22 | | TSD        | 52.62±41.59 ms | 31.70±73.93 ms | 13.35±7.66 ms |
23 | | MIDILike   | 61.75±48.27 ms | 36.28±76.87 ms | 17.77±8.91 ms |
24 | | Structured | 60.38±46.78 ms | 35.85±88.48 ms | 16.56±8.62 ms |
25 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_tokenize/benchmark_tokenize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3 python
 2 | 
 3 | """Measure the average MIDI tokenization speed."""
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from pathlib import Path
 8 | from time import time
 9 | 
10 | import miditok
11 | import numpy as np
12 | from miditok.constants import SCORE_LOADING_EXCEPTION
13 | from pandas import DataFrame, read_csv
14 | from symusic import Score
15 | from tqdm import tqdm
16 | 
17 | from benchmarks import mean_std_str
18 | 
19 | TOKENIZER_CONFIG_KWARGS = {
20 |     "use_tempos": True,
21 |     "use_time_signatures": True,
22 |     "use_sustain_pedals": True,
23 |     "use_pitch_bends": True,
24 |     "log_tempos": True,
25 |     "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2},
26 |     "delete_equal_successive_time_sig_changes": True,
27 |     "delete_equal_successive_tempo_changes": True,
28 | }
29 | 
30 | HERE = Path(__file__).parent
31 | TOKENIZATIONS = ["REMI", "TSD", "MIDILike", "Structured"]
32 | DATASETS = ["Maestro", "MMD", "POP909"]
33 | MAX_NUM_FILES = 1000
34 | 
35 | 
36 | def benchmark_tokenize() -> None:
37 |     r"""Read MIDI files and tokenize them."""
38 |     results_path = HERE / "tokenize.csv"
39 |     if results_path.is_file():
40 |         df = read_csv(results_path, index_col=0)
41 |     else:
42 |         df = DataFrame(index=TOKENIZATIONS, columns=DATASETS)
43 | 
44 |     for dataset in DATASETS:
45 |         midi_paths = list(
46 |             (HERE.parent.parent.parent / "data" / dataset).rglob("*.mid")
47 |         )[:MAX_NUM_FILES]
48 |         for tokenization in TOKENIZATIONS:
49 |             tok_config = miditok.TokenizerConfig(**TOKENIZER_CONFIG_KWARGS)
50 |             tokenizer = getattr(miditok, tokenization)(tok_config)
51 | 
52 |             times = []
53 |             for midi_path in tqdm(midi_paths):
54 |                 try:
55 |                     midi = Score(midi_path)
56 |                 except SCORE_LOADING_EXCEPTION:
57 |                     continue
58 |                 t0 = time()
59 |                 tokenizer.encode(midi)
60 |                 times.append(time() - t0)
61 | 
62 |             times = np.array(times) * 1e3
63 |             df.at[tokenization, dataset] = f"{mean_std_str(times, 2)} ms"
64 | 
65 |     df.to_csv(HERE / "tokenize.csv")
66 |     df.to_markdown(HERE / "tokenize.md")
67 |     df.to_latex(HERE / "tokenize.txt")
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     benchmark_tokenize()
72 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_tokenize/tokenize.csv:
--------------------------------------------------------------------------------
1 | ,Maestro,MMD,POP909
2 | REMI,38.97±32.92 ms,24.55±52.25 ms,11.00±7.73 ms
3 | TSD,52.62±41.59 ms,31.70±73.93 ms,13.35±7.66 ms
4 | MIDILike,61.75±48.27 ms,36.28±76.87 ms,17.77±8.91 ms
5 | Structured,60.38±46.78 ms,35.85±88.48 ms,16.56±8.62 ms
6 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_tokenize/tokenize.md:
--------------------------------------------------------------------------------
1 | |            | Maestro        | MMD            | POP909        |
2 | |:-----------|:---------------|:---------------|:--------------|
3 | | REMI       | 38.97±32.92 ms | 24.55±52.25 ms | 11.00±7.73 ms |
4 | | TSD        | 52.62±41.59 ms | 31.70±73.93 ms | 13.35±7.66 ms |
5 | | MIDILike   | 61.75±48.27 ms | 36.28±76.87 ms | 17.77±8.91 ms |
6 | | Structured | 60.38±46.78 ms | 35.85±88.48 ms | 16.56±8.62 ms |
7 | 


--------------------------------------------------------------------------------
/benchmarks/miditok_tokenize/tokenize.txt:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llll}
 2 | \toprule
 3 |  & Maestro & MMD & POP909 \\
 4 | \midrule
 5 | REMI & 38.97±32.92 ms & 24.55±52.25 ms & 11.00±7.73 ms \\
 6 | TSD & 52.62±41.59 ms & 31.70±73.93 ms & 13.35±7.66 ms \\
 7 | MIDILike & 61.75±48.27 ms & 36.28±76.87 ms & 17.77±8.91 ms \\
 8 | Structured & 60.38±46.78 ms & 35.85±88.48 ms & 16.56±8.62 ms \\
 9 | \bottomrule
10 | \end{tabular}
11 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/README.md:
--------------------------------------------------------------------------------
 1 | # Tokenizer training benchmark
 2 | 
 3 | This benchmark aims to measure the training speeds of the different tokenization algorithms, as well as their encoding-decoding speeds, sequence length reduction, and the impact of some other strategies such as spitting the tokens per bars or beats.
 4 | 
 5 | ## Configuration
 6 | 
 7 | ### Hardware
 8 | 
 9 | Apple M1 Pro, 16GB of memory, macOS 14.4.1
10 | 
11 | ### Software
12 | 
13 | * miditok: v3.0.3
14 | * symusic: v0.4.5
15 | * tokenizers: v0.19.0
16 | * numpy: v1.26.4
17 | 
18 | ### Parameters
19 | 
20 | * Maximum number of files per dataset for training: 20k
21 | * Maximum number of files per dataset for analysis: 2k
22 | * Using tempo, time signature, rests, sustain pedal and pitch bend tokens
23 | 
24 | ## Training times
25 | 
26 | ## Splitting ids per bars and beats
27 | 
28 | Measures the sequence length of subsequence obtained when splitting the token sequences of whole music files per bars or beats.
29 | 
30 | |                 | Maestro           | Lakh                 |  Lakh monotrack   |
31 | |:----------------|:------------------|:---------------------|:------------------|
32 | | REMI - bar      | 74.7±45.8 (↑ 460) | 107.1±129.6 (↑ 2525) | 12.5±24.1 (↑ 624) |
33 | | REMI - beat     | 18.7±13.1 (↑ 190) | 27.4±34.5 (↑ 659)    | 3.3±6.6 (↑ 307)   |
34 | | TSD - bar       | 70.9±44.3 (↑ 456) | 105.7±128.8 (↑ 2521) | 11.2±22.3 (↑ 623) |
35 | | TSD - beat      | 17.7±12.7 (↑ 188) | 27.1±34.2 (↑ 658)    | 2.9±6.1 (↑ 306)   |
36 | | MIDILike - bar  | 77.5±45.9 (↑ 461) | 133.7±163.5 (↑ 3154) | 11.7±23.8 (↑ 624) |
37 | | MIDILike - beat | 19.4±12.8 (↑ 183) | 34.2±43.1 (↑ 832)    | 3.1±6.5 (↑ 317)   |
38 | 
39 | Main observation: beat subsequences are relatively shorts, and in average four times larger than bar sequences, as most files have 4/* time signatures.
40 | 
41 | ## WordPiece `max_input_chars_per_word` impact
42 | 
43 | Analyze the impact of the `max_input_chars_per_word` parameter of the WordPiece model, on training and encoding times.
44 | The vocabulary size used here is 20k.
45 | 
46 | ### Training time
47 | 
48 | |      | Maestro no-split   | Maestro bar-split   | Maestro beat-split   | Lakh multitrack no-split   | Lakh multitrack bar-split   | Lakh multitrack beat-split   |
49 | |-----:|:-------------------|:--------------------|:---------------------|:---------------------------|:----------------------------|:-----------------------------|
50 | |  100 | 131.9 sec          | 88.2 sec            | 99.3 sec             | 1216.5 sec                 | 1463.9 sec                  | 1538.3 sec                   |
51 | |  200 | 128.4 sec          | 88.2 sec            | 98.2 sec             | 1140.3 sec                 | 1283.4 sec                  | 1505.6 sec                   |
52 | |  500 | 128.1 sec          | 86.6 sec            | 98.2 sec             | 1171.8 sec                 | 1457.4 sec                  | 1604.2 sec                   |
53 | | 1000 | 127.8 sec          | 86.4 sec            | 97.0 sec             | 1131.1 sec                 | 1390.0 sec                  | 1620.8 sec                   |
54 | | 2000 | 128.5 sec          | 86.0 sec            | 96.7 sec             | 1238.1 sec                 | 1431.2 sec                  | 1495.7 sec                   |
55 | | 5000 | 127.1 sec          | 85.5 sec            | 96.7 sec             | 1229.0 sec                 | 1543.7 sec                  | 1709.8 sec                   |
56 | 
57 | `max_input_chars_per_word` has almost no impact on the training time.
58 | 
59 | ### Encoding time and ratio of "unknown token"
60 | 
61 | |      | Maestro no-split          | Maestro bar-split         | Maestro beat-split        | Lakh multitrack no-split   | Lakh multitrack bar-split   | Lakh multitrack beat-split   |
62 | |-----:|:--------------------------|:--------------------------|:--------------------------|:---------------------------|:----------------------------|:-----------------------------|
63 | |  100 | 0.0030±0.0022 (1.000 unk) | 0.0195±0.0156 (0.001 unk) | 0.0238±0.0200 (0.000 unk) | 0.0003±0.0004 (0.937 unk)  | 0.0026±0.0159 (0.007 unk)   | 0.0044±0.0495 (0.007 unk)    |
64 | |  200 | 0.0030±0.0022 (1.000 unk) | 0.0416±0.0332 (0.000 unk) | 0.0239±0.0199 (0.000 unk) | 0.0004±0.0005 (0.866 unk)  | 0.0027±0.0146 (0.007 unk)   | 0.0038±0.0475 (0.007 unk)    |
65 | |  500 | 0.0029±0.0022 (1.000 unk) | 0.0443±0.0365 (0.000 unk) | 0.0235±0.0197 (0.000 unk) | 0.0010±0.0016 (0.698 unk)  | 0.0029±0.0156 (0.007 unk)   | 0.0038±0.0466 (0.007 unk)    |
66 | | 1000 | 0.0030±0.0022 (0.999 unk) | 0.0442±0.0366 (0.000 unk) | 0.0236±0.0202 (0.000 unk) | 0.0057±0.0115 (0.513 unk)  | 0.0032±0.0165 (0.007 unk)   | 0.0039±0.0478 (0.007 unk)    |
67 | | 2000 | 0.0037±0.0127 (0.996 unk) | 0.0442±0.0364 (0.000 unk) | 0.0232±0.0194 (0.000 unk) | 0.0405±0.0771 (0.301 unk)  | 0.0029±0.0159 (0.007 unk)   | 0.0042±0.0475 (0.007 unk)    |
68 | | 5000 | 0.1209±0.6198 (0.955 unk) | 0.0440±0.0363 (0.000 unk) | 0.0238±0.0208 (0.000 unk) | 0.3539±0.8183 (0.102 unk)  | 0.0034±0.0174 (0.007 unk)   | 0.0043±0.0501 (0.007 unk)    |
69 | 
70 | `max_input_chars_per_word` has however a significant negative impact on the encoding time of the token ids.
71 | The ratios of unknown tokens also highlight the **importance of splitting the token ids per bars or beats**. Not doing so results in either a high proportion of unknown tokens with low `max_input_chars_per_word` values thus loosing data integrity, or with very high encoding times for high `max_input_chars_per_word` values.
72 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/seq_split_lengths.csv:
--------------------------------------------------------------------------------
1 | ,Maestro,Lakh, Lakh monotrack
2 | REMI - bar,74.7±45.8 (↑ 460),107.1±129.6 (↑ 2525),12.5±24.1 (↑ 624)
3 | REMI - beat,18.7±13.1 (↑ 190),27.4±34.5 (↑ 659),3.3±6.6 (↑ 307)
4 | TSD - bar,70.9±44.3 (↑ 456),105.7±128.8 (↑ 2521),11.2±22.3 (↑ 623)
5 | TSD - beat,17.7±12.7 (↑ 188),27.1±34.2 (↑ 658),2.9±6.1 (↑ 306)
6 | MIDILike - bar,77.5±45.9 (↑ 461),133.7±163.5 (↑ 3154),11.7±23.8 (↑ 624)
7 | MIDILike - beat,19.4±12.8 (↑ 183),34.2±43.1 (↑ 832),3.1±6.5 (↑ 317)
8 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/seq_split_lengths.md:
--------------------------------------------------------------------------------
1 | |                 | Maestro           | Lakh                 |  Lakh monotrack   |
2 | |:----------------|:------------------|:---------------------|:------------------|
3 | | REMI - bar      | 74.7±45.8 (↑ 460) | 107.1±129.6 (↑ 2525) | 12.5±24.1 (↑ 624) |
4 | | REMI - beat     | 18.7±13.1 (↑ 190) | 27.4±34.5 (↑ 659)    | 3.3±6.6 (↑ 307)   |
5 | | TSD - bar       | 70.9±44.3 (↑ 456) | 105.7±128.8 (↑ 2521) | 11.2±22.3 (↑ 623) |
6 | | TSD - beat      | 17.7±12.7 (↑ 188) | 27.1±34.2 (↑ 658)    | 2.9±6.1 (↑ 306)   |
7 | | MIDILike - bar  | 77.5±45.9 (↑ 461) | 133.7±163.5 (↑ 3154) | 11.7±23.8 (↑ 624) |
8 | | MIDILike - beat | 19.4±12.8 (↑ 183) | 34.2±43.1 (↑ 832)    | 3.1±6.5 (↑ 317)   |
9 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/seq_split_lengths.txt:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llll}
 2 | \toprule
 3 |  & Maestro & Lakh &  Lakh monotrack \\
 4 | \midrule
 5 | REMI - bar & 74.7±45.8 (↑ 460) & 107.1±129.6 (↑ 2525) & 12.5±24.1 (↑ 624) \\
 6 | REMI - beat & 18.7±13.1 (↑ 190) & 27.4±34.5 (↑ 659) & 3.3±6.6 (↑ 307) \\
 7 | TSD - bar & 70.9±44.3 (↑ 456) & 105.7±128.8 (↑ 2521) & 11.2±22.3 (↑ 623) \\
 8 | TSD - beat & 17.7±12.7 (↑ 188) & 27.1±34.2 (↑ 658) & 2.9±6.1 (↑ 306) \\
 9 | MIDILike - bar & 77.5±45.9 (↑ 461) & 133.7±163.5 (↑ 3154) & 11.7±23.8 (↑ 624) \\
10 | MIDILike - beat & 19.4±12.8 (↑ 183) & 34.2±43.1 (↑ 832) & 3.1±6.5 (↑ 317) \\
11 | \bottomrule
12 | \end{tabular}
13 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.csv:
--------------------------------------------------------------------------------
1 | ,Maestro no-split,Maestro bar-split,Maestro beat-split,Lakh multitrack no-split,Lakh multitrack bar-split,Lakh multitrack beat-split
2 | 100,0.0030±0.0022 (1.000 unk),0.0195±0.0156 (0.001 unk),0.0238±0.0200 (0.000 unk),0.0003±0.0004 (0.937 unk),0.0026±0.0159 (0.007 unk),0.0044±0.0495 (0.007 unk)
3 | 200,0.0030±0.0022 (1.000 unk),0.0416±0.0332 (0.000 unk),0.0239±0.0199 (0.000 unk),0.0004±0.0005 (0.866 unk),0.0027±0.0146 (0.007 unk),0.0038±0.0475 (0.007 unk)
4 | 500,0.0029±0.0022 (1.000 unk),0.0443±0.0365 (0.000 unk),0.0235±0.0197 (0.000 unk),0.0010±0.0016 (0.698 unk),0.0029±0.0156 (0.007 unk),0.0038±0.0466 (0.007 unk)
5 | 1000,0.0030±0.0022 (0.999 unk),0.0442±0.0366 (0.000 unk),0.0236±0.0202 (0.000 unk),0.0057±0.0115 (0.513 unk),0.0032±0.0165 (0.007 unk),0.0039±0.0478 (0.007 unk)
6 | 2000,0.0037±0.0127 (0.996 unk),0.0442±0.0364 (0.000 unk),0.0232±0.0194 (0.000 unk),0.0405±0.0771 (0.301 unk),0.0029±0.0159 (0.007 unk),0.0042±0.0475 (0.007 unk)
7 | 5000,0.1209±0.6198 (0.955 unk),0.0440±0.0363 (0.000 unk),0.0238±0.0208 (0.000 unk),0.3539±0.8183 (0.102 unk),0.0034±0.0174 (0.007 unk),0.0043±0.0501 (0.007 unk)
8 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.md:
--------------------------------------------------------------------------------
1 | |      | Maestro no-split          | Maestro bar-split         | Maestro beat-split        | Lakh multitrack no-split   | Lakh multitrack bar-split   | Lakh multitrack beat-split   |
2 | |-----:|:--------------------------|:--------------------------|:--------------------------|:---------------------------|:----------------------------|:-----------------------------|
3 | |  100 | 0.0030±0.0022 (1.000 unk) | 0.0195±0.0156 (0.001 unk) | 0.0238±0.0200 (0.000 unk) | 0.0003±0.0004 (0.937 unk)  | 0.0026±0.0159 (0.007 unk)   | 0.0044±0.0495 (0.007 unk)    |
4 | |  200 | 0.0030±0.0022 (1.000 unk) | 0.0416±0.0332 (0.000 unk) | 0.0239±0.0199 (0.000 unk) | 0.0004±0.0005 (0.866 unk)  | 0.0027±0.0146 (0.007 unk)   | 0.0038±0.0475 (0.007 unk)    |
5 | |  500 | 0.0029±0.0022 (1.000 unk) | 0.0443±0.0365 (0.000 unk) | 0.0235±0.0197 (0.000 unk) | 0.0010±0.0016 (0.698 unk)  | 0.0029±0.0156 (0.007 unk)   | 0.0038±0.0466 (0.007 unk)    |
6 | | 1000 | 0.0030±0.0022 (0.999 unk) | 0.0442±0.0366 (0.000 unk) | 0.0236±0.0202 (0.000 unk) | 0.0057±0.0115 (0.513 unk)  | 0.0032±0.0165 (0.007 unk)   | 0.0039±0.0478 (0.007 unk)    |
7 | | 2000 | 0.0037±0.0127 (0.996 unk) | 0.0442±0.0364 (0.000 unk) | 0.0232±0.0194 (0.000 unk) | 0.0405±0.0771 (0.301 unk)  | 0.0029±0.0159 (0.007 unk)   | 0.0042±0.0475 (0.007 unk)    |
8 | | 5000 | 0.1209±0.6198 (0.955 unk) | 0.0440±0.0363 (0.000 unk) | 0.0238±0.0208 (0.000 unk) | 0.3539±0.8183 (0.102 unk)  | 0.0034±0.0174 (0.007 unk)   | 0.0043±0.0501 (0.007 unk)    |
9 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_enc_time.txt:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lllllll}
 2 | \toprule
 3 |  & Maestro no-split & Maestro bar-split & Maestro beat-split & Lakh multitrack no-split & Lakh multitrack bar-split & Lakh multitrack beat-split \\
 4 | \midrule
 5 | 100 & 0.0030±0.0022 (1.000 unk) & 0.0195±0.0156 (0.001 unk) & 0.0238±0.0200 (0.000 unk) & 0.0003±0.0004 (0.937 unk) & 0.0026±0.0159 (0.007 unk) & 0.0044±0.0495 (0.007 unk) \\
 6 | 200 & 0.0030±0.0022 (1.000 unk) & 0.0416±0.0332 (0.000 unk) & 0.0239±0.0199 (0.000 unk) & 0.0004±0.0005 (0.866 unk) & 0.0027±0.0146 (0.007 unk) & 0.0038±0.0475 (0.007 unk) \\
 7 | 500 & 0.0029±0.0022 (1.000 unk) & 0.0443±0.0365 (0.000 unk) & 0.0235±0.0197 (0.000 unk) & 0.0010±0.0016 (0.698 unk) & 0.0029±0.0156 (0.007 unk) & 0.0038±0.0466 (0.007 unk) \\
 8 | 1000 & 0.0030±0.0022 (0.999 unk) & 0.0442±0.0366 (0.000 unk) & 0.0236±0.0202 (0.000 unk) & 0.0057±0.0115 (0.513 unk) & 0.0032±0.0165 (0.007 unk) & 0.0039±0.0478 (0.007 unk) \\
 9 | 2000 & 0.0037±0.0127 (0.996 unk) & 0.0442±0.0364 (0.000 unk) & 0.0232±0.0194 (0.000 unk) & 0.0405±0.0771 (0.301 unk) & 0.0029±0.0159 (0.007 unk) & 0.0042±0.0475 (0.007 unk) \\
10 | 5000 & 0.1209±0.6198 (0.955 unk) & 0.0440±0.0363 (0.000 unk) & 0.0238±0.0208 (0.000 unk) & 0.3539±0.8183 (0.102 unk) & 0.0034±0.0174 (0.007 unk) & 0.0043±0.0501 (0.007 unk) \\
11 | \bottomrule
12 | \end{tabular}
13 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.csv:
--------------------------------------------------------------------------------
1 | ,Maestro no-split,Maestro bar-split,Maestro beat-split,Lakh multitrack no-split,Lakh multitrack bar-split,Lakh multitrack beat-split
2 | 100,131.9 sec,88.2 sec,99.3 sec,1216.5 sec,1463.9 sec,1538.3 sec
3 | 200,128.4 sec,88.2 sec,98.2 sec,1140.3 sec,1283.4 sec,1505.6 sec
4 | 500,128.1 sec,86.6 sec,98.2 sec,1171.8 sec,1457.4 sec,1604.2 sec
5 | 1000,127.8 sec,86.4 sec,97.0 sec,1131.1 sec,1390.0 sec,1620.8 sec
6 | 2000,128.5 sec,86.0 sec,96.7 sec,1238.1 sec,1431.2 sec,1495.7 sec
7 | 5000,127.1 sec,85.5 sec,96.7 sec,1229.0 sec,1543.7 sec,1709.8 sec
8 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.md:
--------------------------------------------------------------------------------
1 | |      | Maestro no-split   | Maestro bar-split   | Maestro beat-split   | Lakh multitrack no-split   | Lakh multitrack bar-split   | Lakh multitrack beat-split   |
2 | |-----:|:-------------------|:--------------------|:---------------------|:---------------------------|:----------------------------|:-----------------------------|
3 | |  100 | 131.9 sec          | 88.2 sec            | 99.3 sec             | 1216.5 sec                 | 1463.9 sec                  | 1538.3 sec                   |
4 | |  200 | 128.4 sec          | 88.2 sec            | 98.2 sec             | 1140.3 sec                 | 1283.4 sec                  | 1505.6 sec                   |
5 | |  500 | 128.1 sec          | 86.6 sec            | 98.2 sec             | 1171.8 sec                 | 1457.4 sec                  | 1604.2 sec                   |
6 | | 1000 | 127.8 sec          | 86.4 sec            | 97.0 sec             | 1131.1 sec                 | 1390.0 sec                  | 1620.8 sec                   |
7 | | 2000 | 128.5 sec          | 86.0 sec            | 96.7 sec             | 1238.1 sec                 | 1431.2 sec                  | 1495.7 sec                   |
8 | | 5000 | 127.1 sec          | 85.5 sec            | 96.7 sec             | 1229.0 sec                 | 1543.7 sec                  | 1709.8 sec                   |
9 | 


--------------------------------------------------------------------------------
/benchmarks/tokenizer_training/results/wordpiece_max_chars_train_time.txt:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lllllll}
 2 | \toprule
 3 |  & Maestro no-split & Maestro bar-split & Maestro beat-split & Lakh multitrack no-split & Lakh multitrack bar-split & Lakh multitrack beat-split \\
 4 | \midrule
 5 | 100 & 131.9 sec & 88.2 sec & 99.3 sec & 1216.5 sec & 1463.9 sec & 1538.3 sec \\
 6 | 200 & 128.4 sec & 88.2 sec & 98.2 sec & 1140.3 sec & 1283.4 sec & 1505.6 sec \\
 7 | 500 & 128.1 sec & 86.6 sec & 98.2 sec & 1171.8 sec & 1457.4 sec & 1604.2 sec \\
 8 | 1000 & 127.8 sec & 86.4 sec & 97.0 sec & 1131.1 sec & 1390.0 sec & 1620.8 sec \\
 9 | 2000 & 128.5 sec & 86.0 sec & 96.7 sec & 1238.1 sec & 1431.2 sec & 1495.7 sec \\
10 | 5000 & 127.1 sec & 85.5 sec & 96.7 sec & 1229.0 sec & 1543.7 sec & 1709.8 sec \\
11 | \bottomrule
12 | \end{tabular}
13 | 


--------------------------------------------------------------------------------
/benchmarks/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils methods for benchmarks."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def mean_std_str(
 9 |     dist: np.array | list[int | float], num_dec: int = 2, latex_pm: bool = False
10 | ) -> str:
11 |     r"""
12 |     Create a nice looking mean and standard deviation string of a distribution.
13 | 
14 |     :param dist: distribution to measure.
15 |     :param num_dec: number of decimals to keep. (default: ``2``)
16 |     :param latex_pm: whether to represent the "±" symbol with LaTeX command ("$\pm$").
17 |         (default: ``False``)
18 |     :return: string of the average and standard deviation of the distribution.
19 |     """
20 |     if not isinstance(dist, np.ndarray):
21 |         dist = np.array(dist)
22 |     mean, std = float(np.mean(dist)), float(np.std(dist))
23 |     if latex_pm:
24 |         return f"{mean:.{num_dec}f}" r"$\pm$" f"{std:.{num_dec}f}"  # noqa: ISC001
25 |     return f"{mean:.{num_dec}f}±{std:.{num_dec}f}"
26 | 


--------------------------------------------------------------------------------
/colab-notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Colab Notebooks
2 | 
3 | In this directory you will find Notebooks using MidiTok, from which you can take inspiration for you own projects.
4 | 
5 | For beginners, we recommend to browse the **Example Hugging Face** notebook. You'll find an up-to-date, concise and full example of training a Transformer to generate music.
6 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/additional_tokens_table.csv:
--------------------------------------------------------------------------------
 1 | ﻿Tokenization,Tempo,Time signature,Chord,Rest,Sustain pedal, Pitch bend, Pitch interval
 2 | MIDILike,✅,✅,✅,✅,✅,✅,✅
 3 | REMI,✅,✅,✅,✅,✅,✅,✅
 4 | TSD,✅,✅,✅,✅,✅,✅,✅
 5 | Structured,❌,❌,❌,❌,❌,❌,❌
 6 | CPWord,✅,✅¹,✅,✅¹,❌,❌,❌
 7 | Octuple,✅,✅²,❌,❌,❌,❌,❌
 8 | MuMIDI,✅,❌,✅,❌,❌,❌,❌
 9 | MMM,✅,✅,✅,❌,✅,✅,✅
10 | 


--------------------------------------------------------------------------------
/docs/assets/Octuple_TS_Rest/original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/Octuple_TS_Rest/original.png


--------------------------------------------------------------------------------
/docs/assets/Octuple_TS_Rest/tokenized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/Octuple_TS_Rest/tokenized.png


--------------------------------------------------------------------------------
/docs/assets/bases/pianoroll_daw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/pianoroll_daw.png


--------------------------------------------------------------------------------
/docs/assets/bases/sheet_music.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/sheet_music.png


--------------------------------------------------------------------------------
/docs/assets/bases/spectrogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/bases/spectrogram.png


--------------------------------------------------------------------------------
/docs/assets/cp_word.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/cp_word.png


--------------------------------------------------------------------------------
/docs/assets/embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/embeddings.png


--------------------------------------------------------------------------------
/docs/assets/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/favicon.png


--------------------------------------------------------------------------------
/docs/assets/midi_like.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_like.png


--------------------------------------------------------------------------------
/docs/assets/midi_preprocessing_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_preprocessing_original.png


--------------------------------------------------------------------------------
/docs/assets/midi_preprocessing_preprocessed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/midi_preprocessing_preprocessed.png


--------------------------------------------------------------------------------
/docs/assets/miditok_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/miditok_logo.png


--------------------------------------------------------------------------------
/docs/assets/miditok_logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- Generated by Pixelmator Pro 3.5.11 -->
 3 | <svg width="2800" height="600" viewBox="0 0 2800 600" xmlns="http://www.w3.org/2000/svg">
 4 |     <g id="MIDILOGO">
 5 |         <path id="Path" fill="#000000" stroke="none" d="M 19 36.901672 L 480.418549 36.901672 C 519.233154 36.901672 541.80896 70.369385 541.80896 110.174164 L 541.80896 563.671326 L 413.087067 563.671326 L 413.087067 169.584259 L 337.834259 169.584259 L 337.834259 563.671326 L 220.994385 563.671326 L 220.994385 169.584259 L 147.721909 169.584259 L 147.721909 563.671326 L 19 563.671326 L 19 36.901672 Z"/>
 6 |         <path id="path1" fill="#000000" stroke="none" d="M 605.179749 36.901672 L 733.901672 36.901672 L 733.901672 563.671326 L 605.179749 563.671326 Z"/>
 7 |         <path id="path2" fill="#000000" stroke="none" d="M 795.292114 36.901672 L 1177.497192 36.901672 C 1216.311768 36.901672 1238.887695 70.369385 1238.887695 110.174164 L 1238.887695 498.320221 C 1238.887695 547.630615 1218.292114 563.671326 1173.536499 563.671326 L 795.292114 563.671326 L 795.292114 228.994385 L 925.994385 228.994385 L 925.994385 434.949432 L 1110.165771 434.949432 L 1110.165771 155.721924 L 795.292114 155.721924 L 795.292114 36.901672 Z"/>
 8 |         <path id="path3" fill="#000000" stroke="none" d="M 1298.297729 36.901672 L 1429 36.901672 L 1429 563.671326 L 1298.297729 563.671326 Z"/>
 9 |     </g>
10 |     <g id="g1">
11 |         <g id="1">
12 |             <g id="g2">
13 |                 <path id="polygon7" fill="#000000" stroke="none" d="M 1842.007446 151.38916 L 1731.81604 151.38916 L 1731.81604 564 L 1605.120239 564 L 1605.120239 151.38916 L 1477.939087 151.38916 L 1477.696289 38.285278 L 1877.928711 38.285278 Z"/>
14 |                 <path id="polygon13" fill="#000000" stroke="none" d="M 2490.049316 281.725677 L 2614.074951 160.369537 L 2763.585938 160.369537 L 2606.551025 312.792816 L 2781.303711 564 L 2643.200684 564 L 2525.970459 389.489899 L 2490.049316 425.411316 L 2490.049316 564 L 2364.566895 564 L 2363.353516 36.828979 L 2490.049316 36.828979 Z"/>
15 |                 <path id="path16" fill="#25f4ee" stroke="none" d="M 1861.90979 349.442383 C 1861.94873 237.468842 1948.088257 144.352905 2059.720215 135.612885 C 2054.137939 135.612885 2046.613892 135.612885 2041.031616 135.612885 C 1927.580322 142.230438 1838.967529 236.162506 1838.967529 349.806458 C 1838.967529 463.450378 1927.580322 557.382446 2041.031616 564 C 2046.613892 564 2054.137939 564 2059.720215 564 C 1947.808838 555.238647 1861.567383 461.695709 1861.90979 349.442383 Z"/>
16 |                 <path id="path18" fill="#fe2c55" stroke="none" d="M 2108.505615 134.884766 C 2102.68042 134.884766 2095.15625 134.884766 2089.573975 134.884766 C 2200.844482 144.075226 2286.470215 237.06488 2286.470215 348.714233 C 2286.470215 460.363617 2200.844482 553.353271 2089.573975 562.543762 C 2095.15625 562.543762 2102.68042 562.543762 2108.505615 562.543762 C 2227.002441 562.543762 2323.062988 466.482971 2323.062988 347.986084 C 2323.062988 229.489227 2227.002441 133.428467 2108.505615 133.428467 Z"/>
17 |                 <path id="path4" fill="#000000" stroke="none" d="M 2074.525879 453.808655 C 2016.885864 453.808655 1970.159424 407.082275 1970.159424 349.442383 C 1970.159424 291.80249 2016.885864 245.076111 2074.525879 245.076111 C 2132.165771 245.076111 2178.89209 291.80249 2178.89209 349.442383 C 2178.758545 407.026917 2132.110352 453.675171 2074.525879 453.808655 Z M 2074.525879 134.884766 C 1956.028809 134.884766 1859.968018 230.945465 1859.968018 349.442383 C 1859.968018 467.939301 1956.028809 564 2074.525879 564 C 2193.022705 564 2289.083252 467.939301 2289.083252 349.442383 C 2289.083252 292.538147 2266.478271 237.9646 2226.240967 197.727264 C 2186.003662 157.489899 2131.429932 134.884766 2074.525879 134.884766 Z"/>
18 |             </g>
19 |         </g>
20 |     </g>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/docs/assets/miditok_logo_stroke.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/miditok_logo_stroke.png


--------------------------------------------------------------------------------
/docs/assets/mumidi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/mumidi.png


--------------------------------------------------------------------------------
/docs/assets/music_sheet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/music_sheet.png


--------------------------------------------------------------------------------
/docs/assets/octuple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/octuple.png


--------------------------------------------------------------------------------
/docs/assets/pitch_intervals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/pitch_intervals.png


--------------------------------------------------------------------------------
/docs/assets/pitch_intervals_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/pitch_intervals_original.png


--------------------------------------------------------------------------------
/docs/assets/remi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/remi.png


--------------------------------------------------------------------------------
/docs/assets/remiplus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/remiplus.png


--------------------------------------------------------------------------------
/docs/assets/structured.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/structured.png


--------------------------------------------------------------------------------
/docs/assets/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/transformer.png


--------------------------------------------------------------------------------
/docs/assets/tsd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/docs/assets/tsd.png


--------------------------------------------------------------------------------
/docs/attribute_controls.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Attribute Controls
 3 | ========================
 4 | 
 5 | Attribute Controls are special tokens that allow to train a model in order to control music generation during inference. They work either at the track-level or bar-level and specifies specific attributes they featured. By being placed at the beginning of each bar or track in the token sequence, a *causal* model will condition the prediction of the next tokens based on them. At inference, these attribute control tokens can strategically be placed at the beginning of nex tracks or bars in order to condition the generated results.
 6 | 
 7 | Attribute controls are not compatible with "multi-vocabulary" (e.g. Octuple) or multitrack "one token stream" tokenizers.
 8 | 
 9 | To train tokenizers and models with attribute control tokens, you can use the :class:`miditok.TokTrainingIterator` and :class:`miditok.pytorch_data.DatasetMIDI` respectively.
10 | 
11 | .. automodule:: miditok.attribute_controls
12 |     :members:
13 | 
14 | Using custom attribute controls
15 | -------------------------------
16 | 
17 | You can easily add your own attribute controls to an existing tokenizer using the :py:func:`miditok.MusicTokenizer.add_attribute_control` method. You attribute control must subclass either the :class:`miditok.attribute_controls.AttributeControl` (track-level) or the :class:`miditok.attribute_controls.BarAttributeControl` classes and implement the attribute computation method.
18 | 


--------------------------------------------------------------------------------
/docs/citations.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Citations
  3 | =================
  4 | 
  5 | Here you will find BibTeX citations of the original works presenting these tokenizations
  6 | 
  7 | MidiTok
  8 | ------------------------
  9 | 
 10 | ..  code-block:: bib
 11 | 
 12 |     @inproceedings{miditok2021,
 13 |         title={{MidiTok}: A Python package for {MIDI} file tokenization},
 14 |         author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas},
 15 |         booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference},
 16 |         year={2021},
 17 |         url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf},
 18 |     }
 19 | 
 20 | Tokenizer Training / Byte Pair Encoding / TSD
 21 | ---------------------------------------------
 22 | 
 23 | ..  code-block:: bib
 24 | 
 25 |     @inproceedings{fradet-etal-2023-byte,
 26 |         title = "Byte Pair Encoding for Symbolic Music",
 27 |         author = "Fradet, Nathan  and
 28 |           Gutowski, Nicolas  and
 29 |           Chhel, Fabien  and
 30 |           Briot, Jean-Pierre",
 31 |         editor = "Bouamor, Houda  and
 32 |           Pino, Juan  and
 33 |           Bali, Kalika",
 34 |         booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
 35 |         month = dec,
 36 |         year = "2023",
 37 |         address = "Singapore",
 38 |         publisher = "Association for Computational Linguistics",
 39 |         url = "https://aclanthology.org/2023.emnlp-main.123",
 40 |         doi = "10.18653/v1/2023.emnlp-main.123",
 41 |         pages = "2001--2020",
 42 |     }
 43 | 
 44 | REMI (Pop Music Transformer)
 45 | ----------------------------
 46 | 
 47 | ..  code-block:: bib
 48 | 
 49 |     @inproceedings{huang_remi_2020,
 50 |         author = {Huang, Yu-Siang and Yang, Yi-Hsuan},
 51 |         title = {Pop Music Transformer: Beat-Based Modeling and Generation of Expressive Pop Piano Compositions},
 52 |         year = {2020},
 53 |         isbn = {9781450379885},
 54 |         publisher = {Association for Computing Machinery},
 55 |         address = {New York, NY, USA},
 56 |         url = {https://doi.org/10.1145/3394171.3413671},
 57 |         doi = {10.1145/3394171.3413671},
 58 |         booktitle = {Proceedings of the 28th ACM International Conference on Multimedia},
 59 |         pages = {1180–1188},
 60 |         numpages = {9},
 61 |         keywords = {transformer, neural sequence model, automatic music composition},
 62 |         location = {Seattle, WA, USA},
 63 |         series = {MM '20}
 64 |     }
 65 | 
 66 | MIDI-Like (This Time with feeling)
 67 | ----------------------------------
 68 | 
 69 | ..  code-block:: bib
 70 | 
 71 |     @article{oore_midilike_2018,
 72 |         author={Sageev Oore and Ian Simon and Sander Dieleman and Douglas Eck and Karen Simonyan},
 73 |         title={This Time with Feeling: Learning Expressive Musical Performance},
 74 |         journal={Neural Computing and Applications},
 75 |         volume={32},
 76 |         year={2018},
 77 |         pages={955–967},
 78 |         url={https://link.springer.com/article/10.1007/s00521-018-3758-9},
 79 |         publisher={Springer}
 80 |     }
 81 | 
 82 | Structured (Piano Inpainting Application)
 83 | -----------------------------------------
 84 | 
 85 | ..  code-block:: bib
 86 | 
 87 |     @misc{pia2021hadjeres,
 88 |         title={The Piano Inpainting Application},
 89 |         author={Gaëtan Hadjeres and Léopold Crestel},
 90 |         year={2021},
 91 |         eprint={2107.05944},
 92 |         archivePrefix={arXiv},
 93 |         primaryClass={cs.SD},
 94 |         url={https://arxiv.org/abs/2107.05944},
 95 |     }
 96 | 
 97 | CPWord (Compound Word Transformer)
 98 | ----------------------------------
 99 | 
100 | ..  code-block:: bib
101 | 
102 |     @article{cpword2021,
103 |         title={Compound Word Transformer: Learning to Compose Full-Song Music over Dynamic Directed Hypergraphs},
104 |         volume={35},
105 |         url={https://ojs.aaai.org/index.php/AAAI/article/view/16091},
106 |         DOI={10.1609/aaai.v35i1.16091},
107 |         number={1},
108 |         journal={Proceedings of the AAAI Conference on Artificial Intelligence},
109 |         author={Hsiao, Wen-Yi and Liu, Jen-Yu and Yeh, Yin-Cheng and Yang, Yi-Hsuan},
110 |         year={2021},
111 |         month={May},
112 |         pages={178-186}
113 |     }
114 | 
115 | Octuple (MusicBERT)
116 | ------------------------
117 | 
118 | ..  code-block:: bib
119 | 
120 |     @inproceedings{zeng2021musicbert,
121 |         title = "{M}usic{BERT}: Symbolic Music Understanding with Large-Scale Pre-Training",
122 |         author = "Zeng, Mingliang and Tan, Xu and Wang, Rui and Ju, Zeqian and Qin, Tao and Liu, Tie-Yan",
123 |         booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
124 |         month = aug,
125 |         year = "2021",
126 |         address = "Online",
127 |         publisher = "Association for Computational Linguistics",
128 |         url = "https://aclanthology.org/2021.findings-acl.70",
129 |         doi = "10.18653/v1/2021.findings-acl.70",
130 |         pages = "791--800",
131 |     }
132 | 
133 | MuMIDI (PopMAG)
134 | ------------------------
135 | 
136 | ..  code-block:: bib
137 | 
138 |     @inproceedings{popmag2020,
139 |         author = {Ren, Yi and He, Jinzheng and Tan, Xu and Qin, Tao and Zhao, Zhou and Liu, Tie-Yan},
140 |         title = {PopMAG: Pop Music Accompaniment Generation},
141 |         year = {2020},
142 |         isbn = {9781450379885},
143 |         publisher = {Association for Computing Machinery},
144 |         url = {https://arxiv.org/abs/2008.07703},
145 |         doi = {10.1145/3394171.3413721},
146 |         abstract = {"MuMIDI encoding, similar to CP.
147 |             Generates multitrack music, filling every track tokens in a single sequence},
148 |         booktitle = {Proceedings of the 28th ACM International Conference on Multimedia},
149 |         pages = {1198–1206},
150 |         numpages = {9},
151 |         keywords = {accompaniment generation, music representation, music generation, sequence-to-sequence model, pop music},
152 |         location = {Seattle, WA, USA}
153 |     }
154 | 
155 | MMM (Multi-Track Music Machine)
156 | --------------------------------
157 | 
158 | ..  code-block:: bib
159 | 
160 |     @misc{ens2020mmm,
161 |           title={MMM : Exploring Conditional Multi-Track Music Generation with the Transformer},
162 |           author={Jeff Ens and Philippe Pasquier},
163 |           year={2020},
164 |           eprint={2008.06048},
165 |           archivePrefix={arXiv},
166 |           primaryClass={cs.SD}
167 |     }
168 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration file for the Sphinx documentation builder.
 3 | 
 4 | For the full list of built-in configuration values, see the documentation:
 5 | https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | """
 7 | 
 8 | import sys
 9 | import tomllib
10 | from pathlib import Path
11 | 
12 | sys.path.insert(0, str(Path("..").resolve() / "src"))
13 | 
14 | # -- Project information -----------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
16 | 
17 | project = "MidiTok"
18 | copyright = "2024, Nathan Fradet"  # noqa: A001
19 | author = "Nathan Fradet"
20 | 
21 | 
22 | with (Path(__file__).parent.parent / "pyproject.toml").open("rb") as f:
23 |     data = tomllib.load(f)
24 |     version = data["project"]["version"]
25 | 
26 | # The language for content autogenerated by Sphinx. Refer to documentation
27 | # for a list of supported languages.
28 | #
29 | # This is also used if you do content translation via gettext catalogs.
30 | # Usually you set "language" from the command line for these cases.
31 | language = "en"
32 | 
33 | # -- General configuration ---------------------------------------------------
34 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
35 | 
36 | extensions = [
37 |     "sphinx_copybutton",
38 |     "sphinx.ext.duration",
39 |     "sphinx.ext.doctest",
40 |     "sphinx.ext.autodoc",
41 |     "sphinx.ext.autosummary",
42 |     "sphinx.ext.autosectionlabel",
43 |     # "sphinxcontrib.tikz",
44 | ]
45 | 
46 | templates_path = ["_templates"]
47 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
51 | 
52 | html_theme = "furo"
53 | html_title = "MidiTok's docs"
54 | html_logo = "assets/miditok_logo_stroke.png"
55 | html_favicon = "assets/favicon.png"
56 | # tikz_proc_suite = "GhostScript"  # required for readthedocs, produce png, not svg
57 | 


--------------------------------------------------------------------------------
/docs/configuration.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | Tokenizer Configuration
 3 | =======================
 4 | 
 5 | MidiTok's tokenizers can be customized with a wide variety of options, and most of the preprocessing and downsampling steps can be tailored to your specifications.
 6 | 
 7 | Tokenizer config
 8 | ------------------------
 9 | 
10 | All tokenizers are initialized with common parameters, that are hold in a :class:`miditok.TokenizerConfig` object, documented below. A tokenizer's configuration can be accessed with ``tokenizer.config``.
11 | Some tokenizers might take additional specific arguments / parameters when creating them.
12 | 
13 | .. autoclass:: miditok.TokenizerConfig
14 |     :members:
15 | 
16 | 
17 | How MidiTok handles time
18 | ----------------------------
19 | 
20 | MidiTok handles time by resampling the music file's time division (time resolution) to a new resolution determined by the ``beat_res`` attribute of of :class:`miditok.TokenizerConfig`. This argument determines which time tokens are present in the vocabulary.
21 | 
22 | It allows to create ``Duration`` and ``TimeShift`` tokens with different resolution depending on their values. It is typically common to use higher resolutions for short time duration (i.e. short values will be represented with greater accuracy) and lower resolutions for higher time values (that generally do not need to be represented with great accuracy).
23 | The values of these tokens take the form of tuple as: ``(num_beats, num_samples, resolution)``. For instance, the time value of token ``(2, 3, 8)`` corresponds to 2 beats and 3/8 of a beat. ``(2, 2, 4)`` corresponds to 2 beats and half of a beat (2.5).
24 | 
25 | For position-based tokenizers, the number of ``Position`` in the vocabulary is equal to the maximum resolution found in ``beat_res``.
26 | 
27 | An example of the downsampling applied by MidiTok during the preprocessing is shown below.
28 | 
29 | ..  figure:: /assets/midi_preprocessing_original.png
30 |     :alt: Original MIDI file
31 |     :width: 800
32 | 
33 |     Original MIDI file from the `Maestro dataset <https://magenta.tensorflow.org/datasets/maestro>`_ with a 4/4 time signature. The numbers at the top indicate the bar number (125) followed by the beat number within the bar.
34 | 
35 | ..  figure:: /assets/midi_preprocessing_preprocessed.png
36 |     :alt: Downsampled MIDI file.
37 |     :width: 800
38 | 
39 |     MIDI file with time downsampled to 8 samples per beat.
40 | 
41 | Additional tokens
42 | ------------------------
43 | 
44 | MidiTok offers to include additional tokens on music information. You can specify them in the ``tokenizer_config`` argument (:class:`miditok.TokenizerConfig`) when creating a tokenizer. The :class:`miditok.TokenizerConfig` documentations specifically details the role of each of them, and their associated parameters.
45 | 
46 | .. csv-table:: Compatibility table of tokenizations and additional tokens.
47 |    :file: additional_tokens_table.csv
48 |    :header-rows: 1
49 | 
50 | ¹: using both time signatures and rests with :class:`miditok.CPWord` might result in time alterations, as the time signature changes are carried with the Bar tokens which can be skipped during period of rests.
51 | ²: using time signatures with :class:`miditok.Octuple` might result in time alterations, as the time signature changes are carried with the note onsets. An example is shown below.
52 | 
53 | Alternatively, **Velocity** and **Duration** tokens are optional and are enabled by default for all tokenizers.
54 | 
55 | .. image:: /assets/Octuple_TS_Rest/original.png
56 |   :width: 800
57 |   :alt: Original MIDI sample preprocessed / downsampled
58 | 
59 | .. image:: /assets/Octuple_TS_Rest/tokenized.png
60 |   :width: 800
61 |   :alt: MIDI sample after being tokenized, the time has been shifted to a bar during the time signature change
62 | 
63 | Below is an example of how pitch intervals would be tokenized, with a ``max_pitch_interval`` of 15.
64 | 
65 | .. image:: /assets/pitch_intervals.png
66 |   :width: 800
67 |   :alt: Schema of the pitch intervals over a piano-roll
68 | 
69 | 
70 | Special tokens
71 | ------------------------
72 | 
73 | MidiTok offers to include some special tokens to the vocabulary. These tokens with no "musical" information can be used for training purposes.
74 | To use special tokens, you must specify them with the ``special_tokens`` argument when creating a tokenizer. By default, this argument is set to ``["PAD", "BOS", "EOS", "MASK"]``. Their signification are:
75 | 
76 | * **PAD** (``PAD_None``): a padding token to use when training a model with batches of sequences of unequal lengths. The padding token id is often set to 0. If you use Hugging Face models, be sure to pad inputs with this tokens, and pad labels with *-100*.
77 | * **BOS** (``SOS_None``): "Start Of Sequence" token, indicating that a token sequence is beginning.
78 | * **EOS** (``EOS_None``): "End Of Sequence" tokens, indicating that a token sequence is ending. For autoregressive generation, this token can be used to stop it.
79 | * **MASK** (``MASK_None``): a masking token, to use when pre-training a (bidirectional) model with a self-supervised objective like `BERT <https://arxiv.org/abs/1810.04805>`_.
80 | 
81 | **Note:** you can use the ``tokenizer.special_tokens`` property to get the list of the special tokens of a tokenizer, and ``tokenizer.special_tokens`` for their ids.
82 | 


--------------------------------------------------------------------------------
/docs/data_augmentation.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Data augmentation
 3 | ========================
 4 | 
 5 | Data augmentation is a technique to artificially increases the size of a dataset by applying various transformations on to the existing data. These transformations consist in altering one or several attributes of the original data. In the context of images, they can include operations such as rotation, scaling, cropping or color adjustments. This is more tricky in the case of natural language, where the meaning of the sentences can easily diverge following how the text is modified, but some techniques such as paraphrase generation or back translation can fill this purpose.
 6 | 
 7 | The purpose of data augmentation is to introduce variability and diversity into the training data without collecting additional real-world data. Data augmentation can be important and increase a model's learning and generalization, as it exposes it to a wider range of variations and patterns present in the data. In turn it can increases its robustness and decrease overfitting.
 8 | 
 9 | MidiTok allows to perform data augmentation, on the MIDI level and token level. Transformations can be made by increasing the values of the velocities and durations of notes, or by shifting their pitches by octaves. Data augmentation is highly recommended to train a model, in order to help a model to learn the global and local harmony of music. In large datasets such as the `Lakh <https://colinraffel.com/projects/lmd/>`_ or `Meta MIDI <https://zenodo.org/records/5142664>`_ datasets, MIDI files can have various ranges of velocity, duration values, and pitch. By augmenting the data, thus creating more diversified data samples, a model can better generalize learning the melody, harmony and music features rather than learning specific recurrent token successions.
10 | 
11 | .. automodule:: miditok.data_augmentation
12 |     :members:
13 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Code examples
  3 | =================
  4 | 
  5 | Create a tokenizer
  6 | ------------------------
  7 | 
  8 | A basic example showing how to create a tokenizer, with a selection of custom parameters.
  9 | 
 10 | ..  code-block:: python
 11 | 
 12 |     from miditok import REMI, TokenizerConfig  # here we choose to use REMI
 13 | 
 14 |     # Our parameters
 15 |     TOKENIZER_PARAMS = {
 16 |         "pitch_range": (21, 109),
 17 |         "beat_res": {(0, 4): 8, (4, 12): 4},
 18 |         "num_velocities": 32,
 19 |         "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
 20 |         "use_chords": True,
 21 |         "use_rests": False,
 22 |         "use_tempos": True,
 23 |         "use_time_signatures": False,
 24 |         "use_programs": False,
 25 |         "num_tempos": 32,  # number of tempo bins
 26 |         "tempo_range": (40, 250),  # (min, max)
 27 |     }
 28 |     config = TokenizerConfig(**TOKENIZER_PARAMS)
 29 | 
 30 |     # Creates the tokenizer
 31 |     tokenizer = REMI(config)
 32 | 
 33 | MIDI - Tokens conversion
 34 | -------------------------------
 35 | 
 36 | Here we convert a MIDI to tokens, decode them back to a MIDI.
 37 | 
 38 | ..  code-block:: python
 39 | 
 40 |     from pathlib import Path
 41 | 
 42 |     # Tokenize a MIDI file
 43 |     tokens = tokenizer(Path("to", "your_midi.mid"))  # automatically detects Score objects, paths, tokens
 44 | 
 45 |     # Convert to MIDI and save it
 46 |     generated_midi = tokenizer(tokens)  # MidiTok can handle PyTorch/Numpy/Tensorflow tensors
 47 |     generated_midi.dump_midi(Path("to", "decoded_midi.mid"))
 48 | 
 49 | 
 50 | Trains a tokenizer with BPE
 51 | -----------------------------
 52 | 
 53 | Here we train the tokenizer with :ref:`Byte Pair Encoding (BPE)`.
 54 | BPE allows to reduce the lengths of the sequences of tokens, in turn model efficiency, while improving the results quality/model performance.
 55 | 
 56 | ..  code-block:: python
 57 | 
 58 |     from miditok import REMI
 59 |     from pathlib import Path
 60 | 
 61 |     # Creates the tokenizer and list the file paths
 62 |     tokenizer = REMI()  # using defaults parameters (constants.py)
 63 |     midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
 64 | 
 65 |     # Builds the vocabulary with BPE
 66 |     tokenizer.train(vocab_size=30000, files_paths=midi_paths)
 67 | 
 68 | 
 69 | Prepare a dataset before training
 70 | -------------------------------------------
 71 | 
 72 | MidiTok provides useful methods to split music files into smaller chunks that make approximately a target number of tokens, allowing to use most of your data to train and evaluate models. It also provide data augmentation methods to increase the amount of data to train models.
 73 | 
 74 | ..  code-block:: python
 75 | 
 76 |     from random import shuffle
 77 | 
 78 |     from miditok.data_augmentation import augment_dataset
 79 |     from miditok.utils import split_files_for_training
 80 | 
 81 |     # Split the dataset into train/valid/test subsets, with 15% of the data for each of the two latter
 82 |     midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
 83 |     total_num_files = len(midi_paths)
 84 |     num_files_valid = round(total_num_files * 0.15)
 85 |     num_files_test = round(total_num_files * 0.15)
 86 |     shuffle(midi_paths)
 87 |     midi_paths_valid = midi_paths[:num_files_valid]
 88 |     midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test]
 89 |     midi_paths_train = midi_paths[num_files_valid + num_files_test:]
 90 | 
 91 |     # Chunk MIDIs and perform data augmentation on each subset independently
 92 |     for files_paths, subset_name in (
 93 |         (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test")
 94 |     ):
 95 | 
 96 |         # Split the MIDIs into chunks of sizes approximately about 1024 tokens
 97 |         subset_chunks_dir = Path(f"dataset_{subset_name}")
 98 |         split_files_for_training(
 99 |             files_paths=files_paths,
100 |             tokenizer=tokenizer,
101 |             save_dir=subset_chunks_dir,
102 |             max_seq_len=1024,
103 |             num_overlap_bars=2,
104 |         )
105 | 
106 |         # Perform data augmentation
107 |         augment_dataset(
108 |             subset_chunks_dir,
109 |             pitch_offsets=[-12, 12],
110 |             velocity_offsets=[-4, 4],
111 |             duration_offsets=[-0.5, 0.5],
112 |         )
113 | 
114 | Creates a Dataset and collator for training
115 | -------------------------------------------
116 | 
117 | Creates a Dataset and a collator to be used with a PyTorch DataLoader to train a model
118 | 
119 | ..  code-block:: python
120 | 
121 |     from miditok import REMI
122 |     from miditok.pytorch_data import DatasetMIDI, DataCollator
123 |     from torch.utils.data import DataLoader
124 | 
125 |     tokenizer = REMI()  # using defaults parameters (constants.py)
126 |     midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
127 |     dataset = DatasetMIDI(
128 |         files_paths=midi_paths,
129 |         tokenizer=tokenizer,
130 |         max_seq_len=1024,
131 |         bos_token_id=tokenizer.pad_token_id,
132 |         eos_token_id=tokenizer["BOS_None"],
133 |     )
134 |     collator = DataCollator(tokenizer.pad_token_id)
135 |     data_loader = DataLoader(dataset=dataset, collate_fn=collator)
136 | 
137 |     # Using the data loader in the training loop
138 |     for batch in data_loader:
139 |         print("Train your model on this batch...")
140 | 
141 | 
142 | Tokenize a dataset
143 | ------------------------
144 | 
145 | Here we tokenize a whole dataset into JSON files storing the tokens ids.
146 | We also perform data augmentation on the pitch, velocity and duration dimension.
147 | 
148 | ..  code-block:: python
149 | 
150 |     from miditok import REMI
151 |     from miditok.data_augmentation import augment_midi_dataset
152 |     from pathlib import Path
153 | 
154 |     # Creates the tokenizer and list the file paths
155 |     tokenizer = REMI()  # using defaults parameters (constants.py)
156 |     data_path = Path("path", "to", "dataset")
157 | 
158 |     # A validation method to discard MIDIs we do not want
159 |     # It can also be used for custom pre-processing, for instance if you want to merge
160 |     # some tracks before tokenizing a MIDI file
161 |     def midi_valid(midi) -> bool:
162 |         if any(ts.numerator != 4 for ts in midi.time_signature_changes):
163 |             return False  # time signature different from 4/*, 4 beats per bar
164 |         return True
165 | 
166 |     # Performs data augmentation on one pitch octave (up and down), velocities and
167 |     # durations
168 |     midi_aug_path = Path("to", "new", "location", "augmented")
169 |     augment_midi_dataset(
170 |         data_path,
171 |         pitch_offsets=[-12, 12],
172 |         velocity_offsets=[-4, 5],
173 |         duration_offsets=[-0.5, 1],
174 |         out_path=midi_aug_path,
175 |     )
176 |     tokenizer.tokenize_dataset(        # 2 velocity and 1 duration values
177 |         data_path,
178 |         Path("path", "to", "tokens"),
179 |         midi_valid,
180 |     )
181 | 


--------------------------------------------------------------------------------
/docs/hf_hub.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Hugging Face Hub
 3 | ========================
 4 | 
 5 | What is the Hugging Face hub
 6 | ---------------------------------
 7 | 
 8 | The `Hugging Face Hub <https://huggingface.co>`_ is a model and dataset sharing platform which is widely used in the AI community. It allows to freely upload, share and download models and datasets, directly in your code in a very convenient way. Its interactions rely on an open-source Python package named `huggingface_hub <https://github.com/huggingface/huggingface_hub>`_. As it works seamlessly in the Hugging Face ecosystem, especially the `Transformers <https://huggingface.co/docs/transformers/index>`_ or `Diffusers <https://huggingface.co/docs/diffusers/index>`_ libraries, it stood out and became one of the preferred way to openly share and download models.
 9 | 
10 | Now when downloading a Transformer model, you will need to also download its associated tokenizer to be able to "dialog" with it. Likewise, if you want to share one of your models, you will need to share its tokenizer too for people to be able to use it. MidiTok allows you to push and download tokenizers in similar way to what is done in the Hugging Face Transformers library.
11 | 
12 | How MidiTok interoperates with the hub
13 | ------------------------------------------
14 | 
15 | Internally, MidiTok relies on the ``huggingface_hub.ModelHubMixin`` component. It implements the same methods commonly used in the Hugging Face ecosystem. Note that:
16 | 
17 | * :py:func:`miditok.MusicTokenizer.save_pretrained` is equivalent to calling :py:func:`miditok.MusicTokenizer.save_params`;
18 | * :py:func:`miditok.MusicTokenizer.from_pretrained` can be used to load tokenizers whether from the Hugging Face hub or from a file on your local filesystem;
19 | * for :py:func:`miditok.MusicTokenizer.save_pretrained` and :py:func:`miditok.MusicTokenizer.push_to_hub`, you can ignore the ``config`` argument which is meant to be used with models (not applicable for tokenizers);
20 | * you can give a ``filename`` keyword argument with the :py:func:`miditok.MusicTokenizer.save_pretrained` and :py:func:`miditok.MusicTokenizer.from_pretrained` methods to use a specific tokenizer configuration file name, otherwise the default one will be used (``tokenizer.json``).
21 | 
22 | .. autofunction:: miditok.MusicTokenizer.from_pretrained
23 |     :noindex:
24 | 
25 | .. autofunction:: miditok.MusicTokenizer.save_pretrained
26 |     :noindex:
27 | 
28 | .. autofunction:: miditok.MusicTokenizer.push_to_hub
29 |     :noindex:
30 | 
31 | Example
32 | ------------------------
33 | 
34 | ..  code-block:: python
35 | 
36 |     from miditok import REMI, TokSequence
37 |     from copy import deepcopy
38 | 
39 |     tokenizer = REMI()  # using defaults parameters (constants.py)
40 |     hf_token = "your_hf_token"  # to create on huggingface.co
41 | 
42 |     # Train the tokenizer with BPE
43 |     tokenizer.train(
44 |         vocab_size=30000,
45 |         files_paths=list(Path("path", "to", "midis").glob("**/*.mid")),
46 |     )
47 | 
48 |     # Push the tokenizer to the HF hub
49 |     tokenizer.push_to_hub("YourUserName/model-name", private=True, token=hf_token)
50 | 
51 |     # Recreates it from the configuration saved on the hub
52 |     tokenizer2 = REMI.from_pretrained("YourUserName/model-name", token=hf_token)
53 |     assert tokenizer == tokenizer2
54 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. MidiTok documentation master file, created by
 2 |    sphinx-quickstart on Sat Feb  4 20:52:11 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to MidiTok's documentation!
 7 | =========================================
 8 | 
 9 | .. image:: /assets/miditok_logo_stroke.png
10 |   :width: 600
11 |   :alt:
12 | 
13 | **MidiTok** is a Python package for MIDI file tokenization, introduced at the ISMIR 2021 LBDs `(paper) <https://archives.ismir.net/ismir2021/latebreaking/000005.pdf>`_.
14 | It tokenize symbolic music files (MIDI, abc), i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task.
15 | MidiTok features most known MIDI :ref:`tokenizations`, and is built around the idea that they all share common methods. Tokenizers can be trained with BPE, Unigram or WordPiece (:ref:`Training a tokenizer`) and be push to and pulled from the Hugging Face hub!
16 | 
17 | Installation
18 | ==================
19 | 
20 | ..  code-block:: bash
21 | 
22 |     pip install miditok
23 | 
24 | MidiTok uses `symusic <https://github.com/Yikai-Liao/symusic>`_ to read and write MIDI files, and tokenizer training is backed by the `Hugging Face 🤗tokenizers <https://github.com/huggingface/tokenizers>`_ for super fast encoding.
25 | 
26 | Citation
27 | ==================
28 | 
29 | If you use MidiTok for your research, a citation in your manuscript would be gladly appreciated. ❤️
30 | 
31 | You can also find in this documentation BibTeX :ref:`citations` of related research works.
32 | 
33 | ..  code-block:: bib
34 | 
35 |     @inproceedings{miditok2021,
36 |         title={{MidiTok}: A Python package for {MIDI} file tokenization},
37 |         author={Fradet, Nathan and Briot, Jean-Pierre and Chhel, Fabien and El Fallah Seghrouchni, Amal and Gutowski, Nicolas},
38 |         booktitle={Extended Abstracts for the Late-Breaking Demo Session of the 22nd International Society for Music Information Retrieval Conference},
39 |         year={2021},
40 |         url={https://archives.ismir.net/ismir2021/latebreaking/000005.pdf},
41 |     }
42 | 
43 | Contents
44 | ==================
45 | 
46 | .. toctree::
47 |    :maxdepth: 2
48 |    :caption: Bases of Music and AI
49 | 
50 |    music_formats
51 |    midi
52 |    sequential_models
53 | 
54 | .. toctree::
55 |    :maxdepth: 2
56 |    :caption: MidiTok
57 | 
58 |    tokenizing_music_with_miditok
59 |    configuration
60 |    tokenizations
61 |    attribute_controls
62 |    train
63 |    hf_hub
64 |    pytorch_data
65 |    data_augmentation
66 |    utils
67 | 
68 | .. toctree::
69 |    :maxdepth: 2
70 |    :caption: Others
71 | 
72 |    examples
73 |    citations
74 | 
75 | .. toctree::
76 |    :hidden:
77 |    :caption: Project Links
78 | 
79 |    GitHub <https://github.com/Natooz/MidiTok>
80 |    PyPi <https://pypi.org/project/miditok/>
81 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/midi.rst:
--------------------------------------------------------------------------------
 1 | .. _midi-protocol-label:
 2 | 
 3 | ===================================
 4 | The MIDI protocol
 5 | ===================================
 6 | 
 7 | MIDI, standing for *Musical Instrument Digital Interface*, is a digital communication protocol standard in the music sector. It describes the protocol itself, the physical connector to transmit the protocol between devices, and a digital file format.
 8 | A MIDI file allows to store MIDI messages as a symbolic music file. It is the most abundant file format among available music datasets.
 9 | 
10 | History of MIDI
11 | -----------------------------
12 | 
13 | MIDI first appeared in the early eighties, when digital instrument manufacturers needed a digital protocol for communication between devices such as synthesizers and computers. It was standardized in 1983 by the first specifications, and is currently maintained by the `MIDI Manufacturers Association <https://www.midi.org>`_\. Meanwhile `new specifications <https://www.midi.org/specifications>`_ were made, the two major ones and still the norm today being the General MIDI 1 (GM1) and General MIDI 2 (GM2). These specifications aim to guide the manufacturers to design digital music devices compatible with the ones from other manufacturers by making sure they implement the protocol by following the same recommendations.
14 | 
15 | The MIDI protocol allows to represent **notes, tempos, time signatures, key signatures, instruments (called programs) and effects (called controls) such as sustain pedal, pitch bend or modulation.**
16 | MIDI is an event based protocol. It consists of a series of messages, which can occur in multiple channels. Each message is composed of two key information, 1) the delta time expressed, which is the distance in ticks with the previous event (in the same channel) and so represents its position in time, 2) a series of bytes which represents its content.
17 | 
18 | The latest evolution of the MIDI protocol is the MIDI Polyphonic Expression (shortly called MPE). This new norm allows manufacturers to create MIDI devices on which a specific channel is assigned to each note allowing the user to apply pitch bend and modulation on each key independently. These devices are typically built with touch-sensitive keys. The MIDI Manufacturers Association released the complete `specifications <https://www.midi.org/midi-articles/midi-polyphonic-expression-mpe>`_ on March 2018.
19 | 
20 | 
21 | MIDI Messages
22 | -----------------------------
23 | 
24 | A message expresses an event or an information. It takes the form of a series of bytes. The first is the Status byte which specifies the type of message and the channel, followed by one or two data bytes which contain the information. All the messages and their significations are described in the GM1 and GM2 specifications. The most important are:
25 | 
26 | - *Note On*: a note is being played, specifies its pitch and velocity;
27 | - *Note Off*: a note is released, specifies the note (by its pitch) to stop and the velocity;
28 | - *Time Signature Change*: indicates the current time signature;
29 | - *Tempo Change*: indicates the current tempo;
30 | - *Program Change*: specifies the current instrument being played;
31 | - *Control Change*: a control parameter is modified or applied. The modulation wheel, foot sustain pedal, volume control or bank select are for instance effects transcribed into Control Change messages.
32 | 
33 | Note that these messages are "voice messages", which means that each of them is applied within a channel that is specified in its status byte. The MIDI protocol handles up to sixteen channels which allows to connect multiple devices that are playing and communicating simultaneously. The channel 10 is reserved for drums, which is a specific "program" in which the pitch values corresponds to drum sounds like kicks, snares, or hi-hats.
34 | 
35 | Time in MIDI
36 | -----------------------------
37 | 
38 | Time in MIDI is determined by its **time division**, which is a clock signal expressed in **ticks per quarter note** (tpq), and can be seen as a time resolution. Common time division values are 384, 480 and 960 tpq as they are divisible by 3, 4, 6 and 8 which are common time signature numerators and denominators.
39 | The time division can also be set in ticks per second, but this option is more rarely encountered as it makes less sense to use seconds as the tempo and time signature are known in MIDI.
40 | The time division is the first information that can be read at the beginning of a file, and a MIDI file can only have one time division.
41 | 
42 | The number of ticks per bar and ticks per beat can be calculated from the MIDI's time division (:math:`time_{div}`) and the current time signature (:math:`\frac{ts_{num}}{ts_{denom}}`):
43 | 
44 | - :math:`tpbeat = time_{div} \times \frac{4}{ts_{denom}}`
45 | - :math:`tpbar = tpbeat \times ts_{num}`
46 | 
47 | Hence, for a :math:`\frac{4}{4}` time signature, the number of ticks per beat is equal to the time division (as a beat is equal to a quarter note) and the number of ticks per bar is equal to four times the number of ticks per beat.
48 | 


--------------------------------------------------------------------------------
/docs/music_formats.rst:
--------------------------------------------------------------------------------
 1 | ===================================
 2 | Music formats
 3 | ===================================
 4 | 
 5 | This page introduces the two representations of music and symbolic music file formats. It aims to present the basic differences between audio and symbolic music in order to better understand how they can be used with AI models, without going to much in the details, for which more comprehensive references are attached.
 6 | 
 7 | Music: symbolic and audio
 8 | ---------------------------
 9 | 
10 | Music is a unique modality in the way that it can take two different forms: symbolic and audio.
11 | 
12 | Symbolic music represents the successions of notes, arranged in time and along with other musical elements such as tempos and time signatures typically found in the western music notations. The `sheet music <https://en.wikipedia.org/wiki/Sheet_music>`_ is the historical handwritten or printed representation of music that shows the notes on staves from left to right and up and down, with the time and key signatures indicated at the beginning.
13 | 
14 | .. image:: /assets/bases/sheet_music.png
15 |   :width: 800
16 |   :alt: A sheet music.
17 | 
18 | The `pianoroll <https://en.wikipedia.org/wiki/Piano_roll>`_ is another symbolic representation which consists of a two axis grid with one axis for the time and one for the note pitches. It was originally used in player pianos, and is now used in most `Digital Audio Wordstation (DAW) <https://en.wikipedia.org/wiki/Digital_audio_workstation>`_ software to show the notes and other effects of a track.
19 | 
20 | .. image:: /assets/bases/pianoroll_daw.png
21 |   :width: 800
22 |   :alt: A piano roll view in the Logic Pro X DAW.
23 | 
24 | Audio on the other hand represents the *physical* form of music, i.e. a sound signal, more specifically vibrations propagating in a material. Audio music is usually represented as waveforms (time domain) or spectrograms (frequency domain).
25 | 
26 | A waveform is strictly the amplitude of a sound as a function of time. In the real world, a waveform is purely continuous. A digital audio waveform as found in audio files such as mp3s will feature a sampling frequency which indicates the number of samples per second used to represent this waveform. This time resolution is usually at least 44.1k samples per seconds, following the `Nyquist–Shannon theorem <https://en.wikipedia.org/wiki/Nyquist–Shannon_sampling_theorem>`_ .
27 | 
28 | A sound, whether from an instrument, a human voice or a music arrangement, is a superposition of many periodic frequencies, defined by their wavelength, amplitude and phase. A spectrogram depicts the intensity in dB of the frequencies as a function of time. It allow to have a representation of these frequencies which is useful when analyzing sound. It can be computed with a `Fourier Transform <https://en.wikipedia.org/wiki/Fourier_transform>`_ , usually a `Short Time Fourier Transform (STFT) <https://ieeexplore.ieee.org/document/1164317>`_ .
29 | 
30 | .. image:: /assets/bases/spectrogram.png
31 |   :width: 800
32 |   :alt: The spectrogram of a sound, abscissa is time, ordinate is frequency and the color represents the intensity in dB.
33 | 
34 | Symbolic music can be seen as both discrete and continuous as it represent discrete notes that feature however "continuous-like" attributes, and potentially with a high time resolution (in samples per beat or other specific time duration). **For this reason, it is more commonly used with discrete sequential models**, which we introduce in :ref:`sequential-models-label`), **by being represented as sequences of tokens**, which is the purpose of MidiTok. Pianoroll has also been used with `Convolutional Neural Networks (CNNs) <https://en.wikipedia.org/wiki/Convolutional_neural_network>`_ in past works (e.g. `MuseGan <https://aaai.org/papers/11312-musegan-multi-track-sequential-generative-adversarial-networks-for-symbolic-music-generation-and-accompaniment/>`_ ) but is now uncommon due to the limitations it imposes on the representation of musical elements.
35 | 
36 | On the other hand, audio is by nature a continuous modality, as it represent the waveform of the sound itself. From a practical point of view, modeling raw waveforms with neural networks is often intractable due to the high time resolution of audio, despite works that achieved to do it (`WaveNet <https://arxiv.org/pdf/1609.03499>`_ , `Jukebox <https://openai.com/index/jukebox/>`_ ). For this reason, audio has been more commonly formatted as spectrograms when used with neural networks, and used with CNNs as it conveniently takes the form of a 2-dimensional matrix with distinct continuous patterns like images.
37 | Research in neural audio codecs allowed to "compress" audio waveform into a reduced number of discrete values allows to use waveforms as sequences of tokens with discrete models such as Transformers. For more details, see `SoundStream <https://ieeexplore.ieee.org/document/9625818>`_ and `EnCodec <https://openreview.net/forum?id=ivCd8z8zR2>`_ which are respectively used with `MusicLM <https://arxiv.org/abs/2301.11325>`_ and `MusicGen <https://proceedings.neurips.cc/paper_files/paper/2023/hash/94b472a1842cd7c56dcb125fb2765fbd-Abstract-Conference.html>`_ .
38 | 
39 | 
40 | Symbolic music files format
41 | -----------------------------
42 | 
43 | There are three major file formats for symbolic music: MIDI, abc and musicXML. **MidiTok supports MIDI and abc files.**
44 | 
45 | MIDI, standing for *Musical Instrument Digital Interface*, is a digital communication protocol standard in the music sector. It describes the protocol itself, the physical connector to transmit the protocol between devices, and a digital file format.
46 | A MIDI file allows to store MIDI messages as a symbolic music file. It is the most abundant file format among available music datasets. It is the most comprehensive and versatile file format for musical music, as such we present it more in detail in :ref:`midi-protocol-label`.
47 | 
48 | 
49 | The ABC notation is a notation for symbolic music, and a file format with the extension ``abc``. Its simplicity has made it widely used to write and share traditional and folk tunes from Western Europe.
50 | Each tune begins with a few lines indicating its title, time signature, default note length, key and others. Lines following the key represent the notes. A note is indicated by its letter, followed by a ``/x`` or ``x`` to respectively divide or multiply its length by ``x`` :math:`\in \mathbb{N}^{\star}` compared to the default note length. An upper case (e.g., A) means a pitch one octave below than a lower case (a).
51 | 
52 | MusicXML is an open file format and music notation. Inspired by the XML file format, it is structured with the same item-hierarchy. An example is shown below.
53 | 
54 | ..  code-block:: xml
55 | 
56 |     <?xml version="1.0" encoding="UTF-8" standalone="no"?>
57 |     <!DOCTYPE score-partwise PUBLIC
58 |         "-//Recordare//DTD MusicXML 3.1 Partwise//EN"
59 |         "http://www.musicxml.org/dtds/partwise.dtd">
60 |     <score-partwise version="3.1">
61 |         <part-list>
62 |             <score-part id="P1">
63 |                 <part-name>Music</part-name>
64 |             </score-part>
65 |         </part-list>
66 |         <part id="P1">
67 |             <measure number="1">
68 |                 <attributes>
69 |                     <divisions>1</divisions>
70 |                     <key><fifths>0</fifths></key>
71 |                     <time><beats>4</beats><beat-type>4</beat-type></time>
72 |                     <clef><sign>G</sign><line>2</line></clef>
73 |                 </attributes>
74 |                 <note>
75 |                     <pitch><step>C</step><octave>4</octave></pitch>
76 |                     <duration>4</duration>
77 |                     <type>whole</type>
78 |                 </note>
79 |             </measure>
80 |         </part>
81 |     </score-partwise>
82 | 
83 | The ``part-list`` references the parts to be written following with the tag ``part``. A ``measure`` is defined with its attributes, followed by notes and their attributes.
84 | The common file extensions are ``.mxl`` and ``.musicxml``.
85 | 


--------------------------------------------------------------------------------
/docs/pytorch_data.rst:
--------------------------------------------------------------------------------
 1 | =================================
 2 | Using MidiTok with Pytorch
 3 | =================================
 4 | 
 5 | MidiTok features PyTorch `Dataset <https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>`_ objects to load music data during training, usually coupled with a PyTorch ``DataLoader``. A ``Dataset`` is an object storing the information about a dataset: paths of files to load, or the data itself stored in memory (recommended for small datasets only).
 6 | When indexed, the ``Dataset`` will output dictionaries with values corresponding to the inputs and labels.
 7 | 
 8 | Loading data
 9 | --------------------------
10 | 
11 | MidiTok provides two dataset classes: :class:`miditok.pytorch_data.DatasetMIDI` and :class:`miditok.pytorch_data.DatasetJSON`.
12 | 
13 | :class:`miditok.pytorch_data.DatasetMIDI` loads MIDI files and can either tokenize them on the fly when the dataset is indexed, or pre-tokenize them when creating it and saving the token ids in memory. **For most use cases, this Dataset should fulfill your needs and is recommended.**
14 | 
15 | :class:`miditok.pytorch_data.DatasetJSON` loads JSON files containing token ids. It requires to first tokenize a dataset to be used. This dataset is only compatible with JSON files saved as "one token stream" (``tokenizer.one_token_stream``). In order to use it for all the tracks of a multi-stream tokenizer, you will need to save each track token sequence as a separate JSON file.
16 | 
17 | Preparing data
18 | --------------------------
19 | 
20 | When training a model, you will likely want to limit the possible token sequence length in order to not run out of memory. The dataset classes handle such case and can trim the token sequences. However, **it is not uncommon for a single MIDI to be tokenized into sequences that can contain several thousands tokens, depending on its duration and number of notes. In such case, using only the first portion of the token sequence would considerably reduce the amount of data used to train and test a model.**
21 | 
22 | To handle such case, MidiTok provides the :py:func:`miditok.pytorch_data.split_files_for_training` method to dynamically split MIDI files into chunks that should be tokenized in approximately the number of tokens you want.
23 | If you cannot fit most of your MIDIs into single usable token sequences, we recommend to split your dataset with this method.
24 | 
25 | Data loading example
26 | --------------------------
27 | 
28 | MidiTok also provides an "all-in-one" data collator: :class:`miditok.pytorch_data.DataCollator` to be used with PyTorch a ``DataLoader`` in order to pad batches and create attention masks.
29 | Here is a complete example showing how to use this module to train any model.
30 | 
31 | ..  code-block:: python
32 | 
33 |     from miditok import REMI, TokenizerConfig
34 |     from miditok.pytorch_data import DatasetMIDI, DataCollator, split_files_for_training
35 |     from torch.utils.data import DataLoader
36 |     from pathlib import Path
37 | 
38 |     # Creating a multitrack tokenizer configuration, read the doc to explore other parameters
39 |     config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
40 |     tokenizer = REMI(config)
41 | 
42 |     # Train the tokenizer with Byte Pair Encoding (BPE)
43 |     midi_paths = list(Path("path", "to", "midis").glob("**/*.mid"))
44 |     tokenizer.train(vocab_size=30000, files_paths=midi_paths)
45 |     tokenizer.save_params(Path("path", "to", "save", "tokenizer.json"))
46 |     # And pushing it to the Hugging Face hub (you can download it back with .from_pretrained)
47 |     tokenizer.push_to_hub("username/model-name", private=True, token="your_hf_token")
48 | 
49 |     # Split MIDIs into smaller chunks for training
50 |     dataset_chunks_dir = Path("path", "to", "midi_chunks")
51 |     split_files_for_training(
52 |         files_paths=midi_paths,
53 |         tokenizer=tokenizer,
54 |         save_dir=dataset_chunks_dir,
55 |         max_seq_len=1024,
56 |     )
57 | 
58 |     # Create a Dataset, a DataLoader and a collator to train a model
59 |     dataset = DatasetMIDI(
60 |         files_paths=list(dataset_chunks_dir.glob("**/*.mid")),
61 |         tokenizer=tokenizer,
62 |         max_seq_len=1024,
63 |         bos_token_id=tokenizer["BOS_None"],
64 |         eos_token_id=tokenizer["EOS_None"],
65 |     )
66 |     collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
67 |     dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)
68 | 
69 |     # Iterate over the dataloader to train a model
70 |     for batch in dataloader:
71 |         print("Train your model on this batch...")
72 | 
73 | **Note:** This module is imported only if ``torch`` is installed in your Python environment.
74 | 
75 | .. automodule:: miditok.pytorch_data
76 |     :members:
77 | 


--------------------------------------------------------------------------------
/docs/sequential_models.rst:
--------------------------------------------------------------------------------
 1 | .. _sequential-models-label:
 2 | 
 3 | ===================================
 4 | Sequential models and tokens
 5 | ===================================
 6 | 
 7 | This page introduces the basic concepts of sequential models, which are often called "language models" as commonly use for natural language, which can be used with MidiTok to be trained on music data.
 8 | 
 9 | 
10 | Sequential models
11 | ----------------------------
12 | 
13 | We qualify as sequential model and model that takes as input **sequences of discrete elements**. `RNN <http://www.cs.toronto.edu/~hinton/absps/pdp8.pdf>`_\, `Long Short Term Memory (LSTM) <https://direct.mit.edu/neco/article-abstract/9/8/1735/6109/Long-Short-Term-Memory?redirectedFrom=fulltext>`_ and `Transformers <https://papers.nips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html>`_ fall into this category. As a general rule, the operation of these models noted :math:`p_\theta` can be formulated as :math:`p_\theta (\mathbf{x}) = y` where :math:`\mathbf{x} \in \mathbb{N}^n` is a sequence of :math:`\mathbb{N}^n` elements (integers here) and :math:`y` can either be a scalar or a sequence. The common feature of these is that :math:`y` **is conditioned on all the elements from** :math:`\mathbf{x}`.
14 | 
15 | .. _transformer-label:
16 | 
17 | ..  figure:: /assets/transformer.png
18 |     :alt: Schema of a Transformer model
19 |     :class: with-shadow
20 |     :width: 500px
21 | 
22 |     Schema of a "seq2seq" Transformer model.
23 | 
24 | A sequential model can be "seq2seq", "encoder-only" or "decoder-only".
25 | seq2seq means that the model is composed of an encoder and decoder. The model's encoder processes an input sequence into intermediate **hidden states**, which condition the decoder that **autoregressively** generate the output sequence. This architecture is commonly used for translation tasks where the input sequence is in one language and the decoder generates its translation in another one.
26 | 
27 | In a seq2seq configuration, the encoder is usually **bi-directional**, meaning that the all the output hidden states are conditioned on all the input elements, whereas the decoder is **causal**, meaning that the logits of a position :math:`t` are conditioned only on the input elements at positions :math:`\leq t`, i.e. the previous ones.
28 | 
29 | An encoder-only model (e.g. `BERT <https://arxiv.org/abs/1810.04805>`_\) is more useful for non-generative tasks, e.g. classification. On the other hand, a decoder-only model is usually designed to generate content. As each position is conditioned on the previous ones, the model is usually trained with **teacher forcing** to predict the next element. Consequently, it can be used to generate content **autoregressively**, i.e. one element after another on :math:`n` iterations by reinjecting the element generated at a given iteration to the end of the input sequence of the next one.
30 | 
31 | 
32 | Tokens and vocabulary
33 | ----------------------------
34 | 
35 | This section focuses more specifically on the nature of the inputs of the models.
36 | 
37 | Until now, we referred to the sequences as holding "elements" representing discrete attributes of the data. These elements are commonly called **tokens**, and **are fed to a model as integers**. For natural language, these tokens can represent words or parts of words. Consequently, sentence can then be tokenized into a sequence of tokens representing the words and punctuation. For symbolic music, tokens can represent the values of the note attributes (pitch, velocity, duration) or time events. The conversion of raw data to tokens is done by a **tokenizer**, which reads it and serializes it into sequences of tokens from its vocabulary.
38 | 
39 | The **vocabulary** of a tokenizer is the finite set of all distinct known tokens. For natural language, it represent the set of words, subwords, punctuations and unicode characters. **Each token is associated to a unique id**, its index in the vocabulary, which is fed to a model. A vocabulary is usually (as in MidiTok) a dictionary acting a lookup table linking tokens (their text forms) to their ids (integer form).
40 | 
41 | 
42 | Embeddings
43 | ----------------------------
44 | 
45 | This section introduces the notion of embedding, sometimes called *embedding vector* or *word embedding*.
46 | 
47 | Vocabularies are often made of thousands of tokens, each of them having a whole variety of meanings and significations. In order for a sequential model to efficiently process them, it must be able to capture their semantic information and features. This step is handled thanks to **embeddings**.
48 | 
49 | An embedding :math:`\mathbf{e}^d` is a vector of :math:`d` dimensions, which represent the semantic information of the associated token. The embeddings are **contextually learned** by the model during training, meaning their position are adjusted conditionally to the context in which they are found in the data. Embeddings with similar semantics/meanings will be closer in the **continuous embedding space** of the model than embeddings with no related meanings. They offer a way to the model to capture the semantic of words across these dimensions.
50 | 
51 | ..  figure:: /assets/embeddings.png
52 |     :alt: Embedding space.
53 |     :class: with-shadow
54 |     :width: 500px
55 | 
56 |     Visualization of an embedding space reduced in 2 dimensions with `TSNE <https://www.jmlr.org/papers/v9/vandermaaten08a.html>`_\.
57 | 
58 | The embeddings are actually the real input of a sequential model. Each token acts as an index for the model's embedding matrix. In :ref:`transformer-label`, the first operation consist in indexing this matrix with the token ids to get their embeddings which are then processed by the model.
59 | 
60 | MidiTok allows you to leverage the features of model embeddings by training the tokenizer (:ref:`training-tokenizer-label`).
61 | 


--------------------------------------------------------------------------------
/docs/tokenizations.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Tokenizations
  3 | =================
  4 | 
  5 | This page details the tokenizations featured by MidiTok. They inherit from :class:`miditok.MusicTokenizer`, see the documentation for learn to use the common methods. For each of them, the token equivalent of the lead sheet below is showed.
  6 | 
  7 | .. image:: /assets/music_sheet.png
  8 |   :width: 800
  9 |   :alt: Music sheet example
 10 | 
 11 | REMI
 12 | ------------------------
 13 | 
 14 | .. image:: /assets/remi.png
 15 |   :width: 800
 16 |   :alt: REMI sequence, time is tracked with Bar and position tokens
 17 | 
 18 | .. autoclass:: miditok.REMI
 19 |     :show-inheritance:
 20 | 
 21 | REMIPlus
 22 | ------------------------
 23 | 
 24 | REMI+ is an extended version of :ref:`REMI` (Huang and Yang) for general multi-track, multi-signature symbolic music sequences, introduced in `FIGARO (Rütte et al.) <https://arxiv.org/abs/2201.10936>`_, which handles multiple instruments by adding ``Program`` tokens before the ``Pitch`` ones.
 25 | 
 26 | You can get the REMI+ tokenization by using the :ref:`REMI` tokenizer with ``config.use_programs``, ``config.one_token_stream_for_programs`` and ``config.use_time_signatures`` enabled.
 27 | 
 28 | MIDI-Like
 29 | ------------------------
 30 | 
 31 | .. image:: /assets/midi_like.png
 32 |   :width: 800
 33 |   :alt: MIDI-Like token sequence, with TimeShift and NoteOff tokens
 34 | 
 35 | .. autoclass:: miditok.MIDILike
 36 |     :show-inheritance:
 37 | 
 38 | TSD
 39 | ------------------------
 40 | 
 41 | .. image:: /assets/tsd.png
 42 |   :width: 800
 43 |   :alt: TSD sequence, like MIDI-Like with Duration tokens
 44 | 
 45 | .. autoclass:: miditok.TSD
 46 |     :show-inheritance:
 47 | 
 48 | Structured
 49 | ------------------------
 50 | 
 51 | .. image:: /assets/structured.png
 52 |   :width: 800
 53 |   :alt: Structured tokenization, the token types always follow the same succession pattern
 54 | 
 55 | .. autoclass:: miditok.Structured
 56 |     :show-inheritance:
 57 | 
 58 | CPWord
 59 | ------------------------
 60 | 
 61 | .. image:: /assets/cp_word.png
 62 |   :width: 800
 63 |   :alt: CP Word sequence, tokens of the same family are grouped together
 64 | 
 65 | .. autoclass:: miditok.CPWord
 66 |     :show-inheritance:
 67 | 
 68 | Octuple
 69 | ------------------------
 70 | 
 71 | .. image:: /assets/octuple.png
 72 |   :width: 800
 73 |   :alt: Octuple sequence, with a bar and position embeddings
 74 | 
 75 | .. autoclass:: miditok.Octuple
 76 |     :show-inheritance:
 77 | 
 78 | MuMIDI
 79 | ------------------------
 80 | 
 81 | .. image:: /assets/mumidi.png
 82 |   :width: 800
 83 |   :alt: MuMIDI sequence, with a bar and position embeddings
 84 | 
 85 | .. autoclass:: miditok.MuMIDI
 86 |     :show-inheritance:
 87 | 
 88 | MMM
 89 | ------------------------
 90 | 
 91 | .. autoclass:: miditok.MMM
 92 |     :show-inheritance:
 93 | 
 94 | PerTok
 95 | ------------------------
 96 | 
 97 | .. autoclass:: miditok.PerTok
 98 |     :show-inheritance:
 99 | 
100 | 
101 | Create yours
102 | ------------------------
103 | 
104 | You can easily create your own tokenizer and benefit from the MidiTok framework. Just create a class inheriting from :class:`miditok.MusicTokenizer`, and override:
105 | 
106 | * :py:func:`miditok.MusicTokenizer._add_time_events` to create time events from global and track events;
107 | * :py:func:`miditok.MusicTokenizer._tokens_to_score` to decode tokens into a ``Score`` object;
108 | * :py:func:`miditok.MusicTokenizer._create_vocabulary` to create the tokenizer's vocabulary;
109 | * :py:func:`miditok.MusicTokenizer._create_token_types_graph` to create the possible token types successions (used for eval only).
110 | 
111 | If needed, you can override the methods:
112 | 
113 | * :py:func:`miditok.MusicTokenizer._score_to_tokens` the main method calling specific tokenization methods;
114 | * :py:func:`miditok.MusicTokenizer._create_track_events` to include special track events;
115 | * :py:func:`miditok.MusicTokenizer._create_global_events` to include special global events.
116 | 
117 | If you think people can benefit from it, feel free to send a pull request on `Github <https://github.com/Natooz/MidiTok>`_.
118 | 


--------------------------------------------------------------------------------
/docs/tokenizing_music_with_miditok.rst:
--------------------------------------------------------------------------------
  1 | =================
  2 | Bases of MidiTok
  3 | =================
  4 | 
  5 | This page introduces the bases of MidiTok, how a tokenizer works and what are the basic elements of MidiTok.
  6 | 
  7 | MidiTok's workflow
  8 | ------------------------
  9 | 
 10 | MidiTok uses a common workflow for all its tokenizers, which follows:
 11 | 
 12 | 1. **Music file preprocessing**: time is **downsampled** to match the tokenizer's time resolution, tracks of the same programs are merged, notes with pitches outside the tokenizer's pitch range are removed, note velocities and tempos are downsampled, finally notes, tempos and time signatures are deduplicated;
 13 | 2. **Parsing of global events**: tempos and time signature tokens are created;
 14 | 3. **Parsing of the tracks events**: notes, chords, controls (pedals...) and tokens specific to each tracks are parsed to create their associated tokens;
 15 | 4. **Creating time tokens**: the tokens representing the time are created in order to bind the previously created global and track tokens.
 16 | 
 17 | The resulting tokens are provided by the tokenizer as one or :class:`miditok.TokSequence` depending on the tokenizer's IO format (:ref:`Tokens & TokSequence input / output format`)
 18 | 
 19 | The first three steps are common for all tokenizers, while the fourth is handled independently by each tokenizer.
 20 | The first step allows to format the music file so that its content fits the tokenizer's vocabulary before being parsed.
 21 | 
 22 | 
 23 | Vocabulary
 24 | ------------------------
 25 | 
 26 | As introduced in :ref:`Tokens and vocabulary`, the vocabulary acts as a lookup table between the tokens (string) and their ids (integers).
 27 | It can be accessed with ``tokenizer.vocab`` to get the string to id mapping.
 28 | 
 29 | For tokenizers with embedding pooling (e.g. :ref:`CPWord` or :ref:`Octuple`), ``tokenizer.vocab`` will be a list of dictionaries, and the ``tokenizer.is_multi_vocab`` property will be ``True``.
 30 | 
 31 | **With a trained tokenizer:**
 32 | ``tokenizer.vocab`` holds all the basic tokens describing the note and time attributes of music. By analogy with text, this vocabulary can be seen as the alphabet of unique characters.
 33 | After :ref:`Training a tokenizer`, a new vocabulary is built with newly created tokens from pairs of basic tokens. This vocabulary can be accessed with ``tokenizer.vocab_model``, and maps tokens as bytes (string) to their associated ids (int). This is the vocabulary of the 🤗tokenizers model.
 34 | 
 35 | TokSequence
 36 | ------------------------
 37 | 
 38 | The methods of MidiTok use :class:`miditok.TokSequence` objects as input and outputs. A :class:`miditok.TokSequence` holds tokens as strings, integers, ``miditok.Event`` and bytes (used internally to encode the token ids with trained tokenizers). TokSequences are subscriptable, can be sliced, concatenated and implement the ``__len__`` magic method.
 39 | 
 40 | You can use the :py:func:`miditok.MusicTokenizer.complete_sequence` method to automatically fill the non-initialized attributes of a :class:`miditok.TokSequence`.
 41 | 
 42 | .. autoclass:: miditok.TokSequence
 43 |     :members:
 44 | 
 45 | 
 46 | The MusicTokenizer class
 47 | ------------------------
 48 | 
 49 | MidiTok features several MIDI tokenizations, all inheriting from the :class:`miditok.MusicTokenizer` class.
 50 | You can customize your tokenizer by creating it with a custom :class:`miditok.TokenizerConfig`.
 51 | 
 52 | .. autoclass:: miditok.MusicTokenizer
 53 |     :members:
 54 | 
 55 | 
 56 | Tokens & TokSequence input / output format
 57 | --------------------------------------------
 58 | 
 59 | Depending on the tokenizer at use, the **format** of the tokens returned by the :py:func:`miditok.MusicTokenizer.encode` method may vary, as well as the expected format for the :py:func:`miditok.MusicTokenizer.decode` method. The format is given by the :py:func:`miditok.MusicTokenizer.io_format` property. For any tokenizer, the format is the same for both methods.
 60 | 
 61 | The format is deduced from the :py:func:`miditok.MusicTokenizer.is_multi_voc` and ``one_token_stream`` tokenizer attributes.
 62 | ``one_token_stream`` determined wether the tokenizer outputs a unique :class:`miditok.TokSequence` covering all the tracks of a music file or one :class:`miditok.TokSequence` per track. It is equal to ``tokenizer.config.one_token_stream_for_programs``, except for :class:`miditok.MMM` for which it is enabled while ``one_token_stream_for_programs`` is False.
 63 | :py:func:`miditok.MusicTokenizer.is_multi_voc` being True means that each "token" within a :class:`miditok.TokSequence` is actually a list of ``C`` "sub-tokens", ``C`` being the number of sub-token classes.
 64 | 
 65 | This results in four situations, where ``I`` (instrument) is the number of tracks, ``T`` (token) is the number of tokens and ``C`` (class) the number of subtokens per token step:
 66 | 
 67 | * ``is_multi_voc`` and ``one_token_stream`` are both ``False``: ``[I,(T)]``;
 68 | * ``is_multi_voc`` is ``False`` and ``one_token_stream`` is ``True``: ``(T)``;
 69 | * ``is_multi_voc`` is ``True`` and ``one_token_stream`` is ``False``: ``[I,(T,C)]``;
 70 | * ``is_multi_voc`` and ``one_token_stream`` are both ``True``: ``(T,C)``.
 71 | 
 72 | **Note that if there is no I dimension in the format, the output of** :py:func:`miditok.MusicTokenizer.encode` **is a** :class:`miditok.TokSequence` **object, otherwise it is a list of** :class:`miditok.TokSequence` **objects (one per token stream / track).**
 73 | 
 74 | Some tokenizer examples to illustrate:
 75 | 
 76 | * **TSD** without ``config.use_programs`` will not have multiple vocabularies and will treat each track as a unique stream of tokens, hence it will convert music files to a list of :class:`miditok.TokSequence` objects, ``(I,T)`` format.
 77 | * **TSD** with ``config.use_programs`` being True will convert all tracks to a single stream of tokens, hence one :class:`miditok.TokSequence` object, ``(T)`` format.
 78 | * **CPWord** is a multi-voc tokenizer, without ``config.use_programs`` it will treat each track as a distinct stream of tokens, hence it will convert music files to a list of :class:`miditok.TokSequence` objects with the ``(I,T,C)`` format.
 79 | * **Octuple** is a multi-voc tokenizer and converts all track to a single stream of tokens, hence it will convert music files to a :class:`miditok.TokSequence` object, ``(T,C)`` format.
 80 | 
 81 | 
 82 | Magic methods
 83 | ------------------------
 84 | 
 85 | `Magic methods <https://rszalski.github.io/magicmethods/>`_ allows to intuitively access to a tokenizer's attributes and methods. We list them here with some examples.
 86 | 
 87 | .. autofunction:: miditok.MusicTokenizer.__call__
 88 |     :noindex:
 89 | ..  code-block:: python
 90 | 
 91 |     tokens = tokenizer(score)
 92 |     score2 = tokenizer(tokens)
 93 | 
 94 | .. autofunction:: miditok.MusicTokenizer.__getitem__
 95 |     :noindex:
 96 | ..  code-block:: python
 97 | 
 98 |     pad_token = tokenizer["PAD_None"]
 99 | 
100 | .. autofunction:: miditok.MusicTokenizer.__len__
101 |     :noindex:
102 | ..  code-block:: python
103 | 
104 |     num_classes = len(tokenizer)
105 |     num_classes_per_vocab = tokenizer.len  # applicable to tokenizer with embedding pooling, e.g. CPWord or Octuple
106 | 
107 | .. autofunction:: miditok.MusicTokenizer.__eq__
108 |     :noindex:
109 | ..  code-block:: python
110 | 
111 |     if tokenizer1 == tokenizer2:
112 |         print("The tokenizers have the same vocabulary and configurations!")
113 | 
114 | 
115 | Save / Load a tokenizer
116 | ------------------------
117 | 
118 | You can save and load a tokenizer, include its configuration and vocabulary. This is especially useful after :ref:`Training a tokenizer`.
119 | 
120 | .. autofunction:: miditok.MusicTokenizer.save
121 |     :noindex:
122 | 
123 | To load a tokenizer from saved parameters, just use the ``params`` argument when creating a it:
124 | 
125 | ..  code-block:: python
126 | 
127 |     tokenizer = REMI(params=Path("to", "tokenizer.json"))
128 | 


--------------------------------------------------------------------------------
/docs/utils.rst:
--------------------------------------------------------------------------------
1 | ========================
2 | Utils methods
3 | ========================
4 | 
5 | .. automodule:: miditok.utils
6 |     :members:
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "miditok"
  7 | version = "3.0.5.post1"
  8 | description = "MIDI / symbolic music tokenizers for Deep Learning models."
  9 | readme = {file = "README.md", content-type = "text/markdown"}
 10 | license = {file = "LICENSE"}
 11 | requires-python = ">=3.9"
 12 | authors = [
 13 |     { name = "Nathan Fradet" },
 14 | ]
 15 | keywords = [
 16 |     "artificial intelligence",
 17 |     "deep learning",
 18 |     "transformer",
 19 |     "midi",
 20 |     "tokenization",
 21 |     "music",
 22 |     "mir",
 23 | ]
 24 | classifiers = [
 25 |     "Intended Audience :: Developers",
 26 |     "Intended Audience :: Science/Research",
 27 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 28 |     "Topic :: Multimedia :: Sound/Audio :: MIDI",
 29 |     "License :: OSI Approved :: MIT License",
 30 |     "Programming Language :: Python",
 31 |     "Programming Language :: Python :: 3 :: Only",
 32 |     "Programming Language :: Python :: 3.9",
 33 |     "Programming Language :: Python :: 3.10",
 34 |     "Programming Language :: Python :: 3.11",
 35 |     "Programming Language :: Python :: 3.12",
 36 |     "Programming Language :: Python :: 3.13",
 37 |     "Operating System :: OS Independent",
 38 | ]
 39 | dependencies = [
 40 |     "numpy>=1.19",
 41 |     "symusic>=0.5.0",
 42 |     "tqdm",
 43 |     "tokenizers>=0.13.0",
 44 |     "huggingface_hub>=0.16.4",
 45 | ]
 46 | 
 47 | [project.optional-dependencies]
 48 | tests = [
 49 |     "pytest-cov",
 50 |     "pytest-xdist[psutil]",
 51 |     "torch",
 52 |     "tensorflow",
 53 |     "miditoolkit",
 54 | ]
 55 | docs = [
 56 |     "furo",  # theme
 57 |     "sphinx-copybutton",
 58 |     "torch",  # for pytorch_data module
 59 |     # "sphinxcontrib-tikz",
 60 | ]
 61 | 
 62 | [project.urls]
 63 | Homepage = "https://github.com/Natooz/MidiTok"
 64 | Repository = "https://github.com/Natooz/MidiTok.git"
 65 | Documentation = "https://miditok.readthedocs.io"
 66 | Issues = "https://github.com/Natooz/MidiTok/issues"
 67 | 
 68 | [tool.hatch.build.targets.wheel]
 69 | packages = ["src/miditok"]
 70 | only-packages = true
 71 | 
 72 | [tool.hatch.version]
 73 | path = "src/miditok/__init__.py"
 74 | 
 75 | [mypy]
 76 | warn_return_any = "True"
 77 | warn_unused_configs = "True"
 78 | plugins = "numpy.typing.mypy_plugin"
 79 | exclude = [
 80 |     "venv",
 81 |     ".venv",
 82 | ]
 83 | 
 84 | [tool.pytest.ini_options]
 85 | pythonpath = "src"
 86 | addopts = [
 87 |     "--import-mode=importlib",
 88 | ]
 89 | 
 90 | [tool.coverage.report]
 91 | exclude_also = [
 92 |     "def __repr__",
 93 | ]
 94 | omit = [
 95 |     # files to omit to check
 96 |     "benchmarks/*"
 97 | ]
 98 | 
 99 | [tool.ruff]
100 | target-version = "py313"
101 | 
102 | [tool.ruff.lint]
103 | extend-select = [
104 |     "ARG",
105 |     "A",
106 |     "ANN",
107 |     "B",
108 |     "BLE",
109 |     "C4",
110 |     "COM",
111 |     "D",
112 |     "E",
113 |     "EM",
114 |     "EXE",
115 |     "F",
116 |     "FA",
117 |     "FBT",
118 |     "G",
119 |     "I",
120 |     "ICN",
121 |     "INP",
122 |     "INT",
123 |     "ISC",
124 |     "N",
125 |     "NPY",
126 |     "PERF",
127 |     "PGH",
128 |     "PTH",
129 |     "PIE",
130 |     # "PL",
131 |     "PT",
132 |     "Q",
133 |     "RET",
134 |     "RSE",
135 |     "RUF",
136 |     "S",
137 |     # "SLF",
138 |     "SIM",
139 |     "T",
140 |     "TCH",
141 |     "TID",
142 |     "UP",
143 |     "W",
144 | ]
145 | 
146 | # Each rule exclusion should be explained here.
147 | # By default, we think it is better to select groups of rules (above), and exclude
148 | # specific problematic rules, instead of selecting specific rules. By doing so, in case
149 | # the ruff rules groups change, this requires us to check and handle the new rules or
150 | # changes, making sure we stay up to date and keep the best practices.
151 | 
152 | # ANN003:
153 | # Would mostly apply to args/kwargs that are passed to methods from dependencies, for
154 | # which the signature can change depending on the version. This would either be too
155 | # difficult to comply and/or would add a lot of noqa exceptions. ANN002 is used as it
156 | # adds very few "noqa" exceptions, but ANN003 would add too much complexity.
157 | 
158 | # ANN101 and ANN102:
159 | # Yields errors for `self` in methods from classes, which is unecessary.
160 | # The existence of these rules is currently questioned, they are likely to be removed.
161 | # https://github.com/astral-sh/ruff/issues/4396
162 | 
163 | # B905
164 | # The `strict` keyword argument for the `zip` built-in method appeared with Python
165 | # 3.10. As we support previous versions, we cannot comply (yet) with this rule. The
166 | # exclusion should be removed when MidiTok drop support for Python 3.9.
167 | 
168 | # D107
169 | # We document classes at the class level (D101). This documentation should cover the
170 | # way classes are initialized. So we do not document `__init__` methods.
171 | 
172 | # D203
173 | # "one-blank-line-before-class", incompatible with D211 (blank-line-before-class).
174 | # We follow PEP 257 and other conventions by preferring D211 over D203.
175 | 
176 | # D212
177 | # "multi-line-summary-first-line", incompatible with D213
178 | # (multi-line-summary-second-line).
179 | # We follow PEP 257, which recommend to set put the summary line on the second line
180 | # after the blank line of the opening quotes.
181 | 
182 | # FBT001 and FBT002
183 | # Refactoring all the methods to make boolean arguments keyword only would add
184 | # complexity and could break code of users. It's ok to have booleans as positional
185 | # arguments with default values. For code redability though, we enable FB003.
186 | 
187 | # COM812:
188 | # Yields errors for one-line portions without comma. Trailing commas are automatically
189 | # set with ruff format anyway. This exclusion could be removed when this behavior is
190 | # fixed in ruff.
191 | 
192 | # UP038
193 | # Recommends to | type union with `isinstance`, which is only supported since Python
194 | # 3.10. The exclusion should be removed when MidiTok drop support for Python 3.9.
195 | 
196 | # (ISC001)
197 | # May cause conflicts when used with the ruff formatter. They recommend to disable it.
198 | # We leave it enabled but keep this in mind.
199 | 
200 | ignore = [
201 |     "ANN003",
202 |     "ANN101",
203 |     "ANN102",
204 |     "B905",
205 |     "COM812",
206 |     "D107",
207 |     "D203",
208 |     "D212",
209 |     "FBT001",
210 |     "FBT002",
211 |     "UP038",
212 | ]
213 | 
214 | [tool.ruff.lint.per-file-ignores]
215 | # S105:
216 | # we don't use passwords in MidiTok, only HF token for the interactions with the hub.
217 | # However we have a lot of variables with "token"(s) in their name, which would yield a
218 | # lot of lint errors or require a lot of noqa exceptions.
219 | "src/miditok/**" = [
220 |     "S105",
221 | ]
222 | "tests/**" = [
223 |     "ANN201",  # allow no return type hint for pytest methods
224 |     "D103",  # no need to document pytest methods
225 |     "S101",  # allow assertions in tests
226 |     "T201",  # print allowed
227 | ]
228 | "docs/conf.py" = ["INP001"]  # not a package
229 | 


--------------------------------------------------------------------------------
/src/miditok/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Root module.
 3 | 
 4 | Here we only import tokenizer classes and submodules.
 5 | """
 6 | 
 7 | from miditok import data_augmentation
 8 | 
 9 | from .classes import Event, TokenizerConfig, TokSequence
10 | from .midi_tokenizer import MusicTokenizer
11 | from .tokenizations import (
12 |     MMM,
13 |     REMI,
14 |     TSD,
15 |     CPWord,
16 |     MIDILike,
17 |     MuMIDI,
18 |     Octuple,
19 |     PerTok,
20 |     Structured,
21 | )
22 | from .tokenizer_training_iterator import TokTrainingIterator
23 | 
24 | __all__ = [
25 |     "MusicTokenizer",
26 |     "Event",
27 |     "TokSequence",
28 |     "TokenizerConfig",
29 |     "TokTrainingIterator",
30 |     "MIDILike",
31 |     "REMI",
32 |     "TSD",
33 |     "Structured",
34 |     "Octuple",
35 |     "CPWord",
36 |     "MuMIDI",
37 |     "MMM",
38 |     "PerTok",
39 |     "utils",
40 |     "data_augmentation",
41 | ]
42 | 
43 | try:
44 |     from miditok import pytorch_data  # noqa: F401
45 | 
46 |     __all__.append("pytorch_data")
47 | except ImportError:
48 |     pass
49 | 


--------------------------------------------------------------------------------
/src/miditok/attribute_controls/__init__.py:
--------------------------------------------------------------------------------
 1 | """Attribute controls module."""
 2 | 
 3 | from .bar_attribute_controls import (
 4 |     BarNoteDensity,
 5 |     BarNoteDuration,
 6 |     BarOnsetPolyphony,
 7 |     BarPitchClass,
 8 | )
 9 | from .classes import AttributeControl, BarAttributeControl, create_random_ac_indexes
10 | from .track_attribute_controls import (
11 |     TrackNoteDensity,
12 |     TrackNoteDuration,
13 |     TrackOnsetPolyphony,
14 |     TrackRepetition,
15 | )
16 | 
17 | __all__ = (
18 |     "AttributeControl",
19 |     "BarAttributeControl",
20 |     "BarNoteDensity",
21 |     "BarNoteDuration",
22 |     "BarOnsetPolyphony",
23 |     "BarPitchClass",
24 |     "TrackRepetition",
25 |     "TrackNoteDuration",
26 |     "TrackNoteDensity",
27 |     "TrackOnsetPolyphony",
28 |     "create_random_ac_indexes",
29 | )
30 | 


--------------------------------------------------------------------------------
/src/miditok/attribute_controls/bar_attribute_controls.py:
--------------------------------------------------------------------------------
  1 | """Bar-level attribute controls modules."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import numpy as np
  6 | 
  7 | from miditok import Event
  8 | 
  9 | from .classes import BarAttributeControl
 10 | 
 11 | 
 12 | class BarOnsetPolyphony(BarAttributeControl):
 13 |     """
 14 |     Onset polyphony attribute control at the bar level.
 15 | 
 16 |     It specifies the minimum and maximum number of notes played simultaneously at a
 17 |     given time onset.
 18 |     It can be enabled with the ``ac_polyphony_bar`` argument of
 19 |     :class:`miditok.TokenizerConfig`.
 20 | 
 21 |     :param polyphony_min: minimum number of simultaneous notes to consider.
 22 |     :param polyphony_max: maximum number of simultaneous notes to consider.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         polyphony_min: int,
 28 |         polyphony_max: int,
 29 |     ) -> None:
 30 |         self.min_polyphony = polyphony_min
 31 |         self.max_polyphony = polyphony_max
 32 |         super().__init__(
 33 |             tokens=[
 34 |                 f"{tok_type}_{val}"
 35 |                 for tok_type in ("ACBarOnsetPolyphonyMin", "ACBarOnsetPolyphonyMax")
 36 |                 for val in range(polyphony_min, polyphony_max + 1)
 37 |             ],
 38 |         )
 39 | 
 40 |     def _compute_on_bar(
 41 |         self,
 42 |         notes_soa: dict[str, np.ndarray],
 43 |         controls_soa: dict[str, np.ndarray],
 44 |         pitch_bends_soa: dict[str, np.ndarray],
 45 |         time_division: int,
 46 |     ) -> list[Event]:
 47 |         del controls_soa, pitch_bends_soa, time_division
 48 |         _, counts_onsets = np.unique(notes_soa["time"], return_counts=True)
 49 |         onset_poly_min, onset_poly_max = np.min(counts_onsets), np.max(counts_onsets)
 50 | 
 51 |         min_poly = min(max(onset_poly_min, self.min_polyphony), self.max_polyphony)
 52 |         max_poly = min(onset_poly_max, self.max_polyphony)
 53 |         return [
 54 |             Event("ACBarOnsetPolyphonyMin", min_poly),
 55 |             Event("ACBarOnsetPolyphonyMax", max_poly),
 56 |         ]
 57 | 
 58 | 
 59 | class BarPitchClass(BarAttributeControl):
 60 |     """
 61 |     Bar-level pitch classes attribute control.
 62 | 
 63 |     This attribute control specifies which pitch classes are present within a bar.
 64 |     """
 65 | 
 66 |     def __init__(self) -> None:
 67 |         super().__init__(tokens=[f"ACBarPitchClass_{i}" for i in range(12)])
 68 | 
 69 |     def _compute_on_bar(
 70 |         self,
 71 |         notes_soa: dict[str, np.ndarray],
 72 |         controls_soa: dict[str, np.ndarray],
 73 |         pitch_bends_soa: dict[str, np.ndarray],
 74 |         time_division: int,
 75 |     ) -> list[Event]:
 76 |         del controls_soa, pitch_bends_soa, time_division
 77 |         pitch_values = notes_soa["pitch"] % 12
 78 |         pitch_values = np.unique(pitch_values)
 79 |         return [Event("ACBarPitchClass", pitch) for pitch in pitch_values]
 80 | 
 81 | 
 82 | class BarNoteDensity(BarAttributeControl):
 83 |     """
 84 |     Bar-level note density attribute control.
 85 | 
 86 |     It specifies the number of notes per bar. If a bar contains more that the maximum
 87 |     density (``density_max``), a ``density_max+`` token will be returned.
 88 | 
 89 |     :param density_max: maximum note density per bar to consider.
 90 |     """
 91 | 
 92 |     def __init__(self, density_max: int) -> None:
 93 |         self.density_max = density_max
 94 |         super().__init__(
 95 |             tokens=[
 96 |                 *(f"ACBarNoteDensity_{i}" for i in range(density_max)),
 97 |                 f"ACBarNoteDensity_{self.density_max}+",
 98 |             ],
 99 |         )
100 | 
101 |     def _compute_on_bar(
102 |         self,
103 |         notes_soa: dict[str, np.ndarray],
104 |         controls_soa: dict[str, np.ndarray],
105 |         pitch_bends_soa: dict[str, np.ndarray],
106 |         time_division: int,
107 |     ) -> list[Event]:
108 |         del controls_soa, pitch_bends_soa, time_division
109 |         n_notes = len(notes_soa["time"])
110 |         if n_notes >= self.density_max:
111 |             return [Event("ACBarNoteDensity", f"{self.density_max}+")]
112 |         return [Event("ACBarNoteDensity", n_notes)]
113 | 
114 | 
115 | class BarNoteDuration(BarAttributeControl):
116 |     """
117 |     Note duration attribute control.
118 | 
119 |     This attribute controls specifies the note durations (whole, half, quarter, eight,
120 |     sixteenth and thirty-second) present in a bar.
121 |     """
122 | 
123 |     def __init__(self) -> None:
124 |         self._note_durations = (
125 |             "Whole",
126 |             "Half",
127 |             "Quarter",
128 |             "Eight",
129 |             "Sixteenth",
130 |             "ThirtySecond",
131 |         )
132 |         super().__init__(
133 |             tokens=[
134 |                 f"ACBarNoteDuration{duration}_{val}"
135 |                 for duration in self._note_durations
136 |                 for val in (0, 1)
137 |             ],
138 |         )
139 |         # Factors multiplying ticks/quarter time division
140 |         self.factors = (4, 2, 1, 0.5, 0.25)
141 | 
142 |     def _compute_on_bar(
143 |         self,
144 |         notes_soa: dict[str, np.ndarray],
145 |         controls_soa: dict[str, np.ndarray],
146 |         pitch_bends_soa: dict[str, np.ndarray],
147 |         time_division: int,
148 |     ) -> list[Event]:
149 |         del controls_soa, pitch_bends_soa
150 |         durations = np.unique(notes_soa["duration"])
151 |         controls = []
152 |         for fi, factor in enumerate(self.factors):
153 |             controls.append(
154 |                 Event(
155 |                     f"ACBarNoteDuration{self._note_durations[fi]}",
156 |                     1 if time_division * factor in durations else 0,
157 |                 )
158 |             )
159 |         return controls
160 | 


--------------------------------------------------------------------------------
/src/miditok/data_augmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data augmentation module.
 3 | 
 4 | The module implements three public methods:
 5 | 
 6 | * :py:func:`miditok.data_augmentation.augment_midi`: augment a unique midi on a unique
 7 |     set of offsets;
 8 | * :py:func:`miditok.data_augmentation.augment_midi_multiple_offsets`: augment a unique
 9 |     MIDI on combinations of offsets;
10 | * :py:func:`miditok.data_augmentation.augment_midi_dataset`: augment a list of MIDI
11 |     files on combinations of offsets.
12 | 
13 | """
14 | 
15 | from .data_augmentation import (
16 |     augment_dataset,
17 |     augment_score,
18 |     augment_score_multiple_offsets,
19 | )
20 | 
21 | __all__ = [
22 |     "augment_score",
23 |     "augment_dataset",
24 |     "augment_score_multiple_offsets",
25 | ]
26 | 


--------------------------------------------------------------------------------
/src/miditok/pytorch_data/__init__.py:
--------------------------------------------------------------------------------
 1 | """Dataset classes and data collators to be used with PyTorch when training a model."""
 2 | 
 3 | from .collators import DataCollator
 4 | from .datasets import (
 5 |     DatasetJSON,
 6 |     DatasetMIDI,
 7 | )
 8 | 
 9 | __all__ = [
10 |     "DatasetMIDI",
11 |     "DatasetJSON",
12 |     "DataCollator",
13 | ]
14 | 


--------------------------------------------------------------------------------
/src/miditok/pytorch_data/collators.py:
--------------------------------------------------------------------------------
  1 | """Collator objects for PyTorch ``DataLoader``s."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import warnings
  6 | from copy import deepcopy
  7 | from typing import TYPE_CHECKING, Any
  8 | 
  9 | import torch
 10 | from torch import LongTensor
 11 | 
 12 | if TYPE_CHECKING:
 13 |     from collections.abc import Mapping, Sequence
 14 | 
 15 | 
 16 | class DataCollator:
 17 |     r"""
 18 |     All-in-one data collator for PyTorch ``DataLoader``.
 19 | 
 20 |     It allows to apply padding (right or left side of sequences), prepend or append
 21 |     *BOS* and *EOS* tokens. It will also add an ``"attention_mask"`` entry to the
 22 |     batch, following the padding applied.
 23 | 
 24 |     :param pad_token_id: padding token id.
 25 |     :param pad_on_left: if given True, it will pad the sequences on the left. This
 26 |         can be required when using some libraries expecting padding on left, for
 27 |         example when generating with Hugging Face Transformers. (default: ``False``)
 28 |     :param copy_inputs_as_labels: will add a labels entry (``labels_kwarg_name``) to
 29 |         the batch (or replace the existing one), which is a copy to the input entry:
 30 |         ``decoder_inputs_kwarg_name`` if present in the batch else
 31 |         ``labels_kwarg_name``. (default: ``False``)
 32 |     :param shift_labels: will shift inputs and labels for autoregressive
 33 |         training/teacher forcing. (default: ``False``)
 34 |     :param labels_pad_idx: padding id for labels. (default: -100)
 35 |     :param inputs_kwarg_name: name of dict / kwarg key for inputs.
 36 |         (default: ``"input_ids"``)
 37 |     :param labels_kwarg_name: name of dict / kwarg key for inputs.
 38 |         (default: ``"labels"``)
 39 |     :param decoder_inputs_kwarg_name: name of dict / kwarg key for decoder inputs.
 40 |         This key is intended to be used for encoder-decoder (seq2seq) models, for the
 41 |         decoder inputs while ``inputs_kwarg_name`` is for the encoder inputs.
 42 |         (default: ``"labels"``)
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         pad_token_id: int,
 48 |         pad_on_left: bool = False,
 49 |         copy_inputs_as_labels: bool = False,
 50 |         shift_labels: bool = False,
 51 |         labels_pad_idx: int = -100,
 52 |         inputs_kwarg_name: str = "input_ids",
 53 |         labels_kwarg_name: str = "labels",
 54 |         decoder_inputs_kwarg_name: str = "decoder_input_ids",
 55 |     ) -> None:
 56 |         self.pad_token = pad_token_id
 57 |         self.pad_on_left = pad_on_left
 58 |         self.copy_inputs_as_labels = copy_inputs_as_labels
 59 |         self.shift_labels = shift_labels
 60 |         self.labels_pad_idx = labels_pad_idx
 61 |         self.inputs_kwarg_name = inputs_kwarg_name
 62 |         self.labels_kwarg_name = labels_kwarg_name
 63 |         self.decoder_inputs_kwarg_name = decoder_inputs_kwarg_name
 64 | 
 65 |     def __call__(self, batch: list[Mapping[str, Any]]) -> Mapping[str, LongTensor]:
 66 |         """
 67 |         Collate the sequences of a batch, make them ready to be fed to a model.
 68 | 
 69 |         :param batch: batch of sequences, as a list of dictionaries containing input ids
 70 |             and optionally labels.
 71 |         :return: the output batch as a dictionary linking to input and optionally target
 72 |             tensors.
 73 |         """
 74 |         out_batch = {}
 75 |         inputs = [None, None, None]  # x, x_dec, y
 76 | 
 77 |         # Figure out inputs
 78 |         for i, key in enumerate(
 79 |             (
 80 |                 self.inputs_kwarg_name,
 81 |                 self.decoder_inputs_kwarg_name,
 82 |                 self.labels_kwarg_name,
 83 |             )
 84 |         ):
 85 |             if key in batch[0]:
 86 |                 inputs[i] = [
 87 |                     sample[key]
 88 |                     for sample in batch
 89 |                     if sample[key] is not None and len(sample[key]) > 0
 90 |                 ]
 91 |         x, x_dec, y = inputs
 92 | 
 93 |         # Copy labels, decoder input has priority over x
 94 |         if y is None and self.copy_inputs_as_labels:
 95 |             y = deepcopy(x_dec if x_dec is not None else x)
 96 | 
 97 |         # Pad inputs / convert to Tensors
 98 |         if x is not None:
 99 |             x = _pad_batch(x, self.pad_token, self.pad_on_left)
100 |         if x_dec is not None:
101 |             x_dec = _pad_batch(x_dec, self.pad_token, self.pad_on_left)
102 |         if y is not None:
103 |             # If labels are sequences of tokens
104 |             if y[0].dim() > 0:
105 |                 y = _pad_batch(y, self.labels_pad_idx, self.pad_on_left)
106 |             else:  # classification
107 |                 y = torch.stack(y)
108 | 
109 |         # Shift labels, otherwise it's handled by models
110 |         if self.shift_labels:
111 |             if x_dec is not None:
112 |                 x_dec = x_dec[:, :-1]
113 |             else:
114 |                 x = x[:, :-1]
115 |             if y[0].dim() > 0:
116 |                 y = y[:, 1:]
117 |             else:
118 |                 warnings.warn(
119 |                     "MidiTok DataCollator: You set shift_labels=True, but provided int"
120 |                     "labels (for sequence classification tasks) which is suited for."
121 |                     "Skipping label shifting.",
122 |                     stacklevel=2,
123 |                 )
124 | 
125 |         # Add inputs / labels to output batch
126 |         if x is not None:
127 |             out_batch[self.inputs_kwarg_name] = x
128 |         if x_dec is not None:
129 |             out_batch[self.decoder_inputs_kwarg_name] = x_dec
130 |         if y is not None:
131 |             out_batch[self.labels_kwarg_name] = y
132 | 
133 |         # Create attention mask (just for padding, causal mask is handled by models)
134 |         if x is not None:
135 |             attention_mask = (x != self.pad_token).int()
136 |             if attention_mask.dim() == 3:
137 |                 attention_mask = attention_mask[..., 0]  # (N,T,Z) --> (N,T)
138 |             out_batch["attention_mask"] = attention_mask
139 |         if x_dec is not None:
140 |             attention_mask = (x_dec != self.pad_token).int()
141 |             if attention_mask.dim() == 3:
142 |                 attention_mask = attention_mask[..., 0]  # (N,T,Z) --> (N,T)
143 |             out_batch["decoder_attention_mask"] = attention_mask
144 | 
145 |         return out_batch
146 | 
147 | 
148 | def _pad_batch(
149 |     batch: Sequence[LongTensor],
150 |     pad_token_id: int,
151 |     pad_on_left: bool = False,
152 | ) -> LongTensor:
153 |     r"""
154 |     Pad sequences of a batch.
155 | 
156 |     :param batch: batch as a list of Tensors.
157 |     :param pad_token_id: padding token id.
158 |     :param pad_on_left: if given True, it will pad the sequences on the left. This can
159 |         be required when using some libraries expecting padding on left, for example
160 |         when generating with Hugging Face Transformers. (default: False)
161 |     :return: the batch sequences, padded into a unique Tensor.
162 |     """
163 |     length_of_first = batch[0].size(0)
164 | 
165 |     # Check if padding is necessary.
166 |     are_tensors_same_length = all(x.size(0) == length_of_first for x in batch)
167 |     if are_tensors_same_length:
168 |         return torch.stack(batch, dim=0).long()
169 | 
170 |     # Creating the full tensor and filling it with our data.
171 |     if pad_on_left:
172 |         return _pad_left(batch, pad_token_id)
173 | 
174 |     return torch.nn.utils.rnn.pad_sequence(
175 |         batch, batch_first=True, padding_value=pad_token_id
176 |     ).long()
177 | 
178 | 
179 | def _pad_left(batch: Sequence[LongTensor], pad_token_id: int) -> LongTensor:
180 |     r"""
181 |     Pad sequences on the left, i.e. on the first indices.
182 | 
183 |     Padding on the left make the last element of each sequence the last token, which is
184 |     convenient when generating autoregressively as a method can more easily and
185 |     efficiently append the newly generated tokens.
186 | 
187 |     :param batch: batch as a list of Tensors.
188 |     :param pad_token_id: padding token id.
189 |     :return: the batch sequences, padded into a unique Tensor.
190 |     """
191 |     batch = [torch.flip(seq, dims=(0,)) for seq in batch]
192 |     batch = torch.nn.utils.rnn.pad_sequence(
193 |         batch, batch_first=True, padding_value=pad_token_id
194 |     )  # (N,T)
195 |     return torch.flip(batch, dims=(1,)).long()
196 | 


--------------------------------------------------------------------------------
/src/miditok/tokenizations/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tokenizer module.
 3 | 
 4 | This module implement tokenizer classes, which inherit from ``MusicTokenizer`` and
 5 | override specific methods such as ``_add_time_events`` or ``_tokens_to_score`` with
 6 | their specific behaviors/representations.
 7 | """
 8 | 
 9 | from .cp_word import CPWord
10 | from .midi_like import MIDILike
11 | from .mmm import MMM
12 | from .mumidi import MuMIDI
13 | from .octuple import Octuple
14 | from .pertok import PerTok
15 | from .remi import REMI
16 | from .structured import Structured
17 | from .tsd import TSD
18 | 
19 | __all__ = [
20 |     "MIDILike",
21 |     "REMI",
22 |     "TSD",
23 |     "Structured",
24 |     "Octuple",
25 |     "CPWord",
26 |     "MuMIDI",
27 |     "MMM",
28 |     "PerTok",
29 | ]
30 | 


--------------------------------------------------------------------------------
/src/miditok/tokenizer_training_iterator.py:
--------------------------------------------------------------------------------
  1 | """Iterator to be used when training a tokenizer with the 🤗tokenizers library."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING
  6 | 
  7 | from symusic import Score
  8 | 
  9 | from .attribute_controls import create_random_ac_indexes
 10 | from .classes import TokSequence
 11 | from .constants import SCORE_LOADING_EXCEPTION
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from collections.abc import Sequence
 15 |     from pathlib import Path
 16 | 
 17 |     from .midi_tokenizer import MusicTokenizer
 18 | 
 19 | 
 20 | class TokTrainingIterator:
 21 |     r"""
 22 |     An iterable class to be used when training a tokenizer.
 23 | 
 24 |     It loads music files (MIDI, abc) and tokenize them on the fly, to be used with the
 25 |     Hugging Face tokenizers library to build a vocabulary with BPE, Unigram or WordPiece
 26 |     models.
 27 | 
 28 |     :param tokenizer: tokenizer to use for training.
 29 |     :param files_paths: sequence of paths of files to load for training.
 30 |     :param tracks_idx_random_ratio_range: range of ratios (between 0 and 1 included) of
 31 |         tracks to compute attribute controls on. If ``None`` is given, the attribute
 32 |         controls will be computed for all the tracks. (default: ``None``)
 33 |     :param bars_idx_random_ratio_range: range of ratios (between 0 and 1 included) of
 34 |         bars to compute attribute controls on. If ``None`` is given, the attribute
 35 |         controls will be computed for all the bars. (default: ``None``)
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         tokenizer: MusicTokenizer,
 41 |         files_paths: Sequence[Path],
 42 |         tracks_idx_random_ratio_range: tuple[float, float] | None = None,
 43 |         bars_idx_random_ratio_range: tuple[float, float] | None = None,
 44 |     ) -> None:
 45 |         self.tokenizer = tokenizer
 46 |         self.files_paths = files_paths
 47 |         self.tracks_idx_random_ratio_range = (
 48 |             tracks_idx_random_ratio_range if tracks_idx_random_ratio_range else []
 49 |         )
 50 |         self.bars_idx_random_ratio_range = (
 51 |             bars_idx_random_ratio_range if bars_idx_random_ratio_range else []
 52 |         )
 53 |         self.__iter_count = 0
 54 | 
 55 |     def load_file(self, path: Path) -> list[str]:
 56 |         """
 57 |         Load a music file and convert it to its byte representation.
 58 | 
 59 |         :param path: path to the file to load.
 60 |         :return: the byte representation of the file.
 61 |         """
 62 |         # Load and tokenize file
 63 |         try:
 64 |             score = Score(path)
 65 |         except SCORE_LOADING_EXCEPTION:
 66 |             return []
 67 | 
 68 |         # Preprocess first to already have the appropriate tracks idx in case of deletes
 69 |         score = self.tokenizer.preprocess_score(score)
 70 | 
 71 |         # Randomly create attribute controls indexes
 72 |         ac_indexes = None
 73 |         if (
 74 |             len(self.tracks_idx_random_ratio_range) > 0
 75 |             or len(self.bars_idx_random_ratio_range) > 0
 76 |         ):
 77 |             ac_indexes = create_random_ac_indexes(
 78 |                 score,
 79 |                 self.tokenizer.attribute_controls,
 80 |                 self.tracks_idx_random_ratio_range,
 81 |                 self.bars_idx_random_ratio_range,
 82 |             )
 83 | 
 84 |         # Tokenize the file
 85 |         # Need to specify `encode_ids=False` as it might be already pretrained
 86 |         # For MMM, we make sure to have sequences separated per track
 87 |         kwargs = {}
 88 |         # can't use isinstance because of circular import
 89 |         if type(self.tokenizer).__name__ == "MMM":
 90 |             kwargs["concatenate_track_sequences"] = False
 91 |         tokseq = self.tokenizer(
 92 |             score,
 93 |             encode_ids=False,
 94 |             no_preprocess_score=True,
 95 |             attribute_controls_indexes=ac_indexes,
 96 |             **kwargs,
 97 |         )
 98 | 
 99 |         # Split ids if requested
100 |         if self.tokenizer.config.encode_ids_split in ["bar", "beat"]:
101 |             if isinstance(tokseq, TokSequence):
102 |                 tokseq = [tokseq]
103 | 
104 |             new_seqs = []
105 |             for seq in tokseq:
106 |                 if self.tokenizer.config.encode_ids_split == "bar":
107 |                     new_seqs += seq.split_per_bars()
108 |                 else:
109 |                     new_seqs += seq.split_per_beats()
110 |             tokseq = [seq for seq in new_seqs if len(seq) > 0]
111 | 
112 |         # Convert ids to bytes for training
113 |         if isinstance(tokseq, TokSequence):
114 |             token_ids = tokseq.ids
115 |         else:
116 |             token_ids = [seq.ids for seq in tokseq]
117 |         bytes_ = self.tokenizer._ids_to_bytes(token_ids, as_one_str=True)
118 |         if isinstance(bytes_, str):
119 |             bytes_ = [bytes_]
120 | 
121 |         return bytes_
122 | 
123 |     def __len__(self) -> int:
124 |         """
125 |         Return the number of files in the training corpus.
126 | 
127 |         :return: number of files in the training corpus.
128 |         """
129 |         return len(self.files_paths)
130 | 
131 |     def __getitem__(self, idx: int) -> list[str]:
132 |         """
133 |         Convert the ``idx``th file to its byte representation.
134 | 
135 |         :param idx: idx of the file to convert.
136 |         :return: byte representation of the file.
137 |         """
138 |         return self.load_file(self.files_paths[idx])
139 | 
140 |     def __iter__(self) -> TokTrainingIterator:  # noqa:D105
141 |         return self
142 | 
143 |     def __next__(self) -> list[str]:  # noqa:D105
144 |         if self.__iter_count >= len(self):
145 |             self.__iter_count = 0
146 |             raise StopIteration
147 | 
148 |         self.__iter_count += 1
149 |         return self[self.__iter_count - 1]
150 | 
151 |     def __str__(self) -> str:
152 |         """
153 |         Return the ``str`` representation of the iterator.
154 | 
155 |         :return: string description.
156 |         """
157 |         return f"{self.tokenizer} - {len(self)} files"
158 | 


--------------------------------------------------------------------------------
/src/miditok/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module containing utils methods than can be used outside of tokenization."""
 2 | 
 3 | from .split import (
 4 |     get_average_num_tokens_per_note,
 5 |     split_files_for_training,
 6 |     split_score_per_beats,
 7 |     split_score_per_note_density,
 8 |     split_score_per_ticks,
 9 |     split_score_per_tracks,
10 |     split_seq_in_subsequences,
11 |     split_tokens_files_to_subsequences,
12 | )
13 | from .utils import (
14 |     compute_ticks_per_bar,
15 |     compute_ticks_per_beat,
16 |     concat_scores,
17 |     convert_ids_tensors_to_list,
18 |     detect_chords,
19 |     filter_dataset,
20 |     fix_offsets_overlapping_notes,
21 |     get_bars_ticks,
22 |     get_beats_ticks,
23 |     get_num_notes_per_bar,
24 |     get_score_programs,
25 |     get_score_ticks_per_beat,
26 |     is_track_empty,
27 |     merge_same_program_tracks,
28 |     merge_scores,
29 |     merge_tracks,
30 |     merge_tracks_per_class,
31 |     num_bar_pos,
32 |     remove_duplicated_notes,
33 | )
34 | 
35 | __all__ = [
36 |     "compute_ticks_per_bar",
37 |     "compute_ticks_per_beat",
38 |     "concat_scores",
39 |     "convert_ids_tensors_to_list",
40 |     "detect_chords",
41 |     "filter_dataset",
42 |     "fix_offsets_overlapping_notes",
43 |     "get_average_num_tokens_per_note",
44 |     "get_bars_ticks",
45 |     "get_beats_ticks",
46 |     "get_score_programs",
47 |     "get_score_ticks_per_beat",
48 |     "is_track_empty",
49 |     "merge_scores",
50 |     "merge_same_program_tracks",
51 |     "merge_tracks",
52 |     "merge_tracks_per_class",
53 |     "num_bar_pos",
54 |     "get_num_notes_per_bar",
55 |     "remove_duplicated_notes",
56 |     "split_score_per_beats",
57 |     "split_score_per_ticks",
58 |     "split_score_per_tracks",
59 |     "split_files_for_training",
60 |     "split_score_per_note_density",
61 |     "split_tokens_files_to_subsequences",
62 |     "split_seq_in_subsequences",
63 | ]
64 | 


--------------------------------------------------------------------------------
/tests/MIDIs_corrupted/ValueError_Control168.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_corrupted/ValueError_Control168.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Aicha.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Aicha.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/All The Small Things.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/All The Small Things.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Funkytown.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Funkytown.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Girls Just Want to Have Fun.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Girls Just Want to Have Fun.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/I Gotta Feeling.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/I Gotta Feeling.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/In Too Deep.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/In Too Deep.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Les Yeux Revolvers.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Les Yeux Revolvers.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Mr. Blue Sky.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Mr. Blue Sky.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/Shut Up.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/Shut Up.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/What a Fool Believes.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/What a Fool Believes.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/d6caebd1964d9e4a3c5ea59525230e2a.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/d6caebd1964d9e4a3c5ea59525230e2a.mid


--------------------------------------------------------------------------------
/tests/MIDIs_multitrack/d8faddb8596fff7abb24d78666f73e4e.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_multitrack/d8faddb8596fff7abb24d78666f73e4e.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/6338816_Etude No. 4.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/6338816_Etude No. 4.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/6354774_Macabre Waltz.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/6354774_Macabre Waltz.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_1.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_1.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_10.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_10.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_2.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_2.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_3.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_3.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_4.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_4.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_5.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_5.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_6.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_6.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_7.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_7.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_8.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_8.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/Maestro_9.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/Maestro_9.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/POP909_008.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_008.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/POP909_010.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_010.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/POP909_022.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_022.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/POP909_191.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/POP909_191.mid


--------------------------------------------------------------------------------
/tests/MIDIs_one_track/empty.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Natooz/MidiTok/1a06c4a3243911c23a76e5dddfeea284c4cad56a/tests/MIDIs_one_track/empty.mid


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Test module.
3 | 
4 | Contains the pytest cases (files beginning with "test"), test utils and benchmarks.
5 | """
6 | 


--------------------------------------------------------------------------------
/tests/abc_files/a_morning_in_summer.abc:
--------------------------------------------------------------------------------
 1 | X: 1
 2 | T: A Morning In Summer
 3 | R: jig
 4 | M: 6/8
 5 | L: 1/8
 6 | K: Dmaj
 7 | |:A|A2DD2E|F2EF2D|G2FG2A|d2cA2G|
 8 | A2DD2E|F2EF2D|D2GF2G|A2GG2:|
 9 | G|:F2GA2B|=c3AB^c|d2cd2e|=f2dd2e|
10 | =f2ef2d|=c2AA2F|G2FG2A|d2cA2G:|
11 | 


--------------------------------------------------------------------------------
/tests/abc_files/flowers_of_edinburgh.abc:
--------------------------------------------------------------------------------
 1 | X:14
 2 | T:Flowers of Edinburgh
 3 | R:Reel
 4 | O:Scotland
 5 | O:Ireland
 6 | M:2/4
 7 | L:1/16
 8 | K:G
 9 | "G"GE|D2DE G2GA|BGBd cBAG|"D7"FGFE DEFG|ABAF E2GE|
10 | "G"D2DE G2GA|"G"BABd "C"efge|"D7"dcBA GFGA|"G"B2G2 G2:|
11 | |:"G"Bd|"G"g2gf gbag|"D7"f2fe fagf|"C"edef gfed|"Em"B2e2 e2ge|
12 | "G"dBGB d2dd|"C"edef "Am"g2fe|"D7"dcBA GFGA|"G"B2G2 G2:|
13 | 


--------------------------------------------------------------------------------
/tests/abc_files/rising_sun.abc:
--------------------------------------------------------------------------------
 1 | X:1
 2 | T:Rising Sun [4]
 3 | S:fiddler Hiram Horner (Southwestern, Pa., 1944)
 4 | M:2/4
 5 | L:1/8
 6 | K:D
 7 | (3A/B/c/||Od>d ~ed/e/|~fd~dA|Be e/f/e/d/|cA A(3A/B/c/|
 8 | d2 ~ed/e/|~fddA|Be AB/c/|d2 d||
 9 | z2z|f a3|f a3|{a}g~fe~d|c~BAz|
10 | f a3|f a3|^g~f ec/e/|a2 a/g/f/e/O||
11 | 


--------------------------------------------------------------------------------
/tests/abc_files/the_rising_of_the_moon.abc:
--------------------------------------------------------------------------------
 1 | % Generated more or less automatically by swtoabc by Erich Rickheit KSC
 2 | X:1
 3 | T:The Rising of the Moon
 4 | M:2/4
 5 | L:1/8
 6 | K:Eb
 7 |  C3/2 D/2| EF Gc| B/2 G3/2 FD| GF/2-D/2 CC| C2 C3/2 D/2| EF Gc| B/2 G3/2 FD|\
 8 |  GF/2-D/2 C3/2 C/2| C2 Gc| c3/2 B/2 d3/2 c/2| BG FG| B/2 G3/2 Bd| c2 Gc|\
 9 |  cB d3/2 c/2| B/2 G3/2 FD| GF/2-E/2 C3/2 C/2| C2||
10 | 


--------------------------------------------------------------------------------
/tests/abc_files/the_wheels_of_the_world.abc:
--------------------------------------------------------------------------------
 1 | X:340
 2 | T:the Wheels of the World (reel)
 3 | R:Reel
 4 | O:Ireland
 5 | B:Ceol Rince 1, n168
 6 | S:Ceol Rince 1
 7 | Z:Transcription, chords:Mike Long
 8 | M:C|
 9 | L:1/8
10 | K:G
11 | "D"dD~D2 FAGF|"C"EC~C2 EFGA|"D"dD~D2 FAGF|"C"EDCE "D"D2FA|\
12 | "D"dD~D2 FAGF|"C"EC~C2 EFGc|
13 | "Am/G"AddB cBAG|"C"ECGE "D"D3|]\
14 | B|\
15 | "D"dAdf afdf|"C"ecgc acge|"D"dAdf afdf|
16 | "C"edce "D"d3A|\
17 | "D"dAdf afdf|"C"ecgc acge|"Am"abag efge|"D"fa"C"ge "D"d3z|]
18 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest configuration file.
 3 | 
 4 | Doc: https://docs.pytest.org/en/latest/reference/reference.html.
 5 | """
 6 | 
 7 | from __future__ import annotations
 8 | 
 9 | import os
10 | 
11 | import pytest
12 | 
13 | 
14 | @pytest.fixture()
15 | def hf_token() -> str | None:
16 |     """
17 |     Get the Hugging Face token from the environment variable HF_TOKEN_HUB_TESTS.
18 | 
19 |     If the variable is not set, the test using this fixture will be skipped.
20 |     """
21 |     token = os.environ.get("HF_TOKEN_HUB_TESTS")
22 |     if not token:
23 |         pytest.skip("HF_TOKEN_HUB_TESTS is not set")
24 |     return token
25 | 


--------------------------------------------------------------------------------
/tests/test_attribute_controls.py:
--------------------------------------------------------------------------------
  1 | """Test methods."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from random import seed
  6 | from typing import TYPE_CHECKING, Any, Literal
  7 | 
  8 | import miditok
  9 | import pytest
 10 | from miditok.attribute_controls import create_random_ac_indexes
 11 | from symusic import Score
 12 | 
 13 | from .utils_tests import (
 14 |     BARS_RANDOM_RATIO_RANGE,
 15 |     MIDI_PATHS_ALL,
 16 |     MIDI_PATHS_ONE_TRACK,
 17 |     SEED,
 18 |     TRACKS_RANDOM_RATIO_RANGE,
 19 |     check_control_tokens_are_well_inserted,
 20 | )
 21 | 
 22 | if TYPE_CHECKING:
 23 |     from collections.abc import Sequence
 24 |     from pathlib import Path
 25 | 
 26 | TOKENIZATIONS = ["REMI", "TSD", "MMM"]
 27 | TOKENIZER_PARAMS = {
 28 |     "pitch_range": (21, 109),
 29 |     "beat_res": {(0, 4): 8, (4, 12): 4},
 30 |     "num_velocities": 32,
 31 |     "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
 32 |     "use_chords": True,
 33 |     "use_rests": False,
 34 |     "use_tempos": True,
 35 |     "use_time_signatures": True,
 36 |     "use_programs": False,
 37 |     "num_tempos": 32,  # number of tempo bins
 38 |     "tempo_range": (40, 250),  # (min, max)
 39 |     "base_tokenizer": "REMI",
 40 |     "ac_polyphony_track": True,
 41 |     "ac_polyphony_bar": True,
 42 |     "ac_pitch_class_bar": True,
 43 |     "ac_note_density_track": True,
 44 |     "ac_note_density_bar": True,
 45 |     "ac_note_duration_bar": True,
 46 |     "ac_note_duration_track": True,
 47 |     "ac_repetition_track": True,
 48 | }
 49 | VOCAB_SIZE = 2000
 50 | NUM_ADDITIONAL_TOKENS_SECOND_TRAINING = 400
 51 | WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BAR = 500  # higher than default MidiTok values
 52 | WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BEAT = 150
 53 | 
 54 | 
 55 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ALL, ids=lambda path: path.name)
 56 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS)
 57 | @pytest.mark.parametrize(
 58 |     "random_tracks_idx",
 59 |     [False, True],
 60 |     ids=lambda r: "rand_tracks" if r else "all_tracks",
 61 | )
 62 | @pytest.mark.parametrize(
 63 |     "random_bars_idx", [False, True], ids=lambda r: "rand_bars" if r else "all_bars"
 64 | )
 65 | def test_attribute_controls_computation(
 66 |     file_path: Path,
 67 |     tokenization: str,
 68 |     random_tracks_idx: bool,
 69 |     random_bars_idx: bool,
 70 |     tokenizer_params: dict[str, Any] | None = None,
 71 | ) -> None:
 72 |     if tokenizer_params is None:
 73 |         tokenizer_params = TOKENIZER_PARAMS
 74 | 
 75 |     tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)(
 76 |         tokenizer_config=miditok.TokenizerConfig(**tokenizer_params)
 77 |     )
 78 |     score = Score(file_path)
 79 |     score = tokenizer.preprocess_score(score)
 80 | 
 81 |     # Set attribute controls indexes
 82 |     seed(SEED)
 83 |     tracks_idx_ratio = (0, 1) if random_tracks_idx else 1
 84 |     bars_idx_ratio = (0, 1) if random_bars_idx else 1
 85 |     ac_indexes = create_random_ac_indexes(
 86 |         score,
 87 |         tokenizer.attribute_controls,
 88 |         tracks_idx_ratio,
 89 |         bars_idx_ratio,
 90 |     )
 91 | 
 92 |     # Tokenize Score with attribute controls injected
 93 |     tokens = tokenizer.encode(
 94 |         score, no_preprocess_score=True, attribute_controls_indexes=ac_indexes
 95 |     )
 96 | 
 97 |     # Check for errors
 98 |     injection_errors = check_control_tokens_are_well_inserted(
 99 |         tokenizer, score, tokens, ac_indexes
100 |     )
101 |     assert len(injection_errors) == 0
102 | 
103 | 
104 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS)
105 | @pytest.mark.parametrize("model", ["BPE"])
106 | @pytest.mark.parametrize(
107 |     "encode_ids_split",
108 |     ["no", "bar", "beat"],
109 |     ids=lambda s: f"{s}_split",
110 | )
111 | def test_tokenizer_training_and_encoding_decoding(
112 |     tokenization: str,
113 |     model: Literal["BPE", "Unigram", "WordPiece"],
114 |     encode_ids_split: Literal["bar", "beat", "no"],
115 |     files_paths: Sequence[Path] = MIDI_PATHS_ONE_TRACK,
116 |     vocab_size: int = VOCAB_SIZE,
117 | ):
118 |     r"""
119 |     Train a tokenizer to make sure the training iterator works with attribute controls.
120 | 
121 |     :param files_paths: list of paths of music files to use for the tests.
122 |     :param encode_ids_split: type of token ids split before encoding/training.
123 |     """
124 |     if encode_ids_split == "no" and model == "WordPiece":
125 |         pytest.skip(f"Skipping training with {model} and {encode_ids_split} split")
126 | 
127 |     # Creates tokenizers
128 |     TOKENIZER_PARAMS["encode_ids_split"] = encode_ids_split
129 |     tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)(
130 |         tokenizer_config=miditok.TokenizerConfig(**TOKENIZER_PARAMS)
131 |     )
132 | 
133 |     training_kwargs = {}
134 |     if model == "WordPiece":
135 |         training_kwargs["max_input_chars_per_word"] = (
136 |             WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BAR
137 |             if encode_ids_split == "bar"
138 |             else WORDPIECE_MAX_INPUT_CHARS_PER_WORD_BEAT
139 |         )
140 | 
141 |     # Train the tokenizer
142 |     training_iterator = miditok.TokTrainingIterator(
143 |         tokenizer, files_paths, TRACKS_RANDOM_RATIO_RANGE, BARS_RANDOM_RATIO_RANGE
144 |     )
145 |     tokenizer.train(
146 |         vocab_size=vocab_size + NUM_ADDITIONAL_TOKENS_SECOND_TRAINING,
147 |         model=model,
148 |         iterator=training_iterator,
149 |         **training_kwargs,
150 |     )
151 | 


--------------------------------------------------------------------------------
/tests/test_data_augmentation.py:
--------------------------------------------------------------------------------
  1 | """Test methods."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING
  6 | 
  7 | from miditok.data_augmentation import (
  8 |     augment_dataset,
  9 | )
 10 | from symusic import Score
 11 | from tqdm import tqdm
 12 | 
 13 | from .utils_tests import HERE
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from pathlib import Path
 17 | 
 18 | 
 19 | def test_augment_dataset(
 20 |     tmp_path: Path,
 21 |     data_path: Path = HERE / "MIDIs_multitrack",
 22 | ) -> None:
 23 |     # We only test data augmentation on MIDIs with one tokenization
 24 | 
 25 |     midi_aug_path = tmp_path / "Multitrack_MIDIs_aug"
 26 |     min_duration = 0.03125
 27 |     augment_dataset(
 28 |         data_path,
 29 |         pitch_offsets=[-2, 1, 2],
 30 |         velocity_offsets=[-4, 5],
 31 |         duration_offsets=[-0.5, 1],
 32 |         all_offset_combinations=True,
 33 |         min_duration=min_duration,
 34 |         out_path=midi_aug_path,
 35 |     )
 36 | 
 37 |     aug_midi_paths = list(midi_aug_path.glob("**/*.mid"))
 38 |     for aug_midi_path in tqdm(
 39 |         aug_midi_paths, desc="CHECKING DATA AUGMENTATION ON MIDIS"
 40 |     ):
 41 |         # Determine offsets of file
 42 |         parts = aug_midi_path.stem.split("#")
 43 |         # If original non-augmented file
 44 |         if len(parts) < 2:
 45 |             continue
 46 |         original_stem, offsets_str = parts[0], parts[1].split("_")
 47 |         offsets = [0, 0, 0]
 48 |         for offset_str in offsets_str:
 49 |             for pos, letter in enumerate(["p", "v", "d"]):
 50 |                 if offset_str[0] == letter:
 51 |                     offsets[pos] = int(offset_str[1:])
 52 | 
 53 |         # Loads MIDIs to compare
 54 |         midi_aug = Score(aug_midi_path)
 55 |         midi_ogi = Score(data_path / f"{original_stem}.mid")
 56 |         min_duration_ticks = round(min_duration * midi_aug.ticks_per_quarter)
 57 | 
 58 |         # Compare them
 59 |         for track_ogi, track_aug in zip(midi_ogi.tracks, midi_aug.tracks):
 60 |             if track_ogi.is_drum:
 61 |                 continue
 62 |             track_ogi.notes.sort(key=lambda x: (x.start, x.pitch, x.end, x.velocity))
 63 |             track_aug.notes.sort(key=lambda x: (x.start, x.pitch, x.end, x.velocity))
 64 |             for note_o, note_a in zip(track_ogi.notes, track_aug.notes):
 65 |                 if note_a.pitch != note_o.pitch + offsets[0]:
 66 |                     msg = (
 67 |                         f"Pitch assertion failed: expected "
 68 |                         f"{note_o.pitch + offsets[0]}, got {note_a.pitch}"
 69 |                     )
 70 |                     raise ValueError(msg)
 71 |                 # If negative duration offset, dur_exp must be greater or equal than
 72 |                 # min_duration_ticks
 73 |                 if offsets[2] < 0:
 74 |                     dur_exp = max(
 75 |                         note_o.duration + offsets[2],
 76 |                         min_duration_ticks,
 77 |                     )
 78 |                 # If positive duration offset, the original duration was just shifted
 79 |                 elif offsets[2] > 0:
 80 |                     dur_exp = note_o.duration + offsets[2]
 81 |                 else:
 82 |                     dur_exp = note_o.duration
 83 |                 if note_a.duration != dur_exp:
 84 |                     msg = (
 85 |                         f"Duration assertion failed: expected {dur_exp}, got "
 86 |                         f"{note_a.duration}"
 87 |                     )
 88 |                     raise ValueError(msg)
 89 |             # We need to resort the tracks with the velocity key in third position
 90 |             # before checking their values.
 91 |             track_ogi.notes.sort(key=lambda x: (x.start, x.pitch, x.velocity))
 92 |             track_aug.notes.sort(key=lambda x: (x.start, x.pitch, x.velocity))
 93 |             for note_o, note_a in zip(track_ogi.notes, track_aug.notes):
 94 |                 if note_a.velocity not in [1, 127, note_o.velocity + offsets[1]]:
 95 |                     msg = (
 96 |                         f"Velocity assertion failed: expected one in "
 97 |                         f"{[1, 127, note_o.velocity + offsets[1]]}, got {note_a.pitch}"
 98 |                     )
 99 |                     raise ValueError(msg)
100 | 


--------------------------------------------------------------------------------
/tests/test_hf_hub.py:
--------------------------------------------------------------------------------
 1 | """Test the integration of the Hugging Face Hub."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import warnings
 6 | from time import sleep
 7 | from typing import TYPE_CHECKING
 8 | 
 9 | import miditok
10 | import pytest
11 | from huggingface_hub.utils import HfHubHTTPError
12 | 
13 | if TYPE_CHECKING:
14 |     from pathlib import Path
15 | 
16 | MAX_NUM_TRIES_HF_PUSH = 3
17 | NUM_SECONDS_RETRY = 8
18 | 
19 | AUTO_TOKENIZER_CASES = [
20 |     # ("class_name", "save_path", "class_name_assert")
21 |     ("REMI", "rem", "REMI"),
22 |     ("REMI", "rem2", "TSD"),
23 |     ("TSD", "tsd", "TSD"),
24 | ]
25 | 
26 | 
27 | def test_push_and_load_to_hf_hub(hf_token: str):
28 |     tokenizer = miditok.REMI(
29 |         miditok.TokenizerConfig(num_velocities=62, pitch_range=(12, 44))
30 |     )
31 |     num_tries = 0
32 |     while num_tries < MAX_NUM_TRIES_HF_PUSH:
33 |         try:
34 |             tokenizer.push_to_hub("Natooz/MidiTok-tests", private=True, token=hf_token)
35 |         except HfHubHTTPError as e:
36 |             if e.response.status_code == 429:  # hourly quota exceeded
37 |                 # We performed to many tests, we skip it to not break the HF servers 🥲
38 |                 pytest.skip(
39 |                     "Hugging Face hourly quota exceeded, skipping"
40 |                     "`test_push_and_load_to_hf_hub` test."
41 |                 )
42 |             elif e.response.status_code in [500, 412]:
43 |                 num_tries += 1
44 |                 sleep(NUM_SECONDS_RETRY)
45 |             else:
46 |                 num_tries = MAX_NUM_TRIES_HF_PUSH
47 | 
48 |     # No skip, we rerun it if possible
49 |     if num_tries == MAX_NUM_TRIES_HF_PUSH:
50 |         warnings.warn("Tokenizer push failed", stacklevel=2)
51 | 
52 |     tokenizer2 = miditok.REMI.from_pretrained("Natooz/MidiTok-tests", token=hf_token)
53 |     assert tokenizer == tokenizer2
54 | 
55 | 
56 | def test_from_pretrained_local(tmp_path: Path):
57 |     # Here using paths to directories
58 |     tokenizer = miditok.TSD()
59 |     tokenizer.save_pretrained(tmp_path)
60 |     tokenizer2 = miditok.TSD.from_pretrained(tmp_path)
61 |     assert tokenizer == tokenizer2
62 | 
63 | 
64 | @pytest.mark.parametrize("params_case", AUTO_TOKENIZER_CASES)
65 | def test_autotokenizer(tmp_path: Path, params_case: tuple[str, str, str]):
66 |     tok_class, save_path, tok_class2 = params_case
67 | 
68 |     tokenizer = getattr(miditok, tok_class)()
69 |     tokenizer.save_pretrained(tmp_path / save_path)
70 |     tokenizer2 = getattr(miditok, tok_class2)(
71 |         params=tmp_path / save_path / "tokenizer.json"
72 |     )
73 | 
74 |     assert (tokenizer == tokenizer2) == (tok_class == tok_class2)
75 | 


--------------------------------------------------------------------------------
/tests/test_io_formats.py:
--------------------------------------------------------------------------------
 1 | """Testing the possible I/O formats of the tokenizers."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from copy import deepcopy
 6 | from typing import TYPE_CHECKING, Any
 7 | 
 8 | import miditok
 9 | import pytest
10 | from symusic import Score
11 | 
12 | from .utils_tests import (
13 |     ALL_TOKENIZATIONS,
14 |     HERE,
15 |     TOKENIZER_CONFIG_KWARGS,
16 |     adjust_tok_params_for_tests,
17 |     tokenize_and_check_equals,
18 | )
19 | 
20 | if TYPE_CHECKING:
21 |     from pathlib import Path
22 | 
23 | default_params = deepcopy(TOKENIZER_CONFIG_KWARGS)
24 | default_params.update(
25 |     {
26 |         "use_chords": True,
27 |         "use_rests": True,
28 |         "use_tempos": True,
29 |         "use_time_signatures": True,
30 |         "use_sustain_pedals": True,
31 |         "use_pitch_bends": True,
32 |         "base_tokenizer": "TSD",
33 |     }
34 | )
35 | tokenizations_no_one_stream = [
36 |     "TSD",
37 |     "REMI",
38 |     "MIDILike",
39 |     "Structured",
40 |     "CPWord",
41 |     "Octuple",
42 | ]
43 | configs = (
44 |     {
45 |         "use_programs": True,
46 |         "one_token_stream_for_programs": True,
47 |         "program_changes": False,
48 |     },
49 |     {
50 |         "use_programs": True,
51 |         "one_token_stream_for_programs": True,
52 |         "program_changes": True,
53 |     },
54 |     {
55 |         "use_programs": True,
56 |         "one_token_stream_for_programs": False,
57 |         "program_changes": False,
58 |     },
59 | )
60 | TOK_PARAMS_IO = []
61 | for tokenization_ in ALL_TOKENIZATIONS:
62 |     params_ = deepcopy(default_params)
63 |     adjust_tok_params_for_tests(tokenization_, params_)
64 |     TOK_PARAMS_IO.append((tokenization_, params_))
65 | 
66 |     if tokenization_ in tokenizations_no_one_stream:
67 |         for config in configs:
68 |             params_tmp = deepcopy(params_)
69 |             params_tmp.update(config)
70 |             TOK_PARAMS_IO.append((tokenization_, params_tmp))
71 | 
72 | 
73 | @pytest.mark.parametrize("tok_params_set", TOK_PARAMS_IO)
74 | def test_io_formats(
75 |     tok_params_set: tuple[str, dict[str, Any]],
76 |     midi_path: Path = HERE / "MIDIs_multitrack" / "Funkytown.mid",
77 | ) -> None:
78 |     r"""
79 |     Tokenize and decode a MIDI back to make sure the possible I/O format are ok.
80 | 
81 |     :param tok_params_set: tokenizer and its parameters to run.
82 |     :param midi_path: path to the MIDI file to test.
83 |     """
84 |     midi = Score(midi_path)
85 |     tokenization, params = tok_params_set
86 |     tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)(
87 |         tokenizer_config=miditok.TokenizerConfig(**params)
88 |     )
89 | 
90 |     _, _, has_errors = tokenize_and_check_equals(midi, tokenizer, midi_path.stem)
91 |     assert not has_errors
92 | 


--------------------------------------------------------------------------------
/tests/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | """Tests on the preprocessing steps of music files, before tokenization."""
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import miditok
 6 | import pytest
 7 | from symusic import Score
 8 | 
 9 | from .utils_tests import MIDI_PATHS_ALL
10 | 
11 | CONFIG_KWARGS = {
12 |     "use_tempos": True,
13 |     "use_time_signatures": True,
14 |     "use_sustain_pedals": True,
15 |     "use_pitch_bends": True,
16 |     "log_tempos": True,
17 |     "beat_res": {(0, 4): 8, (4, 12): 4, (12, 16): 2},
18 |     "delete_equal_successive_time_sig_changes": True,
19 |     "delete_equal_successive_tempo_changes": True,
20 | }
21 | TOKENIZATIONS = ["MIDILike", "TSD"]
22 | 
23 | 
24 | @pytest.mark.parametrize("tokenization", TOKENIZATIONS)
25 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ALL, ids=lambda p: p.name)
26 | def test_preprocess(tokenization: str, file_path: Path):
27 |     r"""
28 |     Check that a second preprocessing doesn't alter the MIDI anymore.
29 | 
30 |     :param tokenization: name of the tokenizer class.
31 |     :param file_path: paths to MIDI file to test.
32 |     """
33 |     # Creates tokenizer
34 |     tok_config = miditok.TokenizerConfig(**CONFIG_KWARGS)
35 |     tokenizer = getattr(miditok, tokenization)(tok_config)
36 | 
37 |     # Preprocess original file, and once again on the already preprocessed file
38 |     score = Score(file_path)
39 |     score_processed1 = tokenizer.preprocess_score(score)
40 |     score_processed2 = tokenizer.preprocess_score(score_processed1)
41 | 
42 |     # The second preprocess shouldn't do anything
43 |     assert score_processed1 == score_processed2
44 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_data_loading.py:
--------------------------------------------------------------------------------
  1 | """Test classes and methods from the pytorch_data module."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from time import time
  6 | from typing import TYPE_CHECKING
  7 | 
  8 | import miditok
  9 | import pytest
 10 | from torch import randint
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | from .utils_tests import (
 14 |     ABC_PATHS,
 15 |     MAX_BAR_EMBEDDING,
 16 |     MIDI_PATHS_CORRUPTED,
 17 |     MIDI_PATHS_MULTITRACK,
 18 |     MIDI_PATHS_ONE_TRACK,
 19 | )
 20 | 
 21 | if TYPE_CHECKING:
 22 |     from collections.abc import Callable, Sequence
 23 |     from pathlib import Path
 24 | 
 25 |     from symusic import Score
 26 | 
 27 | 
 28 | def get_labels_seq_len(score: Score, tokseq: miditok.TokSequence, _: Path) -> int:
 29 |     num_track = 1 if len(score.tracks) == 0 else len(score.tracks)
 30 |     if isinstance(tokseq, miditok.TokSequence):
 31 |         return len(tokseq) // num_track
 32 |     return len(tokseq[0]) // num_track
 33 | 
 34 | 
 35 | def get_labels_seq(score: Score, tokseq: miditok.TokSequence, _: Path) -> list[int]:
 36 |     if isinstance(tokseq, list):
 37 |         return tokseq[0].ids[: -len(score.tracks)]
 38 |     if len(tokseq) > len(score.tracks):
 39 |         return tokseq.ids[: -len(score.tracks)]
 40 |     return tokseq.ids
 41 | 
 42 | 
 43 | @pytest.mark.parametrize(
 44 |     "tokenizer_cls", [miditok.TSD, miditok.Octuple], ids=["TSD", "Octuple"]
 45 | )
 46 | @pytest.mark.parametrize(
 47 |     "one_token_stream_for_programs", [True, False], ids=["1 strm", "n strm"]
 48 | )
 49 | @pytest.mark.parametrize("split_files", [True, False], ids=["split", "no split"])
 50 | @pytest.mark.parametrize("pre_tokenize", [True, False], ids=["pretok", "no pretok"])
 51 | @pytest.mark.parametrize("ac_random_tracks_ratio", [None, (0.0, 1.0)])
 52 | @pytest.mark.parametrize("ac_random_bars_ratio", [None, (0.0, 1.0)])
 53 | @pytest.mark.parametrize("func_labels", [get_labels_seq_len, get_labels_seq])
 54 | @pytest.mark.parametrize("num_overlap_bars", [0, 1], ids=["no overlap", "overlap"])
 55 | def test_dataset_midi(
 56 |     tmp_path: Path,
 57 |     tokenizer_cls: Callable,
 58 |     one_token_stream_for_programs: bool,
 59 |     split_files: bool,
 60 |     pre_tokenize: bool,
 61 |     ac_random_tracks_ratio: tuple[float, float] | None,
 62 |     ac_random_bars_ratio: tuple[float, float] | None,
 63 |     func_labels: Callable,
 64 |     num_overlap_bars: int,
 65 |     files_paths: Sequence[Path] = MIDI_PATHS_MULTITRACK
 66 |     + MIDI_PATHS_ONE_TRACK
 67 |     + MIDI_PATHS_CORRUPTED
 68 |     + ABC_PATHS,
 69 |     max_seq_len: int = 1000,
 70 | ):
 71 |     config = miditok.TokenizerConfig(
 72 |         use_programs=True,
 73 |         one_token_stream_for_programs=one_token_stream_for_programs,
 74 |         max_bar_embedding=MAX_BAR_EMBEDDING,
 75 |     )
 76 |     tokenizer = tokenizer_cls(config)
 77 | 
 78 |     # Split files if requested
 79 |     # We perform it twice as the second time, the method would return the same paths as
 80 |     # the ones created in the first call.
 81 |     if split_files:
 82 |         t0 = time()
 83 |         file_paths_split1 = miditok.utils.split_files_for_training(
 84 |             files_paths,
 85 |             tokenizer,
 86 |             tmp_path,
 87 |             max_seq_len,
 88 |             num_overlap_bars=num_overlap_bars,
 89 |         )
 90 |         t1 = time() - t0
 91 |         print(f"First Score split call: {t1:.2f} sec")
 92 |         t0 = time()
 93 |         file_paths_split2 = miditok.utils.split_files_for_training(
 94 |             files_paths,
 95 |             tokenizer,
 96 |             tmp_path,
 97 |             max_seq_len,
 98 |             num_overlap_bars=num_overlap_bars,
 99 |         )
100 |         t1 = time() - t0
101 |         print(f"Second Score split call: {t1:.2f} sec")
102 | 
103 |         file_paths_split1.sort()
104 |         file_paths_split2.sort()
105 |         assert file_paths_split1 == file_paths_split2
106 |         files_paths = file_paths_split1
107 | 
108 |     # Creating the Dataset, splitting MIDIs
109 |     t0 = time()
110 |     dataset = miditok.pytorch_data.DatasetMIDI(
111 |         files_paths,
112 |         tokenizer,
113 |         max_seq_len,
114 |         tokenizer["BOS_None"],
115 |         tokenizer["EOS_None"],
116 |         pre_tokenize=pre_tokenize,
117 |         ac_tracks_random_ratio_range=ac_random_tracks_ratio,
118 |         ac_bars_random_ratio_range=ac_random_bars_ratio,
119 |         func_to_get_labels=func_labels,
120 |     )
121 |     t1 = time() - t0
122 |     print(f"Dataset init took {t1:.2f} sec")
123 | 
124 |     # Test iteration, and collator with user labels
125 |     batch = [dataset[i] for i in range(min(len(dataset), 10))]
126 | 
127 |     # Test with DataLoader and collator
128 |     collator = miditok.pytorch_data.DataCollator(
129 |         tokenizer.pad_token_id,
130 |         pad_on_left=True,
131 |     )
132 |     _ = collator(batch)
133 |     dataloader = DataLoader(dataset, 16, collate_fn=collator)
134 |     for _ in dataloader:
135 |         pass
136 | 
137 | 
138 | def test_dataset_json(tmp_path: Path, file_paths: Sequence[Path] | None = None):
139 |     if file_paths is None:
140 |         file_paths = MIDI_PATHS_MULTITRACK[:5]
141 |     tokens_dir_path = tmp_path / "multitrack_tokens_dataset_json"
142 | 
143 |     config = miditok.TokenizerConfig(use_programs=True)
144 |     tokenizer = miditok.TSD(config)
145 |     if not tokens_dir_path.is_dir():
146 |         tokenizer.tokenize_dataset(file_paths, tokens_dir_path)
147 | 
148 |     tokens_split_dir_path = tmp_path / "multitrack_tokens_dataset_json_split"
149 |     miditok.utils.split_tokens_files_to_subsequences(
150 |         list(tokens_dir_path.glob("**/*.json")),
151 |         tokens_split_dir_path,
152 |         300,
153 |         1000,
154 |     )
155 |     dataset = miditok.pytorch_data.DatasetJSON(
156 |         list(tokens_split_dir_path.glob("**/*.json")),
157 |         1000,
158 |         tokenizer["BOS_None"],
159 |         tokenizer["EOS_None"],
160 |     )
161 | 
162 |     for _ in dataset:
163 |         pass
164 | 
165 | 
166 | def test_collator():
167 |     collator = miditok.pytorch_data.DataCollator(
168 |         0,
169 |         pad_on_left=True,
170 |         copy_inputs_as_labels=True,
171 |         shift_labels=True,
172 |     )
173 |     seq_lengths = [120, 100, 80, 200]
174 | 
175 |     # Just input ids
176 |     batch_from_dataloader = [
177 |         {"input_ids": randint(0, 300, (seq_len,))} for seq_len in seq_lengths
178 |     ]
179 |     batch_collated = collator(batch_from_dataloader)
180 |     # seq_len - 1 as we shift labels
181 |     assert list(batch_collated["input_ids"].size()) == [
182 |         len(seq_lengths),
183 |         max(seq_lengths) - 1,
184 |     ]
185 | 
186 |     # Encoder and decoder input ids
187 |     batch_from_dataloader = [
188 |         {
189 |             "input_ids": randint(0, 300, (seq_len,)),
190 |             "decoder_input_ids": randint(0, 300, (seq_len,)),
191 |         }
192 |         for seq_len in seq_lengths
193 |     ]
194 |     batch_collated = collator(batch_from_dataloader)
195 |     # seq_len - 1 as we shift labels
196 |     assert list(batch_collated["input_ids"].size()) == [
197 |         len(seq_lengths),
198 |         max(seq_lengths),
199 |     ]
200 |     assert list(batch_collated["decoder_input_ids"].size()) == [
201 |         len(seq_lengths),
202 |         max(seq_lengths) - 1,
203 |     ]
204 | 
205 |     # This time with labels already in batch and embed pooling, padding right
206 |     collator.pad_on_left = False
207 |     batch_from_dataloader = [
208 |         {
209 |             "input_ids": randint(0, 300, (seq_len, 5)),
210 |             "decoder_input_ids": randint(0, 300, (seq_len, 5)),
211 |             "labels": randint(0, 300, (seq_len, 5)),
212 |         }
213 |         for seq_len in seq_lengths
214 |     ]
215 |     batch_collated = collator(batch_from_dataloader)
216 |     assert list(batch_collated["input_ids"].size()) == [
217 |         len(seq_lengths),
218 |         max(seq_lengths),
219 |         5,
220 |     ]
221 |     assert list(batch_collated["decoder_input_ids"].size()) == [
222 |         len(seq_lengths),
223 |         max(seq_lengths) - 1,
224 |         5,
225 |     ]
226 |     assert list(batch_collated["labels"].size()) == [
227 |         len(seq_lengths),
228 |         max(seq_lengths) - 1,
229 |         5,
230 |     ]
231 | 


--------------------------------------------------------------------------------
/tests/test_saving_loading_config.py:
--------------------------------------------------------------------------------
  1 | """Tests for the saving/loading methods of tokenizers."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from copy import deepcopy
  6 | from typing import TYPE_CHECKING, Any
  7 | 
  8 | import miditok
  9 | import pytest
 10 | 
 11 | from .utils_tests import (
 12 |     ALL_TOKENIZATIONS,
 13 |     MAX_BAR_EMBEDDING,
 14 |     MIDI_PATHS_MULTITRACK,
 15 |     MIDI_PATHS_ONE_TRACK,
 16 | )
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from pathlib import Path
 20 | 
 21 | ADDITIONAL_TOKENS_TEST = {
 22 |     "use_chords": False,  # False to speed up tests
 23 |     "use_rests": True,
 24 |     "use_tempos": True,
 25 |     "use_time_signatures": True,
 26 |     "use_programs": False,
 27 |     "beat_res_rest": {(0, 16): 4},
 28 |     "num_tempos": 32,
 29 |     "tempo_range": (40, 250),
 30 |     "base_tokenizer": "TSD",
 31 |     "use_microtiming": True,
 32 |     "ticks_per_quarter": 480,
 33 |     "max_microtiming_shift": 0.25,
 34 |     "num_microtiming_bins": 110,
 35 | }
 36 | 
 37 | TOK_PARAMS_MULTITRACK = []
 38 | tokenizations_non_one_stream = [
 39 |     "TSD",
 40 |     "REMI",
 41 |     "MIDILike",
 42 |     "Structured",
 43 |     "CPWord",
 44 |     "Octuple",
 45 | ]
 46 | for tokenization_ in ALL_TOKENIZATIONS:
 47 |     params_ = {"use_programs": True}
 48 |     if tokenization_ == "MMM":
 49 |         params_["base_tokenizer"] = "TSD"
 50 |     elif tokenization_ in ["Octuple", "MuMIDI"]:
 51 |         params_["max_bar_embedding"] = MAX_BAR_EMBEDDING
 52 |     elif tokenization_ in ["PerTok"]:
 53 |         params_["use_microtiming"] = True
 54 |         params_["ticks_per_quarter"] = 220
 55 |         params_["max_microtiming_shift"] = 0.25
 56 |         params_["num_microtiming_bins"] = 110
 57 |     TOK_PARAMS_MULTITRACK.append((tokenization_, params_))
 58 | 
 59 |     if tokenization_ in tokenizations_non_one_stream:
 60 |         params_tmp = deepcopy(params_)
 61 |         params_tmp["one_token_stream_for_programs"] = False
 62 |         # Disable tempos for Octuple with one_token_stream_for_programs, as tempos are
 63 |         # carried by note tokens
 64 |         if tokenization_ == "Octuple":
 65 |             params_tmp["use_tempos"] = False
 66 |         TOK_PARAMS_MULTITRACK.append((tokenization_, params_tmp))
 67 | 
 68 | 
 69 | @pytest.mark.parametrize("tokenization", ALL_TOKENIZATIONS)
 70 | def test_saving_loading_tokenizer_config(tokenization: str, tmp_path: Path):
 71 |     config1 = miditok.TokenizerConfig()
 72 |     config1.save_to_json(tmp_path / f"tok_conf_{tokenization}.json")
 73 | 
 74 |     config2 = miditok.TokenizerConfig.load_from_json(
 75 |         tmp_path / f"tok_conf_{tokenization}.json"
 76 |     )
 77 | 
 78 |     assert config1 == config2
 79 |     config1.pitch_range = (0, 777)
 80 |     assert config1 != config2
 81 | 
 82 | 
 83 | @pytest.mark.parametrize("tokenization", ALL_TOKENIZATIONS)
 84 | def test_saving_loading_tokenizer(tokenization: str, tmp_path: Path):
 85 |     r"""
 86 |     Make sure saving and loading end with the identical tokenizer.
 87 | 
 88 |     Create a tokenizer, save its config, and load it back.
 89 |     If all went well the reloaded tokenizer should be identical.
 90 |     """
 91 |     tokenizer_config = miditok.TokenizerConfig(**ADDITIONAL_TOKENS_TEST)
 92 |     tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)(
 93 |         tokenizer_config=tokenizer_config
 94 |     )
 95 |     tokenizer.save(tmp_path / f"{tokenization}.txt")
 96 | 
 97 |     tokenizer2: miditok.MusicTokenizer = getattr(miditok, tokenization)(
 98 |         params=tmp_path / f"{tokenization}.txt"
 99 |     )
100 |     assert tokenizer == tokenizer2
101 |     if tokenization == "Octuple":
102 |         tokenizer.vocab[0]["PAD_None"] = 8
103 |         assert tokenizer != tokenizer2
104 | 
105 | 
106 | @pytest.mark.parametrize("file_path", MIDI_PATHS_MULTITRACK[:3], ids=lambda p: p.name)
107 | @pytest.mark.parametrize("tok_params_set", TOK_PARAMS_MULTITRACK)
108 | def test_multitrack_midi_to_tokens_to_midi(
109 |     file_path: Path,
110 |     tok_params_set: tuple[str, dict[str, Any]],
111 |     tmp_path: Path,
112 | ):
113 |     # Create tokenizer
114 |     tokenization, params = tok_params_set
115 |     tokenizer: miditok.MusicTokenizer = getattr(miditok, tokenization)(
116 |         tokenizer_config=miditok.TokenizerConfig(**params)
117 |     )
118 | 
119 |     # Tokenize the file, save tokens and load them back
120 |     tokens = tokenizer(file_path)
121 |     tokenizer.save_tokens(tokens, tmp_path / "tokens.json")
122 |     tokens_loaded = tokenizer.load_tokens(tmp_path / "tokens.json")
123 | 
124 |     # Assert tokens are the same
125 |     assert tokens == tokens_loaded
126 | 
127 | 
128 | @pytest.mark.parametrize("file_path", MIDI_PATHS_ONE_TRACK[:3], ids=lambda p: p.name)
129 | def test_pertok_microtiming_tick_values(file_path: Path):
130 |     # Create the pertok tokenizer
131 |     cfg = miditok.TokenizerConfig(
132 |         use_chords=False,
133 |         use_microtiming=True,
134 |         ticks_per_quarter=480,
135 |         max_microtiming_shift=0.25,
136 |         num_microtiming_bins=110,
137 |     )
138 |     tok = miditok.PerTok(cfg)
139 |     # Train the tokenizer
140 |     tok.train(files_paths=[file_path], vocab_size=1000)
141 |     # Dump the tokenizer to a JSON
142 |     tok.save("tmp.json")
143 |     # Reload the tokenizer
144 |     newtok = miditok.PerTok(params="tmp.json")
145 |     # Should still have the microtiming_tick_values parameter
146 |     assert hasattr(newtok, "microtiming_tick_values")
147 | 


--------------------------------------------------------------------------------
/tests/test_toksequence.py:
--------------------------------------------------------------------------------
 1 | """Test methods."""
 2 | 
 3 | from collections.abc import Callable
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | from miditok import TSD, TokenizerConfig, TokSequence
 8 | 
 9 | from .utils_tests import MIDI_PATHS_MULTITRACK
10 | 
11 | 
12 | def test_tokseq_concat():
13 |     ids1 = list(range(10))
14 |     ids2 = list(range(10, 20))
15 |     str1 = [str(id_ * 2) for id_ in ids1]
16 |     str2 = [str(id_ * 2) for id_ in ids2]
17 |     bytes1 = "".join(str1)
18 |     bytes2 = "".join(str2)
19 | 
20 |     tokseq1 = TokSequence(ids=ids1, tokens=str1, bytes=bytes1)
21 |     tokseq2 = TokSequence(ids=ids2, tokens=str2, bytes=bytes2)
22 |     seq_concat = tokseq1 + tokseq2
23 | 
24 |     assert seq_concat.ids == ids1 + ids2
25 |     assert seq_concat.tokens == str1 + str2
26 |     assert seq_concat.bytes == bytes1 + bytes2
27 | 
28 | 
29 | def test_tokseq_slice_and_concat():
30 |     ids1 = list(range(20))
31 |     str1 = [str(id_ * 2) for id_ in ids1]
32 |     bytes1 = "".join(str1)
33 | 
34 |     tokseq = TokSequence(ids=ids1, tokens=str1, bytes=bytes1)
35 |     subseq1 = tokseq[:10]
36 |     subseq2 = tokseq[10:]
37 | 
38 |     assert subseq1.ids == ids1[:10]
39 |     assert subseq1.tokens == str1[:10]
40 |     assert subseq1.bytes == bytes1[:10]
41 |     assert subseq2.ids == ids1[10:]
42 |     assert subseq2.tokens == str1[10:]
43 |     assert subseq2.bytes == bytes1[10:]
44 | 
45 |     tokseq_concat = subseq1 + subseq2
46 |     assert tokseq == tokseq_concat
47 | 
48 | 
49 | @pytest.mark.parametrize("file_path", MIDI_PATHS_MULTITRACK, ids=lambda p: p.name)
50 | def test_split_tokseq_per_bars_beats(file_path: Path, tokenization: Callable = TSD):
51 |     tokenizer = tokenization(TokenizerConfig(use_programs=True))
52 |     tokseq = tokenizer(file_path)
53 | 
54 |     # Split per bars
55 |     seqs = tokseq.split_per_bars()
56 |     concat_seq = seqs.pop(0)
57 |     for seq in seqs:
58 |         concat_seq += seq
59 |     assert concat_seq == tokseq
60 | 
61 |     # Split per beats
62 |     seqs = tokseq.split_per_beats()
63 |     concat_seq = seqs.pop(0)
64 |     for seq in seqs:
65 |         concat_seq += seq
66 |     assert concat_seq == tokseq
67 | 


--------------------------------------------------------------------------------