├── .faq ├── FAQ.md └── suggest.md ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ └── config.yml ├── stale.yml └── workflows │ ├── doc.yml │ ├── pypi.yml │ ├── test.yml │ └── test_cli.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── FAQ.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── codecov.yml ├── doc ├── gen_docs.py ├── requirements.txt └── source │ └── index.rst ├── environment.yaml ├── faq.yml ├── notebook ├── augmentation.ipynb ├── example.ipynb ├── freeze.ipynb ├── inference.ipynb └── sharing.ipynb ├── pyannote ├── __init__.py └── audio │ ├── __init__.py │ ├── augmentation │ ├── __init__.py │ ├── mix.py │ └── registry.py │ ├── cli │ ├── __init__.py │ ├── config │ │ └── hydra │ │ │ └── default.yaml │ ├── evaluate.py │ ├── evaluate_config │ │ ├── __init__.py │ │ ├── config.yaml │ │ └── hydra │ │ │ └── default.yaml │ ├── lr_schedulers │ │ ├── CosineAnnealingWarmRestarts.py │ │ ├── CyclicLR.py │ │ ├── ReduceLROnPlateau.py │ │ └── __init__.py │ ├── pretrained.py │ ├── train.py │ └── train_config │ │ ├── __init__.py │ │ ├── config.yaml │ │ ├── hydra │ │ └── default.yaml │ │ ├── model │ │ ├── DebugEmbedding.yaml │ │ ├── DebugSegmentation.yaml │ │ ├── Pretrained.yaml │ │ ├── PyanNet.yaml │ │ ├── SSeRiouSS.yaml │ │ ├── XVectorMFCC.yaml │ │ └── XVectorSincNet.yaml │ │ ├── optimizer │ │ ├── Adam.yaml │ │ ├── AdamW.yaml │ │ └── Adan.yaml │ │ ├── preprocessor │ │ └── LowerTemporalResolution.yaml │ │ ├── scheduler │ │ ├── CosineAnnealingWarmRestarts.yaml │ │ ├── CyclicLR.yaml │ │ └── ReduceLROnPlateau.yaml │ │ ├── task │ │ ├── MultiLabelSegmentation.yaml │ │ ├── OverlappedSpeechDetection.yaml │ │ ├── SpeakerDiarization.yaml │ │ ├── SpeakerEmbedding.yaml │ │ └── VoiceActivityDetection.yaml │ │ └── trainer │ │ ├── default.yaml │ │ └── fast_dev_run.yaml │ ├── core │ ├── __init__.py │ ├── callback.py │ ├── inference.py │ ├── io.py │ ├── model.py │ ├── pipeline.py │ └── task.py │ ├── models │ ├── __init__.py │ ├── blocks │ │ ├── pooling.py │ │ └── sincnet.py │ ├── embedding │ │ ├── __init__.py │ │ ├── debug.py │ │ ├── wespeaker │ │ │ ├── LICENSE.WeSpeaker │ │ │ ├── __init__.py │ │ │ ├── convert.py │ │ │ └── resnet.py │ │ └── xvector.py │ ├── segmentation │ │ ├── PyanNet.py │ │ ├── SSeRiouSS.py │ │ ├── __init__.py │ │ └── debug.py │ └── separation │ │ ├── ToTaToNet.py │ │ └── __init__.py │ ├── pipelines │ ├── __init__.py │ ├── clustering.py │ ├── multilabel.py │ ├── overlapped_speech_detection.py │ ├── resegmentation.py │ ├── speaker_diarization.py │ ├── speaker_verification.py │ ├── speech_separation.py │ ├── utils │ │ ├── __init__.py │ │ ├── diarization.py │ │ ├── getter.py │ │ ├── hook.py │ │ └── oracle.py │ └── voice_activity_detection.py │ ├── sample │ ├── __init__.py │ ├── sample.rttm │ └── sample.wav │ ├── tasks │ ├── __init__.py │ ├── embedding │ │ ├── __init__.py │ │ ├── arcface.py │ │ └── mixins.py │ ├── segmentation │ │ ├── __init__.py │ │ ├── mixins.py │ │ ├── multilabel.py │ │ ├── overlapped_speech_detection.py │ │ ├── speaker_diarization.py │ │ └── voice_activity_detection.py │ └── separation │ │ ├── PixIT.py │ │ └── __init__.py │ ├── torchmetrics │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ └── diarization_error_rate.py │ ├── classification │ │ ├── __init__.py │ │ └── equal_error_rate.py │ └── functional │ │ ├── __init__.py │ │ └── audio │ │ ├── __init__.py │ │ └── diarization_error_rate.py │ └── utils │ ├── __init__.py │ ├── loss.py │ ├── metric.py │ ├── multi_task.py │ ├── params.py │ ├── permutation.py │ ├── powerset.py │ ├── preprocessors.py │ ├── preview.py │ ├── probe.py │ ├── protocol.py │ ├── random.py │ ├── receptive_field.py │ ├── reproducibility.py │ ├── signal.py │ └── version.py ├── questions ├── README.md ├── bad_performance.question.md ├── from_memory.question.md ├── offline.question.md ├── pyannote.question.md └── streaming.question.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── conftest.py ├── data │ ├── database.yml │ ├── debug.development.lst │ ├── debug.development.rttm │ ├── debug.development.uem │ ├── debug.test.lst │ ├── debug.test.rttm │ ├── debug.test.uem │ ├── debug.train.lst │ ├── debug.train.rttm │ ├── debug.train.uem │ ├── dev00.wav │ ├── dev01.wav │ ├── empty.wav │ ├── trn01.wav │ ├── trn02.wav │ ├── trn03.wav │ ├── trn04.wav │ ├── trn05.wav │ ├── trn06.wav │ ├── trn07.wav │ ├── trn08.wav │ ├── trn09.wav │ ├── trñ00.wav │ ├── tst00.wav │ └── tst01.wav ├── inference_test.py ├── io_test.py ├── tasks │ ├── test_reproducibility.py │ └── test_specifications.py ├── test_cli.py ├── test_clustering.py ├── test_import_lib.py ├── test_metrics.py ├── test_run_notebooks.py ├── test_sample.py ├── test_speechbrain.py ├── test_stats_pool.py ├── test_train.py └── utils │ ├── preview.py │ ├── probe_util_test.py │ ├── test_permutation.py │ └── test_powerset.py ├── tutorials ├── MRE_template.ipynb ├── adapting_pretrained_pipeline.ipynb ├── add_your_own_model.ipynb ├── add_your_own_task.ipynb ├── applying_a_model.ipynb ├── applying_a_pipeline.ipynb ├── assets │ ├── download-model.png │ ├── download-pipeline.png │ ├── prodigy-pyannote.audio.png │ ├── pyannote.diff.PNG │ ├── pyannote.review.PNG │ ├── sample.rttm │ └── sample.wav ├── community │ └── offline_usage_speaker_diarization.ipynb ├── intro.ipynb ├── overlapped_speech_detection.ipynb ├── speaker_verification.ipynb ├── training_a_model.ipynb ├── training_with_cli.md └── voice_activity_detection.ipynb └── version.txt /.faq/FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | # Frequently Asked Questions 3 | 4 | {%- for question in questions %} 5 | - [{{ question.title }}](#{{ question.slug }}) 6 | {%- endfor %} 7 | 8 | 9 | {%- for question in questions %} 10 | 11 | 12 | ## {{ question.title }} 13 | 14 | {{ question.body }} 15 | 16 | {%- endfor %} 17 | 18 |
19 | 20 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory) 21 | -------------------------------------------------------------------------------- /.faq/suggest.md: -------------------------------------------------------------------------------- 1 | Thank you for your issue. 2 | 3 | {%- if questions -%} 4 | {% if questions|length == 1 %} 5 | We found the following entry in the [FAQ]({{ faq_url }}) which you may find helpful: 6 | {%- else %} 7 | We found the following entries in the [FAQ]({{ faq_url }}) which you may find helpful: 8 | {%- endif %} 9 | 10 | {% for question in questions %} 11 | - [{{ question.title }}]({{ faq_url }}#{{ question.slug }}) 12 | {%- endfor %} 13 | 14 | {%- else -%} 15 | You might want to check the [FAQ]({{ faq_url }}) if you haven't done so already. 16 | {%- endif %} 17 | 18 | Feel free to close this issue if you found an answer in the FAQ. 19 | 20 | If your issue is a feature request, please read [this](https://xyproblem.info/) first and update your request accordingly, if needed. 21 | 22 | If your issue is a bug report, please provide a [minimum reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) as a link to a self-contained [Google Colab](https://colab.research.google.com/) notebook containing everthing needed to reproduce the bug: 23 | - installation 24 | - data preparation 25 | - model download 26 | - etc. 27 | 28 | Providing an MRE will increase your chance of getting an answer from the community (either maintainers or other power users). 29 | 30 | Companies relying on `pyannote.audio` in production may contact [me](https://herve.niderb.fr) via email regarding: 31 | * paid scientific consulting around speaker diarization and speech processing in general; 32 | * custom models and tailored features (via the local tech transfer office). 33 | 34 | > This is an automated reply, generated by [FAQtory](https://github.com/willmcgugan/faqtory) 35 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | pyannote/audio/_version.py export-subst 2 | notebooks/* linguist-documentation 3 | tutorials/* linguist-documentation 4 | versioneer.py linguist-vendored 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [hbredin] 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Report a bug in pyannote.audio 3 | body: 4 | 5 | - type: markdown 6 | attributes: 7 | value: | 8 | When reporting bugs, please follow the guidelines in this template. This helps identify the problem precisely and thus enables contributors to fix it faster. 9 | - Write a descriptive issue title above. 10 | - The golden rule is to **always open *one* issue for *one* bug**. If you notice several bugs and want to report them, make sure to create one new issue for each of them. 11 | - Search [open](https://github.com/pyannote/pyannote-audio/issues) and [closed](https://github.com/pyannote/pyannote-audio/issues?q=is%3Aissue+is%3Aclosed) issues to ensure it has not already been reported. If you don't find a relevant match or if you're unsure, don't hesitate to **open a new issue**. The bugsquad will handle it from there if it's a duplicate. 12 | - Please always check if your issue is reproducible in the latest version – it may already have been fixed! 13 | - If you use a custom build, please test if your issue is reproducible in official releases too. 14 | 15 | - type: textarea 16 | attributes: 17 | label: Tested versions 18 | description: | 19 | To properly fix a bug, we need to identify if the bug was recently introduced in the engine, or if it was always present. 20 | - Please specify the pyannote.audio version you found the issue in, including the **Git commit hash** if using a development build. 21 | - If you can, **please test earlier pyannote.audio versions** and, if applicable, newer versions (development branch). Mention whether the bug is reproducible or not in the versions you tested. 22 | - The aim is for us to identify whether a bug is a **regression**, i.e. an issue that didn't exist in a previous version, but was introduced later on, breaking existing functionality. For example, if a bug is reproducible in 3.2 but not in 3.0, we would like you to test intermediate 3.1 to find which version is the first one where the issue can be reproduced. 23 | placeholder: | 24 | - Reproducible in: 3.1, 3.2, and later 25 | - Not reproducible in: 3.0 26 | validations: 27 | required: true 28 | 29 | - type: input 30 | attributes: 31 | label: System information 32 | description: | 33 | - Specify the OS version, and when relevant hardware information. 34 | - For issues that are likely OS-specific and/or GPU-related, please specify the GPU model and architecture. 35 | - **Bug reports not including the required information may be closed at the maintainers' discretion.** If in doubt, always include all the requested information; it's better to include too much information than not enough information. 36 | placeholder: macOS 13.6 - pyannote.audio 3.1.1 - M1 Pro 37 | validations: 38 | required: true 39 | 40 | - type: textarea 41 | attributes: 42 | label: Issue description 43 | description: | 44 | Describe your issue briefly. What doesn't work, and how do you expect it to work instead? 45 | You can include audio, images or videos with drag and drop, and format code blocks or logs with ``` tags. 46 | validations: 47 | required: true 48 | 49 | - type: input 50 | attributes: 51 | label: Minimal reproduction example (MRE) 52 | description: | 53 | Having reproducible issues is a prerequisite for contributors to be able to solve them. 54 | Include a link to minimal reproduction example using [this Google Colab notebook](https://colab.research.google.com/github/pyannote/pyannote-audio/blob/develop/tutorials/MRE_template.ipynb) as a starting point. 55 | validations: 56 | required: true 57 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | 3 | contact_links: 4 | 5 | - name: Feature request 6 | url: https://github.com/pyannote/pyannote-audio/discussions 7 | about: Suggest an idea for this project. 8 | 9 | - name: Consulting 10 | url: https://herve.niderb.fr/consulting 11 | about: Using pyannote.audio in production? Make the most of it thanks to our consulting services. 12 | 13 | - name: Premium models 14 | url: https://forms.office.com/e/GdqwVgkZ5C 15 | about: We are considering selling premium models, extensions, or services around pyannote.audio. 16 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 180 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 30 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/doc.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | on: 3 | push: 4 | branches: 5 | - master 6 | 7 | jobs: 8 | build-and-deploy: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | max-parallel: 4 12 | matrix: 13 | python-version: ["3.9"] 14 | 15 | steps: 16 | - uses: actions/checkout@v1 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install . 25 | pip install -r doc/requirements.txt 26 | - name: Build documentation 27 | run: | 28 | make --directory=doc html 29 | touch ./doc/build/html/.nojekyll 30 | - name: Deploy 31 | env: 32 | ACTIONS_DEPLOY_KEY: ${{ secrets.ACTIONS_DEPLOY_KEY }} 33 | PUBLISH_BRANCH: gh-pages 34 | PUBLISH_DIR: ./doc/build/html 35 | SCRIPT_MODE: true 36 | run: | 37 | wget https://raw.githubusercontent.com/peaceiris/actions-gh-pages/v2/entrypoint.sh 38 | bash ./entrypoint.sh 39 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: '3.x' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install setuptools wheel twine 21 | - name: Build and publish 22 | env: 23 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 24 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 25 | run: | 26 | python setup.py sdist bdist_wheel 27 | twine upload dist/* 28 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [develop] 6 | pull_request: 7 | branches: [develop] 8 | 9 | jobs: 10 | build: 11 | timeout-minutes: 20 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | python-version: ["3.9", "3.10", "3.11"] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install libsndfile 24 | if: matrix.os == 'ubuntu-latest' 25 | run: | 26 | sudo apt-get update 27 | sudo apt-get install libsndfile1 28 | - name: Install pyannote.audio 29 | run: | 30 | pip install -e .[dev,testing] 31 | - name: Test with pytest 32 | run: | 33 | pytest -k "not test_cli.py" 34 | -------------------------------------------------------------------------------- /.github/workflows/test_cli.yml: -------------------------------------------------------------------------------- 1 | name: CLI tests 2 | 3 | on: 4 | push: 5 | branches: [develop] 6 | pull_request: 7 | branches: [develop] 8 | 9 | jobs: 10 | build: 11 | timeout-minutes: 20 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | python-version: ["3.10"] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install libsndfile 24 | if: matrix.os == 'ubuntu-latest' 25 | run: | 26 | sudo apt-get update 27 | sudo apt-get install libsndfile1 28 | - name: Install pyannote.audio 29 | run: | 30 | pip install -e .[dev,testing,cli] 31 | - name: Test with pytest 32 | run: | 33 | pytest tests/test_cli.py 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | .env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | #Ipython Notebook 63 | .ipynb_checkpoints 64 | 65 | notebooks 66 | 67 | experiments 68 | *~ 69 | 70 | *.npy 71 | *.pt 72 | *events.out.tfevents* 73 | *.csv 74 | 75 | # PyCharm 76 | .idea/ 77 | 78 | gh-pages 79 | gh-pages.pub 80 | 81 | *.zip 82 | .mypy_cache/ 83 | .vscode/ 84 | 85 | **/lightning_logs/** 86 | 87 | # Version Output 88 | pyannote/audio/version.py 89 | 90 | # vim 91 | .vim 92 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tutorials/AMI-diarization-setup"] 2 | path = tutorials/AMI-diarization-setup 3 | url = https://github.com/pyannote/AMI-diarization-setup.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '^docs/conf.py' 2 | 3 | repos: 4 | # # Clean Notebooks 5 | # - repo: https://github.com/kynan/nbstripout 6 | # rev: master 7 | # hooks: 8 | # - id: nbstripout 9 | # Format Code 10 | - repo: https://github.com/ambv/black 11 | rev: 22.3.0 12 | hooks: 13 | - id: black 14 | 15 | # Sort imports 16 | - repo: https://github.com/PyCQA/isort 17 | rev: 5.12.0 18 | hooks: 19 | - id: isort 20 | args: ["--profile", "black"] 21 | 22 | # Formatting, Whitespace, etc 23 | - repo: https://github.com/pre-commit/pre-commit-hooks 24 | rev: v2.2.3 25 | hooks: 26 | - id: trailing-whitespace 27 | - id: check-added-large-files 28 | args: ['--maxkb=1000'] 29 | - id: check-ast 30 | - id: check-json 31 | - id: check-merge-conflict 32 | - id: check-xml 33 | - id: check-yaml 34 | - id: debug-statements 35 | - id: end-of-file-fixer 36 | - id: requirements-txt-fixer 37 | - id: mixed-line-ending 38 | args: ['--fix=no'] 39 | - id: flake8 40 | args: ['--ignore=E203,E501,F811,E712,W503'] 41 | -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | # Frequently Asked Questions 3 | - [Can I apply pretrained pipelines on audio already loaded in memory?](#can-i-apply-pretrained-pipelines-on-audio-already-loaded-in-memory) 4 | - [Can I use gated models (and pipelines) offline?](#can-i-use-gated-models-(and-pipelines)-offline) 5 | - [Does pyannote support streaming speaker diarization?](#does-pyannote-support-streaming-speaker-diarization) 6 | - [How can I improve performance?](#how-can-i-improve-performance) 7 | - [How does one spell and pronounce pyannote.audio?](#how-does-one-spell-and-pronounce-pyannoteaudio) 8 | 9 | 10 | ## Can I apply pretrained pipelines on audio already loaded in memory? 11 | 12 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end. 13 | 14 | 15 | ## Can I use gated models (and pipelines) offline? 16 | 17 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines. 18 | 19 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible. 20 | 21 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming. 22 | 23 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines. 24 | 25 | 26 | ## Does pyannote support streaming speaker diarization? 27 | 28 | **Short answer:** not out of the box, no. 29 | 30 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`. 31 | 32 | 33 | ## How can I improve performance? 34 | 35 | **Long answer:** 36 | 37 | 1. Manually annotate dozens of conversations as precisely as possible. 38 | 2. Separate them into train (80%), development (10%) and test (10%) subsets. 39 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization). 40 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb). 41 | 5. Enjoy. 42 | 43 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that. 44 | 45 | 46 | ## How does one spell and pronounce pyannote.audio? 47 | 48 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic). 49 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon. 50 | 🎹 `pianoter` means to play the piano (hence the logo 🤯). 51 | 52 |
53 | 54 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory) 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 CNRS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include pyannote *.py 2 | recursive-include pyannote *.yaml 3 | recursive-include pyannote *.wav 4 | recursive-include pyannote *.rttm 5 | global-exclude *.pyc 6 | global-exclude __pycache__ 7 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | patch: 4 | default: 5 | enabled: false 6 | -------------------------------------------------------------------------------- /doc/gen_docs.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script will generate the rst docs for the api 3 | """ 4 | 5 | import os 6 | from os import path 7 | 8 | bp = breakpoint 9 | 10 | 11 | def capitalise(s): 12 | news = "" 13 | for word in s.split("_"): 14 | news += word.capitalize() 15 | return news 16 | 17 | 18 | def process_dir(level, p): 19 | md = "" 20 | basename = path.basename(p) 21 | 22 | title = capitalise(basename) 23 | md += f"{'#'*level} {title}\n\n" 24 | subdirs = os.listdir(p) 25 | 26 | for f in subdirs: 27 | m = path.join(subdir, f) 28 | if path.isdir(m): 29 | md += process_dir(level + 1, path.join(p, f)) 30 | else: 31 | if "__" in f: 32 | continue 33 | module = m[3:].replace("/", ".")[:-3] 34 | md += f""" 35 | ```eval_rst 36 | .. automodule:: {module} 37 | :members: 38 | 39 | ``` 40 | 41 | """ 42 | return md 43 | 44 | 45 | DIR = "../pyannote/audio" 46 | 47 | for module in os.listdir(DIR): 48 | # Each folder will become and rst file 49 | # Each file/folder will have a # prepended to it 50 | # Recursively we will add another # each level 51 | 52 | # Initialise Markdown 53 | md = "" 54 | 55 | subdir = path.join(DIR, module) 56 | 57 | # Skip if not directory 58 | if not path.isdir(subdir) or "__" in module: 59 | continue 60 | 61 | md += process_dir(1, subdir) 62 | with open(f"./source/api/{module}.md", "w") as f: 63 | f.write(md) 64 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | ipython==8.10.0 2 | recommonmark 3 | Sphinx==3.0.4 4 | sphinx_rtd_theme==0.4.3 5 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | pyannote.audio 3 | ############## 4 | 5 | `pyannote.audio` is an open-source Python library that provides neural building blocks for speaker diarization. 6 | 7 | Installation 8 | ============ 9 | 10 | :: 11 | 12 | $ conda create -n pyannote python=3.10 13 | $ conda activate pyannote 14 | $ pip install pyannote.audio 15 | 16 | 17 | API documentation 18 | ================= 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: pyannote-audio 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python==3.8.5 7 | - libsndfile==1.0.28 8 | - pip>=20.2 9 | - pip: 10 | - -r requirements.txt 11 | -------------------------------------------------------------------------------- /faq.yml: -------------------------------------------------------------------------------- 1 | # FAQtory settings 2 | 3 | faq_url: "https://github.com/pyannote/pyannote-audio/blob/develop/FAQ.md" # Replace this with the URL to your FAQ.md! 4 | 5 | questions_path: "./questions" # Where questions should be stored 6 | output_path: "./FAQ.md" # Where FAQ.md should be generated 7 | templates_path: ".faq" # Path to templates 8 | -------------------------------------------------------------------------------- /notebook/augmentation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# gett a 5s excerpt of first test file\n", 10 | "from pyannote.database import get_protocol, FileFinder\n", 11 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n", 12 | " preprocessors={\"audio\": FileFinder()})\n", 13 | "\n", 14 | "from pyannote.audio.core.io import Audio\n", 15 | "audio = Audio(sample_rate=16000, mono=\"downmix\")\n", 16 | "file = next(protocol.test())\n", 17 | "\n", 18 | "from pyannote.core import Segment\n", 19 | "waveform, sample_rate = audio.crop(file, Segment(5, 10))\n", 20 | "\n", 21 | "import torch\n", 22 | "waveforms = torch.tensor(waveform)[None, :]" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# play the excerpt\n", 32 | "from IPython.display import Audio as Play\n", 33 | "Play(waveforms.squeeze(), rate=sample_rate, normalize=False, autoplay=True)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# define a model that simply returns the waveform\n", 43 | "from pyannote.audio.core.model import Model\n", 44 | "class Passthrough(Model):\n", 45 | " def forward(self, waveforms):\n", 46 | " return waveforms\n", 47 | " \n", 48 | "identity = Passthrough()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# pass the waveform through this \"identity\" model\n", 58 | "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# add one torch_audiomentations waveform transform to the model\n", 68 | "from pyannote.audio.augmentation.registry import register_augmentation\n", 69 | "from torch_audiomentations import Gain\n", 70 | "gain = Gain(\n", 71 | " min_gain_in_db=-15.0,\n", 72 | " max_gain_in_db=5.0,\n", 73 | " p=0.5)\n", 74 | "register_augmentation(gain, identity, when='input')" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# pass the waveform through the \"augmented\" model\n", 84 | "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.7.9" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 4 109 | } 110 | -------------------------------------------------------------------------------- /notebook/freeze.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyannote.database import get_protocol, FileFinder\n", 10 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n", 11 | " preprocessors={\"audio\": FileFinder()})" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from pyannote.audio.tasks import VoiceActivityDetection\n", 21 | "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n", 22 | "import pytorch_lightning as pl" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "vad = VoiceActivityDetection(protocol, duration=2., batch_size=16, num_workers=4)\n", 32 | "model = SimpleSegmentationModel(task=vad)\n", 33 | "trainer = pl.Trainer(max_epochs=1)\n", 34 | "_ = trainer.fit(model)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "summary = model.summarize('full')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "model.freeze_up_to('lstm')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "model.unfreeze_up_to('mfcc.MelSpectrogram.spectrogram')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "model.freeze_by_name(['lstm', 'activation'])" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.8.5" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 4 95 | } 96 | -------------------------------------------------------------------------------- /notebook/sharing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyannote.database import get_protocol, FileFinder\n", 10 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n", 11 | " preprocessors={\"audio\": FileFinder()})" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Train a model" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from pyannote.audio.tasks import VoiceActivityDetection\n", 28 | "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n", 29 | "import pytorch_lightning as pl\n", 30 | "\n", 31 | "vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)\n", 32 | "model = SimpleSegmentationModel(task=vad)\n", 33 | "trainer = pl.Trainer(max_epochs=1, default_root_dir='sharing/')\n", 34 | "_ = trainer.fit(model)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Load a model without knowing its class" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from pyannote.audio import Model\n", 51 | "model = Model.from_pretrained('sharing/lightning_logs/version_0/checkpoints/epoch=0-step=3.ckpt')\n", 52 | "assert isinstance(model, SimpleSegmentationModel)\n", 53 | "\n", 54 | "# checkpoint should work with a URL as well (it relies on pl_load)" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.8.5" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /pyannote/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | __import__("pkg_resources").declare_namespace(__name__) 24 | -------------------------------------------------------------------------------- /pyannote/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | try: 24 | from .version import __version__, git_version # noqa: F401 25 | except ImportError: 26 | pass 27 | 28 | 29 | from .core.inference import Inference 30 | from .core.io import Audio 31 | from .core.model import Model 32 | from .core.pipeline import Pipeline 33 | 34 | __all__ = ["Audio", "Model", "Inference", "Pipeline"] 35 | -------------------------------------------------------------------------------- /pyannote/audio/augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .mix import MixSpeakerDiarization 25 | 26 | __all__ = ["MixSpeakerDiarization"] 27 | -------------------------------------------------------------------------------- /pyannote/audio/augmentation/mix.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Optional 25 | 26 | import torch 27 | from torch import Tensor 28 | from torch_audiomentations import Mix 29 | 30 | 31 | class MixSpeakerDiarization(Mix): 32 | """ 33 | Create a new sample by mixing it with another random sample from the same batch 34 | 35 | Signal-to-noise ratio (where "noise" is the second random sample) is selected 36 | randomly between `min_snr_in_db` and `max_snr_in_db`. 37 | 38 | Parameters 39 | ---------- 40 | min_snr_in_db : float, optional 41 | Defaults to 0.0 42 | max_snr_in_db : float, optional 43 | Defaults to 5.0 44 | max_num_speakers: int, optional 45 | Maximum number of speakers in mixtures. Defaults to actual maximum number 46 | of speakers in each batch. 47 | """ 48 | 49 | supported_modes = {"per_example", "per_channel"} 50 | 51 | supports_multichannel = True 52 | requires_sample_rate = False 53 | 54 | supports_target = True 55 | requires_target = True 56 | 57 | def __init__( 58 | self, 59 | min_snr_in_db: float = 0.0, 60 | max_snr_in_db: float = 5.0, 61 | mode: str = "per_example", 62 | p: float = 0.5, 63 | p_mode: Optional[str] = None, 64 | sample_rate: Optional[int] = None, 65 | target_rate: Optional[int] = None, 66 | max_num_speakers: Optional[int] = None, 67 | output_type: str = "tensor", 68 | ): 69 | super().__init__( 70 | min_snr_in_db=min_snr_in_db, 71 | max_snr_in_db=max_snr_in_db, 72 | mode=mode, 73 | p=p, 74 | p_mode=p_mode, 75 | sample_rate=sample_rate, 76 | target_rate=target_rate, 77 | output_type=output_type, 78 | ) 79 | self.max_num_speakers = max_num_speakers 80 | 81 | def randomize_parameters( 82 | self, 83 | samples: Optional[Tensor] = None, 84 | sample_rate: Optional[int] = None, 85 | targets: Optional[Tensor] = None, 86 | target_rate: Optional[int] = None, 87 | ): 88 | 89 | batch_size, num_channels, num_samples = samples.shape 90 | snr_distribution = torch.distributions.Uniform( 91 | low=torch.tensor( 92 | self.min_snr_in_db, 93 | dtype=torch.float32, 94 | device=samples.device, 95 | ), 96 | high=torch.tensor( 97 | self.max_snr_in_db, 98 | dtype=torch.float32, 99 | device=samples.device, 100 | ), 101 | validate_args=True, 102 | ) 103 | 104 | # randomize SNRs 105 | self.transform_parameters["snr_in_db"] = snr_distribution.sample( 106 | sample_shape=(batch_size,) 107 | ) 108 | 109 | # count number of active speakers per sample 110 | num_speakers: torch.Tensor = torch.sum(torch.any(targets, dim=-2), dim=-1) 111 | max_num_speakers = self.max_num_speakers or torch.max(num_speakers) 112 | 113 | # randomize index of second sample, constrained by the fact that the 114 | # resulting mixture should have less than max_num_speakers 115 | self.transform_parameters["sample_idx"] = torch.arange( 116 | batch_size, dtype=torch.int64 117 | ) 118 | for n in range(max_num_speakers + 1): 119 | 120 | # indices of samples with exactly n speakers 121 | samples_with_n_speakers = torch.where(num_speakers == n)[0] 122 | num_samples_with_n_speakers = len(samples_with_n_speakers) 123 | if num_samples_with_n_speakers == 0: 124 | continue 125 | 126 | # indices of candidate samples for mixing (i.e. samples that would) 127 | candidates = torch.where(num_speakers + n <= max_num_speakers)[0] 128 | num_candidates = len(candidates) 129 | if num_candidates == 0: 130 | continue 131 | 132 | # sample uniformly from candidate samples 133 | selected_candidates = candidates[ 134 | torch.randint( 135 | 0, 136 | num_candidates, 137 | (num_samples_with_n_speakers,), 138 | device=samples.device, 139 | ) 140 | ] 141 | self.transform_parameters["sample_idx"][ 142 | samples_with_n_speakers 143 | ] = selected_candidates 144 | -------------------------------------------------------------------------------- /pyannote/audio/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .pretrained import pretrained 24 | 25 | __all__ = [ 26 | "pretrained", 27 | ] 28 | -------------------------------------------------------------------------------- /pyannote/audio/cli/config/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | run: 4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 5 | 6 | sweep: 7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 8 | subdir: ${hydra.job.num} 9 | 10 | output_subdir: "" 11 | 12 | help: 13 | app_name: pyannote-audio-train 14 | 15 | # Help header, customize to describe your app to your users 16 | header: == ${hydra.help.app_name} == 17 | 18 | footer: |- 19 | Powered by Hydra (https://hydra.cc) 20 | Use --hydra-help to view Hydra specific help 21 | 22 | template: |- 23 | ${hydra.help.header} 24 | 25 | pyannote-audio-train protocol={protocol_name} 26 | task={task} task.param=... 27 | model={model} model.param=... 28 | optimizer={optimizer} optimizer.param=... 29 | scheduler={scheduler} scheduler.param=... 30 | 31 | ${hydra.help.footer} 32 | -------------------------------------------------------------------------------- /pyannote/audio/cli/evaluate.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Optional 25 | 26 | import hydra 27 | from omegaconf import DictConfig 28 | from pyannote.database import FileFinder, ProtocolFile, registry 29 | from rich.progress import Progress 30 | 31 | from pyannote.audio import Inference, Model 32 | from pyannote.audio.pipelines.utils import get_devices 33 | from pyannote.audio.utils.metric import DiscreteDiarizationErrorRate 34 | from pyannote.audio.utils.signal import binarize 35 | 36 | 37 | @hydra.main(config_path="evaluate_config", config_name="config") 38 | def evaluate(cfg: DictConfig) -> Optional[float]: 39 | 40 | # load pretrained model 41 | (device,) = get_devices(needs=1) 42 | model = Model.from_pretrained(cfg.model, device=device) 43 | 44 | # load databases into registry if it was specified 45 | if "registry" in cfg: 46 | for database_yml in cfg.registry.split(","): 47 | registry.load_database(database_yml) 48 | 49 | # load evaluation files 50 | protocol = registry.get_protocol( 51 | cfg.protocol, preprocessors={"audio": FileFinder()} 52 | ) 53 | 54 | files = list(getattr(protocol, cfg.subset)()) 55 | 56 | # load evaluation metric 57 | metric = DiscreteDiarizationErrorRate() 58 | 59 | with Progress() as progress: 60 | 61 | main_task = progress.add_task(protocol.name, total=len(files)) 62 | file_task = progress.add_task("Processing", total=1.0) 63 | 64 | def progress_hook(completed: Optional[int] = None, total: Optional[int] = None): 65 | progress.update(file_task, completed=completed / total) 66 | 67 | inference = Inference(model, device=device) 68 | warm_up = cfg.warm_up / inference.duration 69 | 70 | def hypothesis(file: ProtocolFile): 71 | return Inference.trim( 72 | binarize(inference(file, hook=progress_hook)), 73 | warm_up=(warm_up, warm_up), 74 | ) 75 | 76 | for file in files: 77 | progress.update(file_task, description=file["uri"]) 78 | reference = file["annotation"] 79 | uem = file["annotated"] 80 | _ = metric(reference, hypothesis(file), uem=uem) 81 | progress.advance(main_task) 82 | 83 | report = metric.report(display=False) 84 | 85 | with open("report.txt", "w") as f: 86 | 87 | f.write(f"# Model: {cfg.model}\n") 88 | f.write(f"# Protocol: {protocol.name}\n") 89 | f.write(f"# Subset: {cfg.subset}\n") 90 | f.write("\n") 91 | report = report.to_string( 92 | index=True, 93 | sparsify=False, 94 | justify="right", 95 | float_format=lambda f: "{0:.2f}".format(f), 96 | ) 97 | f.write(f"{report}") 98 | 99 | 100 | if __name__ == "__main__": 101 | evaluate() 102 | -------------------------------------------------------------------------------- /pyannote/audio/cli/evaluate_config/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/cli/evaluate_config/config.yaml: -------------------------------------------------------------------------------- 1 | model: ??? 2 | protocol: ??? 3 | warm_up: 0.0 4 | subset: test 5 | 6 | defaults: 7 | - hydra: default 8 | -------------------------------------------------------------------------------- /pyannote/audio/cli/evaluate_config/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | run: 4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 5 | 6 | sweep: 7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 8 | subdir: ${hydra.job.num} 9 | 10 | output_subdir: "" 11 | 12 | help: 13 | app_name: pyannote-audio-eval 14 | 15 | # Help header, customize to describe your app to your users 16 | header: == ${hydra.help.app_name} == 17 | 18 | footer: |- 19 | Powered by Hydra (https://hydra.cc) 20 | Use --hydra-help to view Hydra specific help 21 | 22 | template: |- 23 | ${hydra.help.header} 24 | 25 | pyannote-audio-eval registry={path_to_database.yml} 26 | protocol={protocol_name} 27 | subset={test | development | train} 28 | model={path_to_pretrained_model} 29 | warm_up={warm_up_duration_in_seconds} 30 | 31 | ${hydra.help.footer} 32 | -------------------------------------------------------------------------------- /pyannote/audio/cli/lr_schedulers/CosineAnnealingWarmRestarts.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import Optional 24 | 25 | from torch.optim import Optimizer 26 | from torch.optim.lr_scheduler import ( 27 | CosineAnnealingWarmRestarts as _CosineAnnealingWarmRestarts, 28 | ) 29 | 30 | 31 | def CosineAnnealingWarmRestarts( 32 | optimizer: Optimizer, 33 | min_lr: float = 1e-8, 34 | max_lr: float = 1e-3, 35 | patience: int = 1, 36 | num_batches_per_epoch: Optional[int] = None, 37 | **kwargs, 38 | ): 39 | """Wrapper around CosineAnnealingWarmRestarts 40 | 41 | Parameters 42 | ---------- 43 | optimizer : Optimizer 44 | Optimizer 45 | min_lr : float, optional 46 | Defaults to 1e-8. 47 | max_lr : float, optional 48 | Defaults to 1e-3 49 | patience : int, optional 50 | Number of epochs per cycle. Defaults to 1. 51 | num_batches_per_epoch : int, optional 52 | Number of batches per epoch. 53 | """ 54 | 55 | # initialize optimizer lr to max_lr 56 | for g in optimizer.param_groups: 57 | g["lr"] = max_lr 58 | 59 | num_steps = patience * num_batches_per_epoch 60 | 61 | return { 62 | "scheduler": _CosineAnnealingWarmRestarts( 63 | optimizer, num_steps, eta_min=min_lr, T_mult=2 64 | ), 65 | "interval": "step", 66 | } 67 | -------------------------------------------------------------------------------- /pyannote/audio/cli/lr_schedulers/CyclicLR.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import Optional 24 | 25 | from torch.optim import Optimizer 26 | from torch.optim.lr_scheduler import CyclicLR as _CyclicLR 27 | 28 | 29 | def CyclicLR( 30 | optimizer: Optimizer, 31 | min_lr: float = 1e-8, 32 | max_lr: float = 1e-3, 33 | mode: str = "triangular2", 34 | patience: int = 50, 35 | num_batches_per_epoch: Optional[int] = None, 36 | **kwargs, 37 | ): 38 | """Wrapper around CyclicLR learning rate scheduler 39 | 40 | Parameters 41 | ---------- 42 | optimizer : Optimizer 43 | Optimizer 44 | min_lr : float, optional 45 | Defaults to 1e-8. 46 | max_lr : float, optional 47 | Defaults to 1e-3 48 | patience : int, optional 49 | Number of epochs per cycle. Defaults to 50. 50 | num_batches_per_epoch : int, optional 51 | Number of batches per epoch. 52 | mode : {"triangular", "triangular2"}, optional 53 | Defaults to "triangular2". 54 | """ 55 | 56 | step_size_up = int(0.5 * patience * num_batches_per_epoch) 57 | 58 | return { 59 | "scheduler": _CyclicLR( 60 | optimizer, 61 | base_lr=min_lr, 62 | max_lr=max_lr, 63 | step_size_up=step_size_up, 64 | mode=mode, 65 | cycle_momentum=False, 66 | ), 67 | "interval": "step", 68 | } 69 | -------------------------------------------------------------------------------- /pyannote/audio/cli/lr_schedulers/ReduceLROnPlateau.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Optional, Text 25 | 26 | from torch.optim import Optimizer 27 | from torch.optim.lr_scheduler import ReduceLROnPlateau as _ReduceLROnPlateau 28 | 29 | 30 | def ReduceLROnPlateau( 31 | optimizer: Optimizer, 32 | monitor: Optional[Text] = None, 33 | direction: Optional[Text] = "min", 34 | min_lr: float = 1e-8, 35 | max_lr: float = 1e-3, 36 | factor: float = 0.5, 37 | patience: int = 50, 38 | **kwargs, 39 | ): 40 | """Wrapper around ReduceLROnPlateau learning rate scheduler 41 | 42 | Parameters 43 | ---------- 44 | optimizer : Optimizer 45 | Optimizer 46 | min_lr : float, optional 47 | Defaults to 1e-8. 48 | max_lr : float, optional 49 | Defaults to 1e-3 50 | factor : float, optional 51 | Defaults to 0.5 52 | patience : int, optional 53 | Wait that many epochs with no improvement before reducing the learning rate. 54 | Defaults to 50. 55 | monitor : str, optional 56 | Value to monitor 57 | direction : {"min", "max"}, optional 58 | "min" (resp. "max") means smaller (resp. larger) is better. 59 | """ 60 | 61 | # initialize optimizer lr to max_lr 62 | for g in optimizer.param_groups: 63 | g["lr"] = max_lr 64 | 65 | return { 66 | "scheduler": _ReduceLROnPlateau( 67 | optimizer, 68 | mode=direction, 69 | factor=factor, 70 | patience=patience, 71 | threshold=0.0001, 72 | threshold_mode="rel", 73 | cooldown=0, 74 | min_lr=min_lr, 75 | eps=1e-08, 76 | verbose=False, 77 | ), 78 | "interval": "epoch", 79 | "monitor": monitor, 80 | "strict": True, 81 | } 82 | -------------------------------------------------------------------------------- /pyannote/audio/cli/lr_schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .CosineAnnealingWarmRestarts import CosineAnnealingWarmRestarts 25 | from .CyclicLR import CyclicLR 26 | from .ReduceLROnPlateau import ReduceLROnPlateau 27 | 28 | __all__ = ["ReduceLROnPlateau", "CyclicLR", "CosineAnnealingWarmRestarts"] 29 | -------------------------------------------------------------------------------- /pyannote/audio/cli/pretrained.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Text 25 | from pyannote.audio import Model 26 | 27 | 28 | def pretrained(checkpoint: Text): 29 | return Model.from_pretrained(checkpoint, map_location=lambda storage, loc: storage) 30 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/config.yaml: -------------------------------------------------------------------------------- 1 | protocol: ??? 2 | 3 | defaults: 4 | - task: SpeakerDiarization 5 | - model: PyanNet 6 | - optimizer: Adam 7 | - scheduler: CosineAnnealingWarmRestarts 8 | - trainer: default 9 | - hydra: default 10 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | 3 | run: 4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 5 | 6 | sweep: 7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ} 8 | subdir: ${hydra.job.num} 9 | 10 | output_subdir: "" 11 | 12 | help: 13 | app_name: pyannote-audio-train 14 | 15 | # Help header, customize to describe your app to your users 16 | header: == ${hydra.help.app_name} == 17 | 18 | footer: |- 19 | Powered by Hydra (https://hydra.cc) 20 | Use --hydra-help to view Hydra specific help 21 | 22 | template: |- 23 | ${hydra.help.header} 24 | 25 | pyannote-audio-train protocol={protocol_name} 26 | +task={task} task.param=... 27 | +model={model} model.param=... 28 | optimizer={optimizer} optimizer.param=... 29 | scheduler={scheduler} scheduler.param=... 30 | 31 | ${hydra.help.footer} 32 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/DebugEmbedding.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.embedding.debug.SimpleEmbeddingModel 3 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/DebugSegmentation.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.segmentation.debug.SimpleSegmentationModel 3 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/Pretrained.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.cli.pretrained 3 | checkpoint: ??? 4 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/PyanNet.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.segmentation.PyanNet 3 | sincnet: 4 | stride: 10 5 | lstm: 6 | hidden_size: 128 7 | num_layers: 2 8 | bidirectional: true 9 | monolithic: true 10 | dropout: 0.5 11 | linear: 12 | hidden_size: 128 13 | num_layers: 2 -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/SSeRiouSS.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.segmentation.SSeRiouSS 3 | wav2vec: WAVLM_BASE 4 | wav2vec_layer: -1 5 | lstm: 6 | hidden_size: 128 7 | num_layers: 4 8 | bidirectional: true 9 | monolithic: true 10 | dropout: 0.5 11 | linear: 12 | hidden_size: 128 13 | num_layers: 2 14 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/XVectorMFCC.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.embedding.XVectorMFCC 3 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/model/XVectorSincNet.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.models.embedding.XVectorSincNet 3 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/optimizer/Adam.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: torch.optim.Adam 3 | lr: 1e-3 4 | betas: [0.9, 0.999] 5 | eps: 1e-08 6 | weight_decay: 0 7 | amsgrad: False 8 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/optimizer/AdamW.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: torch.optim.AdamW 3 | lr: 1e-3 4 | betas: [0.9, 0.999] 5 | eps: 1e-08 6 | weight_decay: 0.01 7 | amsgrad: False 8 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/optimizer/Adan.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: adan_pytorch.Adan 3 | lr: 1e-3 4 | betas: [0.1, 0.1, 0.001] 5 | weight_decay: 0.0 6 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/preprocessor/LowerTemporalResolution.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.utils.preprocessors.LowerTemporalResolution 3 | resolution: 0.1 4 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/scheduler/CosineAnnealingWarmRestarts.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.cli.lr_schedulers.CosineAnnealingWarmRestarts 3 | min_lr: 1e-8 4 | max_lr: 1e-3 5 | patience: 1 6 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/scheduler/CyclicLR.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.cli.lr_schedulers.CyclicLR 3 | min_lr: 1e-8 4 | max_lr: 1e-3 5 | mode: triangular2 6 | patience: 50 7 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/scheduler/ReduceLROnPlateau.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.cli.lr_schedulers.ReduceLROnPlateau 3 | min_lr: 1e-8 4 | max_lr: 1e-3 5 | factor: 0.5 6 | patience: 50 7 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/task/MultiLabelSegmentation.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.tasks.MultiLabelSegmentation 3 | duration: 3.0 4 | warm_up: 0.0 5 | balance: null 6 | weight: null 7 | batch_size: 32 8 | num_workers: null 9 | pin_memory: False 10 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/task/OverlappedSpeechDetection.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.tasks.OverlappedSpeechDetection 3 | duration: 3.0 4 | warm_up: 0.0 5 | balance: null 6 | overlap: 7 | probability: 0.5 8 | snr_min: 0.0 9 | snr_max: 10.0 10 | weight: null 11 | batch_size: 32 12 | num_workers: null 13 | pin_memory: False 14 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/task/SpeakerDiarization.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.tasks.SpeakerDiarization 3 | duration: 5.0 4 | max_speakers_per_chunk: 3 5 | max_speakers_per_frame: 2 6 | batch_size: 32 7 | num_workers: 10 8 | pin_memory: False 9 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/task/SpeakerEmbedding.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.tasks.SupervisedRepresentationLearningWithArcFace 3 | min_duration: 2.0 4 | duration: 5.0 5 | num_classes_per_batch: 512 6 | num_chunks_per_class: 1 7 | margin: 2.0 8 | scale: 12.0 9 | num_workers: null 10 | pin_memory: False 11 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/task/VoiceActivityDetection.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pyannote.audio.tasks.VoiceActivityDetection 3 | duration: 3.0 4 | warm_up: 0.0 5 | balance: null 6 | weight: null 7 | batch_size: 32 8 | num_workers: null 9 | pin_memory: False 10 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pytorch_lightning.Trainer 3 | accelerator: auto 4 | accumulate_grad_batches: 1 5 | benchmark: null # TODO: automatically set to True when using fixed duration chunks 6 | deterministic: False 7 | check_val_every_n_epoch: 1 8 | devices: auto 9 | detect_anomaly: False 10 | enable_checkpointing: True 11 | enable_model_summary: True 12 | enable_progress_bar: True 13 | fast_dev_run: False 14 | gradient_clip_val: null 15 | gradient_clip_algorithm: norm 16 | limit_predict_batches: 1.0 17 | limit_test_batches: 1.0 18 | limit_train_batches: 1.0 19 | limit_val_batches: 1.0 20 | log_every_n_steps: 50 21 | max_epochs: 1000 22 | max_steps: -1 23 | max_time: null 24 | min_epochs: 1 25 | min_steps: null 26 | num_nodes: 1 27 | num_sanity_val_steps: 2 28 | overfit_batches: 0.0 29 | precision: 32 30 | profiler: null 31 | reload_dataloaders_every_n_epochs: 0 32 | use_distributed_sampler: True # TODO: check what this does exactly 33 | strategy: auto 34 | sync_batchnorm: False 35 | val_check_interval: 1.0 36 | -------------------------------------------------------------------------------- /pyannote/audio/cli/train_config/trainer/fast_dev_run.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: pytorch_lightning.Trainer 3 | fast_dev_run: True 4 | -------------------------------------------------------------------------------- /pyannote/audio/core/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/core/callback.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import List, Mapping, Optional, Text, Union 24 | 25 | from pytorch_lightning import Callback, Trainer 26 | from pytorch_lightning.utilities.model_summary import ModelSummary 27 | 28 | from pyannote.audio import Model 29 | 30 | 31 | class GraduallyUnfreeze(Callback): 32 | """Gradually unfreeze layers 33 | 34 | 1. Start training with all layers frozen, but those that depends on the task 35 | (i.e. those instantiated in model.build() and task.setup_loss_func() 36 | 2. Train for a few epochs and unfreeze a few more layers 37 | 3. Repeat 38 | 39 | Parameters 40 | ---------- 41 | schedule: 42 | See examples for supported format. 43 | epochs_per_stage : int, optional 44 | Number of epochs between each stage. Defaults to 1. 45 | Has no effect if schedule is provided as a {layer_name: epoch} dictionary. 46 | 47 | Usage 48 | ----- 49 | >>> callback = GraduallyUnfreeze() 50 | >>> Trainer(callbacks=[callback]).fit(model) 51 | 52 | Examples 53 | -------- 54 | # for a model with PyanNet architecture (sincnet > lstm > linear > task_specific), 55 | # those are equivalent and will unfreeze 'linear' at epoch 1, 'lstm' at epoch 2, 56 | # and 'sincnet' at epoch 3. 57 | GraduallyUnfreeze() 58 | GraduallyUnfreeze(schedule=['linear', 'lstm', 'sincnet']) 59 | GraduallyUnfreeze(schedule={'linear': 1, 'lstm': 2, 'sincnet': 3}) 60 | 61 | # the following syntax is also possible (with its dict-based equivalent just below): 62 | GraduallyUnfreeze(schedule=['linear', ['lstm', 'sincnet']], epochs_per_stage=10) 63 | GraduallyUnfreeze(schedule={'linear': 10, 'lstm': 20, 'sincnet': 20}) 64 | # will unfreeze 'linear' at epoch 10, and both 'lstm' and 'sincnet' at epoch 20. 65 | """ 66 | 67 | def __init__( 68 | self, 69 | schedule: Union[Mapping[Text, int], List[Union[List[Text], Text]]] = None, 70 | epochs_per_stage: Optional[int] = None, 71 | ): 72 | super().__init__() 73 | 74 | if ( 75 | (schedule is None) or (isinstance(schedule, List)) 76 | ) and epochs_per_stage is None: 77 | epochs_per_stage = 1 78 | 79 | self.epochs_per_stage = epochs_per_stage 80 | self.schedule = schedule 81 | 82 | def on_fit_start(self, trainer: Trainer, model: Model): 83 | 84 | schedule = self.schedule 85 | 86 | task_specific_layers = model.task_dependent 87 | backbone_layers = [ 88 | layer 89 | for layer, _ in reversed(ModelSummary(model, max_depth=1).named_modules) 90 | if layer not in task_specific_layers 91 | ] 92 | 93 | if schedule is None: 94 | schedule = backbone_layers 95 | 96 | if isinstance(schedule, List): 97 | _schedule = dict() 98 | for depth, layers in enumerate(schedule): 99 | layers = layers if isinstance(layers, List) else [layers] 100 | for layer in layers: 101 | _schedule[layer] = (depth + 1) * self.epochs_per_stage 102 | schedule = _schedule 103 | 104 | self.schedule = schedule 105 | 106 | # freeze all but task specific layers 107 | for layer in backbone_layers: 108 | model.freeze_by_name(layer) 109 | 110 | def on_train_epoch_start(self, trainer: Trainer, model: Model): 111 | for layer, epoch in self.schedule.items(): 112 | if epoch == trainer.current_epoch: 113 | model.unfreeze_by_name(layer) 114 | -------------------------------------------------------------------------------- /pyannote/audio/models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/models/blocks/pooling.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import warnings 24 | from typing import Optional 25 | 26 | import torch 27 | import torch.nn as nn 28 | import torch.nn.functional as F 29 | 30 | 31 | def _pool(sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor: 32 | """Helper function to compute statistics pooling 33 | 34 | Assumes that weights are already interpolated to match the number of frames 35 | in sequences and that they encode the activation of only one speaker. 36 | 37 | Parameters 38 | ---------- 39 | sequences : (batch, features, frames) torch.Tensor 40 | Sequences of features. 41 | weights : (batch, frames) torch.Tensor 42 | (Already interpolated) weights. 43 | 44 | Returns 45 | ------- 46 | output : (batch, 2 * features) torch.Tensor 47 | Concatenation of mean and (unbiased) standard deviation. 48 | """ 49 | 50 | weights = weights.unsqueeze(dim=1) 51 | # (batch, 1, frames) 52 | 53 | v1 = weights.sum(dim=2) + 1e-8 54 | mean = torch.sum(sequences * weights, dim=2) / v1 55 | 56 | dx2 = torch.square(sequences - mean.unsqueeze(2)) 57 | v2 = torch.square(weights).sum(dim=2) 58 | 59 | var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8) 60 | std = torch.sqrt(var) 61 | 62 | return torch.cat([mean, std], dim=1) 63 | 64 | 65 | class StatsPool(nn.Module): 66 | """Statistics pooling 67 | 68 | Compute temporal mean and (unbiased) standard deviation 69 | and returns their concatenation. 70 | 71 | Reference 72 | --------- 73 | https://en.wikipedia.org/wiki/Weighted_arithmetic_mean 74 | 75 | """ 76 | 77 | def forward( 78 | self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None 79 | ) -> torch.Tensor: 80 | """Forward pass 81 | 82 | Parameters 83 | ---------- 84 | sequences : (batch, features, frames) torch.Tensor 85 | Sequences of features. 86 | weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional 87 | Compute weighted mean and standard deviation, using provided `weights`. 88 | 89 | Note 90 | ---- 91 | `sequences` and `weights` might use a different number of frames, in which case `weights` 92 | are interpolated linearly to reach the number of frames in `sequences`. 93 | 94 | Returns 95 | ------- 96 | output : (batch, 2 * features) or (batch, speakers, 2 * features) torch.Tensor 97 | Concatenation of mean and (unbiased) standard deviation. When `weights` are 98 | provided with the `speakers` dimension, `output` is computed for each speaker 99 | separately and returned as (batch, speakers, 2 * channel)-shaped tensor. 100 | """ 101 | 102 | if weights is None: 103 | mean = sequences.mean(dim=-1) 104 | std = sequences.std(dim=-1, correction=1) 105 | return torch.cat([mean, std], dim=-1) 106 | 107 | if weights.dim() == 2: 108 | has_speaker_dimension = False 109 | weights = weights.unsqueeze(dim=1) 110 | # (batch, frames) -> (batch, 1, frames) 111 | else: 112 | has_speaker_dimension = True 113 | 114 | # interpolate weights if needed 115 | _, _, num_frames = sequences.size() 116 | _, num_speakers, num_weights = weights.size() 117 | if num_frames != num_weights: 118 | warnings.warn( 119 | f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers." 120 | ) 121 | weights = F.interpolate(weights, size=num_frames, mode="nearest") 122 | 123 | output = torch.stack( 124 | [ 125 | _pool(sequences, weights[:, speaker, :]) 126 | for speaker in range(num_speakers) 127 | ], 128 | dim=1, 129 | ) 130 | 131 | if not has_speaker_dimension: 132 | return output.squeeze(dim=1) 133 | 134 | return output 135 | -------------------------------------------------------------------------------- /pyannote/audio/models/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .wespeaker import ( 25 | WeSpeakerResNet34, 26 | WeSpeakerResNet152, 27 | WeSpeakerResNet221, 28 | WeSpeakerResNet293, 29 | ) 30 | from .xvector import XVectorMFCC, XVectorSincNet 31 | 32 | __all__ = [ 33 | "XVectorSincNet", 34 | "XVectorMFCC", 35 | "WeSpeakerResNet34", 36 | "WeSpeakerResNet152", 37 | "WeSpeakerResNet221", 38 | "WeSpeakerResNet293", 39 | ] 40 | -------------------------------------------------------------------------------- /pyannote/audio/models/embedding/debug.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from functools import lru_cache 25 | from typing import Optional 26 | 27 | import torch 28 | import torch.nn as nn 29 | from einops import rearrange, reduce 30 | from torchaudio.transforms import MFCC 31 | 32 | from pyannote.audio.core.model import Model 33 | from pyannote.audio.core.task import Task 34 | 35 | 36 | class SimpleEmbeddingModel(Model): 37 | def __init__( 38 | self, 39 | sample_rate: int = 16000, 40 | num_channels: int = 1, 41 | task: Optional[Task] = None, 42 | ): 43 | super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task) 44 | 45 | self.mfcc = MFCC( 46 | sample_rate=self.hparams.sample_rate, 47 | n_mfcc=40, 48 | dct_type=2, 49 | norm="ortho", 50 | log_mels=False, 51 | ) 52 | 53 | self.lstm = nn.LSTM( 54 | self.mfcc.n_mfcc * self.hparams.num_channels, 55 | 32, 56 | num_layers=1, 57 | batch_first=True, 58 | bidirectional=True, 59 | ) 60 | 61 | @lru_cache 62 | def num_frames(self, num_samples: int) -> int: 63 | """Compute number of output frames for a given number of input samples 64 | 65 | Parameters 66 | ---------- 67 | num_samples : int 68 | Number of input samples 69 | 70 | Returns 71 | ------- 72 | num_frames : int 73 | Number of output frames 74 | 75 | Source 76 | ------ 77 | https://pytorch.org/docs/stable/generated/torch.stft.html#torch.stft 78 | 79 | """ 80 | 81 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length 82 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft 83 | center = self.mfcc.MelSpectrogram.spectrogram.center 84 | 85 | if center: 86 | return 1 + num_samples // hop_length 87 | else: 88 | return 1 + (num_samples - n_fft) // hop_length 89 | 90 | def receptive_field_size(self, num_frames: int = 1) -> int: 91 | """Compute size of receptive field 92 | 93 | Parameters 94 | ---------- 95 | num_frames : int, optional 96 | Number of frames in the output signal 97 | 98 | Returns 99 | ------- 100 | receptive_field_size : int 101 | Receptive field size. 102 | """ 103 | 104 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length 105 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft 106 | return n_fft + (num_frames - 1) * hop_length 107 | 108 | def receptive_field_center(self, frame: int = 0) -> int: 109 | """Compute center of receptive field 110 | 111 | Parameters 112 | ---------- 113 | frame : int, optional 114 | Frame index 115 | 116 | Returns 117 | ------- 118 | receptive_field_center : int 119 | Index of receptive field center. 120 | """ 121 | 122 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length 123 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft 124 | center = self.mfcc.MelSpectrogram.spectrogram.center 125 | 126 | if center: 127 | return frame * hop_length 128 | else: 129 | return frame * hop_length + n_fft // 2 130 | 131 | @property 132 | def dimension(self) -> int: 133 | """Dimension of output""" 134 | return 64 135 | 136 | def forward(self, waveforms: torch.Tensor) -> torch.Tensor: 137 | """ 138 | 139 | Parameters 140 | ---------- 141 | waveforms : (batch, time, channel) 142 | 143 | Returns 144 | ------- 145 | embedding : (batch, dimension) 146 | """ 147 | 148 | mfcc = self.mfcc(waveforms) 149 | output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)")) 150 | # mean temporal pooling 151 | return reduce(output, "b t f -> b f", "mean") 152 | -------------------------------------------------------------------------------- /pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com) 2 | 2022 Zhengyang Chen (chenzhengyang117@gmail.com) 3 | 2023 Bing Han (hanbing97@sjtu.edu.cn) 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | File `resnet.py` has been borrowed from WeSpeaker that is available under the Apache License, Version 2.0. 18 | 19 | The original file is available at https://github.com/wenet-e2e/wespeaker/blob/c20d765295359e681321625fbefc1a02e8794163/wespeaker/models/resnet.py 20 | 21 | Neither Shuai Wang (@wsstriving on Github) nor myself (Hervé Bredin, or @hbredin on Github) are lawyers, but we both agreed that putting this license file in this directory is enough to comply with the license. See https://github.com/pyannote/pyannote-audio/issues/1537#issuecomment-1808029836. If you know better about this potential MIT/Apache 2.0 compatibility issue, please let us know. 22 | -------------------------------------------------------------------------------- /pyannote/audio/models/embedding/wespeaker/convert.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # Script used to convert from WeSpeaker to pyannote.audio 24 | 25 | import sys 26 | from pathlib import Path 27 | 28 | import pytorch_lightning as pl 29 | import torch 30 | 31 | import pyannote.audio.models.embedding.wespeaker as wespeaker 32 | from pyannote.audio import Model 33 | from pyannote.audio.core.task import Problem, Resolution, Specifications 34 | 35 | wespeaker_checkpoint_dir = sys.argv[1] # /path/to/wespeaker_cnceleb-resnet34-LM 36 | 37 | wespeaker_checkpoint = Path(wespeaker_checkpoint_dir) / "wespeaker.pt" 38 | 39 | depth = Path(wespeaker_checkpoint_dir).parts[-1].split("-")[-2][6:] # '34' 40 | Klass = getattr(wespeaker, f"WeSpeakerResNet{depth}") # WeSpeakerResNet34 41 | 42 | duration = 5.0 # whatever 43 | specifications = Specifications( 44 | problem=Problem.REPRESENTATION, resolution=Resolution.CHUNK, duration=duration 45 | ) 46 | 47 | state_dict = torch.load(wespeaker_checkpoint, map_location=torch.device("cpu")) 48 | state_dict.pop("projection.weight") 49 | 50 | model = Klass() 51 | model.resnet.load_state_dict(state_dict, strict=True) 52 | model.specifications = specifications 53 | 54 | checkpoint = {"state_dict": model.state_dict()} 55 | model.on_save_checkpoint(checkpoint) 56 | checkpoint["pytorch-lightning_version"] = pl.__version__ 57 | 58 | pyannote_checkpoint = Path(wespeaker_checkpoint_dir) / "pytorch_model.bin" 59 | torch.save(checkpoint, pyannote_checkpoint) 60 | 61 | model = Model.from_pretrained(pyannote_checkpoint) 62 | print(model) 63 | -------------------------------------------------------------------------------- /pyannote/audio/models/segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .PyanNet import PyanNet 24 | from .SSeRiouSS import SSeRiouSS 25 | 26 | __all__ = ["PyanNet", "SSeRiouSS"] 27 | -------------------------------------------------------------------------------- /pyannote/audio/models/separation/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .ToTaToNet import ToTaToNet 24 | 25 | __all__ = ["ToTaToNet"] 26 | -------------------------------------------------------------------------------- /pyannote/audio/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2022 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .multilabel import MultiLabelSegmentation 24 | from .overlapped_speech_detection import OverlappedSpeechDetection 25 | from .resegmentation import Resegmentation 26 | from .speaker_diarization import SpeakerDiarization 27 | from .speech_separation import SpeechSeparation 28 | from .voice_activity_detection import VoiceActivityDetection 29 | 30 | __all__ = [ 31 | "VoiceActivityDetection", 32 | "OverlappedSpeechDetection", 33 | "SpeakerDiarization", 34 | "Resegmentation", 35 | "MultiLabelSegmentation", 36 | "SpeechSeparation", 37 | ] 38 | -------------------------------------------------------------------------------- /pyannote/audio/pipelines/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .diarization import SpeakerDiarizationMixin 24 | from .getter import ( 25 | PipelineAugmentation, 26 | PipelineInference, 27 | PipelineModel, 28 | get_augmentation, 29 | get_devices, 30 | get_inference, 31 | get_model, 32 | ) 33 | from .oracle import oracle_segmentation 34 | 35 | __all__ = [ 36 | "SpeakerDiarizationMixin", 37 | "oracle_segmentation", 38 | "get_augmentation", 39 | "PipelineAugmentation", 40 | "get_devices", 41 | "get_inference", 42 | "PipelineInference", 43 | "get_model", 44 | "PipelineModel", 45 | ] 46 | -------------------------------------------------------------------------------- /pyannote/audio/pipelines/utils/oracle.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import Optional, Union 24 | 25 | import numpy as np 26 | from pyannote.core import Annotation, Segment, SlidingWindow, SlidingWindowFeature 27 | 28 | from pyannote.audio.core.io import Audio, AudioFile 29 | 30 | 31 | def oracle_segmentation( 32 | file: AudioFile, 33 | window: SlidingWindow, 34 | frames: Union[SlidingWindow, float], 35 | num_speakers: Optional[int] = None, 36 | ) -> SlidingWindowFeature: 37 | """Oracle speaker segmentation 38 | 39 | Simulates inference based on an (imaginary) oracle segmentation model: 40 | 41 | >>> oracle = Model.from_pretrained("oracle") 42 | >>> assert frames == oracle.receptive_field 43 | >>> inference = Inference(oracle, duration=window.duration, step=window.step, skip_aggregation=True) 44 | >>> oracle_segmentation = inference(file) 45 | 46 | Parameters 47 | ---------- 48 | file : AudioFile 49 | Audio file with "annotation". 50 | window : SlidingWindow 51 | Sliding window used for inference (see above) 52 | frames : SlidingWindow or float 53 | Output resolution of the oracle model (see above) 54 | num_speakers : int, optional 55 | Override the number of speakers returned by the oracle segmentation model 56 | Defaults to the actual number of speakers in the whole file 57 | 58 | Returns 59 | ------- 60 | oracle_segmentation : (num_chunks, num_frames, num_speakers) SlidingWindowFeature 61 | Oracle segmentation. 62 | """ 63 | 64 | if "duration" not in file: 65 | duration = Audio(mono="downmix").get_duration(file) 66 | else: 67 | duration: float = file["duration"] 68 | reference: Annotation = file["annotation"] 69 | 70 | if not isinstance(frames, SlidingWindow): 71 | frames = SlidingWindow(start=0.0, step=frames, duration=frames) 72 | 73 | labels = reference.labels() 74 | actual_num_speakers = len(labels) 75 | if num_speakers is None: 76 | num_speakers = actual_num_speakers 77 | 78 | if num_speakers > actual_num_speakers: 79 | num_missing = num_speakers - actual_num_speakers 80 | labels += [ 81 | f"FakeSpeakerForOracleSegmentationInference{i:d}" 82 | for i in range(num_missing) 83 | ] 84 | 85 | window = SlidingWindow(start=0.0, duration=window.duration, step=window.step) 86 | 87 | segmentations = [] 88 | for chunk in window(Segment(0.0, duration)): 89 | chunk_segmentation: SlidingWindowFeature = reference.discretize( 90 | chunk, 91 | resolution=frames, 92 | labels=labels, 93 | duration=window.duration, 94 | ) 95 | 96 | if num_speakers < actual_num_speakers: 97 | # keep `num_speakers` most talkative speakers 98 | most_talkative_index = np.argsort(-np.sum(chunk_segmentation, axis=0))[ 99 | :num_speakers 100 | ] 101 | chunk_segmentation = chunk_segmentation[:, most_talkative_index] 102 | 103 | segmentations.append(chunk_segmentation) 104 | 105 | return SlidingWindowFeature(np.float32(np.stack(segmentations)), window) 106 | -------------------------------------------------------------------------------- /pyannote/audio/sample/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from pathlib import Path 25 | 26 | from pyannote.core import Annotation, Segment, Timeline 27 | from pyannote.database.util import load_rttm 28 | 29 | from pyannote.audio.core.io import Audio, AudioFile 30 | 31 | 32 | def _sample() -> AudioFile: 33 | sample_wav = Path(__file__).parent / "sample.wav" 34 | uri = "sample" 35 | 36 | audio = Audio() 37 | waveform, sample_rate = audio(sample_wav) 38 | 39 | sample_rttm = Path(__file__).parent / "sample.rttm" 40 | 41 | annotation: Annotation = load_rttm(sample_rttm)[uri] 42 | duration = audio.get_duration(sample_wav) 43 | 44 | annotated: Timeline = Timeline([Segment(0.0, duration)], uri=uri) 45 | 46 | return { 47 | "audio": sample_wav, 48 | "uri": "sample", 49 | "waveform": waveform, 50 | "sample_rate": sample_rate, 51 | "annotation": annotation, 52 | "annotated": annotated, 53 | } 54 | 55 | 56 | SAMPLE_FILE = _sample() 57 | -------------------------------------------------------------------------------- /pyannote/audio/sample/sample.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER sample 1 6.690 0.430 speaker90 2 | SPEAKER sample 1 7.550 0.800 speaker91 3 | SPEAKER sample 1 8.320 1.700 speaker90 4 | SPEAKER sample 1 9.920 1.110 speaker91 5 | SPEAKER sample 1 10.570 4.130 speaker90 6 | SPEAKER sample 1 14.490 3.430 speaker91 7 | SPEAKER sample 1 18.050 3.440 speaker90 8 | SPEAKER sample 1 18.150 0.440 speaker91 9 | SPEAKER sample 1 21.780 6.720 speaker91 10 | SPEAKER sample 1 27.850 2.150 speaker90 11 | -------------------------------------------------------------------------------- /pyannote/audio/sample/sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/pyannote/audio/sample/sample.wav -------------------------------------------------------------------------------- /pyannote/audio/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020-2021 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .segmentation.multilabel import MultiLabelSegmentation # isort:skip 24 | from .segmentation.speaker_diarization import SpeakerDiarization # isort:skip 25 | from .separation.PixIT import PixIT # isort:skip 26 | from .segmentation.voice_activity_detection import VoiceActivityDetection # isort:skip 27 | from .segmentation.overlapped_speech_detection import ( # isort:skip 28 | OverlappedSpeechDetection, 29 | ) 30 | from .embedding.arcface import SupervisedRepresentationLearningWithArcFace # isort:skip 31 | 32 | # Segmentation has been renamed to SpeakerDiarization but we keep Segmentation here for backward compatibility 33 | Segmentation = SpeakerDiarization 34 | 35 | # SpeakerEmbedding is more human-friendly 36 | SpeakerEmbedding = SupervisedRepresentationLearningWithArcFace 37 | 38 | __all__ = [ 39 | "SpeakerDiarization", 40 | "VoiceActivityDetection", 41 | "OverlappedSpeechDetection", 42 | "MultiLabelSegmentation", 43 | "SpeakerEmbedding", 44 | "Segmentation", 45 | "PixIT", 46 | ] 47 | -------------------------------------------------------------------------------- /pyannote/audio/tasks/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/tasks/embedding/arcface.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from __future__ import annotations 25 | 26 | from typing import Dict, Optional, Sequence, Union 27 | 28 | import pytorch_metric_learning.losses 29 | from pyannote.database import Protocol 30 | from torch_audiomentations.core.transforms_interface import BaseWaveformTransform 31 | from torchmetrics import Metric 32 | 33 | from pyannote.audio.core.task import Task 34 | 35 | from .mixins import SupervisedRepresentationLearningTaskMixin 36 | 37 | 38 | class SupervisedRepresentationLearningWithArcFace( 39 | SupervisedRepresentationLearningTaskMixin, 40 | Task, 41 | ): 42 | """Supervised representation learning with ArcFace loss 43 | 44 | Representation learning is the task of ... 45 | 46 | Parameters 47 | ---------- 48 | protocol : Protocol 49 | pyannote.database protocol 50 | duration : float, optional 51 | Chunks duration in seconds. Defaults to two seconds (2.). 52 | min_duration : float, optional 53 | Sample training chunks duration uniformely between `min_duration` 54 | and `duration`. Defaults to `duration` (i.e. fixed length chunks). 55 | num_classes_per_batch : int, optional 56 | Number of classes per batch. Defaults to 32. 57 | num_chunks_per_class : int, optional 58 | Number of chunks per class. Defaults to 1. 59 | margin : float, optional 60 | Margin. Defaults to 28.6. 61 | scale : float, optional 62 | Scale. Defaults to 64. 63 | num_workers : int, optional 64 | Number of workers used for generating training samples. 65 | Defaults to multiprocessing.cpu_count() // 2. 66 | pin_memory : bool, optional 67 | If True, data loaders will copy tensors into CUDA pinned 68 | memory before returning them. See pytorch documentation 69 | for more details. Defaults to False. 70 | augmentation : BaseWaveformTransform, optional 71 | torch_audiomentations waveform transform, used by dataloader 72 | during training. 73 | metric : optional 74 | Validation metric(s). Can be anything supported by torchmetrics.MetricCollection. 75 | Defaults to AUROC (area under the ROC curve). 76 | """ 77 | 78 | #  TODO: add a ".metric" property that tells how speaker embedding trained with this approach 79 | #  should be compared. could be a string like "cosine" or "euclidean" or a pdist/cdist-like 80 | #  callable. this ".metric" property should be propagated all the way to Inference (via the model). 81 | 82 | def __init__( 83 | self, 84 | protocol: Protocol, 85 | min_duration: Optional[float] = None, 86 | duration: float = 2.0, 87 | num_classes_per_batch: int = 32, 88 | num_chunks_per_class: int = 1, 89 | margin: float = 28.6, 90 | scale: float = 64.0, 91 | num_workers: Optional[int] = None, 92 | pin_memory: bool = False, 93 | augmentation: Optional[BaseWaveformTransform] = None, 94 | metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None, 95 | ): 96 | 97 | self.num_chunks_per_class = num_chunks_per_class 98 | self.num_classes_per_batch = num_classes_per_batch 99 | 100 | self.margin = margin 101 | self.scale = scale 102 | 103 | super().__init__( 104 | protocol, 105 | duration=duration, 106 | min_duration=min_duration, 107 | batch_size=self.batch_size, 108 | num_workers=num_workers, 109 | pin_memory=pin_memory, 110 | augmentation=augmentation, 111 | metric=metric, 112 | ) 113 | 114 | def setup_loss_func(self): 115 | 116 | _, embedding_size = self.model(self.model.example_input_array).shape 117 | 118 | self.model.loss_func = pytorch_metric_learning.losses.ArcFaceLoss( 119 | len(self.specifications.classes), 120 | embedding_size, 121 | margin=self.margin, 122 | scale=self.scale, 123 | ) 124 | -------------------------------------------------------------------------------- /pyannote/audio/tasks/segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/tasks/separation/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .audio.diarization_error_rate import ( 25 | DiarizationErrorRate, 26 | FalseAlarmRate, 27 | MissedDetectionRate, 28 | OptimalDiarizationErrorRate, 29 | OptimalDiarizationErrorRateThreshold, 30 | OptimalFalseAlarmRate, 31 | OptimalMissedDetectionRate, 32 | OptimalSpeakerConfusionRate, 33 | SpeakerConfusionRate, 34 | ) 35 | 36 | __all__ = [ 37 | "DiarizationErrorRate", 38 | "FalseAlarmRate", 39 | "MissedDetectionRate", 40 | "SpeakerConfusionRate", 41 | "OptimalDiarizationErrorRate", 42 | "OptimalFalseAlarmRate", 43 | "OptimalMissedDetectionRate", 44 | "OptimalSpeakerConfusionRate", 45 | "OptimalDiarizationErrorRateThreshold", 46 | ] 47 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .diarization_error_rate import ( 25 | DiarizationErrorRate, 26 | FalseAlarmRate, 27 | MissedDetectionRate, 28 | OptimalDiarizationErrorRate, 29 | OptimalDiarizationErrorRateThreshold, 30 | OptimalFalseAlarmRate, 31 | OptimalMissedDetectionRate, 32 | OptimalSpeakerConfusionRate, 33 | SpeakerConfusionRate, 34 | ) 35 | 36 | __all__ = [ 37 | "DiarizationErrorRate", 38 | "SpeakerConfusionRate", 39 | "MissedDetectionRate", 40 | "FalseAlarmRate", 41 | "OptimalDiarizationErrorRate", 42 | "OptimalSpeakerConfusionRate", 43 | "OptimalMissedDetectionRate", 44 | "OptimalFalseAlarmRate", 45 | "OptimalDiarizationErrorRateThreshold", 46 | ] 47 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/classification/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from .equal_error_rate import EqualErrorRate 25 | 26 | __all__ = [ 27 | "EqualErrorRate", 28 | ] 29 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/classification/equal_error_rate.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Optional 25 | 26 | import torch 27 | from pyannote.metrics.binary_classification import det_curve 28 | from torchmetrics import Metric 29 | from torchmetrics.utilities.data import dim_zero_cat 30 | 31 | 32 | class EqualErrorRate(Metric): 33 | 34 | is_differentiable: Optional[bool] = False 35 | higher_is_better: Optional[bool] = False 36 | full_state_update: bool = True 37 | 38 | def __init__(self, distances: bool = True, compute_on_cpu: bool = True, **kwargs): 39 | super().__init__(compute_on_cpu=compute_on_cpu, **kwargs) 40 | self.distances = distances 41 | self.add_state("scores", default=[], dist_reduce_fx="cat") 42 | self.add_state("y_true", default=[], dist_reduce_fx="cat") 43 | 44 | def update(self, scores: torch.Tensor, y_true: torch.Tensor) -> None: 45 | self.scores.append(scores) 46 | self.y_true.append(y_true) 47 | 48 | def compute(self) -> torch.Tensor: 49 | scores = dim_zero_cat(self.scores) 50 | y_true = dim_zero_cat(self.y_true) 51 | _, _, _, eer = det_curve(y_true.cpu(), scores.cpu(), distances=self.distances) 52 | return torch.tensor(eer) 53 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/functional/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/torchmetrics/functional/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2022- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyannote/audio/utils/multi_task.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Any, Callable, Tuple, Union 25 | 26 | from pyannote.audio.core.model import Specifications 27 | 28 | 29 | def map_with_specifications( 30 | specifications: Union[Specifications, Tuple[Specifications]], 31 | func: Callable, 32 | *iterables, 33 | ) -> Union[Any, Tuple[Any]]: 34 | """Compute the function using arguments from each of the iterables 35 | 36 | Returns a tuple if provided `specifications` is a tuple, 37 | otherwise returns the function return value. 38 | 39 | Parameters 40 | ---------- 41 | specifications : (tuple of) Specifications 42 | Specifications or tuple of specifications 43 | func : callable 44 | Function called for each specification with 45 | `func(*iterables[i], specifications=specifications[i])` 46 | *iterables : 47 | List of iterables with same length as `specifications`. 48 | 49 | Returns 50 | ------- 51 | output : (tuple of) `func` return value(s) 52 | """ 53 | 54 | if isinstance(specifications, Specifications): 55 | return func(*iterables, specifications=specifications) 56 | 57 | return tuple( 58 | func(*i, specifications=s) for s, *i in zip(specifications, *iterables) 59 | ) 60 | -------------------------------------------------------------------------------- /pyannote/audio/utils/params.py: -------------------------------------------------------------------------------- 1 | # TODO - make it depth-recursive 2 | # TODO - switch to Omegaconf maybe? 3 | 4 | from typing import Optional 5 | 6 | 7 | def merge_dict(defaults: dict, custom: Optional[dict] = None): 8 | params = dict(defaults) 9 | if custom is not None: 10 | params.update(custom) 11 | return params 12 | -------------------------------------------------------------------------------- /pyannote/audio/utils/probe.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from functools import partial 25 | from typing import Callable, Dict, Set, Text 26 | 27 | import torch.nn as nn 28 | 29 | 30 | def probe(trunk: nn.Module, branches: Dict[Text, Text]) -> Callable: 31 | """Add probing branches to a trunk module 32 | 33 | Parameters 34 | ---------- 35 | trunk : nn.Module 36 | Multi-layer trunk. 37 | branches : {branch_name: layer_name} dict or [layer_name] list 38 | Indicate where to plug a probing branch. 39 | 40 | Returns 41 | ------- 42 | revert : Callable 43 | Callable that, when called, removes probing branches. 44 | 45 | Usage 46 | ----- 47 | 48 | Define a trunk made out of three consecutive layers 49 | 50 | >>> import torch.nn as nn 51 | >>> class Trunk(nn.Module): 52 | ... 53 | ... def __init__(self): 54 | ... super().__init__() 55 | ... self.layer1 = nn.Linear(1, 2) 56 | ... self.layer2 = nn.Linear(2, 3) 57 | ... self.layer3 = nn.Linear(3, 4) 58 | ... 59 | ... def forward(self, x): 60 | ... return self.layer3(self.layer2(self.layer1(x))) 61 | 62 | >>> trunk = Trunk() 63 | >>> x = torch.tensor((0.,)) 64 | >>> trunk(x) 65 | # tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=) 66 | 67 | Add two probing branches: 68 | - first one is called "probe1" and probes the output of "layer1" 69 | - second one is called "probe2" and probes the output of "layer3" 70 | 71 | >>> revert = probe(trunk, {"probe1": "layer1", "probe2": "layer3"}) 72 | >>> trunk(x) 73 | # {'probe1': tensor([ 0.5854, -0.9685], grad_fn=), 74 | # 'probe2': tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)} 75 | 76 | Use callback returned by `probe` to revert its effect 77 | 78 | >>> revert() 79 | >>> trunk(x) 80 | # tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=) 81 | 82 | For convenience, one can also define probes as a list of layers: 83 | 84 | >>> revert = probe(trunk, ['layer1', 'layer3']) 85 | >>> trunk(x) 86 | # {'layer1': tensor([ 0.5854, -0.9685], grad_fn=), 87 | # 'layer3': tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)} 88 | """ 89 | 90 | def remove(): 91 | del trunk.__probe 92 | for handle in trunk.__probe_handles: 93 | handle.remove() 94 | del trunk.__probe_handles 95 | 96 | if hasattr(trunk, "__probe"): 97 | remove() 98 | 99 | trunk.__probe_handles = [] 100 | 101 | def __probe_init(module, input): 102 | trunk.__probe = dict() 103 | 104 | handle = trunk.register_forward_pre_hook(__probe_init) 105 | trunk.__probe_handles.append(handle) 106 | 107 | def __probe_append(branch_name, module, input, output): 108 | trunk.__probe[branch_name] = output 109 | 110 | if not isinstance(branches, dict): 111 | branches = {b: b for b in branches} 112 | 113 | sehcnarb: Dict[Text, Set] = dict() 114 | for branch_name, layer_name in branches.items(): 115 | if layer_name not in sehcnarb: 116 | sehcnarb[layer_name] = set() 117 | sehcnarb[layer_name].add(branch_name) 118 | 119 | for layer_name, layer in trunk.named_modules(): 120 | if layer_name not in sehcnarb: 121 | continue 122 | for branch_name in sehcnarb[layer_name]: 123 | handle = layer.register_forward_hook(partial(__probe_append, branch_name)) 124 | trunk.__probe_handles.append(handle) 125 | 126 | def __probe_return(module, input, output): 127 | return trunk.__probe 128 | 129 | handle = trunk.register_forward_hook(__probe_return) 130 | trunk.__probe_handles.append(handle) 131 | 132 | return remove 133 | -------------------------------------------------------------------------------- /pyannote/audio/utils/random.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import os 25 | import zlib 26 | from random import Random 27 | 28 | import torch 29 | 30 | 31 | def create_rng_for_worker(model) -> Random: 32 | """Create worker-specific random number generator 33 | 34 | This makes sure that 35 | 1. training samples generation is reproducible 36 | 2. every (worker, epoch) uses a different seed 37 | 38 | Parameters 39 | ---------- 40 | epoch : int 41 | Current epoch. 42 | """ 43 | 44 | # create random number generator 45 | rng = Random() 46 | 47 | global_seed = os.environ.get("PL_GLOBAL_SEED", "unset") 48 | worker_info = torch.utils.data.get_worker_info() 49 | 50 | if worker_info is None: 51 | worker_id = None 52 | else: 53 | worker_id = worker_info.id 54 | 55 | seed_tuple = ( 56 | global_seed, 57 | worker_id, 58 | model.local_rank, 59 | model.global_rank, 60 | model.current_epoch, 61 | ) 62 | # use adler32 because python's `hash` is not deterministic. 63 | seed = zlib.adler32(str(seed_tuple).encode()) 64 | rng.seed(seed) 65 | 66 | return rng 67 | -------------------------------------------------------------------------------- /pyannote/audio/utils/receptive_field.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023 CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import List 24 | 25 | 26 | def conv1d_num_frames( 27 | num_samples, kernel_size=5, stride=1, padding=0, dilation=1 28 | ) -> int: 29 | """Compute expected number of frames after 1D convolution 30 | 31 | Parameters 32 | ---------- 33 | num_samples : int 34 | Number of samples in the input signal 35 | kernel_size : int 36 | Kernel size 37 | stride : int 38 | Stride 39 | padding : int 40 | Padding 41 | dilation : int 42 | Dilation 43 | 44 | Returns 45 | ------- 46 | num_frames : int 47 | Number of frames in the output signal 48 | 49 | Source 50 | ------ 51 | https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d 52 | """ 53 | return 1 + (num_samples + 2 * padding - dilation * (kernel_size - 1) - 1) // stride 54 | 55 | 56 | def multi_conv_num_frames( 57 | num_samples: int, 58 | kernel_size: List[int] = None, 59 | stride: List[int] = None, 60 | padding: List[int] = None, 61 | dilation: List[int] = None, 62 | ) -> int: 63 | num_frames = num_samples 64 | for k, s, p, d in zip(kernel_size, stride, padding, dilation): 65 | num_frames = conv1d_num_frames( 66 | num_frames, kernel_size=k, stride=s, padding=p, dilation=d 67 | ) 68 | 69 | return num_frames 70 | 71 | 72 | def conv1d_receptive_field_size( 73 | num_frames=1, kernel_size=5, stride=1, padding=0, dilation=1 74 | ): 75 | """Compute size of receptive field 76 | 77 | Parameters 78 | ---------- 79 | num_frames : int, optional 80 | Number of frames in the output signal 81 | kernel_size : int 82 | Kernel size 83 | stride : int 84 | Stride 85 | padding : int 86 | Padding 87 | dilation : int 88 | Dilation 89 | 90 | Returns 91 | ------- 92 | size : int 93 | Receptive field size 94 | """ 95 | 96 | effective_kernel_size = 1 + (kernel_size - 1) * dilation 97 | return effective_kernel_size + (num_frames - 1) * stride - 2 * padding 98 | 99 | 100 | def multi_conv_receptive_field_size( 101 | num_frames: int, 102 | kernel_size: List[int] = None, 103 | stride: List[int] = None, 104 | padding: List[int] = None, 105 | dilation: List[int] = None, 106 | ) -> int: 107 | receptive_field_size = num_frames 108 | 109 | for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))): 110 | receptive_field_size = conv1d_receptive_field_size( 111 | num_frames=receptive_field_size, 112 | kernel_size=k, 113 | stride=s, 114 | padding=p, 115 | dilation=d, 116 | ) 117 | return receptive_field_size 118 | 119 | 120 | def conv1d_receptive_field_center( 121 | frame=0, kernel_size=5, stride=1, padding=0, dilation=1 122 | ) -> int: 123 | """Compute center of receptive field 124 | 125 | Parameters 126 | ---------- 127 | frame : int 128 | Frame index 129 | kernel_size : int 130 | Kernel size 131 | stride : int 132 | Stride 133 | padding : int 134 | Padding 135 | dilation : int 136 | Dilation 137 | 138 | Returns 139 | ------- 140 | center : int 141 | Index of receptive field center 142 | """ 143 | 144 | effective_kernel_size = 1 + (kernel_size - 1) * dilation 145 | return frame * stride + (effective_kernel_size - 1) // 2 - padding 146 | 147 | 148 | def multi_conv_receptive_field_center( 149 | frame: int, 150 | kernel_size: List[int] = None, 151 | stride: List[int] = None, 152 | padding: List[int] = None, 153 | dilation: List[int] = None, 154 | ) -> int: 155 | receptive_field_center = frame 156 | for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))): 157 | receptive_field_center = conv1d_receptive_field_center( 158 | frame=receptive_field_center, 159 | kernel_size=k, 160 | stride=s, 161 | padding=p, 162 | dilation=d, 163 | ) 164 | 165 | return receptive_field_center 166 | -------------------------------------------------------------------------------- /pyannote/audio/utils/reproducibility.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # Context: https://github.com/pyannote/pyannote-audio/issues/1370 24 | 25 | import warnings 26 | 27 | import torch 28 | 29 | 30 | class ReproducibilityError(Exception): 31 | ... 32 | 33 | 34 | class ReproducibilityWarning(UserWarning): 35 | ... 36 | 37 | 38 | def raise_reproducibility(device: torch.device): 39 | if (device.type == "cuda") and ( 40 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32 41 | ): 42 | raise ReproducibilityError( 43 | "Please disable TensorFloat-32 (TF32) by calling\n" 44 | " >>> import torch\n" 45 | " >>> torch.backends.cuda.matmul.allow_tf32 = False\n" 46 | " >>> torch.backends.cudnn.allow_tf32 = False\n" 47 | "or you might face reproducibility issues and obtain lower accuracy.\n" 48 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details." 49 | ) 50 | 51 | 52 | def warn_reproducibility(device: torch.device): 53 | if (device.type == "cuda") and ( 54 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32 55 | ): 56 | warnings.warn( 57 | ReproducibilityWarning( 58 | "Please disable TensorFloat-32 (TF32) by calling\n" 59 | " >>> import torch\n" 60 | " >>> torch.backends.cuda.matmul.allow_tf32 = False\n" 61 | " >>> torch.backends.cudnn.allow_tf32 = False\n" 62 | "or you might face reproducibility issues and obtain lower accuracy.\n" 63 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details." 64 | ) 65 | ) 66 | 67 | 68 | def fix_reproducibility(device: torch.device): 69 | if (device.type == "cuda") and ( 70 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32 71 | ): 72 | torch.backends.cuda.matmul.allow_tf32 = False 73 | torch.backends.cudnn.allow_tf32 = False 74 | warnings.warn( 75 | ReproducibilityWarning( 76 | "TensorFloat-32 (TF32) has been disabled as it might lead to reproducibility issues and lower accuracy.\n" 77 | "It can be re-enabled by calling\n" 78 | " >>> import torch\n" 79 | " >>> torch.backends.cuda.matmul.allow_tf32 = True\n" 80 | " >>> torch.backends.cudnn.allow_tf32 = True\n" 81 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.\n" 82 | ) 83 | ) 84 | -------------------------------------------------------------------------------- /pyannote/audio/utils/version.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import Text 24 | 25 | from semver import VersionInfo 26 | 27 | 28 | def check_version(library: Text, theirs: Text, mine: Text, what: Text = "Pipeline"): 29 | 30 | theirs = ".".join(theirs.split(".")[:3]) 31 | mine = ".".join(mine.split(".")[:3]) 32 | 33 | theirs = VersionInfo.parse(theirs) 34 | mine = VersionInfo.parse(mine) 35 | 36 | if theirs.major > mine.major: 37 | print( 38 | f"{what} was trained with {library} {theirs}, yours is {mine}. " 39 | f"Bad things will probably happen unless you upgrade {library} to {theirs.major}.x." 40 | ) 41 | 42 | elif theirs.major < mine.major: 43 | print( 44 | f"{what} was trained with {library} {theirs}, yours is {mine}. " 45 | f"Bad things might happen unless you revert {library} to {theirs.major}.x." 46 | ) 47 | 48 | elif theirs.minor > mine.minor: 49 | print( 50 | f"{what} was trained with {library} {theirs}, yours is {mine}. " 51 | f"This should be OK but you might want to upgrade {library}." 52 | ) 53 | -------------------------------------------------------------------------------- /questions/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Questions 3 | 4 | Your questions should go in this directory. 5 | 6 | Question files should be named with the extension ".question.md". 7 | -------------------------------------------------------------------------------- /questions/bad_performance.question.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How can I improve performance?" 3 | alt_titles: 4 | - "Pretrained pipelines do not produce good results on my data. What can I do?" 5 | - "It does not work! Help me!" 6 | --- 7 | 8 | **Long answer:** 9 | 10 | 1. Manually annotate dozens of conversations as precisely as possible. 11 | 2. Separate them into train (80%), development (10%) and test (10%) subsets. 12 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization). 13 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb). 14 | 5. Enjoy. 15 | 16 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that. 17 | -------------------------------------------------------------------------------- /questions/from_memory.question.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Can I apply pretrained pipelines on audio already loaded in memory?" 3 | alt_titles: 4 | - "Can I apply models on an audio array?" 5 | --- 6 | 7 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end. 8 | -------------------------------------------------------------------------------- /questions/offline.question.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Can I use gated models (and pipelines) offline?" 3 | alt_titles: 4 | - "Why does one need to authenticate to access the pretrained models?" 5 | - "Can I use pyannote.audio pretrained pipelines without the Hugginface token?" 6 | - "How can I solve the permission issue?" 7 | --- 8 | 9 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines. 10 | 11 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible. 12 | 13 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming. 14 | 15 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines. 16 | -------------------------------------------------------------------------------- /questions/pyannote.question.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How does one spell and pronounce pyannote.audio?" 3 | alt_titles: 4 | - "Why the name of the library?" 5 | - "Why the logo of the library?" 6 | --- 7 | 8 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic). 9 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon. 10 | 🎹 `pianoter` means to play the piano (hence the logo 🤯). 11 | -------------------------------------------------------------------------------- /questions/streaming.question.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Does pyannote support streaming speaker diarization?" 3 | alt_titles: 4 | - "Is it possible to do realtime speaker diarization?" 5 | - "Can it process online audio buffers?" 6 | --- 7 | 8 | **Short answer:** not out of the box, no. 9 | 10 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`. 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asteroid-filterbanks >=0.4 2 | einops >=0.6.0 3 | huggingface_hub >= 0.13.0 4 | lightning >= 2.0.1 5 | omegaconf >=2.1,<3.0 6 | pyannote.core >= 5.0.0 7 | pyannote.database >= 5.0.1 8 | pyannote.metrics >= 3.2 9 | pyannote.pipeline >= 3.0.1 10 | pytorch_metric_learning >= 2.1.0 11 | rich >= 12.0.0 12 | semver >= 3.0.0 13 | soundfile >= 0.12.1 14 | speechbrain >= 1.0.0 15 | tensorboardX >= 2.6 16 | torch >= 2.0.0 17 | torch_audiomentations >= 0.11.0 18 | torchaudio >= 2.2.0 19 | torchmetrics >= 0.11.0 20 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # This file is used to configure your project. 2 | # Read more about the various options under: 3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files 4 | 5 | [metadata] 6 | name = pyannote-audio 7 | description = Neural speaker diarization 8 | author = Herve Bredin 9 | author-email = herve.bredin@irit.fr 10 | license = mit 11 | long-description = file: README.md 12 | long-description-content-type = text/markdown; charset=UTF-8; variant=GFM 13 | # Change if running only on Windows, Mac or Linux (comma-separated) 14 | platforms = Linux, Mac 15 | # Add here all kinds of additional classifiers as defined under 16 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 17 | classifiers = 18 | Development Status :: 4 - Beta 19 | Programming Language :: Python 20 | 21 | [options] 22 | zip_safe = False 23 | packages = find: 24 | include_package_data = True 25 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! 26 | setup_requires = pyscaffold>=3.2a0,<3.3a0 27 | # Add here dependencies of your project (semicolon/line-separated), e.g. 28 | # install_requires = numpy; scipy 29 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4 30 | python_requires = >=3.9 31 | 32 | [options.packages.find] 33 | where = . 34 | exclude = 35 | tests 36 | 37 | [options.extras_require] 38 | # Add here additional requirements for extra features, to install with: 39 | # `pip install fastaudio[PDF]` like: 40 | # PDF = ReportLab; RXP 41 | # Add here test requirements (semicolon/line-separated) 42 | testing = 43 | pytest>=6.0 44 | pytest-cov>=2.10 45 | jupyter 46 | papermill 47 | dev = 48 | pre_commit>=2.7 49 | recommonmark>=0.6 50 | black>=22.3.0 51 | cli = 52 | hydra-core >=1.1,<1.2 53 | typer >= 0.4.0,<0.5.0 54 | separation = 55 | transformers >= 4.39.1 56 | asteroid >=0.7.0 57 | 58 | [options.entry_points] 59 | 60 | console_scripts = 61 | pyannote-audio-train=pyannote.audio.cli.train:train 62 | pyannote-audio-eval=pyannote.audio.cli.evaluate:evaluate 63 | 64 | 65 | [test] 66 | # py.test options when running `python setup.py test` 67 | # addopts = --verbose 68 | extras = True 69 | 70 | [tool:pytest] 71 | # Options for py.test: 72 | # Specify command line options as you would do when invoking py.test directly. 73 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml 74 | # in order to write a coverage file that can be read by Jenkins. 75 | addopts = 76 | --cov pyannote --cov-report term-missing 77 | --verbose 78 | norecursedirs = 79 | dist 80 | build 81 | .tox 82 | testpaths = tests 83 | 84 | [aliases] 85 | dists = bdist_wheel 86 | 87 | [bdist_wheel] 88 | # Use this option if your package is pure-python 89 | universal = 1 90 | 91 | [build_sphinx] 92 | source_dir = doc 93 | build_dir = build/sphinx 94 | 95 | [devpi:upload] 96 | # Options for the devpi: PyPI server and packaging tool 97 | # VCS export must be deactivated since we are using setuptools-scm 98 | no-vcs = 1 99 | formats = bdist_wheel 100 | 101 | [flake8] 102 | # Some sane defaults for the code style checker flake8 103 | exclude = 104 | .tox 105 | build 106 | dist 107 | .eggs 108 | docs/conf.py 109 | 110 | [pyscaffold] 111 | # PyScaffold's parameters when the project was created. 112 | # This will be used when updating. Do not change! 113 | version = 3.2.3 114 | package = pyannote-audio 115 | extensions = 116 | markdown 117 | no_skeleton 118 | pre_commit 119 | dsproject 120 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | 5 | from pkg_resources import VersionConflict, require 6 | from setuptools import find_packages, setup 7 | 8 | with open("README.md", mode="r", encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | with open("requirements.txt", mode="r", encoding="utf-8") as f: 12 | requirements = f.read().splitlines() 13 | 14 | try: 15 | require("setuptools>=38.3") 16 | except VersionConflict: 17 | print("Error: version of setuptools is too old (<38.3)!") 18 | sys.exit(1) 19 | 20 | 21 | ROOT_DIR = Path(__file__).parent.resolve() 22 | # Creating the version file 23 | 24 | with open("version.txt", mode="r", encoding="utf-8") as f: 25 | version = f.read() 26 | 27 | version = version.strip() 28 | sha = "Unknown" 29 | 30 | if os.getenv("BUILD_VERSION"): 31 | version = os.getenv("BUILD_VERSION") 32 | elif sha != "Unknown": 33 | version += "+" + sha[:7] 34 | print("-- Building version " + version) 35 | 36 | version_path = ROOT_DIR / "pyannote" / "audio" / "version.py" 37 | 38 | with open(version_path, mode="w", encoding="utf-8") as f: 39 | f.write("__version__ = '{}'\n".format(version)) 40 | 41 | if __name__ == "__main__": 42 | setup( 43 | name="pyannote.audio", 44 | namespace_packages=["pyannote"], 45 | version=version, 46 | packages=find_packages(), 47 | install_requires=requirements, 48 | description="Neural building blocks for speaker diarization", 49 | long_description=long_description, 50 | long_description_content_type="text/markdown", 51 | author="Hervé Bredin", 52 | author_email="herve.bredin@irit.fr", 53 | url="https://github.com/pyannote/pyannote-audio", 54 | classifiers=[ 55 | "Development Status :: 4 - Beta", 56 | "Intended Audience :: Science/Research", 57 | "License :: OSI Approved :: MIT License", 58 | "Natural Language :: English", 59 | "Programming Language :: Python :: 3.9", 60 | "Programming Language :: Python :: 3.10", 61 | "Programming Language :: Python :: 3.11", 62 | "Topic :: Scientific/Engineering", 63 | ], 64 | ) 65 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | def pytest_sessionstart(session): 25 | """ 26 | Called after the Session object has been created and 27 | before performing collection and entering the run test loop. 28 | """ 29 | 30 | from pyannote.database import registry 31 | 32 | registry.load_database("tests/data/database.yml") 33 | -------------------------------------------------------------------------------- /tests/data/database.yml: -------------------------------------------------------------------------------- 1 | Protocols: 2 | Debug: 3 | SpeakerDiarization: 4 | Debug: 5 | scope: database 6 | train: 7 | uri: debug.train.lst 8 | annotation: debug.train.rttm 9 | annotated: debug.train.uem 10 | development: 11 | uri: debug.development.lst 12 | annotation: debug.development.rttm 13 | annotated: debug.development.uem 14 | test: 15 | uri: debug.test.lst 16 | annotation: debug.test.rttm 17 | annotated: debug.test.uem 18 | 19 | Databases: 20 | Debug: ./{uri}.wav 21 | -------------------------------------------------------------------------------- /tests/data/debug.development.lst: -------------------------------------------------------------------------------- 1 | dev00 2 | dev01 3 | -------------------------------------------------------------------------------- /tests/data/debug.development.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER dev00 1 1.440 11.872 MEE009 2 | SPEAKER dev00 1 13.152 3.770 MEE012 3 | SPEAKER dev00 1 18.064 0.336 MEE012 4 | SPEAKER dev00 1 18.201 2.439 MEE009 5 | SPEAKER dev00 1 20.560 1.056 MEE012 6 | SPEAKER dev00 1 21.952 4.320 MEE009 7 | SPEAKER dev00 1 23.072 0.736 MEE012 8 | SPEAKER dev00 1 26.192 2.192 MEE012 9 | SPEAKER dev00 1 28.224 1.776 MEE009 10 | SPEAKER dev01 1 4.304 2.448 MEE012 11 | SPEAKER dev01 1 7.024 4.752 MEE009 12 | SPEAKER dev01 1 15.133 4.515 MEE009 13 | SPEAKER dev01 1 16.384 1.168 MEE012 14 | SPEAKER dev01 1 19.568 0.800 MEE012 15 | SPEAKER dev01 1 21.312 1.280 MEE009 16 | SPEAKER dev01 1 22.464 1.456 MEE012 17 | SPEAKER dev01 1 29.072 0.464 MEE012 18 | -------------------------------------------------------------------------------- /tests/data/debug.development.uem: -------------------------------------------------------------------------------- 1 | dev00 NA 0.000 30.000 2 | dev01 NA 0.000 30.000 3 | -------------------------------------------------------------------------------- /tests/data/debug.test.lst: -------------------------------------------------------------------------------- 1 | tst00 2 | tst01 3 | -------------------------------------------------------------------------------- /tests/data/debug.test.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER tst00 1 0.000 1.901 MEE071 2 | SPEAKER tst00 1 0.944 6.124 MEE073 3 | SPEAKER tst00 1 3.492 1.954 FEO072 4 | SPEAKER tst00 1 3.612 8.676 MEE071 5 | SPEAKER tst00 1 3.692 1.887 FEO070 6 | SPEAKER tst00 1 7.891 1.114 FEO070 7 | SPEAKER tst00 1 8.544 3.216 FEO072 8 | SPEAKER tst00 1 12.133 3.301 FEO070 9 | SPEAKER tst00 1 13.120 0.602 FEO072 10 | SPEAKER tst00 1 14.959 0.666 MEE071 11 | SPEAKER tst00 1 15.109 10.155 FEO072 12 | SPEAKER tst00 1 19.006 0.485 FEO070 13 | SPEAKER tst00 1 19.008 4.796 MEE071 14 | SPEAKER tst00 1 20.124 1.044 MEE073 15 | SPEAKER tst00 1 20.222 1.222 FEO070 16 | SPEAKER tst00 1 21.400 1.928 MEE073 17 | SPEAKER tst00 1 23.490 0.750 FEO070 18 | SPEAKER tst00 1 25.344 4.656 MEE073 19 | SPEAKER tst00 1 25.658 0.550 FEO070 20 | SPEAKER tst00 1 27.792 2.208 MEE071 21 | SPEAKER tst00 1 27.879 2.121 FEO072 22 | SPEAKER tst00 1 28.016 1.984 FEO070 23 | SPEAKER tst01 1 4.390 0.350 FEO072 24 | SPEAKER tst01 1 4.773 0.366 MEE073 25 | SPEAKER tst01 1 16.495 0.540 MEE071 26 | SPEAKER tst01 1 24.159 4.388 FEO070 27 | SPEAKER tst01 1 29.008 0.448 MEE073 28 | -------------------------------------------------------------------------------- /tests/data/debug.test.uem: -------------------------------------------------------------------------------- 1 | tst00 NA 0.000 30.000 2 | tst01 NA 0.000 30.000 3 | -------------------------------------------------------------------------------- /tests/data/debug.train.lst: -------------------------------------------------------------------------------- 1 | trñ00 2 | trn01 3 | trn02 4 | trn03 5 | trn04 6 | trn05 7 | trn06 8 | trn07 9 | trn08 10 | trn09 11 | -------------------------------------------------------------------------------- /tests/data/debug.train.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER trn00 1 3.168 0.800 MÉO069 2 | SPEAKER trn00 1 5.463 0.640 MÉO069 3 | SPEAKER trn00 1 5.496 0.574 MEE068 4 | SPEAKER trn00 1 10.454 0.499 MÉO069 5 | SPEAKER trn00 1 11.040 4.592 MEE068 6 | SPEAKER trn00 1 16.736 1.410 MÉO069 7 | SPEAKER trn00 1 16.980 2.778 MEE067 8 | SPEAKER trn00 1 18.883 0.490 MEE068 9 | SPEAKER trn00 1 18.985 1.831 MÉO069 10 | SPEAKER trn00 1 20.944 0.447 MEE067 11 | SPEAKER trn00 1 21.392 4.465 MEE068 12 | SPEAKER trn00 1 22.928 0.384 MÉO069 13 | SPEAKER trn00 1 25.001 2.471 MÉO069 14 | SPEAKER trn00 1 28.033 1.967 MEE068 15 | SPEAKER trn01 1 2.977 0.391 FEO066 16 | SPEAKER trn01 1 18.705 0.964 MEE068 17 | SPEAKER trn01 1 22.269 0.457 FEO065 18 | SPEAKER trn01 1 28.474 1.526 MÉO069 19 | SPEAKER trn01 1 28.593 1.407 FEO066 20 | SPEAKER trn01 1 28.993 1.007 FEO065 21 | SPEAKER trn02 1 20.704 0.688 FEO066 22 | SPEAKER trn03 1 0.000 1.184 MEE067 23 | SPEAKER trn03 1 1.104 28.896 MÉO069 24 | SPEAKER trn04 1 14.032 1.744 MEE076 25 | SPEAKER trn04 1 14.345 2.471 MEO074 26 | SPEAKER trn04 1 16.736 7.216 MEE075 27 | SPEAKER trn04 1 21.158 0.607 MEO074 28 | SPEAKER trn04 1 25.200 0.736 MEE075 29 | SPEAKER trn04 1 26.992 0.272 MEE075 30 | SPEAKER trn04 1 27.840 2.160 MEE076 31 | SPEAKER trn05 1 0.000 0.384 FEO079 32 | SPEAKER trn05 1 0.000 1.472 FEE078 33 | SPEAKER trn05 1 1.456 0.656 FEE081 34 | SPEAKER trn05 1 5.936 0.342 FEE078 35 | SPEAKER trn05 1 8.016 21.984 FEE078 36 | SPEAKER trn05 1 8.496 0.784 FEE081 37 | SPEAKER trn05 1 19.157 0.424 FEE080 38 | SPEAKER trn06 1 0.000 8.856 FEE083 39 | SPEAKER trn06 1 3.528 3.218 MEO082 40 | SPEAKER trn06 1 10.544 0.648 FEE083 41 | SPEAKER trn06 1 11.419 1.079 FEE085 42 | SPEAKER trn06 1 13.524 16.476 FEE083 43 | SPEAKER trn06 1 21.799 0.557 FEE085 44 | SPEAKER trn07 1 8.275 1.452 FEE087 45 | SPEAKER trn07 1 15.600 2.810 FEE087 46 | SPEAKER trn07 1 19.901 0.559 FEE087 47 | SPEAKER trn07 1 20.277 0.615 MEE089 48 | SPEAKER trn07 1 22.592 2.525 FEE087 49 | SPEAKER trn07 1 23.197 0.782 MEE089 50 | SPEAKER trn07 1 23.502 1.779 FEE088 51 | SPEAKER trn07 1 24.032 0.474 MEO086 52 | SPEAKER trn07 1 26.506 1.689 FEE087 53 | SPEAKER trn07 1 27.182 2.818 MEO086 54 | SPEAKER trn08 1 5.015 1.738 MEE089 55 | SPEAKER trn08 1 5.040 3.568 FEE087 56 | SPEAKER trn08 1 5.491 3.018 FEE088 57 | SPEAKER trn08 1 6.995 0.547 MEO086 58 | SPEAKER trn08 1 10.099 0.858 FEE087 59 | SPEAKER trn08 1 10.128 0.958 FEE088 60 | SPEAKER trn08 1 12.000 7.664 FEE087 61 | SPEAKER trn08 1 12.701 1.871 FEE088 62 | SPEAKER trn08 1 14.912 1.008 MEE089 63 | SPEAKER trn08 1 15.003 1.289 MEO086 64 | SPEAKER trn08 1 17.164 0.314 FEE088 65 | SPEAKER trn08 1 18.522 5.414 FEE088 66 | SPEAKER trn08 1 21.168 0.969 FEE087 67 | SPEAKER trn08 1 26.848 1.339 MEE089 68 | SPEAKER trn08 1 27.040 1.648 FEE088 69 | SPEAKER trn08 1 27.107 0.582 FEE087 70 | SPEAKER trn09 1 0.000 1.854 FEE083 71 | SPEAKER trn09 1 0.000 6.045 MEE094 72 | SPEAKER trn09 1 1.854 28.146 FEE083 73 | SPEAKER trn09 1 12.857 0.485 MEE094 74 | SPEAKER trn09 1 14.201 4.023 MEE094 75 | SPEAKER trn09 1 15.726 0.823 MEE095 76 | SPEAKER trn09 1 24.992 2.358 MEE094 77 | SPEAKER trn09 1 29.687 0.313 MEE094 78 | -------------------------------------------------------------------------------- /tests/data/debug.train.uem: -------------------------------------------------------------------------------- 1 | trn00 NA 0.000 30.000 2 | trn01 NA 0.000 30.000 3 | trn02 NA 0.000 30.000 4 | trn03 NA 0.000 30.000 5 | trn04 NA 0.000 30.000 6 | trn05 NA 0.000 30.000 7 | trn06 NA 0.000 30.000 8 | trn07 NA 0.000 30.000 9 | trn08 NA 0.000 30.000 10 | trn09 NA 0.000 30.000 11 | -------------------------------------------------------------------------------- /tests/data/dev00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/dev00.wav -------------------------------------------------------------------------------- /tests/data/dev01.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/dev01.wav -------------------------------------------------------------------------------- /tests/data/empty.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/empty.wav -------------------------------------------------------------------------------- /tests/data/trn01.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn01.wav -------------------------------------------------------------------------------- /tests/data/trn02.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn02.wav -------------------------------------------------------------------------------- /tests/data/trn03.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn03.wav -------------------------------------------------------------------------------- /tests/data/trn04.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn04.wav -------------------------------------------------------------------------------- /tests/data/trn05.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn05.wav -------------------------------------------------------------------------------- /tests/data/trn06.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn06.wav -------------------------------------------------------------------------------- /tests/data/trn07.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn07.wav -------------------------------------------------------------------------------- /tests/data/trn08.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn08.wav -------------------------------------------------------------------------------- /tests/data/trn09.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn09.wav -------------------------------------------------------------------------------- /tests/data/trñ00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trñ00.wav -------------------------------------------------------------------------------- /tests/data/tst00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/tst00.wav -------------------------------------------------------------------------------- /tests/data/tst01.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/tst01.wav -------------------------------------------------------------------------------- /tests/inference_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import pytorch_lightning as pl 4 | from pyannote.core import SlidingWindowFeature 5 | from pyannote.database import FileFinder, get_protocol 6 | 7 | from pyannote.audio import Inference, Model 8 | from pyannote.audio.core.task import Resolution 9 | from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel 10 | from pyannote.audio.tasks import VoiceActivityDetection 11 | 12 | HF_SAMPLE_MODEL_ID = "pyannote/ci-segmentation" 13 | 14 | 15 | def test_hf_download_inference(): 16 | inference = Inference(HF_SAMPLE_MODEL_ID, device="cpu") 17 | assert isinstance(inference, Inference) 18 | 19 | 20 | def test_hf_download_model(): 21 | model = Model.from_pretrained(HF_SAMPLE_MODEL_ID) 22 | assert isinstance(model, Model) 23 | 24 | 25 | @pytest.fixture() 26 | def trained(): 27 | protocol = get_protocol( 28 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 29 | ) 30 | vad = VoiceActivityDetection(protocol, duration=2.0, batch_size=16, num_workers=4) 31 | model = SimpleSegmentationModel(task=vad) 32 | trainer = pl.Trainer(fast_dev_run=True, accelerator="cpu") 33 | trainer.fit(model) 34 | return protocol, model 35 | 36 | 37 | @pytest.fixture() 38 | def pretrained_model(): 39 | return Model.from_pretrained(HF_SAMPLE_MODEL_ID) 40 | 41 | 42 | @pytest.fixture() 43 | def dev_file(): 44 | protocol = get_protocol( 45 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 46 | ) 47 | return next(protocol.development()) 48 | 49 | 50 | def test_duration_warning(trained): 51 | protocol, model = trained 52 | with pytest.warns(UserWarning): 53 | duration = model.specifications.duration 54 | new_duration = duration + 1 55 | Inference(model, duration=new_duration, step=0.1, batch_size=128) 56 | 57 | 58 | def test_step_check_warning(trained): 59 | protocol, model = trained 60 | with pytest.raises(ValueError): 61 | duration = model.specifications.duration 62 | Inference(model, step=duration + 1, batch_size=128) 63 | 64 | 65 | def test_invalid_window_fails(trained): 66 | protocol, model = trained 67 | with pytest.raises(ValueError): 68 | Inference(model, window="unknown") 69 | 70 | 71 | def test_invalid_resolution_fails(trained): 72 | protocol, model = trained 73 | with pytest.warns(UserWarning): 74 | model.specifications.resolution = Resolution.FRAME 75 | Inference(model, window="whole", batch_size=128) 76 | 77 | 78 | def test_whole_window_slide(trained): 79 | protocol, model = trained 80 | inference = Inference(model, window="whole", batch_size=128) 81 | dev_file = next(protocol.development()) 82 | output = inference(dev_file) 83 | assert isinstance(output, np.ndarray) 84 | 85 | 86 | def test_on_file_path(trained): 87 | protocol, model = trained 88 | inference = Inference(model, batch_size=128) 89 | output = inference("tests/data/dev00.wav") 90 | assert isinstance(output, SlidingWindowFeature) 91 | 92 | 93 | def test_skip_aggregation(pretrained_model, dev_file): 94 | inference = Inference(pretrained_model, skip_aggregation=True) 95 | scores = inference(dev_file) 96 | assert len(scores.data.shape) == 3 97 | -------------------------------------------------------------------------------- /tests/io_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | from pyannote.core import Segment 4 | from torch import Tensor 5 | 6 | from pyannote.audio.core.io import Audio 7 | 8 | 9 | def test_audio_resample(): 10 | "Audio is correctly resampled when it isn't the correct sample rate" 11 | test_file = "tests/data/dev00.wav" 12 | info = torchaudio.info(test_file) 13 | old_sr = info.sample_rate 14 | loader = Audio(sample_rate=old_sr // 2, mono="downmix") 15 | wav, sr = loader(test_file) 16 | assert isinstance(wav, Tensor) 17 | assert sr == old_sr // 2 18 | 19 | 20 | def test_basic_load_with_defaults(): 21 | test_file = "tests/data/dev00.wav" 22 | loader = Audio(mono="downmix") 23 | wav, sr = loader(test_file) 24 | assert isinstance(wav, Tensor) 25 | 26 | 27 | def test_correct_audio_channel(): 28 | "When we specify an audio channel, it is chosen correctly" 29 | waveform = torch.rand(2, 16000 * 2) 30 | loader = Audio(mono="downmix") 31 | wav, sr = loader({"waveform": waveform, "sample_rate": 16000, "channel": 1}) 32 | assert torch.equal(wav, waveform[1:2]) 33 | assert sr == 16000 34 | 35 | 36 | def test_can_load_with_waveform(): 37 | "We can load a raw waveform" 38 | waveform = torch.rand(2, 16000 * 2) 39 | loader = Audio(mono="downmix") 40 | wav, sr = loader({"waveform": waveform, "sample_rate": 16000}) 41 | assert isinstance(wav, Tensor) 42 | assert sr == 16000 43 | 44 | 45 | def test_can_crop(): 46 | "Cropping works when we give a Segment" 47 | test_file = "tests/data/dev00.wav" 48 | loader = Audio(mono="downmix") 49 | segment = Segment(0.2, 0.7) 50 | wav, sr = loader.crop(test_file, segment) 51 | assert wav.shape[1] / sr == 0.5 52 | 53 | 54 | def test_can_crop_waveform(): 55 | "Cropping works on raw waveforms" 56 | waveform = torch.rand(1, 16000 * 2) 57 | loader = Audio(mono="downmix") 58 | segment = Segment(0.2, 0.7) 59 | wav, sr = loader.crop({"waveform": waveform, "sample_rate": 16000}, segment) 60 | assert isinstance(wav, Tensor) 61 | assert sr == 16000 62 | 63 | 64 | # File Like Object Tests 65 | def test_can_load_from_file_like(): 66 | "Load entire wav of file like" 67 | loader = Audio(mono="downmix") 68 | 69 | with open("tests/data/dev00.wav", "rb") as f: 70 | wav, sr = loader(f) 71 | 72 | assert isinstance(wav, Tensor) 73 | assert sr == 16000 74 | 75 | 76 | def test_can_crop_from_file_like(): 77 | "Load cropped sections from file like objects" 78 | loader = Audio(mono="downmix") 79 | 80 | with open("tests/data/dev00.wav", "rb") as f: 81 | segment = Segment(0.2, 0.7) 82 | wav, sr = loader.crop(f, segment) 83 | 84 | assert isinstance(wav, Tensor) 85 | assert sr == 16000 86 | assert wav.shape[1] == 0.5 * 16000 87 | -------------------------------------------------------------------------------- /tests/tasks/test_reproducibility.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from lightning.pytorch import seed_everything 3 | from pyannote.database import FileFinder, get_protocol 4 | 5 | from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel 6 | from pyannote.audio.tasks import VoiceActivityDetection 7 | 8 | 9 | def setup_tasks(task): 10 | protocol = get_protocol( 11 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 12 | ) 13 | vad = task(protocol, duration=0.2, batch_size=32, num_workers=4) 14 | return protocol, vad 15 | 16 | 17 | def create_dl(model, task): 18 | m = model(task=task) 19 | m.prepare_data() 20 | m.setup() 21 | return task.train_dataloader() 22 | 23 | 24 | def get_next5(dl): 25 | last5 = [] 26 | it = iter(dl) 27 | for i in range(5): 28 | last5.append(next(it)) 29 | return last5 30 | 31 | 32 | def test_seeding_ensures_data_loaders(): 33 | "Setting a global seed for the dataloaders ensures that we get data back in the same order" 34 | 35 | seed_everything(1) 36 | protocol, vad = setup_tasks(VoiceActivityDetection) 37 | dl = create_dl(SimpleSegmentationModel, vad) 38 | last5a = get_next5(dl) 39 | 40 | seed_everything(1) 41 | protocol, vad = setup_tasks(VoiceActivityDetection) 42 | dl = create_dl(SimpleSegmentationModel, vad) 43 | last5b = get_next5(dl) 44 | 45 | for i in range(len(last5b)): 46 | assert torch.equal(last5a[i]["X"], last5b[i]["X"]) 47 | 48 | 49 | def test_different_seeds(): 50 | "Changing the global seed will change the order of the data that loads" 51 | 52 | protocol, vad = setup_tasks(VoiceActivityDetection) 53 | seed_everything(4) 54 | dl = create_dl(SimpleSegmentationModel, vad) 55 | last5a = get_next5(dl) 56 | 57 | protocol, vad = setup_tasks(VoiceActivityDetection) 58 | seed_everything(5) 59 | dl = create_dl(SimpleSegmentationModel, vad) 60 | last5b = get_next5(dl) 61 | 62 | for i in range(5): 63 | assert not torch.equal(last5a[i]["X"], last5b[i]["X"]) 64 | -------------------------------------------------------------------------------- /tests/tasks/test_specifications.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyannote.database import FileFinder, get_protocol 3 | 4 | from pyannote.audio.core.model import Model 5 | from pyannote.audio.core.task import UnknownSpecificationsError 6 | from pyannote.audio.tasks import SpeakerDiarization 7 | 8 | 9 | @pytest.fixture() 10 | def protocol(): 11 | return get_protocol( 12 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 13 | ) 14 | 15 | 16 | def test_unknown_specifications_error_raised_on_non_setup_task(protocol): 17 | task = SpeakerDiarization(protocol=protocol) 18 | with pytest.raises(UnknownSpecificationsError): 19 | _ = task.specifications 20 | 21 | 22 | def test_unknown_specifications_error_raised_on_non_setup_model_task(protocol): 23 | task = SpeakerDiarization(protocol=protocol) 24 | model = Model.from_pretrained("pyannote/ci-segmentation") 25 | model.task = task 26 | with pytest.raises(UnknownSpecificationsError): 27 | _ = model.specifications 28 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import subprocess 24 | 25 | import pytest 26 | from pyannote.database import FileFinder, get_protocol 27 | 28 | 29 | @pytest.fixture() 30 | def protocol(): 31 | return get_protocol( 32 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 33 | ) 34 | 35 | 36 | @pytest.fixture() 37 | def database(): 38 | return "./tests/data/database.yml" 39 | 40 | 41 | @pytest.fixture() 42 | def model(): 43 | return "pyannote/ci-segmentation" 44 | 45 | 46 | def test_cli_train_vad(database, protocol): 47 | res = subprocess.run( 48 | [ 49 | "pyannote-audio-train", 50 | "model=DebugSegmentation", 51 | "task=VoiceActivityDetection", 52 | f"+registry={database}", 53 | f"protocol={protocol.name}", 54 | "trainer=fast_dev_run", 55 | "hydra.run.dir=.", # run hydra app in current directory 56 | "hydra.output_subdir=null", # disable hydra outputs 57 | "hydra/hydra_logging=disabled", 58 | "hydra/job_logging=disabled", 59 | ] 60 | ) 61 | assert res.returncode == 0 62 | 63 | 64 | def test_cli_train_segmentation(database, protocol): 65 | res = subprocess.run( 66 | [ 67 | "pyannote-audio-train", 68 | "model=DebugSegmentation", 69 | "task=SpeakerDiarization", 70 | f"+registry={database}", 71 | f"protocol={protocol.name}", 72 | "trainer=fast_dev_run", 73 | "hydra.run.dir=.", # run hydra app in current directory 74 | "hydra.output_subdir=null", # disable hydra outputs 75 | "hydra/hydra_logging=disabled", 76 | "hydra/job_logging=disabled", 77 | ] 78 | ) 79 | assert res.returncode == 0 80 | 81 | 82 | def test_cli_train_osd(database, protocol): 83 | res = subprocess.run( 84 | [ 85 | "pyannote-audio-train", 86 | "model=DebugSegmentation", 87 | "task=OverlappedSpeechDetection", 88 | f"+registry={database}", 89 | f"protocol={protocol.name}", 90 | "trainer=fast_dev_run", 91 | "hydra.run.dir=.", # run hydra app in current directory 92 | "hydra.output_subdir=null", # disable hydra outputs 93 | "hydra/hydra_logging=disabled", 94 | "hydra/job_logging=disabled", 95 | ] 96 | ) 97 | assert res.returncode == 0 98 | 99 | 100 | def test_cli_train_supervised_representation_with_arcface(database, protocol): 101 | res = subprocess.run( 102 | [ 103 | "pyannote-audio-train", 104 | "model=DebugEmbedding", 105 | "task=SpeakerEmbedding", 106 | f"+registry={database}", 107 | f"protocol={protocol.name}", 108 | "trainer=fast_dev_run", 109 | "hydra.run.dir=.", # run hydra app in current directory 110 | "hydra.output_subdir=null", # disable hydra outputs 111 | "hydra/hydra_logging=disabled", 112 | "hydra/job_logging=disabled", 113 | ] 114 | ) 115 | assert res.returncode == 0 116 | 117 | 118 | def test_cli_train_segmentation_with_pyannet(database, protocol): 119 | res = subprocess.run( 120 | [ 121 | "pyannote-audio-train", 122 | "model=PyanNet", 123 | "task=SpeakerDiarization", 124 | f"+registry={database}", 125 | f"protocol={protocol.name}", 126 | "trainer=fast_dev_run", 127 | "hydra.run.dir=.", # run hydra app in current directory 128 | "hydra.output_subdir=null", # disable hydra outputs 129 | "hydra/hydra_logging=disabled", 130 | "hydra/job_logging=disabled", 131 | ] 132 | ) 133 | assert res.returncode == 0 134 | 135 | 136 | def test_cli_eval_segmentation_model(database, protocol, model): 137 | res = subprocess.run( 138 | [ 139 | "pyannote-audio-eval", 140 | f"model={model}", 141 | f"+registry={database}", 142 | f"protocol={protocol.name}", 143 | "hydra.run.dir=.", # run hydra app in current directory 144 | "hydra.output_subdir=null", # disable hydra outputs 145 | "hydra/hydra_logging=disabled", 146 | "hydra/job_logging=disabled", 147 | ] 148 | ) 149 | assert res.returncode == 0 150 | -------------------------------------------------------------------------------- /tests/test_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pyannote.audio.pipelines.clustering import AgglomerativeClustering 4 | 5 | 6 | def test_agglomerative_clustering_num_cluster(): 7 | """ 8 | Make sure AgglomerativeClustering doesn't "over-merge" clusters when initial 9 | clustering already matches target num_clusters, cf 10 | https://github.com/pyannote/pyannote-audio/issues/1525 11 | """ 12 | 13 | # 2 embeddings different enough 14 | embeddings = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 1.0, 2.0]]) 15 | 16 | # clustering with params that should yield 1 cluster per embedding 17 | clustering = AgglomerativeClustering().instantiate( 18 | { 19 | "method": "centroid", 20 | "min_cluster_size": 0, 21 | "threshold": 0.0, 22 | } 23 | ) 24 | 25 | # request 2 clusters 26 | clusters = clustering.cluster( 27 | embeddings=embeddings, min_clusters=2, max_clusters=2, num_clusters=2 28 | ) 29 | assert np.array_equal(clusters, np.array([0, 1])) 30 | -------------------------------------------------------------------------------- /tests/test_import_lib.py: -------------------------------------------------------------------------------- 1 | from pyannote.audio.core.model import Model 2 | 3 | 4 | def test_import_lib(): 5 | """This is a dummy test, just to check 6 | if the lib can be successfully imported. 7 | """ 8 | assert Model is not None 9 | -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import pytest 24 | import torch 25 | 26 | from pyannote.audio.torchmetrics.functional.audio.diarization_error_rate import ( 27 | _der_update, 28 | diarization_error_rate, 29 | ) 30 | 31 | 32 | @pytest.fixture 33 | def target(): 34 | chunk1 = [[0, 0], [1, 0], [1, 0], [1, 1], [1, 1], [0, 1], [0, 1]] 35 | chunk2 = [[0, 0], [0, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 0]] 36 | return torch.tensor([chunk1, chunk2], dtype=torch.float32).transpose(2, 1) 37 | 38 | 39 | @pytest.fixture 40 | def prediction(): 41 | chunk1 = [[0, 0], [1, 0], [0, 0], [1, 1], [0, 1], [1, 1], [1, 0]] 42 | chunk2 = [[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 1], [1, 0]] 43 | return torch.tensor([chunk1, chunk2], dtype=torch.float32).transpose(2, 1) 44 | 45 | 46 | def test_frame_reduction(target, prediction): 47 | false_alarm, missed_detection, speaker_confusion, speech_total = _der_update( 48 | prediction, target, reduce="frame" 49 | ) 50 | 51 | torch.testing.assert_close( 52 | false_alarm, 53 | torch.Tensor( 54 | [[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]] 55 | ), 56 | ) 57 | 58 | torch.testing.assert_close( 59 | missed_detection, 60 | torch.Tensor( 61 | [ 62 | [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0], 63 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 64 | ] 65 | ), 66 | ) 67 | 68 | torch.testing.assert_close( 69 | speaker_confusion, 70 | torch.Tensor( 71 | [[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 72 | ), 73 | ) 74 | 75 | torch.testing.assert_close( 76 | speech_total, 77 | torch.Tensor( 78 | [[0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]] 79 | ), 80 | ) 81 | 82 | 83 | def test_chunk_reduction(target, prediction): 84 | false_alarm, missed_detection, speaker_confusion, speech_total = _der_update( 85 | prediction, target, reduce="chunk" 86 | ) 87 | 88 | torch.testing.assert_close( 89 | false_alarm, 90 | torch.Tensor([1.0, 2.0]), 91 | ) 92 | 93 | torch.testing.assert_close( 94 | missed_detection, 95 | torch.Tensor([2.0, 0.0]), 96 | ) 97 | 98 | torch.testing.assert_close( 99 | speaker_confusion, 100 | torch.Tensor([1.0, 0.0]), 101 | ) 102 | 103 | torch.testing.assert_close( 104 | speech_total, 105 | torch.Tensor([8.0, 4.0]), 106 | ) 107 | 108 | 109 | def test_batch_reduction(target, prediction): 110 | false_alarm, missed_detection, speaker_confusion, speech_total = _der_update( 111 | prediction, target, reduce="batch" 112 | ) 113 | torch.testing.assert_close(false_alarm.item(), 3.0) 114 | torch.testing.assert_close(missed_detection.item(), 2.0) 115 | torch.testing.assert_close(speaker_confusion.item(), 1.0) 116 | torch.testing.assert_close(speech_total.item(), 12.0) 117 | 118 | 119 | def test_batch_der(target, prediction): 120 | der = diarization_error_rate(prediction, target, reduce="batch") 121 | torch.testing.assert_close(der.item(), (3.0 + 2.0 + 1.0) / 12.0) 122 | 123 | 124 | def test_batch_der_with_components(target, prediction): 125 | der, ( 126 | false_alarm, 127 | missed_detection, 128 | speaker_confusion, 129 | speech_total, 130 | ) = diarization_error_rate( 131 | prediction, target, reduce="batch", return_components=True 132 | ) 133 | torch.testing.assert_close(der.item(), (3.0 + 2.0 + 1.0) / 12.0) 134 | torch.testing.assert_close(false_alarm.item(), 3.0) 135 | torch.testing.assert_close(missed_detection.item(), 2.0) 136 | torch.testing.assert_close(speaker_confusion.item(), 1.0) 137 | torch.testing.assert_close(speech_total.item(), 12.0) 138 | 139 | 140 | def test_chunk_der(target, prediction): 141 | der = diarization_error_rate(prediction, target, reduce="chunk") 142 | torch.testing.assert_close(der, torch.Tensor([4.0 / 8.0, 2.0 / 4.0])) 143 | -------------------------------------------------------------------------------- /tests/test_run_notebooks.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | 3 | import papermill as pm 4 | 5 | 6 | def test_can_run_notebooks(): 7 | # Search for all notebooks in directory 8 | notebooks = glob("**/notebook/**/*.ipynb") 9 | for nb in notebooks: 10 | try: 11 | pm.execute_notebook( 12 | nb, "/dev/null", progress_bar=False, kernel_name="python" 13 | ) 14 | except Exception as e: 15 | # Which notebook caused the error 16 | raise Exception(nb, e) 17 | -------------------------------------------------------------------------------- /tests/test_sample.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2024- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | def test_sample(): 25 | from pyannote.audio.sample import SAMPLE_FILE 26 | 27 | assert "annotation" in SAMPLE_FILE 28 | assert "annotated" in SAMPLE_FILE 29 | -------------------------------------------------------------------------------- /tests/test_speechbrain.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import pytest 3 | from speechbrain.inference import EncoderClassifier 4 | 5 | 6 | @pytest.fixture() 7 | def cache(): 8 | return tempfile.mkdtemp() 9 | 10 | def test_import_speechbrain_encoder_classifier(cache): 11 | """This is a simple test that check if speechbrain 12 | EncoderClassifier can be imported. It does not check 13 | if the model is working properly. 14 | """ 15 | 16 | model = EncoderClassifier.from_hparams( 17 | source="speechbrain/spkrec-ecapa-voxceleb", 18 | savedir=cache, 19 | ) 20 | assert isinstance(model, EncoderClassifier) 21 | -------------------------------------------------------------------------------- /tests/test_stats_pool.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import torch 24 | 25 | from pyannote.audio.models.blocks.pooling import StatsPool 26 | 27 | 28 | def test_stats_pool_weightless(): 29 | x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]]) 30 | # (batch = 2, features = 2, frames = 2) 31 | 32 | stats_pool = StatsPool() 33 | 34 | y = stats_pool(x) 35 | # (batch = 2, features = 4) 36 | 37 | assert torch.equal( 38 | torch.round(y, decimals=4), 39 | torch.Tensor([[3.0, 3.0, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]), 40 | ) 41 | 42 | 43 | def test_stats_pool_one_speaker(): 44 | x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]]) 45 | # (batch = 2, features = 2, frames = 2) 46 | 47 | w = torch.Tensor( 48 | [ 49 | [0.5, 0.01], 50 | [0.2, 0.1], 51 | ] 52 | ) 53 | # (batch = 2, frames = 2) 54 | 55 | stats_pool = StatsPool() 56 | 57 | y = stats_pool(x, weights=w) 58 | # (batch = 2, features = 4) 59 | 60 | assert torch.equal( 61 | torch.round(y, decimals=4), 62 | torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]), 63 | ) 64 | 65 | 66 | def test_stats_pool_multi_speaker(): 67 | x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]]) 68 | # (batch = 2, features = 2, frames = 2) 69 | 70 | w = torch.Tensor([[[0.1, 0.2], [0.2, 0.3]], [[0.001, 0.001], [0.2, 0.3]]]) 71 | # (batch = 2, speakers = 2, frames = 2) 72 | 73 | stats_pool = StatsPool() 74 | 75 | y = stats_pool(x, weights=w) 76 | # (batch = 2, speakers = 2, features = 4) 77 | 78 | assert torch.equal( 79 | torch.round(y, decimals=4), 80 | torch.Tensor( 81 | [ 82 | [[3.3333, 3.3333, 1.4142, 1.4142], [3.2, 3.2, 1.4142, 1.4142]], 83 | [[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]], 84 | ] 85 | ), 86 | ) 87 | 88 | 89 | def test_stats_pool_frame_mismatch(): 90 | x = torch.Tensor([[[2.0, 2.0], [2.0, 2.0]], [[1.0, 1.0], [1.0, 1.0]]]) 91 | # (batch = 2, features = 2, frames = 2) 92 | 93 | stats_pool = StatsPool() 94 | w = torch.Tensor( 95 | [ 96 | [0.5, 0.5, 0.0], 97 | [0.0, 0.5, 0.5], 98 | ] 99 | ) 100 | # (batch = 2, frames = 3) 101 | 102 | y = stats_pool(x, weights=w) 103 | # (batch = 2, features = 4) 104 | 105 | assert torch.equal( 106 | torch.round(y, decimals=4), 107 | torch.Tensor([[2.0, 2.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]), 108 | ) 109 | 110 | 111 | def test_stats_pool_all_zero_weights(): 112 | x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]]) 113 | # (batch = 2, features = 2, frames = 2) 114 | 115 | w = torch.Tensor( 116 | [ 117 | [0.5, 0.01], 118 | [0.0, 0.0], # all zero weights 119 | ] 120 | ) 121 | # (batch = 2, frames = 2) 122 | 123 | stats_pool = StatsPool() 124 | 125 | y = stats_pool(x, weights=w) 126 | # (batch = 2, features = 4) 127 | 128 | assert torch.equal( 129 | torch.round(y, decimals=4), 130 | torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [0.0, 0.0, 0.0, 0.0]]), 131 | ) 132 | -------------------------------------------------------------------------------- /tests/utils/preview.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from IPython.display import Audio 3 | 4 | from pyannote.audio.utils.preview import listen 5 | from pyannote.core import Segment 6 | from pyannote.database import FileFinder, get_protocol 7 | 8 | 9 | def test_file(): 10 | protocol = get_protocol( 11 | "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()} 12 | ) 13 | return next(protocol.train()) 14 | 15 | 16 | def test_returns_audio_object(): 17 | audio_file = test_file() 18 | ipython_audio = listen(audio_file) 19 | assert isinstance(ipython_audio, Audio) 20 | 21 | 22 | def test_can_crop(): 23 | audio_file = test_file() 24 | listen(audio_file, Segment(0, 1)) 25 | 26 | 27 | def test_fail_crop_too_large(): 28 | with pytest.raises(ValueError): 29 | audio_file = test_file() 30 | duration = audio_file.duration 31 | listen(audio_file, Segment(0, duration * 2)) 32 | -------------------------------------------------------------------------------- /tests/utils/probe_util_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from pyannote.audio.utils.probe import probe 5 | 6 | 7 | class Trunk(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.layer1 = nn.Linear(1, 2) 11 | self.layer2 = nn.Linear(2, 3) 12 | self.layer3 = nn.Linear(3, 4) 13 | 14 | def forward(self, x): 15 | return self.layer3(self.layer2(self.layer1(x))) 16 | 17 | 18 | def test_probe_dict(): 19 | trunk = Trunk() 20 | probe(trunk, {"probe1": "layer1"}) 21 | out = trunk( 22 | torch.ones( 23 | 1, 24 | ) 25 | ) 26 | assert isinstance(out, dict) 27 | assert len(out.keys()) == 1 28 | assert isinstance(out["probe1"], torch.Tensor) 29 | 30 | 31 | def test_probe_output(): 32 | trunk = Trunk() 33 | probe(trunk, {"probe1": "layer3"}) 34 | out = trunk( 35 | torch.ones( 36 | 1, 37 | ) 38 | ) 39 | out = out["probe1"] 40 | tout = trunk.layer3( 41 | trunk.layer2( 42 | trunk.layer1( 43 | torch.ones( 44 | 1, 45 | ) 46 | ) 47 | ) 48 | ) 49 | assert torch.equal(tout, out) 50 | 51 | 52 | def test_probe_revert(): 53 | trunk = Trunk() 54 | revert = probe(trunk, {"probe1": "layer3"}) 55 | out = trunk( 56 | torch.ones( 57 | 1, 58 | ) 59 | ) 60 | assert isinstance(out, dict) 61 | revert() 62 | out = trunk( 63 | torch.ones( 64 | 1, 65 | ) 66 | ) 67 | assert isinstance(out, torch.Tensor) 68 | 69 | 70 | def test_probe_array(): 71 | trunk = Trunk() 72 | probe(trunk, ["layer3"]) 73 | out = trunk( 74 | torch.ones( 75 | 1, 76 | ) 77 | ) 78 | assert isinstance(out, dict) 79 | -------------------------------------------------------------------------------- /tests/utils/test_permutation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from pyannote.audio.utils.permutation import permutate 5 | 6 | 7 | def test_permutate_torch(): 8 | 9 | num_frames, num_speakers = 10, 3 10 | 11 | actual_permutations = [ 12 | (0, 1, 2), 13 | (0, 2, 1), 14 | (1, 0, 2), 15 | (1, 2, 0), 16 | (2, 0, 1), 17 | (2, 1, 0), 18 | ] 19 | batch_size = len(actual_permutations) 20 | 21 | y2 = torch.randn((num_frames, num_speakers)) 22 | y1 = torch.zeros((batch_size, num_frames, num_speakers)) 23 | 24 | for p, permutation in enumerate(actual_permutations): 25 | y1[p] = y2[:, permutation] 26 | 27 | permutated_y2, permutations = permutate(y1, y2) 28 | assert actual_permutations == permutations 29 | 30 | for p, permutation in enumerate(actual_permutations): 31 | np.testing.assert_allclose(permutated_y2[p], y2[:, permutation]) 32 | 33 | 34 | def test_permutate_numpy(): 35 | 36 | num_frames, num_speakers = 10, 3 37 | 38 | actual_permutations = [ 39 | (0, 1, 2), 40 | (0, 2, 1), 41 | (1, 0, 2), 42 | (1, 2, 0), 43 | (2, 0, 1), 44 | (2, 1, 0), 45 | ] 46 | batch_size = len(actual_permutations) 47 | 48 | y2 = np.random.randn(num_frames, num_speakers) 49 | y1 = np.zeros((batch_size, num_frames, num_speakers)) 50 | 51 | for p, permutation in enumerate(actual_permutations): 52 | y1[p] = y2[:, permutation] 53 | 54 | permutated_y2, permutations = permutate(y1, y2) 55 | assert actual_permutations == permutations 56 | 57 | for p, permutation in enumerate(actual_permutations): 58 | np.testing.assert_allclose(permutated_y2[p], y2[:, permutation]) 59 | 60 | 61 | def test_permutate_less_speakers(): 62 | 63 | num_frames = 10 64 | 65 | actual_permutations = [ 66 | (0, 1, None), 67 | (0, None, 1), 68 | (1, 0, None), 69 | (1, None, 0), 70 | (None, 0, 1), 71 | (None, 1, 0), 72 | ] 73 | batch_size = len(actual_permutations) 74 | 75 | y2 = np.random.randn(num_frames, 2) 76 | y1 = np.zeros((batch_size, num_frames, 3)) 77 | 78 | for p, permutation in enumerate(actual_permutations): 79 | for i, j in enumerate(permutation): 80 | if j is not None: 81 | y1[p, :, i] = y2[:, j] 82 | 83 | permutated_y2, permutations = permutate(y1, y2) 84 | 85 | assert permutations == actual_permutations 86 | 87 | 88 | def test_permutate_more_speakers(): 89 | 90 | num_frames = 10 91 | 92 | actual_permutations = [ 93 | (0, 1), 94 | (0, 2), 95 | (1, 0), 96 | (1, 2), 97 | (2, 0), 98 | (2, 1), 99 | ] 100 | batch_size = len(actual_permutations) 101 | 102 | y2 = np.random.randn(num_frames, 3) 103 | y1 = np.zeros((batch_size, num_frames, 2)) 104 | 105 | for p, permutation in enumerate(actual_permutations): 106 | for i, j in enumerate(permutation): 107 | y1[p, :, i] = y2[:, j] 108 | 109 | permutated_y2, permutations = permutate(y1, y2) 110 | 111 | assert permutations == actual_permutations 112 | np.testing.assert_allclose(permutated_y2, y1) 113 | -------------------------------------------------------------------------------- /tests/utils/test_powerset.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2023- CNRS 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import torch 25 | 26 | from pyannote.audio.utils.powerset import Powerset 27 | 28 | 29 | def test_roundtrip(): 30 | for num_classes in range(2, 5): 31 | for max_set_size in range(1, num_classes + 1): 32 | powerset = Powerset(num_classes, max_set_size) 33 | 34 | # simulate a sequence where each frame is assigned to a different powerset class 35 | one_sequence = [ 36 | [0] * powerset.num_powerset_classes 37 | for _ in range(powerset.num_powerset_classes) 38 | ] 39 | for i in range(powerset.num_powerset_classes): 40 | one_sequence[i][i] = 1.0 41 | 42 | # make a batch out of this sequence and the same sequence in reverse order 43 | batch_powerset = torch.tensor([one_sequence, one_sequence[::-1]]) 44 | 45 | # convert from powerset to multi-label 46 | batch_multilabel = powerset.to_multilabel(batch_powerset) 47 | 48 | # convert batch back to powerset 49 | reconstruction = powerset.to_powerset(batch_multilabel) 50 | 51 | assert torch.equal(batch_powerset, reconstruction) 52 | 53 | 54 | def test_permutate_powerset(): 55 | for num_classes in range(1, 6): 56 | for max_set_size in range(1, num_classes + 1): 57 | powerset = Powerset(num_classes, max_set_size) 58 | 59 | # create (num_powerset_class, num_powerset_class)-shaped tensor, where each frame is assigned to a different powerset class 60 | # and convert it to its multi-label equivalent 61 | t1 = torch.nn.functional.one_hot( 62 | torch.arange(powerset.num_powerset_classes), 63 | powerset.num_powerset_classes, 64 | ) 65 | t1_ml = powerset.to_multilabel(t1) 66 | 67 | # then permutate the powerset class in powerset space AND the multilabel equivalent in its native space 68 | # and check it has the same result. 69 | # perm = torch.randperm(num_classes) 70 | perm = tuple(torch.randperm(num_classes).tolist()) 71 | t1_ml_perm = t1_ml[:, perm] 72 | perm_ps = powerset.permutation_mapping[perm] 73 | t1_ps_perm = t1[..., perm_ps] 74 | t1_ps_perm_ml = powerset.to_multilabel(t1_ps_perm) 75 | 76 | assert t1_ml_perm.equal(t1_ps_perm_ml) 77 | -------------------------------------------------------------------------------- /tutorials/assets/download-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/download-model.png -------------------------------------------------------------------------------- /tutorials/assets/download-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/download-pipeline.png -------------------------------------------------------------------------------- /tutorials/assets/prodigy-pyannote.audio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/prodigy-pyannote.audio.png -------------------------------------------------------------------------------- /tutorials/assets/pyannote.diff.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/pyannote.diff.PNG -------------------------------------------------------------------------------- /tutorials/assets/pyannote.review.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/pyannote.review.PNG -------------------------------------------------------------------------------- /tutorials/assets/sample.rttm: -------------------------------------------------------------------------------- 1 | SPEAKER sample 1 6.690 0.430 speaker90 2 | SPEAKER sample 1 7.550 0.800 speaker91 3 | SPEAKER sample 1 8.320 1.700 speaker90 4 | SPEAKER sample 1 9.920 1.110 speaker91 5 | SPEAKER sample 1 10.570 4.130 speaker90 6 | SPEAKER sample 1 14.490 3.430 speaker91 7 | SPEAKER sample 1 18.050 3.440 speaker90 8 | SPEAKER sample 1 18.150 0.440 speaker91 9 | SPEAKER sample 1 21.780 6.720 speaker91 10 | SPEAKER sample 1 27.850 2.150 speaker90 11 | -------------------------------------------------------------------------------- /tutorials/assets/sample.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/sample.wav -------------------------------------------------------------------------------- /tutorials/speaker_verification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Speaker verification\n", 9 | "\n", 10 | "```python\n", 11 | "import torch\n", 12 | "from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding\n", 13 | "model = PretrainedSpeakerEmbedding(\n", 14 | " \"speechbrain/spkrec-ecapa-voxceleb\",\n", 15 | " device=torch.device(\"cuda\"))\n", 16 | "\n", 17 | "from pyannote.audio import Audio\n", 18 | "from pyannote.core import Segment\n", 19 | "audio = Audio(sample_rate=16000, mono=\"downmix\")\n", 20 | "\n", 21 | "# extract embedding for a speaker speaking between t=3s and t=6s\n", 22 | "speaker1 = Segment(3., 6.)\n", 23 | "waveform1, sample_rate = audio.crop(\"audio.wav\", speaker1)\n", 24 | "embedding1 = model(waveform1[None])\n", 25 | "\n", 26 | "# extract embedding for a speaker speaking between t=7s and t=12s\n", 27 | "speaker2 = Segment(7., 12.)\n", 28 | "waveform2, sample_rate = audio.crop(\"audio.wav\", speaker2)\n", 29 | "embedding2 = model(waveform2[None])\n", 30 | "\n", 31 | "# compare embeddings using \"cosine\" distance\n", 32 | "from scipy.spatial.distance import cdist\n", 33 | "distance = cdist(embedding1, embedding2, metric=\"cosine\")\n", 34 | "```\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [] 41 | } 42 | ], 43 | "metadata": { 44 | "interpreter": { 45 | "hash": "41379f2c2a4eb17f5ac9a1f5014f4b793a0ead0b6469d8877f81a91eb030f53e" 46 | }, 47 | "kernelspec": { 48 | "display_name": "Python 3.8.2 64-bit ('pyannote': conda)", 49 | "language": "python", 50 | "name": "python3" 51 | }, 52 | "language_info": { 53 | "name": "python", 54 | "version": "3.8.2" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } 60 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 3.3.2 2 | --------------------------------------------------------------------------------