├── .faq
├── FAQ.md
└── suggest.md
├── .gitattributes
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.yml
│ └── config.yml
├── stale.yml
└── workflows
│ ├── doc.yml
│ ├── pypi.yml
│ ├── test.yml
│ └── test_cli.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── FAQ.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── doc
├── gen_docs.py
├── requirements.txt
└── source
│ └── index.rst
├── environment.yaml
├── faq.yml
├── notebook
├── augmentation.ipynb
├── example.ipynb
├── freeze.ipynb
├── inference.ipynb
└── sharing.ipynb
├── pyannote
├── __init__.py
└── audio
│ ├── __init__.py
│ ├── augmentation
│ ├── __init__.py
│ ├── mix.py
│ └── registry.py
│ ├── cli
│ ├── __init__.py
│ ├── config
│ │ └── hydra
│ │ │ └── default.yaml
│ ├── evaluate.py
│ ├── evaluate_config
│ │ ├── __init__.py
│ │ ├── config.yaml
│ │ └── hydra
│ │ │ └── default.yaml
│ ├── lr_schedulers
│ │ ├── CosineAnnealingWarmRestarts.py
│ │ ├── CyclicLR.py
│ │ ├── ReduceLROnPlateau.py
│ │ └── __init__.py
│ ├── pretrained.py
│ ├── train.py
│ └── train_config
│ │ ├── __init__.py
│ │ ├── config.yaml
│ │ ├── hydra
│ │ └── default.yaml
│ │ ├── model
│ │ ├── DebugEmbedding.yaml
│ │ ├── DebugSegmentation.yaml
│ │ ├── Pretrained.yaml
│ │ ├── PyanNet.yaml
│ │ ├── SSeRiouSS.yaml
│ │ ├── XVectorMFCC.yaml
│ │ └── XVectorSincNet.yaml
│ │ ├── optimizer
│ │ ├── Adam.yaml
│ │ ├── AdamW.yaml
│ │ └── Adan.yaml
│ │ ├── preprocessor
│ │ └── LowerTemporalResolution.yaml
│ │ ├── scheduler
│ │ ├── CosineAnnealingWarmRestarts.yaml
│ │ ├── CyclicLR.yaml
│ │ └── ReduceLROnPlateau.yaml
│ │ ├── task
│ │ ├── MultiLabelSegmentation.yaml
│ │ ├── OverlappedSpeechDetection.yaml
│ │ ├── SpeakerDiarization.yaml
│ │ ├── SpeakerEmbedding.yaml
│ │ └── VoiceActivityDetection.yaml
│ │ └── trainer
│ │ ├── default.yaml
│ │ └── fast_dev_run.yaml
│ ├── core
│ ├── __init__.py
│ ├── callback.py
│ ├── inference.py
│ ├── io.py
│ ├── model.py
│ ├── pipeline.py
│ └── task.py
│ ├── models
│ ├── __init__.py
│ ├── blocks
│ │ ├── pooling.py
│ │ └── sincnet.py
│ ├── embedding
│ │ ├── __init__.py
│ │ ├── debug.py
│ │ ├── wespeaker
│ │ │ ├── LICENSE.WeSpeaker
│ │ │ ├── __init__.py
│ │ │ ├── convert.py
│ │ │ └── resnet.py
│ │ └── xvector.py
│ ├── segmentation
│ │ ├── PyanNet.py
│ │ ├── SSeRiouSS.py
│ │ ├── __init__.py
│ │ └── debug.py
│ └── separation
│ │ ├── ToTaToNet.py
│ │ └── __init__.py
│ ├── pipelines
│ ├── __init__.py
│ ├── clustering.py
│ ├── multilabel.py
│ ├── overlapped_speech_detection.py
│ ├── resegmentation.py
│ ├── speaker_diarization.py
│ ├── speaker_verification.py
│ ├── speech_separation.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── diarization.py
│ │ ├── getter.py
│ │ ├── hook.py
│ │ └── oracle.py
│ └── voice_activity_detection.py
│ ├── sample
│ ├── __init__.py
│ ├── sample.rttm
│ └── sample.wav
│ ├── tasks
│ ├── __init__.py
│ ├── embedding
│ │ ├── __init__.py
│ │ ├── arcface.py
│ │ └── mixins.py
│ ├── segmentation
│ │ ├── __init__.py
│ │ ├── mixins.py
│ │ ├── multilabel.py
│ │ ├── overlapped_speech_detection.py
│ │ ├── speaker_diarization.py
│ │ └── voice_activity_detection.py
│ └── separation
│ │ ├── PixIT.py
│ │ └── __init__.py
│ ├── torchmetrics
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ └── diarization_error_rate.py
│ ├── classification
│ │ ├── __init__.py
│ │ └── equal_error_rate.py
│ └── functional
│ │ ├── __init__.py
│ │ └── audio
│ │ ├── __init__.py
│ │ └── diarization_error_rate.py
│ └── utils
│ ├── __init__.py
│ ├── loss.py
│ ├── metric.py
│ ├── multi_task.py
│ ├── params.py
│ ├── permutation.py
│ ├── powerset.py
│ ├── preprocessors.py
│ ├── preview.py
│ ├── probe.py
│ ├── protocol.py
│ ├── random.py
│ ├── receptive_field.py
│ ├── reproducibility.py
│ ├── signal.py
│ └── version.py
├── questions
├── README.md
├── bad_performance.question.md
├── from_memory.question.md
├── offline.question.md
├── pyannote.question.md
└── streaming.question.md
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
├── conftest.py
├── data
│ ├── database.yml
│ ├── debug.development.lst
│ ├── debug.development.rttm
│ ├── debug.development.uem
│ ├── debug.test.lst
│ ├── debug.test.rttm
│ ├── debug.test.uem
│ ├── debug.train.lst
│ ├── debug.train.rttm
│ ├── debug.train.uem
│ ├── dev00.wav
│ ├── dev01.wav
│ ├── empty.wav
│ ├── trn01.wav
│ ├── trn02.wav
│ ├── trn03.wav
│ ├── trn04.wav
│ ├── trn05.wav
│ ├── trn06.wav
│ ├── trn07.wav
│ ├── trn08.wav
│ ├── trn09.wav
│ ├── trñ00.wav
│ ├── tst00.wav
│ └── tst01.wav
├── inference_test.py
├── io_test.py
├── tasks
│ ├── test_reproducibility.py
│ └── test_specifications.py
├── test_cli.py
├── test_clustering.py
├── test_import_lib.py
├── test_metrics.py
├── test_run_notebooks.py
├── test_sample.py
├── test_speechbrain.py
├── test_stats_pool.py
├── test_train.py
└── utils
│ ├── preview.py
│ ├── probe_util_test.py
│ ├── test_permutation.py
│ └── test_powerset.py
├── tutorials
├── MRE_template.ipynb
├── adapting_pretrained_pipeline.ipynb
├── add_your_own_model.ipynb
├── add_your_own_task.ipynb
├── applying_a_model.ipynb
├── applying_a_pipeline.ipynb
├── assets
│ ├── download-model.png
│ ├── download-pipeline.png
│ ├── prodigy-pyannote.audio.png
│ ├── pyannote.diff.PNG
│ ├── pyannote.review.PNG
│ ├── sample.rttm
│ └── sample.wav
├── community
│ └── offline_usage_speaker_diarization.ipynb
├── intro.ipynb
├── overlapped_speech_detection.ipynb
├── speaker_verification.ipynb
├── training_a_model.ipynb
├── training_with_cli.md
└── voice_activity_detection.ipynb
└── version.txt
/.faq/FAQ.md:
--------------------------------------------------------------------------------
1 |
2 | # Frequently Asked Questions
3 |
4 | {%- for question in questions %}
5 | - [{{ question.title }}](#{{ question.slug }})
6 | {%- endfor %}
7 |
8 |
9 | {%- for question in questions %}
10 |
11 |
12 | ## {{ question.title }}
13 |
14 | {{ question.body }}
15 |
16 | {%- endfor %}
17 |
18 |
19 |
20 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory)
21 |
--------------------------------------------------------------------------------
/.faq/suggest.md:
--------------------------------------------------------------------------------
1 | Thank you for your issue.
2 |
3 | {%- if questions -%}
4 | {% if questions|length == 1 %}
5 | We found the following entry in the [FAQ]({{ faq_url }}) which you may find helpful:
6 | {%- else %}
7 | We found the following entries in the [FAQ]({{ faq_url }}) which you may find helpful:
8 | {%- endif %}
9 |
10 | {% for question in questions %}
11 | - [{{ question.title }}]({{ faq_url }}#{{ question.slug }})
12 | {%- endfor %}
13 |
14 | {%- else -%}
15 | You might want to check the [FAQ]({{ faq_url }}) if you haven't done so already.
16 | {%- endif %}
17 |
18 | Feel free to close this issue if you found an answer in the FAQ.
19 |
20 | If your issue is a feature request, please read [this](https://xyproblem.info/) first and update your request accordingly, if needed.
21 |
22 | If your issue is a bug report, please provide a [minimum reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) as a link to a self-contained [Google Colab](https://colab.research.google.com/) notebook containing everthing needed to reproduce the bug:
23 | - installation
24 | - data preparation
25 | - model download
26 | - etc.
27 |
28 | Providing an MRE will increase your chance of getting an answer from the community (either maintainers or other power users).
29 |
30 | Companies relying on `pyannote.audio` in production may contact [me](https://herve.niderb.fr) via email regarding:
31 | * paid scientific consulting around speaker diarization and speech processing in general;
32 | * custom models and tailored features (via the local tech transfer office).
33 |
34 | > This is an automated reply, generated by [FAQtory](https://github.com/willmcgugan/faqtory)
35 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pyannote/audio/_version.py export-subst
2 | notebooks/* linguist-documentation
3 | tutorials/* linguist-documentation
4 | versioneer.py linguist-vendored
5 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [hbredin]
4 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
1 | name: Bug report
2 | description: Report a bug in pyannote.audio
3 | body:
4 |
5 | - type: markdown
6 | attributes:
7 | value: |
8 | When reporting bugs, please follow the guidelines in this template. This helps identify the problem precisely and thus enables contributors to fix it faster.
9 | - Write a descriptive issue title above.
10 | - The golden rule is to **always open *one* issue for *one* bug**. If you notice several bugs and want to report them, make sure to create one new issue for each of them.
11 | - Search [open](https://github.com/pyannote/pyannote-audio/issues) and [closed](https://github.com/pyannote/pyannote-audio/issues?q=is%3Aissue+is%3Aclosed) issues to ensure it has not already been reported. If you don't find a relevant match or if you're unsure, don't hesitate to **open a new issue**. The bugsquad will handle it from there if it's a duplicate.
12 | - Please always check if your issue is reproducible in the latest version – it may already have been fixed!
13 | - If you use a custom build, please test if your issue is reproducible in official releases too.
14 |
15 | - type: textarea
16 | attributes:
17 | label: Tested versions
18 | description: |
19 | To properly fix a bug, we need to identify if the bug was recently introduced in the engine, or if it was always present.
20 | - Please specify the pyannote.audio version you found the issue in, including the **Git commit hash** if using a development build.
21 | - If you can, **please test earlier pyannote.audio versions** and, if applicable, newer versions (development branch). Mention whether the bug is reproducible or not in the versions you tested.
22 | - The aim is for us to identify whether a bug is a **regression**, i.e. an issue that didn't exist in a previous version, but was introduced later on, breaking existing functionality. For example, if a bug is reproducible in 3.2 but not in 3.0, we would like you to test intermediate 3.1 to find which version is the first one where the issue can be reproduced.
23 | placeholder: |
24 | - Reproducible in: 3.1, 3.2, and later
25 | - Not reproducible in: 3.0
26 | validations:
27 | required: true
28 |
29 | - type: input
30 | attributes:
31 | label: System information
32 | description: |
33 | - Specify the OS version, and when relevant hardware information.
34 | - For issues that are likely OS-specific and/or GPU-related, please specify the GPU model and architecture.
35 | - **Bug reports not including the required information may be closed at the maintainers' discretion.** If in doubt, always include all the requested information; it's better to include too much information than not enough information.
36 | placeholder: macOS 13.6 - pyannote.audio 3.1.1 - M1 Pro
37 | validations:
38 | required: true
39 |
40 | - type: textarea
41 | attributes:
42 | label: Issue description
43 | description: |
44 | Describe your issue briefly. What doesn't work, and how do you expect it to work instead?
45 | You can include audio, images or videos with drag and drop, and format code blocks or logs with ```
tags.
46 | validations:
47 | required: true
48 |
49 | - type: input
50 | attributes:
51 | label: Minimal reproduction example (MRE)
52 | description: |
53 | Having reproducible issues is a prerequisite for contributors to be able to solve them.
54 | Include a link to minimal reproduction example using [this Google Colab notebook](https://colab.research.google.com/github/pyannote/pyannote-audio/blob/develop/tutorials/MRE_template.ipynb) as a starting point.
55 | validations:
56 | required: true
57 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 |
3 | contact_links:
4 |
5 | - name: Feature request
6 | url: https://github.com/pyannote/pyannote-audio/discussions
7 | about: Suggest an idea for this project.
8 |
9 | - name: Consulting
10 | url: https://herve.niderb.fr/consulting
11 | about: Using pyannote.audio in production? Make the most of it thanks to our consulting services.
12 |
13 | - name: Premium models
14 | url: https://forms.office.com/e/GdqwVgkZ5C
15 | about: We are considering selling premium models, extensions, or services around pyannote.audio.
16 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 180
3 | # Number of days of inactivity before a stale issue is closed
4 | daysUntilClose: 30
5 | # Issues with these labels will never be considered stale
6 | exemptLabels:
7 | - pinned
8 | - security
9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 | This issue has been automatically marked as stale because it has not had
14 | recent activity. It will be closed if no further activity occurs. Thank you
15 | for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 |
--------------------------------------------------------------------------------
/.github/workflows/doc.yml:
--------------------------------------------------------------------------------
1 | name: Documentation
2 | on:
3 | push:
4 | branches:
5 | - master
6 |
7 | jobs:
8 | build-and-deploy:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | max-parallel: 4
12 | matrix:
13 | python-version: ["3.9"]
14 |
15 | steps:
16 | - uses: actions/checkout@v1
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v1
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install .
25 | pip install -r doc/requirements.txt
26 | - name: Build documentation
27 | run: |
28 | make --directory=doc html
29 | touch ./doc/build/html/.nojekyll
30 | - name: Deploy
31 | env:
32 | ACTIONS_DEPLOY_KEY: ${{ secrets.ACTIONS_DEPLOY_KEY }}
33 | PUBLISH_BRANCH: gh-pages
34 | PUBLISH_DIR: ./doc/build/html
35 | SCRIPT_MODE: true
36 | run: |
37 | wget https://raw.githubusercontent.com/peaceiris/actions-gh-pages/v2/entrypoint.sh
38 | bash ./entrypoint.sh
39 |
--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
1 | name: PyPI
2 |
3 | on:
4 | push:
5 | tags:
6 | - '*'
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v1
13 | - name: Set up Python
14 | uses: actions/setup-python@v1
15 | with:
16 | python-version: '3.x'
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install setuptools wheel twine
21 | - name: Build and publish
22 | env:
23 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
24 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
25 | run: |
26 | python setup.py sdist bdist_wheel
27 | twine upload dist/*
28 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches: [develop]
6 | pull_request:
7 | branches: [develop]
8 |
9 | jobs:
10 | build:
11 | timeout-minutes: 20
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest]
16 | python-version: ["3.9", "3.10", "3.11"]
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install libsndfile
24 | if: matrix.os == 'ubuntu-latest'
25 | run: |
26 | sudo apt-get update
27 | sudo apt-get install libsndfile1
28 | - name: Install pyannote.audio
29 | run: |
30 | pip install -e .[dev,testing]
31 | - name: Test with pytest
32 | run: |
33 | pytest -k "not test_cli.py"
34 |
--------------------------------------------------------------------------------
/.github/workflows/test_cli.yml:
--------------------------------------------------------------------------------
1 | name: CLI tests
2 |
3 | on:
4 | push:
5 | branches: [develop]
6 | pull_request:
7 | branches: [develop]
8 |
9 | jobs:
10 | build:
11 | timeout-minutes: 20
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest]
16 | python-version: ["3.10"]
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Install libsndfile
24 | if: matrix.os == 'ubuntu-latest'
25 | run: |
26 | sudo apt-get update
27 | sudo apt-get install libsndfile1
28 | - name: Install pyannote.audio
29 | run: |
30 | pip install -e .[dev,testing,cli]
31 | - name: Test with pytest
32 | run: |
33 | pytest tests/test_cli.py
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | .env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 |
56 | # Sphinx documentation
57 | docs/_build/
58 |
59 | # PyBuilder
60 | target/
61 |
62 | #Ipython Notebook
63 | .ipynb_checkpoints
64 |
65 | notebooks
66 |
67 | experiments
68 | *~
69 |
70 | *.npy
71 | *.pt
72 | *events.out.tfevents*
73 | *.csv
74 |
75 | # PyCharm
76 | .idea/
77 |
78 | gh-pages
79 | gh-pages.pub
80 |
81 | *.zip
82 | .mypy_cache/
83 | .vscode/
84 |
85 | **/lightning_logs/**
86 |
87 | # Version Output
88 | pyannote/audio/version.py
89 |
90 | # vim
91 | .vim
92 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tutorials/AMI-diarization-setup"]
2 | path = tutorials/AMI-diarization-setup
3 | url = https://github.com/pyannote/AMI-diarization-setup.git
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: '^docs/conf.py'
2 |
3 | repos:
4 | # # Clean Notebooks
5 | # - repo: https://github.com/kynan/nbstripout
6 | # rev: master
7 | # hooks:
8 | # - id: nbstripout
9 | # Format Code
10 | - repo: https://github.com/ambv/black
11 | rev: 22.3.0
12 | hooks:
13 | - id: black
14 |
15 | # Sort imports
16 | - repo: https://github.com/PyCQA/isort
17 | rev: 5.12.0
18 | hooks:
19 | - id: isort
20 | args: ["--profile", "black"]
21 |
22 | # Formatting, Whitespace, etc
23 | - repo: https://github.com/pre-commit/pre-commit-hooks
24 | rev: v2.2.3
25 | hooks:
26 | - id: trailing-whitespace
27 | - id: check-added-large-files
28 | args: ['--maxkb=1000']
29 | - id: check-ast
30 | - id: check-json
31 | - id: check-merge-conflict
32 | - id: check-xml
33 | - id: check-yaml
34 | - id: debug-statements
35 | - id: end-of-file-fixer
36 | - id: requirements-txt-fixer
37 | - id: mixed-line-ending
38 | args: ['--fix=no']
39 | - id: flake8
40 | args: ['--ignore=E203,E501,F811,E712,W503']
41 |
--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
1 |
2 | # Frequently Asked Questions
3 | - [Can I apply pretrained pipelines on audio already loaded in memory?](#can-i-apply-pretrained-pipelines-on-audio-already-loaded-in-memory)
4 | - [Can I use gated models (and pipelines) offline?](#can-i-use-gated-models-(and-pipelines)-offline)
5 | - [Does pyannote support streaming speaker diarization?](#does-pyannote-support-streaming-speaker-diarization)
6 | - [How can I improve performance?](#how-can-i-improve-performance)
7 | - [How does one spell and pronounce pyannote.audio?](#how-does-one-spell-and-pronounce-pyannoteaudio)
8 |
9 |
10 | ## Can I apply pretrained pipelines on audio already loaded in memory?
11 |
12 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end.
13 |
14 |
15 | ## Can I use gated models (and pipelines) offline?
16 |
17 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
18 |
19 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible.
20 |
21 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming.
22 |
23 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
24 |
25 |
26 | ## Does pyannote support streaming speaker diarization?
27 |
28 | **Short answer:** not out of the box, no.
29 |
30 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`.
31 |
32 |
33 | ## How can I improve performance?
34 |
35 | **Long answer:**
36 |
37 | 1. Manually annotate dozens of conversations as precisely as possible.
38 | 2. Separate them into train (80%), development (10%) and test (10%) subsets.
39 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization).
40 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb).
41 | 5. Enjoy.
42 |
43 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that.
44 |
45 |
46 | ## How does one spell and pronounce pyannote.audio?
47 |
48 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic).
49 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon.
50 | 🎹 `pianoter` means to play the piano (hence the logo 🤯).
51 |
52 |
53 |
54 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory)
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 CNRS
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include pyannote *.py
2 | recursive-include pyannote *.yaml
3 | recursive-include pyannote *.wav
4 | recursive-include pyannote *.rttm
5 | global-exclude *.pyc
6 | global-exclude __pycache__
7 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | status:
3 | patch:
4 | default:
5 | enabled: false
6 |
--------------------------------------------------------------------------------
/doc/gen_docs.py:
--------------------------------------------------------------------------------
1 | """
2 | This script will generate the rst docs for the api
3 | """
4 |
5 | import os
6 | from os import path
7 |
8 | bp = breakpoint
9 |
10 |
11 | def capitalise(s):
12 | news = ""
13 | for word in s.split("_"):
14 | news += word.capitalize()
15 | return news
16 |
17 |
18 | def process_dir(level, p):
19 | md = ""
20 | basename = path.basename(p)
21 |
22 | title = capitalise(basename)
23 | md += f"{'#'*level} {title}\n\n"
24 | subdirs = os.listdir(p)
25 |
26 | for f in subdirs:
27 | m = path.join(subdir, f)
28 | if path.isdir(m):
29 | md += process_dir(level + 1, path.join(p, f))
30 | else:
31 | if "__" in f:
32 | continue
33 | module = m[3:].replace("/", ".")[:-3]
34 | md += f"""
35 | ```eval_rst
36 | .. automodule:: {module}
37 | :members:
38 |
39 | ```
40 |
41 | """
42 | return md
43 |
44 |
45 | DIR = "../pyannote/audio"
46 |
47 | for module in os.listdir(DIR):
48 | # Each folder will become and rst file
49 | # Each file/folder will have a # prepended to it
50 | # Recursively we will add another # each level
51 |
52 | # Initialise Markdown
53 | md = ""
54 |
55 | subdir = path.join(DIR, module)
56 |
57 | # Skip if not directory
58 | if not path.isdir(subdir) or "__" in module:
59 | continue
60 |
61 | md += process_dir(1, subdir)
62 | with open(f"./source/api/{module}.md", "w") as f:
63 | f.write(md)
64 |
--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython==8.10.0
2 | recommonmark
3 | Sphinx==3.0.4
4 | sphinx_rtd_theme==0.4.3
5 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | ##############
2 | pyannote.audio
3 | ##############
4 |
5 | `pyannote.audio` is an open-source Python library that provides neural building blocks for speaker diarization.
6 |
7 | Installation
8 | ============
9 |
10 | ::
11 |
12 | $ conda create -n pyannote python=3.10
13 | $ conda activate pyannote
14 | $ pip install pyannote.audio
15 |
16 |
17 | API documentation
18 | =================
19 |
20 | .. toctree::
21 | :maxdepth: 2
22 |
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: pyannote-audio
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python==3.8.5
7 | - libsndfile==1.0.28
8 | - pip>=20.2
9 | - pip:
10 | - -r requirements.txt
11 |
--------------------------------------------------------------------------------
/faq.yml:
--------------------------------------------------------------------------------
1 | # FAQtory settings
2 |
3 | faq_url: "https://github.com/pyannote/pyannote-audio/blob/develop/FAQ.md" # Replace this with the URL to your FAQ.md!
4 |
5 | questions_path: "./questions" # Where questions should be stored
6 | output_path: "./FAQ.md" # Where FAQ.md should be generated
7 | templates_path: ".faq" # Path to templates
8 |
--------------------------------------------------------------------------------
/notebook/augmentation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# gett a 5s excerpt of first test file\n",
10 | "from pyannote.database import get_protocol, FileFinder\n",
11 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
12 | " preprocessors={\"audio\": FileFinder()})\n",
13 | "\n",
14 | "from pyannote.audio.core.io import Audio\n",
15 | "audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
16 | "file = next(protocol.test())\n",
17 | "\n",
18 | "from pyannote.core import Segment\n",
19 | "waveform, sample_rate = audio.crop(file, Segment(5, 10))\n",
20 | "\n",
21 | "import torch\n",
22 | "waveforms = torch.tensor(waveform)[None, :]"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# play the excerpt\n",
32 | "from IPython.display import Audio as Play\n",
33 | "Play(waveforms.squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# define a model that simply returns the waveform\n",
43 | "from pyannote.audio.core.model import Model\n",
44 | "class Passthrough(Model):\n",
45 | " def forward(self, waveforms):\n",
46 | " return waveforms\n",
47 | " \n",
48 | "identity = Passthrough()"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# pass the waveform through this \"identity\" model\n",
58 | "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# add one torch_audiomentations waveform transform to the model\n",
68 | "from pyannote.audio.augmentation.registry import register_augmentation\n",
69 | "from torch_audiomentations import Gain\n",
70 | "gain = Gain(\n",
71 | " min_gain_in_db=-15.0,\n",
72 | " max_gain_in_db=5.0,\n",
73 | " p=0.5)\n",
74 | "register_augmentation(gain, identity, when='input')"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "# pass the waveform through the \"augmented\" model\n",
84 | "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
85 | ]
86 | }
87 | ],
88 | "metadata": {
89 | "kernelspec": {
90 | "display_name": "Python 3",
91 | "language": "python",
92 | "name": "python3"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.7.9"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 4
109 | }
110 |
--------------------------------------------------------------------------------
/notebook/freeze.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyannote.database import get_protocol, FileFinder\n",
10 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
11 | " preprocessors={\"audio\": FileFinder()})"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from pyannote.audio.tasks import VoiceActivityDetection\n",
21 | "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n",
22 | "import pytorch_lightning as pl"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "vad = VoiceActivityDetection(protocol, duration=2., batch_size=16, num_workers=4)\n",
32 | "model = SimpleSegmentationModel(task=vad)\n",
33 | "trainer = pl.Trainer(max_epochs=1)\n",
34 | "_ = trainer.fit(model)"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "summary = model.summarize('full')"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "model.freeze_up_to('lstm')"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "model.unfreeze_up_to('mfcc.MelSpectrogram.spectrogram')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "model.freeze_by_name(['lstm', 'activation'])"
71 | ]
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python 3",
77 | "language": "python",
78 | "name": "python3"
79 | },
80 | "language_info": {
81 | "codemirror_mode": {
82 | "name": "ipython",
83 | "version": 3
84 | },
85 | "file_extension": ".py",
86 | "mimetype": "text/x-python",
87 | "name": "python",
88 | "nbconvert_exporter": "python",
89 | "pygments_lexer": "ipython3",
90 | "version": "3.8.5"
91 | }
92 | },
93 | "nbformat": 4,
94 | "nbformat_minor": 4
95 | }
96 |
--------------------------------------------------------------------------------
/notebook/sharing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pyannote.database import get_protocol, FileFinder\n",
10 | "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
11 | " preprocessors={\"audio\": FileFinder()})"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Train a model"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "from pyannote.audio.tasks import VoiceActivityDetection\n",
28 | "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n",
29 | "import pytorch_lightning as pl\n",
30 | "\n",
31 | "vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)\n",
32 | "model = SimpleSegmentationModel(task=vad)\n",
33 | "trainer = pl.Trainer(max_epochs=1, default_root_dir='sharing/')\n",
34 | "_ = trainer.fit(model)"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Load a model without knowing its class"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "from pyannote.audio import Model\n",
51 | "model = Model.from_pretrained('sharing/lightning_logs/version_0/checkpoints/epoch=0-step=3.ckpt')\n",
52 | "assert isinstance(model, SimpleSegmentationModel)\n",
53 | "\n",
54 | "# checkpoint should work with a URL as well (it relies on pl_load)"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.8.5"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/pyannote/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | __import__("pkg_resources").declare_namespace(__name__)
24 |
--------------------------------------------------------------------------------
/pyannote/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | try:
24 | from .version import __version__, git_version # noqa: F401
25 | except ImportError:
26 | pass
27 |
28 |
29 | from .core.inference import Inference
30 | from .core.io import Audio
31 | from .core.model import Model
32 | from .core.pipeline import Pipeline
33 |
34 | __all__ = ["Audio", "Model", "Inference", "Pipeline"]
35 |
--------------------------------------------------------------------------------
/pyannote/audio/augmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .mix import MixSpeakerDiarization
25 |
26 | __all__ = ["MixSpeakerDiarization"]
27 |
--------------------------------------------------------------------------------
/pyannote/audio/augmentation/mix.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Optional
25 |
26 | import torch
27 | from torch import Tensor
28 | from torch_audiomentations import Mix
29 |
30 |
31 | class MixSpeakerDiarization(Mix):
32 | """
33 | Create a new sample by mixing it with another random sample from the same batch
34 |
35 | Signal-to-noise ratio (where "noise" is the second random sample) is selected
36 | randomly between `min_snr_in_db` and `max_snr_in_db`.
37 |
38 | Parameters
39 | ----------
40 | min_snr_in_db : float, optional
41 | Defaults to 0.0
42 | max_snr_in_db : float, optional
43 | Defaults to 5.0
44 | max_num_speakers: int, optional
45 | Maximum number of speakers in mixtures. Defaults to actual maximum number
46 | of speakers in each batch.
47 | """
48 |
49 | supported_modes = {"per_example", "per_channel"}
50 |
51 | supports_multichannel = True
52 | requires_sample_rate = False
53 |
54 | supports_target = True
55 | requires_target = True
56 |
57 | def __init__(
58 | self,
59 | min_snr_in_db: float = 0.0,
60 | max_snr_in_db: float = 5.0,
61 | mode: str = "per_example",
62 | p: float = 0.5,
63 | p_mode: Optional[str] = None,
64 | sample_rate: Optional[int] = None,
65 | target_rate: Optional[int] = None,
66 | max_num_speakers: Optional[int] = None,
67 | output_type: str = "tensor",
68 | ):
69 | super().__init__(
70 | min_snr_in_db=min_snr_in_db,
71 | max_snr_in_db=max_snr_in_db,
72 | mode=mode,
73 | p=p,
74 | p_mode=p_mode,
75 | sample_rate=sample_rate,
76 | target_rate=target_rate,
77 | output_type=output_type,
78 | )
79 | self.max_num_speakers = max_num_speakers
80 |
81 | def randomize_parameters(
82 | self,
83 | samples: Optional[Tensor] = None,
84 | sample_rate: Optional[int] = None,
85 | targets: Optional[Tensor] = None,
86 | target_rate: Optional[int] = None,
87 | ):
88 |
89 | batch_size, num_channels, num_samples = samples.shape
90 | snr_distribution = torch.distributions.Uniform(
91 | low=torch.tensor(
92 | self.min_snr_in_db,
93 | dtype=torch.float32,
94 | device=samples.device,
95 | ),
96 | high=torch.tensor(
97 | self.max_snr_in_db,
98 | dtype=torch.float32,
99 | device=samples.device,
100 | ),
101 | validate_args=True,
102 | )
103 |
104 | # randomize SNRs
105 | self.transform_parameters["snr_in_db"] = snr_distribution.sample(
106 | sample_shape=(batch_size,)
107 | )
108 |
109 | # count number of active speakers per sample
110 | num_speakers: torch.Tensor = torch.sum(torch.any(targets, dim=-2), dim=-1)
111 | max_num_speakers = self.max_num_speakers or torch.max(num_speakers)
112 |
113 | # randomize index of second sample, constrained by the fact that the
114 | # resulting mixture should have less than max_num_speakers
115 | self.transform_parameters["sample_idx"] = torch.arange(
116 | batch_size, dtype=torch.int64
117 | )
118 | for n in range(max_num_speakers + 1):
119 |
120 | # indices of samples with exactly n speakers
121 | samples_with_n_speakers = torch.where(num_speakers == n)[0]
122 | num_samples_with_n_speakers = len(samples_with_n_speakers)
123 | if num_samples_with_n_speakers == 0:
124 | continue
125 |
126 | # indices of candidate samples for mixing (i.e. samples that would)
127 | candidates = torch.where(num_speakers + n <= max_num_speakers)[0]
128 | num_candidates = len(candidates)
129 | if num_candidates == 0:
130 | continue
131 |
132 | # sample uniformly from candidate samples
133 | selected_candidates = candidates[
134 | torch.randint(
135 | 0,
136 | num_candidates,
137 | (num_samples_with_n_speakers,),
138 | device=samples.device,
139 | )
140 | ]
141 | self.transform_parameters["sample_idx"][
142 | samples_with_n_speakers
143 | ] = selected_candidates
144 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .pretrained import pretrained
24 |
25 | __all__ = [
26 | "pretrained",
27 | ]
28 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/config/hydra/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | run:
4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
5 |
6 | sweep:
7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
8 | subdir: ${hydra.job.num}
9 |
10 | output_subdir: ""
11 |
12 | help:
13 | app_name: pyannote-audio-train
14 |
15 | # Help header, customize to describe your app to your users
16 | header: == ${hydra.help.app_name} ==
17 |
18 | footer: |-
19 | Powered by Hydra (https://hydra.cc)
20 | Use --hydra-help to view Hydra specific help
21 |
22 | template: |-
23 | ${hydra.help.header}
24 |
25 | pyannote-audio-train protocol={protocol_name}
26 | task={task} task.param=...
27 | model={model} model.param=...
28 | optimizer={optimizer} optimizer.param=...
29 | scheduler={scheduler} scheduler.param=...
30 |
31 | ${hydra.help.footer}
32 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Optional
25 |
26 | import hydra
27 | from omegaconf import DictConfig
28 | from pyannote.database import FileFinder, ProtocolFile, registry
29 | from rich.progress import Progress
30 |
31 | from pyannote.audio import Inference, Model
32 | from pyannote.audio.pipelines.utils import get_devices
33 | from pyannote.audio.utils.metric import DiscreteDiarizationErrorRate
34 | from pyannote.audio.utils.signal import binarize
35 |
36 |
37 | @hydra.main(config_path="evaluate_config", config_name="config")
38 | def evaluate(cfg: DictConfig) -> Optional[float]:
39 |
40 | # load pretrained model
41 | (device,) = get_devices(needs=1)
42 | model = Model.from_pretrained(cfg.model, device=device)
43 |
44 | # load databases into registry if it was specified
45 | if "registry" in cfg:
46 | for database_yml in cfg.registry.split(","):
47 | registry.load_database(database_yml)
48 |
49 | # load evaluation files
50 | protocol = registry.get_protocol(
51 | cfg.protocol, preprocessors={"audio": FileFinder()}
52 | )
53 |
54 | files = list(getattr(protocol, cfg.subset)())
55 |
56 | # load evaluation metric
57 | metric = DiscreteDiarizationErrorRate()
58 |
59 | with Progress() as progress:
60 |
61 | main_task = progress.add_task(protocol.name, total=len(files))
62 | file_task = progress.add_task("Processing", total=1.0)
63 |
64 | def progress_hook(completed: Optional[int] = None, total: Optional[int] = None):
65 | progress.update(file_task, completed=completed / total)
66 |
67 | inference = Inference(model, device=device)
68 | warm_up = cfg.warm_up / inference.duration
69 |
70 | def hypothesis(file: ProtocolFile):
71 | return Inference.trim(
72 | binarize(inference(file, hook=progress_hook)),
73 | warm_up=(warm_up, warm_up),
74 | )
75 |
76 | for file in files:
77 | progress.update(file_task, description=file["uri"])
78 | reference = file["annotation"]
79 | uem = file["annotated"]
80 | _ = metric(reference, hypothesis(file), uem=uem)
81 | progress.advance(main_task)
82 |
83 | report = metric.report(display=False)
84 |
85 | with open("report.txt", "w") as f:
86 |
87 | f.write(f"# Model: {cfg.model}\n")
88 | f.write(f"# Protocol: {protocol.name}\n")
89 | f.write(f"# Subset: {cfg.subset}\n")
90 | f.write("\n")
91 | report = report.to_string(
92 | index=True,
93 | sparsify=False,
94 | justify="right",
95 | float_format=lambda f: "{0:.2f}".format(f),
96 | )
97 | f.write(f"{report}")
98 |
99 |
100 | if __name__ == "__main__":
101 | evaluate()
102 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/config.yaml:
--------------------------------------------------------------------------------
1 | model: ???
2 | protocol: ???
3 | warm_up: 0.0
4 | subset: test
5 |
6 | defaults:
7 | - hydra: default
8 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/hydra/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | run:
4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
5 |
6 | sweep:
7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
8 | subdir: ${hydra.job.num}
9 |
10 | output_subdir: ""
11 |
12 | help:
13 | app_name: pyannote-audio-eval
14 |
15 | # Help header, customize to describe your app to your users
16 | header: == ${hydra.help.app_name} ==
17 |
18 | footer: |-
19 | Powered by Hydra (https://hydra.cc)
20 | Use --hydra-help to view Hydra specific help
21 |
22 | template: |-
23 | ${hydra.help.header}
24 |
25 | pyannote-audio-eval registry={path_to_database.yml}
26 | protocol={protocol_name}
27 | subset={test | development | train}
28 | model={path_to_pretrained_model}
29 | warm_up={warm_up_duration_in_seconds}
30 |
31 | ${hydra.help.footer}
32 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/CosineAnnealingWarmRestarts.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import Optional
24 |
25 | from torch.optim import Optimizer
26 | from torch.optim.lr_scheduler import (
27 | CosineAnnealingWarmRestarts as _CosineAnnealingWarmRestarts,
28 | )
29 |
30 |
31 | def CosineAnnealingWarmRestarts(
32 | optimizer: Optimizer,
33 | min_lr: float = 1e-8,
34 | max_lr: float = 1e-3,
35 | patience: int = 1,
36 | num_batches_per_epoch: Optional[int] = None,
37 | **kwargs,
38 | ):
39 | """Wrapper around CosineAnnealingWarmRestarts
40 |
41 | Parameters
42 | ----------
43 | optimizer : Optimizer
44 | Optimizer
45 | min_lr : float, optional
46 | Defaults to 1e-8.
47 | max_lr : float, optional
48 | Defaults to 1e-3
49 | patience : int, optional
50 | Number of epochs per cycle. Defaults to 1.
51 | num_batches_per_epoch : int, optional
52 | Number of batches per epoch.
53 | """
54 |
55 | # initialize optimizer lr to max_lr
56 | for g in optimizer.param_groups:
57 | g["lr"] = max_lr
58 |
59 | num_steps = patience * num_batches_per_epoch
60 |
61 | return {
62 | "scheduler": _CosineAnnealingWarmRestarts(
63 | optimizer, num_steps, eta_min=min_lr, T_mult=2
64 | ),
65 | "interval": "step",
66 | }
67 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/CyclicLR.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import Optional
24 |
25 | from torch.optim import Optimizer
26 | from torch.optim.lr_scheduler import CyclicLR as _CyclicLR
27 |
28 |
29 | def CyclicLR(
30 | optimizer: Optimizer,
31 | min_lr: float = 1e-8,
32 | max_lr: float = 1e-3,
33 | mode: str = "triangular2",
34 | patience: int = 50,
35 | num_batches_per_epoch: Optional[int] = None,
36 | **kwargs,
37 | ):
38 | """Wrapper around CyclicLR learning rate scheduler
39 |
40 | Parameters
41 | ----------
42 | optimizer : Optimizer
43 | Optimizer
44 | min_lr : float, optional
45 | Defaults to 1e-8.
46 | max_lr : float, optional
47 | Defaults to 1e-3
48 | patience : int, optional
49 | Number of epochs per cycle. Defaults to 50.
50 | num_batches_per_epoch : int, optional
51 | Number of batches per epoch.
52 | mode : {"triangular", "triangular2"}, optional
53 | Defaults to "triangular2".
54 | """
55 |
56 | step_size_up = int(0.5 * patience * num_batches_per_epoch)
57 |
58 | return {
59 | "scheduler": _CyclicLR(
60 | optimizer,
61 | base_lr=min_lr,
62 | max_lr=max_lr,
63 | step_size_up=step_size_up,
64 | mode=mode,
65 | cycle_momentum=False,
66 | ),
67 | "interval": "step",
68 | }
69 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/ReduceLROnPlateau.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Optional, Text
25 |
26 | from torch.optim import Optimizer
27 | from torch.optim.lr_scheduler import ReduceLROnPlateau as _ReduceLROnPlateau
28 |
29 |
30 | def ReduceLROnPlateau(
31 | optimizer: Optimizer,
32 | monitor: Optional[Text] = None,
33 | direction: Optional[Text] = "min",
34 | min_lr: float = 1e-8,
35 | max_lr: float = 1e-3,
36 | factor: float = 0.5,
37 | patience: int = 50,
38 | **kwargs,
39 | ):
40 | """Wrapper around ReduceLROnPlateau learning rate scheduler
41 |
42 | Parameters
43 | ----------
44 | optimizer : Optimizer
45 | Optimizer
46 | min_lr : float, optional
47 | Defaults to 1e-8.
48 | max_lr : float, optional
49 | Defaults to 1e-3
50 | factor : float, optional
51 | Defaults to 0.5
52 | patience : int, optional
53 | Wait that many epochs with no improvement before reducing the learning rate.
54 | Defaults to 50.
55 | monitor : str, optional
56 | Value to monitor
57 | direction : {"min", "max"}, optional
58 | "min" (resp. "max") means smaller (resp. larger) is better.
59 | """
60 |
61 | # initialize optimizer lr to max_lr
62 | for g in optimizer.param_groups:
63 | g["lr"] = max_lr
64 |
65 | return {
66 | "scheduler": _ReduceLROnPlateau(
67 | optimizer,
68 | mode=direction,
69 | factor=factor,
70 | patience=patience,
71 | threshold=0.0001,
72 | threshold_mode="rel",
73 | cooldown=0,
74 | min_lr=min_lr,
75 | eps=1e-08,
76 | verbose=False,
77 | ),
78 | "interval": "epoch",
79 | "monitor": monitor,
80 | "strict": True,
81 | }
82 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .CosineAnnealingWarmRestarts import CosineAnnealingWarmRestarts
25 | from .CyclicLR import CyclicLR
26 | from .ReduceLROnPlateau import ReduceLROnPlateau
27 |
28 | __all__ = ["ReduceLROnPlateau", "CyclicLR", "CosineAnnealingWarmRestarts"]
29 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/pretrained.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Text
25 | from pyannote.audio import Model
26 |
27 |
28 | def pretrained(checkpoint: Text):
29 | return Model.from_pretrained(checkpoint, map_location=lambda storage, loc: storage)
30 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/config.yaml:
--------------------------------------------------------------------------------
1 | protocol: ???
2 |
3 | defaults:
4 | - task: SpeakerDiarization
5 | - model: PyanNet
6 | - optimizer: Adam
7 | - scheduler: CosineAnnealingWarmRestarts
8 | - trainer: default
9 | - hydra: default
10 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/hydra/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 |
3 | run:
4 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
5 |
6 | sweep:
7 | dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
8 | subdir: ${hydra.job.num}
9 |
10 | output_subdir: ""
11 |
12 | help:
13 | app_name: pyannote-audio-train
14 |
15 | # Help header, customize to describe your app to your users
16 | header: == ${hydra.help.app_name} ==
17 |
18 | footer: |-
19 | Powered by Hydra (https://hydra.cc)
20 | Use --hydra-help to view Hydra specific help
21 |
22 | template: |-
23 | ${hydra.help.header}
24 |
25 | pyannote-audio-train protocol={protocol_name}
26 | +task={task} task.param=...
27 | +model={model} model.param=...
28 | optimizer={optimizer} optimizer.param=...
29 | scheduler={scheduler} scheduler.param=...
30 |
31 | ${hydra.help.footer}
32 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/DebugEmbedding.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.debug.SimpleEmbeddingModel
3 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/DebugSegmentation.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.segmentation.debug.SimpleSegmentationModel
3 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/Pretrained.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.pretrained
3 | checkpoint: ???
4 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/PyanNet.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.segmentation.PyanNet
3 | sincnet:
4 | stride: 10
5 | lstm:
6 | hidden_size: 128
7 | num_layers: 2
8 | bidirectional: true
9 | monolithic: true
10 | dropout: 0.5
11 | linear:
12 | hidden_size: 128
13 | num_layers: 2
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/SSeRiouSS.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.segmentation.SSeRiouSS
3 | wav2vec: WAVLM_BASE
4 | wav2vec_layer: -1
5 | lstm:
6 | hidden_size: 128
7 | num_layers: 4
8 | bidirectional: true
9 | monolithic: true
10 | dropout: 0.5
11 | linear:
12 | hidden_size: 128
13 | num_layers: 2
14 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/XVectorMFCC.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.XVectorMFCC
3 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/XVectorSincNet.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.XVectorSincNet
3 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/Adam.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: torch.optim.Adam
3 | lr: 1e-3
4 | betas: [0.9, 0.999]
5 | eps: 1e-08
6 | weight_decay: 0
7 | amsgrad: False
8 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/AdamW.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: torch.optim.AdamW
3 | lr: 1e-3
4 | betas: [0.9, 0.999]
5 | eps: 1e-08
6 | weight_decay: 0.01
7 | amsgrad: False
8 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/Adan.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: adan_pytorch.Adan
3 | lr: 1e-3
4 | betas: [0.1, 0.1, 0.001]
5 | weight_decay: 0.0
6 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/preprocessor/LowerTemporalResolution.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.utils.preprocessors.LowerTemporalResolution
3 | resolution: 0.1
4 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/CosineAnnealingWarmRestarts.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.CosineAnnealingWarmRestarts
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | patience: 1
6 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/CyclicLR.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.CyclicLR
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | mode: triangular2
6 | patience: 50
7 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/ReduceLROnPlateau.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.ReduceLROnPlateau
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | factor: 0.5
6 | patience: 50
7 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/MultiLabelSegmentation.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.MultiLabelSegmentation
3 | duration: 3.0
4 | warm_up: 0.0
5 | balance: null
6 | weight: null
7 | batch_size: 32
8 | num_workers: null
9 | pin_memory: False
10 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/OverlappedSpeechDetection.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.OverlappedSpeechDetection
3 | duration: 3.0
4 | warm_up: 0.0
5 | balance: null
6 | overlap:
7 | probability: 0.5
8 | snr_min: 0.0
9 | snr_max: 10.0
10 | weight: null
11 | batch_size: 32
12 | num_workers: null
13 | pin_memory: False
14 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/SpeakerDiarization.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.SpeakerDiarization
3 | duration: 5.0
4 | max_speakers_per_chunk: 3
5 | max_speakers_per_frame: 2
6 | batch_size: 32
7 | num_workers: 10
8 | pin_memory: False
9 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/SpeakerEmbedding.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.SupervisedRepresentationLearningWithArcFace
3 | min_duration: 2.0
4 | duration: 5.0
5 | num_classes_per_batch: 512
6 | num_chunks_per_class: 1
7 | margin: 2.0
8 | scale: 12.0
9 | num_workers: null
10 | pin_memory: False
11 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/VoiceActivityDetection.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.VoiceActivityDetection
3 | duration: 3.0
4 | warm_up: 0.0
5 | balance: null
6 | weight: null
7 | batch_size: 32
8 | num_workers: null
9 | pin_memory: False
10 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/trainer/default.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pytorch_lightning.Trainer
3 | accelerator: auto
4 | accumulate_grad_batches: 1
5 | benchmark: null # TODO: automatically set to True when using fixed duration chunks
6 | deterministic: False
7 | check_val_every_n_epoch: 1
8 | devices: auto
9 | detect_anomaly: False
10 | enable_checkpointing: True
11 | enable_model_summary: True
12 | enable_progress_bar: True
13 | fast_dev_run: False
14 | gradient_clip_val: null
15 | gradient_clip_algorithm: norm
16 | limit_predict_batches: 1.0
17 | limit_test_batches: 1.0
18 | limit_train_batches: 1.0
19 | limit_val_batches: 1.0
20 | log_every_n_steps: 50
21 | max_epochs: 1000
22 | max_steps: -1
23 | max_time: null
24 | min_epochs: 1
25 | min_steps: null
26 | num_nodes: 1
27 | num_sanity_val_steps: 2
28 | overfit_batches: 0.0
29 | precision: 32
30 | profiler: null
31 | reload_dataloaders_every_n_epochs: 0
32 | use_distributed_sampler: True # TODO: check what this does exactly
33 | strategy: auto
34 | sync_batchnorm: False
35 | val_check_interval: 1.0
36 |
--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/trainer/fast_dev_run.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pytorch_lightning.Trainer
3 | fast_dev_run: True
4 |
--------------------------------------------------------------------------------
/pyannote/audio/core/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/core/callback.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import List, Mapping, Optional, Text, Union
24 |
25 | from pytorch_lightning import Callback, Trainer
26 | from pytorch_lightning.utilities.model_summary import ModelSummary
27 |
28 | from pyannote.audio import Model
29 |
30 |
31 | class GraduallyUnfreeze(Callback):
32 | """Gradually unfreeze layers
33 |
34 | 1. Start training with all layers frozen, but those that depends on the task
35 | (i.e. those instantiated in model.build() and task.setup_loss_func()
36 | 2. Train for a few epochs and unfreeze a few more layers
37 | 3. Repeat
38 |
39 | Parameters
40 | ----------
41 | schedule:
42 | See examples for supported format.
43 | epochs_per_stage : int, optional
44 | Number of epochs between each stage. Defaults to 1.
45 | Has no effect if schedule is provided as a {layer_name: epoch} dictionary.
46 |
47 | Usage
48 | -----
49 | >>> callback = GraduallyUnfreeze()
50 | >>> Trainer(callbacks=[callback]).fit(model)
51 |
52 | Examples
53 | --------
54 | # for a model with PyanNet architecture (sincnet > lstm > linear > task_specific),
55 | # those are equivalent and will unfreeze 'linear' at epoch 1, 'lstm' at epoch 2,
56 | # and 'sincnet' at epoch 3.
57 | GraduallyUnfreeze()
58 | GraduallyUnfreeze(schedule=['linear', 'lstm', 'sincnet'])
59 | GraduallyUnfreeze(schedule={'linear': 1, 'lstm': 2, 'sincnet': 3})
60 |
61 | # the following syntax is also possible (with its dict-based equivalent just below):
62 | GraduallyUnfreeze(schedule=['linear', ['lstm', 'sincnet']], epochs_per_stage=10)
63 | GraduallyUnfreeze(schedule={'linear': 10, 'lstm': 20, 'sincnet': 20})
64 | # will unfreeze 'linear' at epoch 10, and both 'lstm' and 'sincnet' at epoch 20.
65 | """
66 |
67 | def __init__(
68 | self,
69 | schedule: Union[Mapping[Text, int], List[Union[List[Text], Text]]] = None,
70 | epochs_per_stage: Optional[int] = None,
71 | ):
72 | super().__init__()
73 |
74 | if (
75 | (schedule is None) or (isinstance(schedule, List))
76 | ) and epochs_per_stage is None:
77 | epochs_per_stage = 1
78 |
79 | self.epochs_per_stage = epochs_per_stage
80 | self.schedule = schedule
81 |
82 | def on_fit_start(self, trainer: Trainer, model: Model):
83 |
84 | schedule = self.schedule
85 |
86 | task_specific_layers = model.task_dependent
87 | backbone_layers = [
88 | layer
89 | for layer, _ in reversed(ModelSummary(model, max_depth=1).named_modules)
90 | if layer not in task_specific_layers
91 | ]
92 |
93 | if schedule is None:
94 | schedule = backbone_layers
95 |
96 | if isinstance(schedule, List):
97 | _schedule = dict()
98 | for depth, layers in enumerate(schedule):
99 | layers = layers if isinstance(layers, List) else [layers]
100 | for layer in layers:
101 | _schedule[layer] = (depth + 1) * self.epochs_per_stage
102 | schedule = _schedule
103 |
104 | self.schedule = schedule
105 |
106 | # freeze all but task specific layers
107 | for layer in backbone_layers:
108 | model.freeze_by_name(layer)
109 |
110 | def on_train_epoch_start(self, trainer: Trainer, model: Model):
111 | for layer, epoch in self.schedule.items():
112 | if epoch == trainer.current_epoch:
113 | model.unfreeze_by_name(layer)
114 |
--------------------------------------------------------------------------------
/pyannote/audio/models/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/models/blocks/pooling.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | import warnings
24 | from typing import Optional
25 |
26 | import torch
27 | import torch.nn as nn
28 | import torch.nn.functional as F
29 |
30 |
31 | def _pool(sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
32 | """Helper function to compute statistics pooling
33 |
34 | Assumes that weights are already interpolated to match the number of frames
35 | in sequences and that they encode the activation of only one speaker.
36 |
37 | Parameters
38 | ----------
39 | sequences : (batch, features, frames) torch.Tensor
40 | Sequences of features.
41 | weights : (batch, frames) torch.Tensor
42 | (Already interpolated) weights.
43 |
44 | Returns
45 | -------
46 | output : (batch, 2 * features) torch.Tensor
47 | Concatenation of mean and (unbiased) standard deviation.
48 | """
49 |
50 | weights = weights.unsqueeze(dim=1)
51 | # (batch, 1, frames)
52 |
53 | v1 = weights.sum(dim=2) + 1e-8
54 | mean = torch.sum(sequences * weights, dim=2) / v1
55 |
56 | dx2 = torch.square(sequences - mean.unsqueeze(2))
57 | v2 = torch.square(weights).sum(dim=2)
58 |
59 | var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8)
60 | std = torch.sqrt(var)
61 |
62 | return torch.cat([mean, std], dim=1)
63 |
64 |
65 | class StatsPool(nn.Module):
66 | """Statistics pooling
67 |
68 | Compute temporal mean and (unbiased) standard deviation
69 | and returns their concatenation.
70 |
71 | Reference
72 | ---------
73 | https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
74 |
75 | """
76 |
77 | def forward(
78 | self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
79 | ) -> torch.Tensor:
80 | """Forward pass
81 |
82 | Parameters
83 | ----------
84 | sequences : (batch, features, frames) torch.Tensor
85 | Sequences of features.
86 | weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
87 | Compute weighted mean and standard deviation, using provided `weights`.
88 |
89 | Note
90 | ----
91 | `sequences` and `weights` might use a different number of frames, in which case `weights`
92 | are interpolated linearly to reach the number of frames in `sequences`.
93 |
94 | Returns
95 | -------
96 | output : (batch, 2 * features) or (batch, speakers, 2 * features) torch.Tensor
97 | Concatenation of mean and (unbiased) standard deviation. When `weights` are
98 | provided with the `speakers` dimension, `output` is computed for each speaker
99 | separately and returned as (batch, speakers, 2 * channel)-shaped tensor.
100 | """
101 |
102 | if weights is None:
103 | mean = sequences.mean(dim=-1)
104 | std = sequences.std(dim=-1, correction=1)
105 | return torch.cat([mean, std], dim=-1)
106 |
107 | if weights.dim() == 2:
108 | has_speaker_dimension = False
109 | weights = weights.unsqueeze(dim=1)
110 | # (batch, frames) -> (batch, 1, frames)
111 | else:
112 | has_speaker_dimension = True
113 |
114 | # interpolate weights if needed
115 | _, _, num_frames = sequences.size()
116 | _, num_speakers, num_weights = weights.size()
117 | if num_frames != num_weights:
118 | warnings.warn(
119 | f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
120 | )
121 | weights = F.interpolate(weights, size=num_frames, mode="nearest")
122 |
123 | output = torch.stack(
124 | [
125 | _pool(sequences, weights[:, speaker, :])
126 | for speaker in range(num_speakers)
127 | ],
128 | dim=1,
129 | )
130 |
131 | if not has_speaker_dimension:
132 | return output.squeeze(dim=1)
133 |
134 | return output
135 |
--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .wespeaker import (
25 | WeSpeakerResNet34,
26 | WeSpeakerResNet152,
27 | WeSpeakerResNet221,
28 | WeSpeakerResNet293,
29 | )
30 | from .xvector import XVectorMFCC, XVectorSincNet
31 |
32 | __all__ = [
33 | "XVectorSincNet",
34 | "XVectorMFCC",
35 | "WeSpeakerResNet34",
36 | "WeSpeakerResNet152",
37 | "WeSpeakerResNet221",
38 | "WeSpeakerResNet293",
39 | ]
40 |
--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/debug.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from functools import lru_cache
25 | from typing import Optional
26 |
27 | import torch
28 | import torch.nn as nn
29 | from einops import rearrange, reduce
30 | from torchaudio.transforms import MFCC
31 |
32 | from pyannote.audio.core.model import Model
33 | from pyannote.audio.core.task import Task
34 |
35 |
36 | class SimpleEmbeddingModel(Model):
37 | def __init__(
38 | self,
39 | sample_rate: int = 16000,
40 | num_channels: int = 1,
41 | task: Optional[Task] = None,
42 | ):
43 | super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
44 |
45 | self.mfcc = MFCC(
46 | sample_rate=self.hparams.sample_rate,
47 | n_mfcc=40,
48 | dct_type=2,
49 | norm="ortho",
50 | log_mels=False,
51 | )
52 |
53 | self.lstm = nn.LSTM(
54 | self.mfcc.n_mfcc * self.hparams.num_channels,
55 | 32,
56 | num_layers=1,
57 | batch_first=True,
58 | bidirectional=True,
59 | )
60 |
61 | @lru_cache
62 | def num_frames(self, num_samples: int) -> int:
63 | """Compute number of output frames for a given number of input samples
64 |
65 | Parameters
66 | ----------
67 | num_samples : int
68 | Number of input samples
69 |
70 | Returns
71 | -------
72 | num_frames : int
73 | Number of output frames
74 |
75 | Source
76 | ------
77 | https://pytorch.org/docs/stable/generated/torch.stft.html#torch.stft
78 |
79 | """
80 |
81 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
82 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
83 | center = self.mfcc.MelSpectrogram.spectrogram.center
84 |
85 | if center:
86 | return 1 + num_samples // hop_length
87 | else:
88 | return 1 + (num_samples - n_fft) // hop_length
89 |
90 | def receptive_field_size(self, num_frames: int = 1) -> int:
91 | """Compute size of receptive field
92 |
93 | Parameters
94 | ----------
95 | num_frames : int, optional
96 | Number of frames in the output signal
97 |
98 | Returns
99 | -------
100 | receptive_field_size : int
101 | Receptive field size.
102 | """
103 |
104 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
105 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
106 | return n_fft + (num_frames - 1) * hop_length
107 |
108 | def receptive_field_center(self, frame: int = 0) -> int:
109 | """Compute center of receptive field
110 |
111 | Parameters
112 | ----------
113 | frame : int, optional
114 | Frame index
115 |
116 | Returns
117 | -------
118 | receptive_field_center : int
119 | Index of receptive field center.
120 | """
121 |
122 | hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
123 | n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
124 | center = self.mfcc.MelSpectrogram.spectrogram.center
125 |
126 | if center:
127 | return frame * hop_length
128 | else:
129 | return frame * hop_length + n_fft // 2
130 |
131 | @property
132 | def dimension(self) -> int:
133 | """Dimension of output"""
134 | return 64
135 |
136 | def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
137 | """
138 |
139 | Parameters
140 | ----------
141 | waveforms : (batch, time, channel)
142 |
143 | Returns
144 | -------
145 | embedding : (batch, dimension)
146 | """
147 |
148 | mfcc = self.mfcc(waveforms)
149 | output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
150 | # mean temporal pooling
151 | return reduce(output, "b t f -> b f", "mean")
152 |
--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker:
--------------------------------------------------------------------------------
1 | Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
2 | 2022 Zhengyang Chen (chenzhengyang117@gmail.com)
3 | 2023 Bing Han (hanbing97@sjtu.edu.cn)
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 |
17 | File `resnet.py` has been borrowed from WeSpeaker that is available under the Apache License, Version 2.0.
18 |
19 | The original file is available at https://github.com/wenet-e2e/wespeaker/blob/c20d765295359e681321625fbefc1a02e8794163/wespeaker/models/resnet.py
20 |
21 | Neither Shuai Wang (@wsstriving on Github) nor myself (Hervé Bredin, or @hbredin on Github) are lawyers, but we both agreed that putting this license file in this directory is enough to comply with the license. See https://github.com/pyannote/pyannote-audio/issues/1537#issuecomment-1808029836. If you know better about this potential MIT/Apache 2.0 compatibility issue, please let us know.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/wespeaker/convert.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | # Script used to convert from WeSpeaker to pyannote.audio
24 |
25 | import sys
26 | from pathlib import Path
27 |
28 | import pytorch_lightning as pl
29 | import torch
30 |
31 | import pyannote.audio.models.embedding.wespeaker as wespeaker
32 | from pyannote.audio import Model
33 | from pyannote.audio.core.task import Problem, Resolution, Specifications
34 |
35 | wespeaker_checkpoint_dir = sys.argv[1] # /path/to/wespeaker_cnceleb-resnet34-LM
36 |
37 | wespeaker_checkpoint = Path(wespeaker_checkpoint_dir) / "wespeaker.pt"
38 |
39 | depth = Path(wespeaker_checkpoint_dir).parts[-1].split("-")[-2][6:] # '34'
40 | Klass = getattr(wespeaker, f"WeSpeakerResNet{depth}") # WeSpeakerResNet34
41 |
42 | duration = 5.0 # whatever
43 | specifications = Specifications(
44 | problem=Problem.REPRESENTATION, resolution=Resolution.CHUNK, duration=duration
45 | )
46 |
47 | state_dict = torch.load(wespeaker_checkpoint, map_location=torch.device("cpu"))
48 | state_dict.pop("projection.weight")
49 |
50 | model = Klass()
51 | model.resnet.load_state_dict(state_dict, strict=True)
52 | model.specifications = specifications
53 |
54 | checkpoint = {"state_dict": model.state_dict()}
55 | model.on_save_checkpoint(checkpoint)
56 | checkpoint["pytorch-lightning_version"] = pl.__version__
57 |
58 | pyannote_checkpoint = Path(wespeaker_checkpoint_dir) / "pytorch_model.bin"
59 | torch.save(checkpoint, pyannote_checkpoint)
60 |
61 | model = Model.from_pretrained(pyannote_checkpoint)
62 | print(model)
63 |
--------------------------------------------------------------------------------
/pyannote/audio/models/segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .PyanNet import PyanNet
24 | from .SSeRiouSS import SSeRiouSS
25 |
26 | __all__ = ["PyanNet", "SSeRiouSS"]
27 |
--------------------------------------------------------------------------------
/pyannote/audio/models/separation/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2024- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .ToTaToNet import ToTaToNet
24 |
25 | __all__ = ["ToTaToNet"]
26 |
--------------------------------------------------------------------------------
/pyannote/audio/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2022 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .multilabel import MultiLabelSegmentation
24 | from .overlapped_speech_detection import OverlappedSpeechDetection
25 | from .resegmentation import Resegmentation
26 | from .speaker_diarization import SpeakerDiarization
27 | from .speech_separation import SpeechSeparation
28 | from .voice_activity_detection import VoiceActivityDetection
29 |
30 | __all__ = [
31 | "VoiceActivityDetection",
32 | "OverlappedSpeechDetection",
33 | "SpeakerDiarization",
34 | "Resegmentation",
35 | "MultiLabelSegmentation",
36 | "SpeechSeparation",
37 | ]
38 |
--------------------------------------------------------------------------------
/pyannote/audio/pipelines/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .diarization import SpeakerDiarizationMixin
24 | from .getter import (
25 | PipelineAugmentation,
26 | PipelineInference,
27 | PipelineModel,
28 | get_augmentation,
29 | get_devices,
30 | get_inference,
31 | get_model,
32 | )
33 | from .oracle import oracle_segmentation
34 |
35 | __all__ = [
36 | "SpeakerDiarizationMixin",
37 | "oracle_segmentation",
38 | "get_augmentation",
39 | "PipelineAugmentation",
40 | "get_devices",
41 | "get_inference",
42 | "PipelineInference",
43 | "get_model",
44 | "PipelineModel",
45 | ]
46 |
--------------------------------------------------------------------------------
/pyannote/audio/pipelines/utils/oracle.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import Optional, Union
24 |
25 | import numpy as np
26 | from pyannote.core import Annotation, Segment, SlidingWindow, SlidingWindowFeature
27 |
28 | from pyannote.audio.core.io import Audio, AudioFile
29 |
30 |
31 | def oracle_segmentation(
32 | file: AudioFile,
33 | window: SlidingWindow,
34 | frames: Union[SlidingWindow, float],
35 | num_speakers: Optional[int] = None,
36 | ) -> SlidingWindowFeature:
37 | """Oracle speaker segmentation
38 |
39 | Simulates inference based on an (imaginary) oracle segmentation model:
40 |
41 | >>> oracle = Model.from_pretrained("oracle")
42 | >>> assert frames == oracle.receptive_field
43 | >>> inference = Inference(oracle, duration=window.duration, step=window.step, skip_aggregation=True)
44 | >>> oracle_segmentation = inference(file)
45 |
46 | Parameters
47 | ----------
48 | file : AudioFile
49 | Audio file with "annotation".
50 | window : SlidingWindow
51 | Sliding window used for inference (see above)
52 | frames : SlidingWindow or float
53 | Output resolution of the oracle model (see above)
54 | num_speakers : int, optional
55 | Override the number of speakers returned by the oracle segmentation model
56 | Defaults to the actual number of speakers in the whole file
57 |
58 | Returns
59 | -------
60 | oracle_segmentation : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
61 | Oracle segmentation.
62 | """
63 |
64 | if "duration" not in file:
65 | duration = Audio(mono="downmix").get_duration(file)
66 | else:
67 | duration: float = file["duration"]
68 | reference: Annotation = file["annotation"]
69 |
70 | if not isinstance(frames, SlidingWindow):
71 | frames = SlidingWindow(start=0.0, step=frames, duration=frames)
72 |
73 | labels = reference.labels()
74 | actual_num_speakers = len(labels)
75 | if num_speakers is None:
76 | num_speakers = actual_num_speakers
77 |
78 | if num_speakers > actual_num_speakers:
79 | num_missing = num_speakers - actual_num_speakers
80 | labels += [
81 | f"FakeSpeakerForOracleSegmentationInference{i:d}"
82 | for i in range(num_missing)
83 | ]
84 |
85 | window = SlidingWindow(start=0.0, duration=window.duration, step=window.step)
86 |
87 | segmentations = []
88 | for chunk in window(Segment(0.0, duration)):
89 | chunk_segmentation: SlidingWindowFeature = reference.discretize(
90 | chunk,
91 | resolution=frames,
92 | labels=labels,
93 | duration=window.duration,
94 | )
95 |
96 | if num_speakers < actual_num_speakers:
97 | # keep `num_speakers` most talkative speakers
98 | most_talkative_index = np.argsort(-np.sum(chunk_segmentation, axis=0))[
99 | :num_speakers
100 | ]
101 | chunk_segmentation = chunk_segmentation[:, most_talkative_index]
102 |
103 | segmentations.append(chunk_segmentation)
104 |
105 | return SlidingWindowFeature(np.float32(np.stack(segmentations)), window)
106 |
--------------------------------------------------------------------------------
/pyannote/audio/sample/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2024- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from pathlib import Path
25 |
26 | from pyannote.core import Annotation, Segment, Timeline
27 | from pyannote.database.util import load_rttm
28 |
29 | from pyannote.audio.core.io import Audio, AudioFile
30 |
31 |
32 | def _sample() -> AudioFile:
33 | sample_wav = Path(__file__).parent / "sample.wav"
34 | uri = "sample"
35 |
36 | audio = Audio()
37 | waveform, sample_rate = audio(sample_wav)
38 |
39 | sample_rttm = Path(__file__).parent / "sample.rttm"
40 |
41 | annotation: Annotation = load_rttm(sample_rttm)[uri]
42 | duration = audio.get_duration(sample_wav)
43 |
44 | annotated: Timeline = Timeline([Segment(0.0, duration)], uri=uri)
45 |
46 | return {
47 | "audio": sample_wav,
48 | "uri": "sample",
49 | "waveform": waveform,
50 | "sample_rate": sample_rate,
51 | "annotation": annotation,
52 | "annotated": annotated,
53 | }
54 |
55 |
56 | SAMPLE_FILE = _sample()
57 |
--------------------------------------------------------------------------------
/pyannote/audio/sample/sample.rttm:
--------------------------------------------------------------------------------
1 | SPEAKER sample 1 6.690 0.430 speaker90
2 | SPEAKER sample 1 7.550 0.800 speaker91
3 | SPEAKER sample 1 8.320 1.700 speaker90
4 | SPEAKER sample 1 9.920 1.110 speaker91
5 | SPEAKER sample 1 10.570 4.130 speaker90
6 | SPEAKER sample 1 14.490 3.430 speaker91
7 | SPEAKER sample 1 18.050 3.440 speaker90
8 | SPEAKER sample 1 18.150 0.440 speaker91
9 | SPEAKER sample 1 21.780 6.720 speaker91
10 | SPEAKER sample 1 27.850 2.150 speaker90
11 |
--------------------------------------------------------------------------------
/pyannote/audio/sample/sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/pyannote/audio/sample/sample.wav
--------------------------------------------------------------------------------
/pyannote/audio/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020-2021 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from .segmentation.multilabel import MultiLabelSegmentation # isort:skip
24 | from .segmentation.speaker_diarization import SpeakerDiarization # isort:skip
25 | from .separation.PixIT import PixIT # isort:skip
26 | from .segmentation.voice_activity_detection import VoiceActivityDetection # isort:skip
27 | from .segmentation.overlapped_speech_detection import ( # isort:skip
28 | OverlappedSpeechDetection,
29 | )
30 | from .embedding.arcface import SupervisedRepresentationLearningWithArcFace # isort:skip
31 |
32 | # Segmentation has been renamed to SpeakerDiarization but we keep Segmentation here for backward compatibility
33 | Segmentation = SpeakerDiarization
34 |
35 | # SpeakerEmbedding is more human-friendly
36 | SpeakerEmbedding = SupervisedRepresentationLearningWithArcFace
37 |
38 | __all__ = [
39 | "SpeakerDiarization",
40 | "VoiceActivityDetection",
41 | "OverlappedSpeechDetection",
42 | "MultiLabelSegmentation",
43 | "SpeakerEmbedding",
44 | "Segmentation",
45 | "PixIT",
46 | ]
47 |
--------------------------------------------------------------------------------
/pyannote/audio/tasks/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/tasks/embedding/arcface.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from __future__ import annotations
25 |
26 | from typing import Dict, Optional, Sequence, Union
27 |
28 | import pytorch_metric_learning.losses
29 | from pyannote.database import Protocol
30 | from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
31 | from torchmetrics import Metric
32 |
33 | from pyannote.audio.core.task import Task
34 |
35 | from .mixins import SupervisedRepresentationLearningTaskMixin
36 |
37 |
38 | class SupervisedRepresentationLearningWithArcFace(
39 | SupervisedRepresentationLearningTaskMixin,
40 | Task,
41 | ):
42 | """Supervised representation learning with ArcFace loss
43 |
44 | Representation learning is the task of ...
45 |
46 | Parameters
47 | ----------
48 | protocol : Protocol
49 | pyannote.database protocol
50 | duration : float, optional
51 | Chunks duration in seconds. Defaults to two seconds (2.).
52 | min_duration : float, optional
53 | Sample training chunks duration uniformely between `min_duration`
54 | and `duration`. Defaults to `duration` (i.e. fixed length chunks).
55 | num_classes_per_batch : int, optional
56 | Number of classes per batch. Defaults to 32.
57 | num_chunks_per_class : int, optional
58 | Number of chunks per class. Defaults to 1.
59 | margin : float, optional
60 | Margin. Defaults to 28.6.
61 | scale : float, optional
62 | Scale. Defaults to 64.
63 | num_workers : int, optional
64 | Number of workers used for generating training samples.
65 | Defaults to multiprocessing.cpu_count() // 2.
66 | pin_memory : bool, optional
67 | If True, data loaders will copy tensors into CUDA pinned
68 | memory before returning them. See pytorch documentation
69 | for more details. Defaults to False.
70 | augmentation : BaseWaveformTransform, optional
71 | torch_audiomentations waveform transform, used by dataloader
72 | during training.
73 | metric : optional
74 | Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
75 | Defaults to AUROC (area under the ROC curve).
76 | """
77 |
78 | # TODO: add a ".metric" property that tells how speaker embedding trained with this approach
79 | # should be compared. could be a string like "cosine" or "euclidean" or a pdist/cdist-like
80 | # callable. this ".metric" property should be propagated all the way to Inference (via the model).
81 |
82 | def __init__(
83 | self,
84 | protocol: Protocol,
85 | min_duration: Optional[float] = None,
86 | duration: float = 2.0,
87 | num_classes_per_batch: int = 32,
88 | num_chunks_per_class: int = 1,
89 | margin: float = 28.6,
90 | scale: float = 64.0,
91 | num_workers: Optional[int] = None,
92 | pin_memory: bool = False,
93 | augmentation: Optional[BaseWaveformTransform] = None,
94 | metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
95 | ):
96 |
97 | self.num_chunks_per_class = num_chunks_per_class
98 | self.num_classes_per_batch = num_classes_per_batch
99 |
100 | self.margin = margin
101 | self.scale = scale
102 |
103 | super().__init__(
104 | protocol,
105 | duration=duration,
106 | min_duration=min_duration,
107 | batch_size=self.batch_size,
108 | num_workers=num_workers,
109 | pin_memory=pin_memory,
110 | augmentation=augmentation,
111 | metric=metric,
112 | )
113 |
114 | def setup_loss_func(self):
115 |
116 | _, embedding_size = self.model(self.model.example_input_array).shape
117 |
118 | self.model.loss_func = pytorch_metric_learning.losses.ArcFaceLoss(
119 | len(self.specifications.classes),
120 | embedding_size,
121 | margin=self.margin,
122 | scale=self.scale,
123 | )
124 |
--------------------------------------------------------------------------------
/pyannote/audio/tasks/segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/tasks/separation/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2024- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .audio.diarization_error_rate import (
25 | DiarizationErrorRate,
26 | FalseAlarmRate,
27 | MissedDetectionRate,
28 | OptimalDiarizationErrorRate,
29 | OptimalDiarizationErrorRateThreshold,
30 | OptimalFalseAlarmRate,
31 | OptimalMissedDetectionRate,
32 | OptimalSpeakerConfusionRate,
33 | SpeakerConfusionRate,
34 | )
35 |
36 | __all__ = [
37 | "DiarizationErrorRate",
38 | "FalseAlarmRate",
39 | "MissedDetectionRate",
40 | "SpeakerConfusionRate",
41 | "OptimalDiarizationErrorRate",
42 | "OptimalFalseAlarmRate",
43 | "OptimalMissedDetectionRate",
44 | "OptimalSpeakerConfusionRate",
45 | "OptimalDiarizationErrorRateThreshold",
46 | ]
47 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .diarization_error_rate import (
25 | DiarizationErrorRate,
26 | FalseAlarmRate,
27 | MissedDetectionRate,
28 | OptimalDiarizationErrorRate,
29 | OptimalDiarizationErrorRateThreshold,
30 | OptimalFalseAlarmRate,
31 | OptimalMissedDetectionRate,
32 | OptimalSpeakerConfusionRate,
33 | SpeakerConfusionRate,
34 | )
35 |
36 | __all__ = [
37 | "DiarizationErrorRate",
38 | "SpeakerConfusionRate",
39 | "MissedDetectionRate",
40 | "FalseAlarmRate",
41 | "OptimalDiarizationErrorRate",
42 | "OptimalSpeakerConfusionRate",
43 | "OptimalMissedDetectionRate",
44 | "OptimalFalseAlarmRate",
45 | "OptimalDiarizationErrorRateThreshold",
46 | ]
47 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/classification/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from .equal_error_rate import EqualErrorRate
25 |
26 | __all__ = [
27 | "EqualErrorRate",
28 | ]
29 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/classification/equal_error_rate.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Optional
25 |
26 | import torch
27 | from pyannote.metrics.binary_classification import det_curve
28 | from torchmetrics import Metric
29 | from torchmetrics.utilities.data import dim_zero_cat
30 |
31 |
32 | class EqualErrorRate(Metric):
33 |
34 | is_differentiable: Optional[bool] = False
35 | higher_is_better: Optional[bool] = False
36 | full_state_update: bool = True
37 |
38 | def __init__(self, distances: bool = True, compute_on_cpu: bool = True, **kwargs):
39 | super().__init__(compute_on_cpu=compute_on_cpu, **kwargs)
40 | self.distances = distances
41 | self.add_state("scores", default=[], dist_reduce_fx="cat")
42 | self.add_state("y_true", default=[], dist_reduce_fx="cat")
43 |
44 | def update(self, scores: torch.Tensor, y_true: torch.Tensor) -> None:
45 | self.scores.append(scores)
46 | self.y_true.append(y_true)
47 |
48 | def compute(self) -> torch.Tensor:
49 | scores = dim_zero_cat(self.scores)
50 | y_true = dim_zero_cat(self.y_true)
51 | _, _, _, eer = det_curve(y_true.cpu(), scores.cpu(), distances=self.distances)
52 | return torch.tensor(eer)
53 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/functional/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/functional/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2022- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/multi_task.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Any, Callable, Tuple, Union
25 |
26 | from pyannote.audio.core.model import Specifications
27 |
28 |
29 | def map_with_specifications(
30 | specifications: Union[Specifications, Tuple[Specifications]],
31 | func: Callable,
32 | *iterables,
33 | ) -> Union[Any, Tuple[Any]]:
34 | """Compute the function using arguments from each of the iterables
35 |
36 | Returns a tuple if provided `specifications` is a tuple,
37 | otherwise returns the function return value.
38 |
39 | Parameters
40 | ----------
41 | specifications : (tuple of) Specifications
42 | Specifications or tuple of specifications
43 | func : callable
44 | Function called for each specification with
45 | `func(*iterables[i], specifications=specifications[i])`
46 | *iterables :
47 | List of iterables with same length as `specifications`.
48 |
49 | Returns
50 | -------
51 | output : (tuple of) `func` return value(s)
52 | """
53 |
54 | if isinstance(specifications, Specifications):
55 | return func(*iterables, specifications=specifications)
56 |
57 | return tuple(
58 | func(*i, specifications=s) for s, *i in zip(specifications, *iterables)
59 | )
60 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/params.py:
--------------------------------------------------------------------------------
1 | # TODO - make it depth-recursive
2 | # TODO - switch to Omegaconf maybe?
3 |
4 | from typing import Optional
5 |
6 |
7 | def merge_dict(defaults: dict, custom: Optional[dict] = None):
8 | params = dict(defaults)
9 | if custom is not None:
10 | params.update(custom)
11 | return params
12 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/probe.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from functools import partial
25 | from typing import Callable, Dict, Set, Text
26 |
27 | import torch.nn as nn
28 |
29 |
30 | def probe(trunk: nn.Module, branches: Dict[Text, Text]) -> Callable:
31 | """Add probing branches to a trunk module
32 |
33 | Parameters
34 | ----------
35 | trunk : nn.Module
36 | Multi-layer trunk.
37 | branches : {branch_name: layer_name} dict or [layer_name] list
38 | Indicate where to plug a probing branch.
39 |
40 | Returns
41 | -------
42 | revert : Callable
43 | Callable that, when called, removes probing branches.
44 |
45 | Usage
46 | -----
47 |
48 | Define a trunk made out of three consecutive layers
49 |
50 | >>> import torch.nn as nn
51 | >>> class Trunk(nn.Module):
52 | ...
53 | ... def __init__(self):
54 | ... super().__init__()
55 | ... self.layer1 = nn.Linear(1, 2)
56 | ... self.layer2 = nn.Linear(2, 3)
57 | ... self.layer3 = nn.Linear(3, 4)
58 | ...
59 | ... def forward(self, x):
60 | ... return self.layer3(self.layer2(self.layer1(x)))
61 |
62 | >>> trunk = Trunk()
63 | >>> x = torch.tensor((0.,))
64 | >>> trunk(x)
65 | # tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)
66 |
67 | Add two probing branches:
68 | - first one is called "probe1" and probes the output of "layer1"
69 | - second one is called "probe2" and probes the output of "layer3"
70 |
71 | >>> revert = probe(trunk, {"probe1": "layer1", "probe2": "layer3"})
72 | >>> trunk(x)
73 | # {'probe1': tensor([ 0.5854, -0.9685], grad_fn=),
74 | # 'probe2': tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)}
75 |
76 | Use callback returned by `probe` to revert its effect
77 |
78 | >>> revert()
79 | >>> trunk(x)
80 | # tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)
81 |
82 | For convenience, one can also define probes as a list of layers:
83 |
84 | >>> revert = probe(trunk, ['layer1', 'layer3'])
85 | >>> trunk(x)
86 | # {'layer1': tensor([ 0.5854, -0.9685], grad_fn=),
87 | # 'layer3': tensor([ 0.4548, -0.1814, 0.9494, 1.0445], grad_fn=)}
88 | """
89 |
90 | def remove():
91 | del trunk.__probe
92 | for handle in trunk.__probe_handles:
93 | handle.remove()
94 | del trunk.__probe_handles
95 |
96 | if hasattr(trunk, "__probe"):
97 | remove()
98 |
99 | trunk.__probe_handles = []
100 |
101 | def __probe_init(module, input):
102 | trunk.__probe = dict()
103 |
104 | handle = trunk.register_forward_pre_hook(__probe_init)
105 | trunk.__probe_handles.append(handle)
106 |
107 | def __probe_append(branch_name, module, input, output):
108 | trunk.__probe[branch_name] = output
109 |
110 | if not isinstance(branches, dict):
111 | branches = {b: b for b in branches}
112 |
113 | sehcnarb: Dict[Text, Set] = dict()
114 | for branch_name, layer_name in branches.items():
115 | if layer_name not in sehcnarb:
116 | sehcnarb[layer_name] = set()
117 | sehcnarb[layer_name].add(branch_name)
118 |
119 | for layer_name, layer in trunk.named_modules():
120 | if layer_name not in sehcnarb:
121 | continue
122 | for branch_name in sehcnarb[layer_name]:
123 | handle = layer.register_forward_hook(partial(__probe_append, branch_name))
124 | trunk.__probe_handles.append(handle)
125 |
126 | def __probe_return(module, input, output):
127 | return trunk.__probe
128 |
129 | handle = trunk.register_forward_hook(__probe_return)
130 | trunk.__probe_handles.append(handle)
131 |
132 | return remove
133 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/random.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | import os
25 | import zlib
26 | from random import Random
27 |
28 | import torch
29 |
30 |
31 | def create_rng_for_worker(model) -> Random:
32 | """Create worker-specific random number generator
33 |
34 | This makes sure that
35 | 1. training samples generation is reproducible
36 | 2. every (worker, epoch) uses a different seed
37 |
38 | Parameters
39 | ----------
40 | epoch : int
41 | Current epoch.
42 | """
43 |
44 | # create random number generator
45 | rng = Random()
46 |
47 | global_seed = os.environ.get("PL_GLOBAL_SEED", "unset")
48 | worker_info = torch.utils.data.get_worker_info()
49 |
50 | if worker_info is None:
51 | worker_id = None
52 | else:
53 | worker_id = worker_info.id
54 |
55 | seed_tuple = (
56 | global_seed,
57 | worker_id,
58 | model.local_rank,
59 | model.global_rank,
60 | model.current_epoch,
61 | )
62 | # use adler32 because python's `hash` is not deterministic.
63 | seed = zlib.adler32(str(seed_tuple).encode())
64 | rng.seed(seed)
65 |
66 | return rng
67 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/receptive_field.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023 CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import List
24 |
25 |
26 | def conv1d_num_frames(
27 | num_samples, kernel_size=5, stride=1, padding=0, dilation=1
28 | ) -> int:
29 | """Compute expected number of frames after 1D convolution
30 |
31 | Parameters
32 | ----------
33 | num_samples : int
34 | Number of samples in the input signal
35 | kernel_size : int
36 | Kernel size
37 | stride : int
38 | Stride
39 | padding : int
40 | Padding
41 | dilation : int
42 | Dilation
43 |
44 | Returns
45 | -------
46 | num_frames : int
47 | Number of frames in the output signal
48 |
49 | Source
50 | ------
51 | https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d
52 | """
53 | return 1 + (num_samples + 2 * padding - dilation * (kernel_size - 1) - 1) // stride
54 |
55 |
56 | def multi_conv_num_frames(
57 | num_samples: int,
58 | kernel_size: List[int] = None,
59 | stride: List[int] = None,
60 | padding: List[int] = None,
61 | dilation: List[int] = None,
62 | ) -> int:
63 | num_frames = num_samples
64 | for k, s, p, d in zip(kernel_size, stride, padding, dilation):
65 | num_frames = conv1d_num_frames(
66 | num_frames, kernel_size=k, stride=s, padding=p, dilation=d
67 | )
68 |
69 | return num_frames
70 |
71 |
72 | def conv1d_receptive_field_size(
73 | num_frames=1, kernel_size=5, stride=1, padding=0, dilation=1
74 | ):
75 | """Compute size of receptive field
76 |
77 | Parameters
78 | ----------
79 | num_frames : int, optional
80 | Number of frames in the output signal
81 | kernel_size : int
82 | Kernel size
83 | stride : int
84 | Stride
85 | padding : int
86 | Padding
87 | dilation : int
88 | Dilation
89 |
90 | Returns
91 | -------
92 | size : int
93 | Receptive field size
94 | """
95 |
96 | effective_kernel_size = 1 + (kernel_size - 1) * dilation
97 | return effective_kernel_size + (num_frames - 1) * stride - 2 * padding
98 |
99 |
100 | def multi_conv_receptive_field_size(
101 | num_frames: int,
102 | kernel_size: List[int] = None,
103 | stride: List[int] = None,
104 | padding: List[int] = None,
105 | dilation: List[int] = None,
106 | ) -> int:
107 | receptive_field_size = num_frames
108 |
109 | for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))):
110 | receptive_field_size = conv1d_receptive_field_size(
111 | num_frames=receptive_field_size,
112 | kernel_size=k,
113 | stride=s,
114 | padding=p,
115 | dilation=d,
116 | )
117 | return receptive_field_size
118 |
119 |
120 | def conv1d_receptive_field_center(
121 | frame=0, kernel_size=5, stride=1, padding=0, dilation=1
122 | ) -> int:
123 | """Compute center of receptive field
124 |
125 | Parameters
126 | ----------
127 | frame : int
128 | Frame index
129 | kernel_size : int
130 | Kernel size
131 | stride : int
132 | Stride
133 | padding : int
134 | Padding
135 | dilation : int
136 | Dilation
137 |
138 | Returns
139 | -------
140 | center : int
141 | Index of receptive field center
142 | """
143 |
144 | effective_kernel_size = 1 + (kernel_size - 1) * dilation
145 | return frame * stride + (effective_kernel_size - 1) // 2 - padding
146 |
147 |
148 | def multi_conv_receptive_field_center(
149 | frame: int,
150 | kernel_size: List[int] = None,
151 | stride: List[int] = None,
152 | padding: List[int] = None,
153 | dilation: List[int] = None,
154 | ) -> int:
155 | receptive_field_center = frame
156 | for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))):
157 | receptive_field_center = conv1d_receptive_field_center(
158 | frame=receptive_field_center,
159 | kernel_size=k,
160 | stride=s,
161 | padding=p,
162 | dilation=d,
163 | )
164 |
165 | return receptive_field_center
166 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/reproducibility.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2023- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | # Context: https://github.com/pyannote/pyannote-audio/issues/1370
24 |
25 | import warnings
26 |
27 | import torch
28 |
29 |
30 | class ReproducibilityError(Exception):
31 | ...
32 |
33 |
34 | class ReproducibilityWarning(UserWarning):
35 | ...
36 |
37 |
38 | def raise_reproducibility(device: torch.device):
39 | if (device.type == "cuda") and (
40 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
41 | ):
42 | raise ReproducibilityError(
43 | "Please disable TensorFloat-32 (TF32) by calling\n"
44 | " >>> import torch\n"
45 | " >>> torch.backends.cuda.matmul.allow_tf32 = False\n"
46 | " >>> torch.backends.cudnn.allow_tf32 = False\n"
47 | "or you might face reproducibility issues and obtain lower accuracy.\n"
48 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details."
49 | )
50 |
51 |
52 | def warn_reproducibility(device: torch.device):
53 | if (device.type == "cuda") and (
54 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
55 | ):
56 | warnings.warn(
57 | ReproducibilityWarning(
58 | "Please disable TensorFloat-32 (TF32) by calling\n"
59 | " >>> import torch\n"
60 | " >>> torch.backends.cuda.matmul.allow_tf32 = False\n"
61 | " >>> torch.backends.cudnn.allow_tf32 = False\n"
62 | "or you might face reproducibility issues and obtain lower accuracy.\n"
63 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details."
64 | )
65 | )
66 |
67 |
68 | def fix_reproducibility(device: torch.device):
69 | if (device.type == "cuda") and (
70 | torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
71 | ):
72 | torch.backends.cuda.matmul.allow_tf32 = False
73 | torch.backends.cudnn.allow_tf32 = False
74 | warnings.warn(
75 | ReproducibilityWarning(
76 | "TensorFloat-32 (TF32) has been disabled as it might lead to reproducibility issues and lower accuracy.\n"
77 | "It can be re-enabled by calling\n"
78 | " >>> import torch\n"
79 | " >>> torch.backends.cuda.matmul.allow_tf32 = True\n"
80 | " >>> torch.backends.cudnn.allow_tf32 = True\n"
81 | "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.\n"
82 | )
83 | )
84 |
--------------------------------------------------------------------------------
/pyannote/audio/utils/version.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from typing import Text
24 |
25 | from semver import VersionInfo
26 |
27 |
28 | def check_version(library: Text, theirs: Text, mine: Text, what: Text = "Pipeline"):
29 |
30 | theirs = ".".join(theirs.split(".")[:3])
31 | mine = ".".join(mine.split(".")[:3])
32 |
33 | theirs = VersionInfo.parse(theirs)
34 | mine = VersionInfo.parse(mine)
35 |
36 | if theirs.major > mine.major:
37 | print(
38 | f"{what} was trained with {library} {theirs}, yours is {mine}. "
39 | f"Bad things will probably happen unless you upgrade {library} to {theirs.major}.x."
40 | )
41 |
42 | elif theirs.major < mine.major:
43 | print(
44 | f"{what} was trained with {library} {theirs}, yours is {mine}. "
45 | f"Bad things might happen unless you revert {library} to {theirs.major}.x."
46 | )
47 |
48 | elif theirs.minor > mine.minor:
49 | print(
50 | f"{what} was trained with {library} {theirs}, yours is {mine}. "
51 | f"This should be OK but you might want to upgrade {library}."
52 | )
53 |
--------------------------------------------------------------------------------
/questions/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Questions
3 |
4 | Your questions should go in this directory.
5 |
6 | Question files should be named with the extension ".question.md".
7 |
--------------------------------------------------------------------------------
/questions/bad_performance.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "How can I improve performance?"
3 | alt_titles:
4 | - "Pretrained pipelines do not produce good results on my data. What can I do?"
5 | - "It does not work! Help me!"
6 | ---
7 |
8 | **Long answer:**
9 |
10 | 1. Manually annotate dozens of conversations as precisely as possible.
11 | 2. Separate them into train (80%), development (10%) and test (10%) subsets.
12 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization).
13 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb).
14 | 5. Enjoy.
15 |
16 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that.
17 |
--------------------------------------------------------------------------------
/questions/from_memory.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Can I apply pretrained pipelines on audio already loaded in memory?"
3 | alt_titles:
4 | - "Can I apply models on an audio array?"
5 | ---
6 |
7 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end.
8 |
--------------------------------------------------------------------------------
/questions/offline.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Can I use gated models (and pipelines) offline?"
3 | alt_titles:
4 | - "Why does one need to authenticate to access the pretrained models?"
5 | - "Can I use pyannote.audio pretrained pipelines without the Hugginface token?"
6 | - "How can I solve the permission issue?"
7 | ---
8 |
9 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
10 |
11 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible.
12 |
13 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming.
14 |
15 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
16 |
--------------------------------------------------------------------------------
/questions/pyannote.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "How does one spell and pronounce pyannote.audio?"
3 | alt_titles:
4 | - "Why the name of the library?"
5 | - "Why the logo of the library?"
6 | ---
7 |
8 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic).
9 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon.
10 | 🎹 `pianoter` means to play the piano (hence the logo 🤯).
11 |
--------------------------------------------------------------------------------
/questions/streaming.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Does pyannote support streaming speaker diarization?"
3 | alt_titles:
4 | - "Is it possible to do realtime speaker diarization?"
5 | - "Can it process online audio buffers?"
6 | ---
7 |
8 | **Short answer:** not out of the box, no.
9 |
10 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`.
11 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | asteroid-filterbanks >=0.4
2 | einops >=0.6.0
3 | huggingface_hub >= 0.13.0
4 | lightning >= 2.0.1
5 | omegaconf >=2.1,<3.0
6 | pyannote.core >= 5.0.0
7 | pyannote.database >= 5.0.1
8 | pyannote.metrics >= 3.2
9 | pyannote.pipeline >= 3.0.1
10 | pytorch_metric_learning >= 2.1.0
11 | rich >= 12.0.0
12 | semver >= 3.0.0
13 | soundfile >= 0.12.1
14 | speechbrain >= 1.0.0
15 | tensorboardX >= 2.6
16 | torch >= 2.0.0
17 | torch_audiomentations >= 0.11.0
18 | torchaudio >= 2.2.0
19 | torchmetrics >= 0.11.0
20 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # This file is used to configure your project.
2 | # Read more about the various options under:
3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
4 |
5 | [metadata]
6 | name = pyannote-audio
7 | description = Neural speaker diarization
8 | author = Herve Bredin
9 | author-email = herve.bredin@irit.fr
10 | license = mit
11 | long-description = file: README.md
12 | long-description-content-type = text/markdown; charset=UTF-8; variant=GFM
13 | # Change if running only on Windows, Mac or Linux (comma-separated)
14 | platforms = Linux, Mac
15 | # Add here all kinds of additional classifiers as defined under
16 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
17 | classifiers =
18 | Development Status :: 4 - Beta
19 | Programming Language :: Python
20 |
21 | [options]
22 | zip_safe = False
23 | packages = find:
24 | include_package_data = True
25 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
26 | setup_requires = pyscaffold>=3.2a0,<3.3a0
27 | # Add here dependencies of your project (semicolon/line-separated), e.g.
28 | # install_requires = numpy; scipy
29 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
30 | python_requires = >=3.9
31 |
32 | [options.packages.find]
33 | where = .
34 | exclude =
35 | tests
36 |
37 | [options.extras_require]
38 | # Add here additional requirements for extra features, to install with:
39 | # `pip install fastaudio[PDF]` like:
40 | # PDF = ReportLab; RXP
41 | # Add here test requirements (semicolon/line-separated)
42 | testing =
43 | pytest>=6.0
44 | pytest-cov>=2.10
45 | jupyter
46 | papermill
47 | dev =
48 | pre_commit>=2.7
49 | recommonmark>=0.6
50 | black>=22.3.0
51 | cli =
52 | hydra-core >=1.1,<1.2
53 | typer >= 0.4.0,<0.5.0
54 | separation =
55 | transformers >= 4.39.1
56 | asteroid >=0.7.0
57 |
58 | [options.entry_points]
59 |
60 | console_scripts =
61 | pyannote-audio-train=pyannote.audio.cli.train:train
62 | pyannote-audio-eval=pyannote.audio.cli.evaluate:evaluate
63 |
64 |
65 | [test]
66 | # py.test options when running `python setup.py test`
67 | # addopts = --verbose
68 | extras = True
69 |
70 | [tool:pytest]
71 | # Options for py.test:
72 | # Specify command line options as you would do when invoking py.test directly.
73 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
74 | # in order to write a coverage file that can be read by Jenkins.
75 | addopts =
76 | --cov pyannote --cov-report term-missing
77 | --verbose
78 | norecursedirs =
79 | dist
80 | build
81 | .tox
82 | testpaths = tests
83 |
84 | [aliases]
85 | dists = bdist_wheel
86 |
87 | [bdist_wheel]
88 | # Use this option if your package is pure-python
89 | universal = 1
90 |
91 | [build_sphinx]
92 | source_dir = doc
93 | build_dir = build/sphinx
94 |
95 | [devpi:upload]
96 | # Options for the devpi: PyPI server and packaging tool
97 | # VCS export must be deactivated since we are using setuptools-scm
98 | no-vcs = 1
99 | formats = bdist_wheel
100 |
101 | [flake8]
102 | # Some sane defaults for the code style checker flake8
103 | exclude =
104 | .tox
105 | build
106 | dist
107 | .eggs
108 | docs/conf.py
109 |
110 | [pyscaffold]
111 | # PyScaffold's parameters when the project was created.
112 | # This will be used when updating. Do not change!
113 | version = 3.2.3
114 | package = pyannote-audio
115 | extensions =
116 | markdown
117 | no_skeleton
118 | pre_commit
119 | dsproject
120 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from pathlib import Path
4 |
5 | from pkg_resources import VersionConflict, require
6 | from setuptools import find_packages, setup
7 |
8 | with open("README.md", mode="r", encoding="utf-8") as f:
9 | long_description = f.read()
10 |
11 | with open("requirements.txt", mode="r", encoding="utf-8") as f:
12 | requirements = f.read().splitlines()
13 |
14 | try:
15 | require("setuptools>=38.3")
16 | except VersionConflict:
17 | print("Error: version of setuptools is too old (<38.3)!")
18 | sys.exit(1)
19 |
20 |
21 | ROOT_DIR = Path(__file__).parent.resolve()
22 | # Creating the version file
23 |
24 | with open("version.txt", mode="r", encoding="utf-8") as f:
25 | version = f.read()
26 |
27 | version = version.strip()
28 | sha = "Unknown"
29 |
30 | if os.getenv("BUILD_VERSION"):
31 | version = os.getenv("BUILD_VERSION")
32 | elif sha != "Unknown":
33 | version += "+" + sha[:7]
34 | print("-- Building version " + version)
35 |
36 | version_path = ROOT_DIR / "pyannote" / "audio" / "version.py"
37 |
38 | with open(version_path, mode="w", encoding="utf-8") as f:
39 | f.write("__version__ = '{}'\n".format(version))
40 |
41 | if __name__ == "__main__":
42 | setup(
43 | name="pyannote.audio",
44 | namespace_packages=["pyannote"],
45 | version=version,
46 | packages=find_packages(),
47 | install_requires=requirements,
48 | description="Neural building blocks for speaker diarization",
49 | long_description=long_description,
50 | long_description_content_type="text/markdown",
51 | author="Hervé Bredin",
52 | author_email="herve.bredin@irit.fr",
53 | url="https://github.com/pyannote/pyannote-audio",
54 | classifiers=[
55 | "Development Status :: 4 - Beta",
56 | "Intended Audience :: Science/Research",
57 | "License :: OSI Approved :: MIT License",
58 | "Natural Language :: English",
59 | "Programming Language :: Python :: 3.9",
60 | "Programming Language :: Python :: 3.10",
61 | "Programming Language :: Python :: 3.11",
62 | "Topic :: Scientific/Engineering",
63 | ],
64 | )
65 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2020- CNRS
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | def pytest_sessionstart(session):
25 | """
26 | Called after the Session object has been created and
27 | before performing collection and entering the run test loop.
28 | """
29 |
30 | from pyannote.database import registry
31 |
32 | registry.load_database("tests/data/database.yml")
33 |
--------------------------------------------------------------------------------
/tests/data/database.yml:
--------------------------------------------------------------------------------
1 | Protocols:
2 | Debug:
3 | SpeakerDiarization:
4 | Debug:
5 | scope: database
6 | train:
7 | uri: debug.train.lst
8 | annotation: debug.train.rttm
9 | annotated: debug.train.uem
10 | development:
11 | uri: debug.development.lst
12 | annotation: debug.development.rttm
13 | annotated: debug.development.uem
14 | test:
15 | uri: debug.test.lst
16 | annotation: debug.test.rttm
17 | annotated: debug.test.uem
18 |
19 | Databases:
20 | Debug: ./{uri}.wav
21 |
--------------------------------------------------------------------------------
/tests/data/debug.development.lst:
--------------------------------------------------------------------------------
1 | dev00
2 | dev01
3 |
--------------------------------------------------------------------------------
/tests/data/debug.development.rttm:
--------------------------------------------------------------------------------
1 | SPEAKER dev00 1 1.440 11.872 MEE009
2 | SPEAKER dev00 1 13.152 3.770 MEE012
3 | SPEAKER dev00 1 18.064 0.336 MEE012
4 | SPEAKER dev00 1 18.201 2.439 MEE009
5 | SPEAKER dev00 1 20.560 1.056 MEE012
6 | SPEAKER dev00 1 21.952 4.320 MEE009
7 | SPEAKER dev00 1 23.072 0.736 MEE012
8 | SPEAKER dev00 1 26.192 2.192 MEE012
9 | SPEAKER dev00 1 28.224 1.776 MEE009
10 | SPEAKER dev01 1 4.304 2.448 MEE012
11 | SPEAKER dev01 1 7.024 4.752 MEE009
12 | SPEAKER dev01 1 15.133 4.515 MEE009
13 | SPEAKER dev01 1 16.384 1.168 MEE012
14 | SPEAKER dev01 1 19.568 0.800 MEE012
15 | SPEAKER dev01 1 21.312 1.280 MEE009
16 | SPEAKER dev01 1 22.464 1.456 MEE012
17 | SPEAKER dev01 1 29.072 0.464 MEE012
18 |
--------------------------------------------------------------------------------
/tests/data/debug.development.uem:
--------------------------------------------------------------------------------
1 | dev00 NA 0.000 30.000
2 | dev01 NA 0.000 30.000
3 |
--------------------------------------------------------------------------------
/tests/data/debug.test.lst:
--------------------------------------------------------------------------------
1 | tst00
2 | tst01
3 |
--------------------------------------------------------------------------------
/tests/data/debug.test.rttm:
--------------------------------------------------------------------------------
1 | SPEAKER tst00 1 0.000 1.901 MEE071
2 | SPEAKER tst00 1 0.944 6.124 MEE073
3 | SPEAKER tst00 1 3.492 1.954 FEO072
4 | SPEAKER tst00 1 3.612 8.676 MEE071
5 | SPEAKER tst00 1 3.692 1.887 FEO070
6 | SPEAKER tst00 1 7.891 1.114 FEO070
7 | SPEAKER tst00 1 8.544 3.216 FEO072
8 | SPEAKER tst00 1 12.133 3.301 FEO070
9 | SPEAKER tst00 1 13.120 0.602 FEO072
10 | SPEAKER tst00 1 14.959 0.666 MEE071
11 | SPEAKER tst00 1 15.109 10.155 FEO072
12 | SPEAKER tst00 1 19.006 0.485 FEO070
13 | SPEAKER tst00 1 19.008 4.796 MEE071
14 | SPEAKER tst00 1 20.124 1.044 MEE073
15 | SPEAKER tst00 1 20.222 1.222 FEO070
16 | SPEAKER tst00 1 21.400 1.928 MEE073
17 | SPEAKER tst00 1 23.490 0.750 FEO070
18 | SPEAKER tst00 1 25.344 4.656 MEE073
19 | SPEAKER tst00 1 25.658 0.550 FEO070
20 | SPEAKER tst00 1 27.792 2.208 MEE071
21 | SPEAKER tst00 1 27.879 2.121 FEO072
22 | SPEAKER tst00 1 28.016 1.984 FEO070
23 | SPEAKER tst01 1 4.390 0.350 FEO072
24 | SPEAKER tst01 1 4.773 0.366 MEE073
25 | SPEAKER tst01 1 16.495 0.540 MEE071
26 | SPEAKER tst01 1 24.159 4.388 FEO070
27 | SPEAKER tst01 1 29.008 0.448 MEE073
28 |
--------------------------------------------------------------------------------
/tests/data/debug.test.uem:
--------------------------------------------------------------------------------
1 | tst00 NA 0.000 30.000
2 | tst01 NA 0.000 30.000
3 |
--------------------------------------------------------------------------------
/tests/data/debug.train.lst:
--------------------------------------------------------------------------------
1 | trñ00
2 | trn01
3 | trn02
4 | trn03
5 | trn04
6 | trn05
7 | trn06
8 | trn07
9 | trn08
10 | trn09
11 |
--------------------------------------------------------------------------------
/tests/data/debug.train.rttm:
--------------------------------------------------------------------------------
1 | SPEAKER trn00 1 3.168 0.800 MÉO069
2 | SPEAKER trn00 1 5.463 0.640 MÉO069
3 | SPEAKER trn00 1 5.496 0.574 MEE068
4 | SPEAKER trn00 1 10.454 0.499 MÉO069
5 | SPEAKER trn00 1 11.040 4.592 MEE068
6 | SPEAKER trn00 1 16.736 1.410 MÉO069
7 | SPEAKER trn00 1 16.980 2.778 MEE067
8 | SPEAKER trn00 1 18.883 0.490 MEE068