├── .faq
    ├── FAQ.md
    └── suggest.md
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   └── config.yml
    ├── stale.yml
    └── workflows
    │   ├── doc.yml
    │   ├── pypi.yml
    │   ├── test.yml
    │   └── test_cli.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── FAQ.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── codecov.yml
├── doc
    ├── gen_docs.py
    ├── requirements.txt
    └── source
    │   └── index.rst
├── environment.yaml
├── faq.yml
├── notebook
    ├── augmentation.ipynb
    ├── example.ipynb
    ├── freeze.ipynb
    ├── inference.ipynb
    └── sharing.ipynb
├── pyannote
    ├── __init__.py
    └── audio
    │   ├── __init__.py
    │   ├── augmentation
    │       ├── __init__.py
    │       ├── mix.py
    │       └── registry.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── config
    │       │   └── hydra
    │       │   │   └── default.yaml
    │       ├── evaluate.py
    │       ├── evaluate_config
    │       │   ├── __init__.py
    │       │   ├── config.yaml
    │       │   └── hydra
    │       │   │   └── default.yaml
    │       ├── lr_schedulers
    │       │   ├── CosineAnnealingWarmRestarts.py
    │       │   ├── CyclicLR.py
    │       │   ├── ReduceLROnPlateau.py
    │       │   └── __init__.py
    │       ├── pretrained.py
    │       ├── train.py
    │       └── train_config
    │       │   ├── __init__.py
    │       │   ├── config.yaml
    │       │   ├── hydra
    │       │       └── default.yaml
    │       │   ├── model
    │       │       ├── DebugEmbedding.yaml
    │       │       ├── DebugSegmentation.yaml
    │       │       ├── Pretrained.yaml
    │       │       ├── PyanNet.yaml
    │       │       ├── SSeRiouSS.yaml
    │       │       ├── XVectorMFCC.yaml
    │       │       └── XVectorSincNet.yaml
    │       │   ├── optimizer
    │       │       ├── Adam.yaml
    │       │       ├── AdamW.yaml
    │       │       └── Adan.yaml
    │       │   ├── preprocessor
    │       │       └── LowerTemporalResolution.yaml
    │       │   ├── scheduler
    │       │       ├── CosineAnnealingWarmRestarts.yaml
    │       │       ├── CyclicLR.yaml
    │       │       └── ReduceLROnPlateau.yaml
    │       │   ├── task
    │       │       ├── MultiLabelSegmentation.yaml
    │       │       ├── OverlappedSpeechDetection.yaml
    │       │       ├── SpeakerDiarization.yaml
    │       │       ├── SpeakerEmbedding.yaml
    │       │       └── VoiceActivityDetection.yaml
    │       │   └── trainer
    │       │       ├── default.yaml
    │       │       └── fast_dev_run.yaml
    │   ├── core
    │       ├── __init__.py
    │       ├── callback.py
    │       ├── inference.py
    │       ├── io.py
    │       ├── model.py
    │       ├── pipeline.py
    │       └── task.py
    │   ├── models
    │       ├── __init__.py
    │       ├── blocks
    │       │   ├── pooling.py
    │       │   └── sincnet.py
    │       ├── embedding
    │       │   ├── __init__.py
    │       │   ├── debug.py
    │       │   ├── wespeaker
    │       │   │   ├── LICENSE.WeSpeaker
    │       │   │   ├── __init__.py
    │       │   │   ├── convert.py
    │       │   │   └── resnet.py
    │       │   └── xvector.py
    │       ├── segmentation
    │       │   ├── PyanNet.py
    │       │   ├── SSeRiouSS.py
    │       │   ├── __init__.py
    │       │   └── debug.py
    │       └── separation
    │       │   ├── ToTaToNet.py
    │       │   └── __init__.py
    │   ├── pipelines
    │       ├── __init__.py
    │       ├── clustering.py
    │       ├── multilabel.py
    │       ├── overlapped_speech_detection.py
    │       ├── resegmentation.py
    │       ├── speaker_diarization.py
    │       ├── speaker_verification.py
    │       ├── speech_separation.py
    │       ├── utils
    │       │   ├── __init__.py
    │       │   ├── diarization.py
    │       │   ├── getter.py
    │       │   ├── hook.py
    │       │   └── oracle.py
    │       └── voice_activity_detection.py
    │   ├── sample
    │       ├── __init__.py
    │       ├── sample.rttm
    │       └── sample.wav
    │   ├── tasks
    │       ├── __init__.py
    │       ├── embedding
    │       │   ├── __init__.py
    │       │   ├── arcface.py
    │       │   └── mixins.py
    │       ├── segmentation
    │       │   ├── __init__.py
    │       │   ├── mixins.py
    │       │   ├── multilabel.py
    │       │   ├── overlapped_speech_detection.py
    │       │   ├── speaker_diarization.py
    │       │   └── voice_activity_detection.py
    │       └── separation
    │       │   ├── PixIT.py
    │       │   └── __init__.py
    │   ├── torchmetrics
    │       ├── __init__.py
    │       ├── audio
    │       │   ├── __init__.py
    │       │   └── diarization_error_rate.py
    │       ├── classification
    │       │   ├── __init__.py
    │       │   └── equal_error_rate.py
    │       └── functional
    │       │   ├── __init__.py
    │       │   └── audio
    │       │       ├── __init__.py
    │       │       └── diarization_error_rate.py
    │   └── utils
    │       ├── __init__.py
    │       ├── loss.py
    │       ├── metric.py
    │       ├── multi_task.py
    │       ├── params.py
    │       ├── permutation.py
    │       ├── powerset.py
    │       ├── preprocessors.py
    │       ├── preview.py
    │       ├── probe.py
    │       ├── protocol.py
    │       ├── random.py
    │       ├── receptive_field.py
    │       ├── reproducibility.py
    │       ├── signal.py
    │       └── version.py
├── questions
    ├── README.md
    ├── bad_performance.question.md
    ├── from_memory.question.md
    ├── offline.question.md
    ├── pyannote.question.md
    └── streaming.question.md
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── conftest.py
    ├── data
    │   ├── database.yml
    │   ├── debug.development.lst
    │   ├── debug.development.rttm
    │   ├── debug.development.uem
    │   ├── debug.test.lst
    │   ├── debug.test.rttm
    │   ├── debug.test.uem
    │   ├── debug.train.lst
    │   ├── debug.train.rttm
    │   ├── debug.train.uem
    │   ├── dev00.wav
    │   ├── dev01.wav
    │   ├── empty.wav
    │   ├── trn01.wav
    │   ├── trn02.wav
    │   ├── trn03.wav
    │   ├── trn04.wav
    │   ├── trn05.wav
    │   ├── trn06.wav
    │   ├── trn07.wav
    │   ├── trn08.wav
    │   ├── trn09.wav
    │   ├── trñ00.wav
    │   ├── tst00.wav
    │   └── tst01.wav
    ├── inference_test.py
    ├── io_test.py
    ├── tasks
    │   ├── test_reproducibility.py
    │   └── test_specifications.py
    ├── test_cli.py
    ├── test_clustering.py
    ├── test_import_lib.py
    ├── test_metrics.py
    ├── test_run_notebooks.py
    ├── test_sample.py
    ├── test_speechbrain.py
    ├── test_stats_pool.py
    ├── test_train.py
    └── utils
    │   ├── preview.py
    │   ├── probe_util_test.py
    │   ├── test_permutation.py
    │   └── test_powerset.py
├── tutorials
    ├── MRE_template.ipynb
    ├── adapting_pretrained_pipeline.ipynb
    ├── add_your_own_model.ipynb
    ├── add_your_own_task.ipynb
    ├── applying_a_model.ipynb
    ├── applying_a_pipeline.ipynb
    ├── assets
    │   ├── download-model.png
    │   ├── download-pipeline.png
    │   ├── prodigy-pyannote.audio.png
    │   ├── pyannote.diff.PNG
    │   ├── pyannote.review.PNG
    │   ├── sample.rttm
    │   └── sample.wav
    ├── community
    │   └── offline_usage_speaker_diarization.ipynb
    ├── intro.ipynb
    ├── overlapped_speech_detection.ipynb
    ├── speaker_verification.ipynb
    ├── training_a_model.ipynb
    ├── training_with_cli.md
    └── voice_activity_detection.ipynb
└── version.txt


/.faq/FAQ.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Frequently Asked Questions
 3 | 
 4 | {%- for question in questions %}
 5 | - [{{ question.title }}](#{{ question.slug }})
 6 | {%- endfor %}
 7 | 
 8 | 
 9 | {%- for question in questions %}
10 | 
11 | <a name="{{ question.slug }}"></a>
12 | ## {{ question.title }}
13 | 
14 | {{ question.body }}
15 | 
16 | {%- endfor %}
17 | 
18 | <hr>
19 | 
20 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory)
21 | 


--------------------------------------------------------------------------------
/.faq/suggest.md:
--------------------------------------------------------------------------------
 1 | Thank you for your issue. 
 2 | 
 3 | {%- if questions -%}
 4 | {% if questions|length == 1 %}
 5 | We found the following entry in the [FAQ]({{ faq_url }}) which you may find helpful:
 6 | {%- else %}
 7 | We found the following entries in the [FAQ]({{ faq_url }}) which you may find helpful:
 8 | {%- endif %}
 9 | 
10 | {% for question in questions %}
11 | - [{{ question.title }}]({{ faq_url }}#{{ question.slug }})
12 | {%- endfor %}
13 | 
14 | {%- else -%}
15 | You might want to check the [FAQ]({{ faq_url }}) if you haven't done so already.
16 | {%- endif %}
17 | 
18 | Feel free to close this issue if you found an answer in the FAQ. 
19 | 
20 | If your issue is a feature request, please read [this](https://xyproblem.info/) first and update your request accordingly, if needed.
21 | 
22 | If your issue is a bug report, please provide a [minimum reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) as a link to a self-contained [Google Colab](https://colab.research.google.com/) notebook containing everthing needed to reproduce the bug: 
23 |   - installation
24 |   - data preparation
25 |   - model download
26 |   - etc.
27 |   
28 | Providing an MRE will increase your chance of getting an answer from the community (either maintainers or other power users).
29 | 
30 | Companies relying on `pyannote.audio` in production may contact [me](https://herve.niderb.fr) via email regarding:
31 | * paid scientific consulting around speaker diarization and speech processing in general;
32 | * custom models and tailored features (via the local tech transfer office).
33 | 
34 | > This is an automated reply, generated by [FAQtory](https://github.com/willmcgugan/faqtory)
35 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | pyannote/audio/_version.py export-subst
2 | notebooks/* linguist-documentation
3 | tutorials/* linguist-documentation
4 | versioneer.py linguist-vendored
5 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [hbredin]
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug report
 2 | description: Report a bug in pyannote.audio
 3 | body:
 4 | 
 5 | - type: markdown
 6 |   attributes:
 7 |     value: |
 8 |       When reporting bugs, please follow the guidelines in this template. This helps identify the problem precisely and thus enables contributors to fix it faster.
 9 |       - Write a descriptive issue title above.
10 |       - The golden rule is to **always open *one* issue for *one* bug**. If you notice several bugs and want to report them, make sure to create one new issue for each of them.
11 |       - Search [open](https://github.com/pyannote/pyannote-audio/issues) and [closed](https://github.com/pyannote/pyannote-audio/issues?q=is%3Aissue+is%3Aclosed) issues to ensure it has not already been reported. If you don't find a relevant match or if you're unsure, don't hesitate to **open a new issue**. The bugsquad will handle it from there if it's a duplicate.
12 |       - Please always check if your issue is reproducible in the latest version – it may already have been fixed!
13 |       - If you use a custom build, please test if your issue is reproducible in official releases too.
14 | 
15 | - type: textarea
16 |   attributes:
17 |     label: Tested versions
18 |     description: |
19 |       To properly fix a bug, we need to identify if the bug was recently introduced in the engine, or if it was always present.
20 |       - Please specify the pyannote.audio version you found the issue in, including the **Git commit hash** if using a development build.
21 |       - If you can, **please test earlier pyannote.audio versions** and, if applicable, newer versions (development branch). Mention whether the bug is reproducible or not in the versions you tested.
22 |       - The aim is for us to identify whether a bug is a **regression**, i.e. an issue that didn't exist in a previous version, but was introduced later on, breaking existing functionality. For example, if a bug is reproducible in 3.2 but not in 3.0, we would like you to test intermediate 3.1 to find which version is the first one where the issue can be reproduced.
23 |     placeholder: |
24 |       - Reproducible in: 3.1, 3.2, and later
25 |       - Not reproducible in: 3.0
26 |   validations:
27 |     required: true
28 | 
29 | - type: input
30 |   attributes:
31 |     label: System information
32 |     description: |
33 |       - Specify the OS version, and when relevant hardware information.
34 |       - For issues that are likely OS-specific and/or GPU-related, please specify the GPU model and architecture.
35 |       - **Bug reports not including the required information may be closed at the maintainers' discretion.** If in doubt, always include all the requested information; it's better to include too much information than not enough information.
36 |     placeholder: macOS 13.6 - pyannote.audio 3.1.1 - M1 Pro
37 |   validations:
38 |     required: true
39 | 
40 | - type: textarea
41 |   attributes:
42 |     label: Issue description
43 |     description: |
44 |       Describe your issue briefly. What doesn't work, and how do you expect it to work instead?
45 |       You can include audio, images or videos with drag and drop, and format code blocks or logs with <code>```</code> tags.
46 |   validations:
47 |     required: true
48 | 
49 | - type: input
50 |   attributes:
51 |     label: Minimal reproduction example (MRE)
52 |     description: |
53 |       Having reproducible issues is a prerequisite for contributors to be able to solve them.
54 |       Include a link to minimal reproduction example using [this Google Colab notebook](https://colab.research.google.com/github/pyannote/pyannote-audio/blob/develop/tutorials/MRE_template.ipynb) as a starting point.
55 |   validations:
56 |     required: true
57 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | blank_issues_enabled: false
 2 | 
 3 | contact_links:
 4 | 
 5 |   - name: Feature request
 6 |     url: https://github.com/pyannote/pyannote-audio/discussions
 7 |     about: Suggest an idea for this project.
 8 | 
 9 |   - name: Consulting
10 |     url: https://herve.niderb.fr/consulting
11 |     about: Using pyannote.audio in production? Make the most of it thanks to our consulting services.
12 | 
13 |   - name: Premium models
14 |     url: https://forms.office.com/e/GdqwVgkZ5C
15 |     about: We are considering selling premium models, extensions, or services around pyannote.audio.
16 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 180
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 30
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/doc.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | 
 7 | jobs:
 8 |   build-and-deploy:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       max-parallel: 4
12 |       matrix:
13 |         python-version: ["3.9"]
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v1
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install .
25 |         pip install -r doc/requirements.txt
26 |     - name: Build documentation
27 |       run: |
28 |         make --directory=doc html
29 |         touch ./doc/build/html/.nojekyll
30 |     - name: Deploy
31 |       env:
32 |         ACTIONS_DEPLOY_KEY: ${{ secrets.ACTIONS_DEPLOY_KEY }}
33 |         PUBLISH_BRANCH: gh-pages
34 |         PUBLISH_DIR: ./doc/build/html
35 |         SCRIPT_MODE: true
36 |       run: |
37 |         wget https://raw.githubusercontent.com/peaceiris/actions-gh-pages/v2/entrypoint.sh
38 |         bash ./entrypoint.sh
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |     - '*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |       with:
16 |         python-version: '3.x'
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install setuptools wheel twine
21 |     - name: Build and publish
22 |       env:
23 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
24 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
25 |       run: |
26 |         python setup.py sdist bdist_wheel
27 |         twine upload dist/*
28 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [develop]
 6 |   pull_request:
 7 |     branches: [develop]
 8 | 
 9 | jobs:
10 |   build:
11 |     timeout-minutes: 20
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest]
16 |         python-version: ["3.9", "3.10", "3.11"]
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |       - name: Install libsndfile
24 |         if: matrix.os == 'ubuntu-latest'
25 |         run: |
26 |           sudo apt-get update
27 |           sudo apt-get install libsndfile1
28 |       - name: Install pyannote.audio
29 |         run: |
30 |           pip install -e .[dev,testing]
31 |       - name: Test with pytest
32 |         run: |
33 |           pytest -k "not test_cli.py"
34 | 


--------------------------------------------------------------------------------
/.github/workflows/test_cli.yml:
--------------------------------------------------------------------------------
 1 | name: CLI tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [develop]
 6 |   pull_request:
 7 |     branches: [develop]
 8 | 
 9 | jobs:
10 |   build:
11 |     timeout-minutes: 20
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest]
16 |         python-version: ["3.10"]
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |       - name: Install libsndfile
24 |         if: matrix.os == 'ubuntu-latest'
25 |         run: |
26 |           sudo apt-get update
27 |           sudo apt-get install libsndfile1
28 |       - name: Install pyannote.audio
29 |         run: |
30 |           pip install -e .[dev,testing,cli]
31 |       - name: Test with pytest
32 |         run: |
33 |           pytest tests/test_cli.py
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | .env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 
59 | # PyBuilder
60 | target/
61 | 
62 | #Ipython Notebook
63 | .ipynb_checkpoints
64 | 
65 | notebooks
66 | 
67 | experiments
68 | *~
69 | 
70 | *.npy
71 | *.pt
72 | *events.out.tfevents*
73 | *.csv
74 | 
75 | # PyCharm
76 | .idea/
77 | 
78 | gh-pages
79 | gh-pages.pub
80 | 
81 | *.zip
82 | .mypy_cache/
83 | .vscode/
84 | 
85 | **/lightning_logs/**
86 | 
87 | # Version Output
88 | pyannote/audio/version.py
89 | 
90 | # vim
91 | .vim
92 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tutorials/AMI-diarization-setup"]
2 | 	path = tutorials/AMI-diarization-setup
3 | 	url = https://github.com/pyannote/AMI-diarization-setup.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: '^docs/conf.py'
 2 | 
 3 | repos:
 4 |     # # Clean Notebooks
 5 |     # - repo: https://github.com/kynan/nbstripout
 6 |     #   rev: master
 7 |     #   hooks:
 8 |     #     - id: nbstripout
 9 |     # Format Code
10 |     - repo: https://github.com/ambv/black
11 |       rev: 22.3.0
12 |       hooks:
13 |         - id: black
14 | 
15 |     # Sort imports
16 |     - repo: https://github.com/PyCQA/isort
17 |       rev: 5.12.0
18 |       hooks:
19 |       - id: isort
20 |         args: ["--profile", "black"]
21 | 
22 |     # Formatting, Whitespace, etc
23 |     - repo: https://github.com/pre-commit/pre-commit-hooks
24 |       rev: v2.2.3
25 |       hooks:
26 |       - id: trailing-whitespace
27 |       - id: check-added-large-files
28 |         args: ['--maxkb=1000']
29 |       - id: check-ast
30 |       - id: check-json
31 |       - id: check-merge-conflict
32 |       - id: check-xml
33 |       - id: check-yaml
34 |       - id: debug-statements
35 |       - id: end-of-file-fixer
36 |       - id: requirements-txt-fixer
37 |       - id: mixed-line-ending
38 |         args: ['--fix=no']
39 |       - id: flake8
40 |         args: ['--ignore=E203,E501,F811,E712,W503']
41 | 


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Frequently Asked Questions
 3 | - [Can I apply pretrained pipelines on audio already loaded in memory?](#can-i-apply-pretrained-pipelines-on-audio-already-loaded-in-memory)
 4 | - [Can I use gated models (and pipelines) offline?](#can-i-use-gated-models-(and-pipelines)-offline)
 5 | - [Does pyannote support streaming speaker diarization?](#does-pyannote-support-streaming-speaker-diarization)
 6 | - [How can I improve performance?](#how-can-i-improve-performance)
 7 | - [How does one spell and pronounce pyannote.audio?](#how-does-one-spell-and-pronounce-pyannoteaudio)
 8 | 
 9 | <a name="can-i-apply-pretrained-pipelines-on-audio-already-loaded-in-memory"></a>
10 | ## Can I apply pretrained pipelines on audio already loaded in memory?
11 | 
12 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end.
13 | 
14 | <a name="can-i-use-gated-models-(and-pipelines)-offline"></a>
15 | ## Can I use gated models (and pipelines) offline?
16 | 
17 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
18 | 
19 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible.
20 | 
21 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming.
22 | 
23 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
24 | 
25 | <a name="does-pyannote-support-streaming-speaker-diarization"></a>
26 | ## Does pyannote support streaming speaker diarization?
27 | 
28 | **Short answer:** not out of the box, no.
29 | 
30 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`.
31 | 
32 | <a name="how-can-i-improve-performance"></a>
33 | ## How can I improve performance?
34 | 
35 | **Long answer:**
36 | 
37 | 1. Manually annotate dozens of conversations as precisely as possible.
38 | 2. Separate them into train (80%), development (10%) and test (10%) subsets.
39 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization).
40 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb).
41 | 5. Enjoy.
42 | 
43 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that.
44 | 
45 | <a name="how-does-one-spell-and-pronounce-pyannoteaudio"></a>
46 | ## How does one spell and pronounce pyannote.audio?
47 | 
48 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic).
49 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon.
50 | 🎹 `pianoter` means to play the piano (hence the logo 🤯).
51 | 
52 | <hr>
53 | 
54 | Generated by [FAQtory](https://github.com/willmcgugan/faqtory)
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 CNRS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include pyannote *.py
2 | recursive-include pyannote *.yaml
3 | recursive-include pyannote *.wav
4 | recursive-include pyannote *.rttm
5 | global-exclude *.pyc
6 | global-exclude __pycache__
7 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     patch:
4 |       default:
5 |         enabled: false
6 | 


--------------------------------------------------------------------------------
/doc/gen_docs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script will generate the rst docs for the api
 3 | """
 4 | 
 5 | import os
 6 | from os import path
 7 | 
 8 | bp = breakpoint
 9 | 
10 | 
11 | def capitalise(s):
12 |     news = ""
13 |     for word in s.split("_"):
14 |         news += word.capitalize()
15 |     return news
16 | 
17 | 
18 | def process_dir(level, p):
19 |     md = ""
20 |     basename = path.basename(p)
21 | 
22 |     title = capitalise(basename)
23 |     md += f"{'#'*level} {title}\n\n"
24 |     subdirs = os.listdir(p)
25 | 
26 |     for f in subdirs:
27 |         m = path.join(subdir, f)
28 |         if path.isdir(m):
29 |             md += process_dir(level + 1, path.join(p, f))
30 |         else:
31 |             if "__" in f:
32 |                 continue
33 |             module = m[3:].replace("/", ".")[:-3]
34 |             md += f"""
35 | ```eval_rst
36 | .. automodule:: {module}
37 |     :members:
38 | 
39 | ```
40 | 
41 | """
42 |     return md
43 | 
44 | 
45 | DIR = "../pyannote/audio"
46 | 
47 | for module in os.listdir(DIR):
48 |     # Each folder will become and rst file
49 |     # Each file/folder will have a # prepended to it
50 |     # Recursively we will add another # each level
51 | 
52 |     # Initialise Markdown
53 |     md = ""
54 | 
55 |     subdir = path.join(DIR, module)
56 | 
57 |     # Skip if not directory
58 |     if not path.isdir(subdir) or "__" in module:
59 |         continue
60 | 
61 |     md += process_dir(1, subdir)
62 |     with open(f"./source/api/{module}.md", "w") as f:
63 |         f.write(md)
64 | 


--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython==8.10.0
2 | recommonmark
3 | Sphinx==3.0.4
4 | sphinx_rtd_theme==0.4.3
5 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | ##############
 2 | pyannote.audio
 3 | ##############
 4 | 
 5 | `pyannote.audio` is an open-source Python library that provides neural building blocks for speaker diarization.
 6 | 
 7 | Installation
 8 | ============
 9 | 
10 | ::
11 | 
12 |   $ conda create -n pyannote python=3.10
13 |   $ conda activate pyannote
14 |   $ pip install pyannote.audio
15 | 
16 | 
17 | API documentation
18 | =================
19 | 
20 | .. toctree::
21 |    :maxdepth: 2
22 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: pyannote-audio
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python==3.8.5
 7 |   - libsndfile==1.0.28
 8 |   - pip>=20.2
 9 |   - pip:
10 |     - -r requirements.txt
11 | 


--------------------------------------------------------------------------------
/faq.yml:
--------------------------------------------------------------------------------
1 | # FAQtory settings
2 | 
3 | faq_url: "https://github.com/pyannote/pyannote-audio/blob/develop/FAQ.md" # Replace this with the URL to your FAQ.md!
4 | 
5 | questions_path: "./questions" # Where questions should be stored
6 | output_path: "./FAQ.md" # Where FAQ.md should be generated
7 | templates_path: ".faq" # Path to templates
8 | 


--------------------------------------------------------------------------------
/notebook/augmentation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# gett a 5s excerpt of first test file\n",
 10 |     "from pyannote.database import get_protocol, FileFinder\n",
 11 |     "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
 12 |     "                        preprocessors={\"audio\": FileFinder()})\n",
 13 |     "\n",
 14 |     "from pyannote.audio.core.io import Audio\n",
 15 |     "audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
 16 |     "file = next(protocol.test())\n",
 17 |     "\n",
 18 |     "from pyannote.core import Segment\n",
 19 |     "waveform, sample_rate = audio.crop(file, Segment(5, 10))\n",
 20 |     "\n",
 21 |     "import torch\n",
 22 |     "waveforms = torch.tensor(waveform)[None, :]"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# play the excerpt\n",
 32 |     "from IPython.display import Audio as Play\n",
 33 |     "Play(waveforms.squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# define a model that simply returns the waveform\n",
 43 |     "from pyannote.audio.core.model import Model\n",
 44 |     "class Passthrough(Model):\n",
 45 |     "    def forward(self, waveforms):\n",
 46 |     "        return waveforms\n",
 47 |     "    \n",
 48 |     "identity = Passthrough()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# pass the waveform through this \"identity\" model\n",
 58 |     "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# add one torch_audiomentations waveform transform to the model\n",
 68 |     "from pyannote.audio.augmentation.registry import register_augmentation\n",
 69 |     "from torch_audiomentations import Gain\n",
 70 |     "gain = Gain(\n",
 71 |     "    min_gain_in_db=-15.0,\n",
 72 |     "    max_gain_in_db=5.0,\n",
 73 |     "    p=0.5)\n",
 74 |     "register_augmentation(gain, identity, when='input')"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# pass the waveform through the \"augmented\" model\n",
 84 |     "Play(identity(waveforms).squeeze(), rate=sample_rate, normalize=False, autoplay=True)"
 85 |    ]
 86 |   }
 87 |  ],
 88 |  "metadata": {
 89 |   "kernelspec": {
 90 |    "display_name": "Python 3",
 91 |    "language": "python",
 92 |    "name": "python3"
 93 |   },
 94 |   "language_info": {
 95 |    "codemirror_mode": {
 96 |     "name": "ipython",
 97 |     "version": 3
 98 |    },
 99 |    "file_extension": ".py",
100 |    "mimetype": "text/x-python",
101 |    "name": "python",
102 |    "nbconvert_exporter": "python",
103 |    "pygments_lexer": "ipython3",
104 |    "version": "3.7.9"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 4
109 | }
110 | 


--------------------------------------------------------------------------------
/notebook/freeze.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pyannote.database import get_protocol, FileFinder\n",
10 |     "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
11 |     "                        preprocessors={\"audio\": FileFinder()})"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": null,
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "from pyannote.audio.tasks import VoiceActivityDetection\n",
21 |     "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n",
22 |     "import pytorch_lightning as pl"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {},
29 |    "outputs": [],
30 |    "source": [
31 |     "vad = VoiceActivityDetection(protocol, duration=2., batch_size=16, num_workers=4)\n",
32 |     "model = SimpleSegmentationModel(task=vad)\n",
33 |     "trainer = pl.Trainer(max_epochs=1)\n",
34 |     "_ = trainer.fit(model)"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": null,
40 |    "metadata": {},
41 |    "outputs": [],
42 |    "source": [
43 |     "summary = model.summarize('full')"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {},
50 |    "outputs": [],
51 |    "source": [
52 |     "model.freeze_up_to('lstm')"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "model.unfreeze_up_to('mfcc.MelSpectrogram.spectrogram')"
62 |    ]
63 |   },
64 |   {
65 |    "cell_type": "code",
66 |    "execution_count": null,
67 |    "metadata": {},
68 |    "outputs": [],
69 |    "source": [
70 |     "model.freeze_by_name(['lstm', 'activation'])"
71 |    ]
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "codemirror_mode": {
82 |     "name": "ipython",
83 |     "version": 3
84 |    },
85 |    "file_extension": ".py",
86 |    "mimetype": "text/x-python",
87 |    "name": "python",
88 |    "nbconvert_exporter": "python",
89 |    "pygments_lexer": "ipython3",
90 |    "version": "3.8.5"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 4
95 | }
96 | 


--------------------------------------------------------------------------------
/notebook/sharing.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pyannote.database import get_protocol, FileFinder\n",
10 |     "protocol = get_protocol('Debug.SpeakerDiarization.Debug', \n",
11 |     "                        preprocessors={\"audio\": FileFinder()})"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "markdown",
16 |    "metadata": {},
17 |    "source": [
18 |     "## Train a model"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "from pyannote.audio.tasks import VoiceActivityDetection\n",
28 |     "from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel\n",
29 |     "import pytorch_lightning as pl\n",
30 |     "\n",
31 |     "vad = VoiceActivityDetection(protocol, duration=2., batch_size=32, num_workers=4)\n",
32 |     "model = SimpleSegmentationModel(task=vad)\n",
33 |     "trainer = pl.Trainer(max_epochs=1, default_root_dir='sharing/')\n",
34 |     "_ = trainer.fit(model)"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "markdown",
39 |    "metadata": {},
40 |    "source": [
41 |     "## Load a model without knowing its class"
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "code",
46 |    "execution_count": null,
47 |    "metadata": {},
48 |    "outputs": [],
49 |    "source": [
50 |     "from pyannote.audio import Model\n",
51 |     "model = Model.from_pretrained('sharing/lightning_logs/version_0/checkpoints/epoch=0-step=3.ckpt')\n",
52 |     "assert isinstance(model, SimpleSegmentationModel)\n",
53 |     "\n",
54 |     "# checkpoint should work with a URL as well (it relies on pl_load)"
55 |    ]
56 |   }
57 |  ],
58 |  "metadata": {
59 |   "kernelspec": {
60 |    "display_name": "Python 3",
61 |    "language": "python",
62 |    "name": "python3"
63 |   },
64 |   "language_info": {
65 |    "codemirror_mode": {
66 |     "name": "ipython",
67 |     "version": 3
68 |    },
69 |    "file_extension": ".py",
70 |    "mimetype": "text/x-python",
71 |    "name": "python",
72 |    "nbconvert_exporter": "python",
73 |    "pygments_lexer": "ipython3",
74 |    "version": "3.8.5"
75 |   }
76 |  },
77 |  "nbformat": 4,
78 |  "nbformat_minor": 4
79 | }
80 | 


--------------------------------------------------------------------------------
/pyannote/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | __import__("pkg_resources").declare_namespace(__name__)
24 | 


--------------------------------------------------------------------------------
/pyannote/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | try:
24 |     from .version import __version__, git_version  # noqa: F401
25 | except ImportError:
26 |     pass
27 | 
28 | 
29 | from .core.inference import Inference
30 | from .core.io import Audio
31 | from .core.model import Model
32 | from .core.pipeline import Pipeline
33 | 
34 | __all__ = ["Audio", "Model", "Inference", "Pipeline"]
35 | 


--------------------------------------------------------------------------------
/pyannote/audio/augmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .mix import MixSpeakerDiarization
25 | 
26 | __all__ = ["MixSpeakerDiarization"]
27 | 


--------------------------------------------------------------------------------
/pyannote/audio/augmentation/mix.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2022- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from typing import Optional
 25 | 
 26 | import torch
 27 | from torch import Tensor
 28 | from torch_audiomentations import Mix
 29 | 
 30 | 
 31 | class MixSpeakerDiarization(Mix):
 32 |     """
 33 |     Create a new sample by mixing it with another random sample from the same batch
 34 | 
 35 |     Signal-to-noise ratio (where "noise" is the second random sample) is selected
 36 |     randomly between `min_snr_in_db` and `max_snr_in_db`.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     min_snr_in_db : float, optional
 41 |         Defaults to 0.0
 42 |     max_snr_in_db : float, optional
 43 |         Defaults to 5.0
 44 |     max_num_speakers: int, optional
 45 |         Maximum number of speakers in mixtures.  Defaults to actual maximum number
 46 |         of speakers in each batch.
 47 |     """
 48 | 
 49 |     supported_modes = {"per_example", "per_channel"}
 50 | 
 51 |     supports_multichannel = True
 52 |     requires_sample_rate = False
 53 | 
 54 |     supports_target = True
 55 |     requires_target = True
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         min_snr_in_db: float = 0.0,
 60 |         max_snr_in_db: float = 5.0,
 61 |         mode: str = "per_example",
 62 |         p: float = 0.5,
 63 |         p_mode: Optional[str] = None,
 64 |         sample_rate: Optional[int] = None,
 65 |         target_rate: Optional[int] = None,
 66 |         max_num_speakers: Optional[int] = None,
 67 |         output_type: str = "tensor",
 68 |     ):
 69 |         super().__init__(
 70 |             min_snr_in_db=min_snr_in_db,
 71 |             max_snr_in_db=max_snr_in_db,
 72 |             mode=mode,
 73 |             p=p,
 74 |             p_mode=p_mode,
 75 |             sample_rate=sample_rate,
 76 |             target_rate=target_rate,
 77 |             output_type=output_type,
 78 |         )
 79 |         self.max_num_speakers = max_num_speakers
 80 | 
 81 |     def randomize_parameters(
 82 |         self,
 83 |         samples: Optional[Tensor] = None,
 84 |         sample_rate: Optional[int] = None,
 85 |         targets: Optional[Tensor] = None,
 86 |         target_rate: Optional[int] = None,
 87 |     ):
 88 | 
 89 |         batch_size, num_channels, num_samples = samples.shape
 90 |         snr_distribution = torch.distributions.Uniform(
 91 |             low=torch.tensor(
 92 |                 self.min_snr_in_db,
 93 |                 dtype=torch.float32,
 94 |                 device=samples.device,
 95 |             ),
 96 |             high=torch.tensor(
 97 |                 self.max_snr_in_db,
 98 |                 dtype=torch.float32,
 99 |                 device=samples.device,
100 |             ),
101 |             validate_args=True,
102 |         )
103 | 
104 |         # randomize SNRs
105 |         self.transform_parameters["snr_in_db"] = snr_distribution.sample(
106 |             sample_shape=(batch_size,)
107 |         )
108 | 
109 |         # count number of active speakers per sample
110 |         num_speakers: torch.Tensor = torch.sum(torch.any(targets, dim=-2), dim=-1)
111 |         max_num_speakers = self.max_num_speakers or torch.max(num_speakers)
112 | 
113 |         # randomize index of second sample, constrained by the fact that the
114 |         # resulting mixture should have less than max_num_speakers
115 |         self.transform_parameters["sample_idx"] = torch.arange(
116 |             batch_size, dtype=torch.int64
117 |         )
118 |         for n in range(max_num_speakers + 1):
119 | 
120 |             # indices of samples with exactly n speakers
121 |             samples_with_n_speakers = torch.where(num_speakers == n)[0]
122 |             num_samples_with_n_speakers = len(samples_with_n_speakers)
123 |             if num_samples_with_n_speakers == 0:
124 |                 continue
125 | 
126 |             # indices of candidate samples for mixing (i.e. samples that would)
127 |             candidates = torch.where(num_speakers + n <= max_num_speakers)[0]
128 |             num_candidates = len(candidates)
129 |             if num_candidates == 0:
130 |                 continue
131 | 
132 |             # sample uniformly from candidate samples
133 |             selected_candidates = candidates[
134 |                 torch.randint(
135 |                     0,
136 |                     num_candidates,
137 |                     (num_samples_with_n_speakers,),
138 |                     device=samples.device,
139 |                 )
140 |             ]
141 |             self.transform_parameters["sample_idx"][
142 |                 samples_with_n_speakers
143 |             ] = selected_candidates
144 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .pretrained import pretrained
24 | 
25 | __all__ = [
26 |     "pretrained",
27 | ]
28 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/config/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | run:
 4 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 5 | 
 6 | sweep:
 7 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 8 |   subdir: ${hydra.job.num}
 9 | 
10 | output_subdir: ""
11 | 
12 | help:
13 |   app_name: pyannote-audio-train
14 | 
15 |   # Help header, customize to describe your app to your users
16 |   header: == ${hydra.help.app_name} ==
17 | 
18 |   footer: |-
19 |     Powered by Hydra (https://hydra.cc)
20 |     Use --hydra-help to view Hydra specific help
21 | 
22 |   template: |-
23 |     ${hydra.help.header}
24 | 
25 |     pyannote-audio-train protocol={protocol_name} 
26 |                          task={task} task.param=...
27 |                          model={model} model.param=...
28 |                          optimizer={optimizer} optimizer.param=...
29 |                          scheduler={scheduler} scheduler.param=...
30 | 
31 |     ${hydra.help.footer}
32 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2022- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from typing import Optional
 25 | 
 26 | import hydra
 27 | from omegaconf import DictConfig
 28 | from pyannote.database import FileFinder, ProtocolFile, registry
 29 | from rich.progress import Progress
 30 | 
 31 | from pyannote.audio import Inference, Model
 32 | from pyannote.audio.pipelines.utils import get_devices
 33 | from pyannote.audio.utils.metric import DiscreteDiarizationErrorRate
 34 | from pyannote.audio.utils.signal import binarize
 35 | 
 36 | 
 37 | @hydra.main(config_path="evaluate_config", config_name="config")
 38 | def evaluate(cfg: DictConfig) -> Optional[float]:
 39 | 
 40 |     # load pretrained model
 41 |     (device,) = get_devices(needs=1)
 42 |     model = Model.from_pretrained(cfg.model, device=device)
 43 | 
 44 |     # load databases into registry if it was specified
 45 |     if "registry" in cfg:
 46 |         for database_yml in cfg.registry.split(","):
 47 |             registry.load_database(database_yml)
 48 | 
 49 |     # load evaluation files
 50 |     protocol = registry.get_protocol(
 51 |         cfg.protocol, preprocessors={"audio": FileFinder()}
 52 |     )
 53 | 
 54 |     files = list(getattr(protocol, cfg.subset)())
 55 | 
 56 |     # load evaluation metric
 57 |     metric = DiscreteDiarizationErrorRate()
 58 | 
 59 |     with Progress() as progress:
 60 | 
 61 |         main_task = progress.add_task(protocol.name, total=len(files))
 62 |         file_task = progress.add_task("Processing", total=1.0)
 63 | 
 64 |         def progress_hook(completed: Optional[int] = None, total: Optional[int] = None):
 65 |             progress.update(file_task, completed=completed / total)
 66 | 
 67 |         inference = Inference(model, device=device)
 68 |         warm_up = cfg.warm_up / inference.duration
 69 | 
 70 |         def hypothesis(file: ProtocolFile):
 71 |             return Inference.trim(
 72 |                 binarize(inference(file, hook=progress_hook)),
 73 |                 warm_up=(warm_up, warm_up),
 74 |             )
 75 | 
 76 |         for file in files:
 77 |             progress.update(file_task, description=file["uri"])
 78 |             reference = file["annotation"]
 79 |             uem = file["annotated"]
 80 |             _ = metric(reference, hypothesis(file), uem=uem)
 81 |             progress.advance(main_task)
 82 | 
 83 |     report = metric.report(display=False)
 84 | 
 85 |     with open("report.txt", "w") as f:
 86 | 
 87 |         f.write(f"# Model:    {cfg.model}\n")
 88 |         f.write(f"# Protocol: {protocol.name}\n")
 89 |         f.write(f"# Subset:   {cfg.subset}\n")
 90 |         f.write("\n")
 91 |         report = report.to_string(
 92 |             index=True,
 93 |             sparsify=False,
 94 |             justify="right",
 95 |             float_format=lambda f: "{0:.2f}".format(f),
 96 |         )
 97 |         f.write(f"{report}")
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     evaluate()
102 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/config.yaml:
--------------------------------------------------------------------------------
1 | model: ???
2 | protocol: ???
3 | warm_up: 0.0
4 | subset: test
5 | 
6 | defaults:
7 |   - hydra: default
8 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/evaluate_config/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | run:
 4 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 5 | 
 6 | sweep:
 7 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 8 |   subdir: ${hydra.job.num}
 9 | 
10 | output_subdir: ""
11 | 
12 | help:
13 |   app_name: pyannote-audio-eval
14 | 
15 |   # Help header, customize to describe your app to your users
16 |   header: == ${hydra.help.app_name} ==
17 | 
18 |   footer: |-
19 |     Powered by Hydra (https://hydra.cc)
20 |     Use --hydra-help to view Hydra specific help
21 | 
22 |   template: |-
23 |     ${hydra.help.header}
24 | 
25 |     pyannote-audio-eval registry={path_to_database.yml}
26 |                         protocol={protocol_name}
27 |                         subset={test | development | train}
28 |                         model={path_to_pretrained_model}
29 |                         warm_up={warm_up_duration_in_seconds}
30 | 
31 |     ${hydra.help.footer}
32 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/CosineAnnealingWarmRestarts.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from typing import Optional
24 | 
25 | from torch.optim import Optimizer
26 | from torch.optim.lr_scheduler import (
27 |     CosineAnnealingWarmRestarts as _CosineAnnealingWarmRestarts,
28 | )
29 | 
30 | 
31 | def CosineAnnealingWarmRestarts(
32 |     optimizer: Optimizer,
33 |     min_lr: float = 1e-8,
34 |     max_lr: float = 1e-3,
35 |     patience: int = 1,
36 |     num_batches_per_epoch: Optional[int] = None,
37 |     **kwargs,
38 | ):
39 |     """Wrapper around CosineAnnealingWarmRestarts
40 | 
41 |     Parameters
42 |     ----------
43 |     optimizer : Optimizer
44 |         Optimizer
45 |     min_lr : float, optional
46 |         Defaults to 1e-8.
47 |     max_lr : float, optional
48 |         Defaults to 1e-3
49 |     patience : int, optional
50 |         Number of epochs per cycle. Defaults to 1.
51 |     num_batches_per_epoch : int, optional
52 |         Number of batches per epoch.
53 |     """
54 | 
55 |     # initialize optimizer lr to max_lr
56 |     for g in optimizer.param_groups:
57 |         g["lr"] = max_lr
58 | 
59 |     num_steps = patience * num_batches_per_epoch
60 | 
61 |     return {
62 |         "scheduler": _CosineAnnealingWarmRestarts(
63 |             optimizer, num_steps, eta_min=min_lr, T_mult=2
64 |         ),
65 |         "interval": "step",
66 |     }
67 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/CyclicLR.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from typing import Optional
24 | 
25 | from torch.optim import Optimizer
26 | from torch.optim.lr_scheduler import CyclicLR as _CyclicLR
27 | 
28 | 
29 | def CyclicLR(
30 |     optimizer: Optimizer,
31 |     min_lr: float = 1e-8,
32 |     max_lr: float = 1e-3,
33 |     mode: str = "triangular2",
34 |     patience: int = 50,
35 |     num_batches_per_epoch: Optional[int] = None,
36 |     **kwargs,
37 | ):
38 |     """Wrapper around CyclicLR learning rate scheduler
39 | 
40 |     Parameters
41 |     ----------
42 |     optimizer : Optimizer
43 |         Optimizer
44 |     min_lr : float, optional
45 |         Defaults to 1e-8.
46 |     max_lr : float, optional
47 |         Defaults to 1e-3
48 |     patience : int, optional
49 |         Number of epochs per cycle. Defaults to 50.
50 |     num_batches_per_epoch : int, optional
51 |         Number of batches per epoch.
52 |     mode : {"triangular", "triangular2"}, optional
53 |         Defaults to "triangular2".
54 |     """
55 | 
56 |     step_size_up = int(0.5 * patience * num_batches_per_epoch)
57 | 
58 |     return {
59 |         "scheduler": _CyclicLR(
60 |             optimizer,
61 |             base_lr=min_lr,
62 |             max_lr=max_lr,
63 |             step_size_up=step_size_up,
64 |             mode=mode,
65 |             cycle_momentum=False,
66 |         ),
67 |         "interval": "step",
68 |     }
69 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/ReduceLROnPlateau.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from typing import Optional, Text
25 | 
26 | from torch.optim import Optimizer
27 | from torch.optim.lr_scheduler import ReduceLROnPlateau as _ReduceLROnPlateau
28 | 
29 | 
30 | def ReduceLROnPlateau(
31 |     optimizer: Optimizer,
32 |     monitor: Optional[Text] = None,
33 |     direction: Optional[Text] = "min",
34 |     min_lr: float = 1e-8,
35 |     max_lr: float = 1e-3,
36 |     factor: float = 0.5,
37 |     patience: int = 50,
38 |     **kwargs,
39 | ):
40 |     """Wrapper around ReduceLROnPlateau learning rate scheduler
41 | 
42 |     Parameters
43 |     ----------
44 |     optimizer : Optimizer
45 |         Optimizer
46 |     min_lr : float, optional
47 |         Defaults to 1e-8.
48 |     max_lr : float, optional
49 |         Defaults to 1e-3
50 |     factor : float, optional
51 |         Defaults to 0.5
52 |     patience : int, optional
53 |         Wait that many epochs with no improvement before reducing the learning rate.
54 |         Defaults to 50.
55 |     monitor : str, optional
56 |         Value to monitor
57 |     direction : {"min", "max"}, optional
58 |         "min" (resp. "max") means smaller (resp. larger) is better.
59 |     """
60 | 
61 |     # initialize optimizer lr to max_lr
62 |     for g in optimizer.param_groups:
63 |         g["lr"] = max_lr
64 | 
65 |     return {
66 |         "scheduler": _ReduceLROnPlateau(
67 |             optimizer,
68 |             mode=direction,
69 |             factor=factor,
70 |             patience=patience,
71 |             threshold=0.0001,
72 |             threshold_mode="rel",
73 |             cooldown=0,
74 |             min_lr=min_lr,
75 |             eps=1e-08,
76 |             verbose=False,
77 |         ),
78 |         "interval": "epoch",
79 |         "monitor": monitor,
80 |         "strict": True,
81 |     }
82 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/lr_schedulers/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .CosineAnnealingWarmRestarts import CosineAnnealingWarmRestarts
25 | from .CyclicLR import CyclicLR
26 | from .ReduceLROnPlateau import ReduceLROnPlateau
27 | 
28 | __all__ = ["ReduceLROnPlateau", "CyclicLR", "CosineAnnealingWarmRestarts"]
29 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/pretrained.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from typing import Text
25 | from pyannote.audio import Model
26 | 
27 | 
28 | def pretrained(checkpoint: Text):
29 |     return Model.from_pretrained(checkpoint, map_location=lambda storage, loc: storage)
30 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/config.yaml:
--------------------------------------------------------------------------------
 1 | protocol: ???
 2 | 
 3 | defaults:
 4 |   - task: SpeakerDiarization
 5 |   - model: PyanNet
 6 |   - optimizer: Adam
 7 |   - scheduler: CosineAnnealingWarmRestarts
 8 |   - trainer: default
 9 |   - hydra: default
10 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | run:
 4 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 5 | 
 6 | sweep:
 7 |   dir: ${protocol}/${now:%Y-%m-%dT%H:%M:%S.%fZ}
 8 |   subdir: ${hydra.job.num}
 9 | 
10 | output_subdir: ""
11 | 
12 | help:
13 |   app_name: pyannote-audio-train
14 | 
15 |   # Help header, customize to describe your app to your users
16 |   header: == ${hydra.help.app_name} ==
17 | 
18 |   footer: |-
19 |     Powered by Hydra (https://hydra.cc)
20 |     Use --hydra-help to view Hydra specific help
21 | 
22 |   template: |-
23 |     ${hydra.help.header}
24 | 
25 |     pyannote-audio-train protocol={protocol_name} 
26 |                          +task={task} task.param=...
27 |                          +model={model} model.param=...
28 |                          optimizer={optimizer} optimizer.param=...
29 |                          scheduler={scheduler} scheduler.param=...
30 | 
31 |     ${hydra.help.footer}
32 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/DebugEmbedding.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.debug.SimpleEmbeddingModel
3 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/DebugSegmentation.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.segmentation.debug.SimpleSegmentationModel
3 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/Pretrained.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.pretrained
3 | checkpoint: ???
4 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/PyanNet.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.models.segmentation.PyanNet
 3 | sincnet:
 4 |   stride: 10
 5 | lstm:
 6 |   hidden_size: 128
 7 |   num_layers: 2
 8 |   bidirectional: true
 9 |   monolithic: true
10 |   dropout: 0.5
11 | linear:
12 |   hidden_size: 128
13 |   num_layers: 2


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/SSeRiouSS.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.models.segmentation.SSeRiouSS
 3 | wav2vec: WAVLM_BASE
 4 | wav2vec_layer: -1
 5 | lstm:
 6 |   hidden_size: 128
 7 |   num_layers: 4
 8 |   bidirectional: true
 9 |   monolithic: true
10 |   dropout: 0.5
11 | linear:
12 |   hidden_size: 128
13 |   num_layers: 2
14 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/XVectorMFCC.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.XVectorMFCC
3 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/model/XVectorSincNet.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.models.embedding.XVectorSincNet
3 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/Adam.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: torch.optim.Adam
3 | lr: 1e-3
4 | betas: [0.9, 0.999]
5 | eps: 1e-08
6 | weight_decay: 0
7 | amsgrad: False
8 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/AdamW.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: torch.optim.AdamW
3 | lr: 1e-3
4 | betas: [0.9, 0.999]
5 | eps: 1e-08
6 | weight_decay: 0.01
7 | amsgrad: False
8 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/optimizer/Adan.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: adan_pytorch.Adan
3 | lr: 1e-3
4 | betas: [0.1, 0.1, 0.001]
5 | weight_decay: 0.0
6 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/preprocessor/LowerTemporalResolution.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.utils.preprocessors.LowerTemporalResolution
3 | resolution: 0.1
4 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/CosineAnnealingWarmRestarts.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.CosineAnnealingWarmRestarts
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | patience: 1
6 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/CyclicLR.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.CyclicLR
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | mode: triangular2
6 | patience: 50
7 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/scheduler/ReduceLROnPlateau.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.cli.lr_schedulers.ReduceLROnPlateau
3 | min_lr: 1e-8
4 | max_lr: 1e-3
5 | factor: 0.5
6 | patience: 50
7 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/MultiLabelSegmentation.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.tasks.MultiLabelSegmentation
 3 | duration: 3.0
 4 | warm_up: 0.0
 5 | balance: null
 6 | weight: null
 7 | batch_size: 32
 8 | num_workers: null
 9 | pin_memory: False
10 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/OverlappedSpeechDetection.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.tasks.OverlappedSpeechDetection
 3 | duration: 3.0
 4 | warm_up: 0.0
 5 | balance: null
 6 | overlap:
 7 |   probability: 0.5
 8 |   snr_min: 0.0
 9 |   snr_max: 10.0
10 | weight: null
11 | batch_size: 32
12 | num_workers: null
13 | pin_memory: False
14 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/SpeakerDiarization.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pyannote.audio.tasks.SpeakerDiarization
3 | duration: 5.0
4 | max_speakers_per_chunk: 3
5 | max_speakers_per_frame: 2
6 | batch_size: 32
7 | num_workers: 10
8 | pin_memory: False
9 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/SpeakerEmbedding.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.tasks.SupervisedRepresentationLearningWithArcFace
 3 | min_duration: 2.0
 4 | duration: 5.0
 5 | num_classes_per_batch: 512
 6 | num_chunks_per_class: 1
 7 | margin: 2.0
 8 | scale: 12.0
 9 | num_workers: null
10 | pin_memory: False
11 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/task/VoiceActivityDetection.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pyannote.audio.tasks.VoiceActivityDetection
 3 | duration: 3.0
 4 | warm_up: 0.0
 5 | balance: null
 6 | weight: null
 7 | batch_size: 32
 8 | num_workers: null
 9 | pin_memory: False
10 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | _target_: pytorch_lightning.Trainer
 3 | accelerator: auto
 4 | accumulate_grad_batches: 1
 5 | benchmark: null # TODO: automatically set to True when using fixed duration chunks
 6 | deterministic: False
 7 | check_val_every_n_epoch: 1
 8 | devices: auto
 9 | detect_anomaly: False
10 | enable_checkpointing: True
11 | enable_model_summary: True
12 | enable_progress_bar: True
13 | fast_dev_run: False
14 | gradient_clip_val: null
15 | gradient_clip_algorithm: norm
16 | limit_predict_batches: 1.0
17 | limit_test_batches: 1.0
18 | limit_train_batches: 1.0
19 | limit_val_batches: 1.0
20 | log_every_n_steps: 50
21 | max_epochs: 1000
22 | max_steps: -1
23 | max_time: null
24 | min_epochs: 1
25 | min_steps: null
26 | num_nodes: 1
27 | num_sanity_val_steps: 2
28 | overfit_batches: 0.0
29 | precision: 32
30 | profiler: null
31 | reload_dataloaders_every_n_epochs: 0
32 | use_distributed_sampler: True # TODO: check what this does exactly
33 | strategy: auto
34 | sync_batchnorm: False
35 | val_check_interval: 1.0
36 | 


--------------------------------------------------------------------------------
/pyannote/audio/cli/train_config/trainer/fast_dev_run.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: pytorch_lightning.Trainer
3 | fast_dev_run: True
4 | 


--------------------------------------------------------------------------------
/pyannote/audio/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/core/callback.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020-2021 CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from typing import List, Mapping, Optional, Text, Union
 24 | 
 25 | from pytorch_lightning import Callback, Trainer
 26 | from pytorch_lightning.utilities.model_summary import ModelSummary
 27 | 
 28 | from pyannote.audio import Model
 29 | 
 30 | 
 31 | class GraduallyUnfreeze(Callback):
 32 |     """Gradually unfreeze layers
 33 | 
 34 |     1. Start training with all layers frozen, but those that depends on the task
 35 |        (i.e. those instantiated in model.build() and task.setup_loss_func()
 36 |     2. Train for a few epochs and unfreeze a few more layers
 37 |     3. Repeat
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     schedule:
 42 |         See examples for supported format.
 43 |     epochs_per_stage : int, optional
 44 |         Number of epochs between each stage. Defaults to 1.
 45 |         Has no effect if schedule is provided as a {layer_name: epoch} dictionary.
 46 | 
 47 |     Usage
 48 |     -----
 49 |     >>> callback = GraduallyUnfreeze()
 50 |     >>> Trainer(callbacks=[callback]).fit(model)
 51 | 
 52 |     Examples
 53 |     --------
 54 |     # for a model with PyanNet architecture (sincnet > lstm > linear > task_specific),
 55 |     # those are equivalent and will unfreeze 'linear' at epoch 1, 'lstm' at epoch 2,
 56 |     # and 'sincnet' at epoch 3.
 57 |     GraduallyUnfreeze()
 58 |     GraduallyUnfreeze(schedule=['linear', 'lstm', 'sincnet'])
 59 |     GraduallyUnfreeze(schedule={'linear': 1, 'lstm': 2, 'sincnet': 3})
 60 | 
 61 |     # the following syntax is also possible (with its dict-based equivalent just below):
 62 |     GraduallyUnfreeze(schedule=['linear', ['lstm', 'sincnet']], epochs_per_stage=10)
 63 |     GraduallyUnfreeze(schedule={'linear': 10, 'lstm': 20, 'sincnet': 20})
 64 |     # will unfreeze 'linear' at epoch 10, and both 'lstm' and 'sincnet' at epoch 20.
 65 |     """
 66 | 
 67 |     def __init__(
 68 |         self,
 69 |         schedule: Union[Mapping[Text, int], List[Union[List[Text], Text]]] = None,
 70 |         epochs_per_stage: Optional[int] = None,
 71 |     ):
 72 |         super().__init__()
 73 | 
 74 |         if (
 75 |             (schedule is None) or (isinstance(schedule, List))
 76 |         ) and epochs_per_stage is None:
 77 |             epochs_per_stage = 1
 78 | 
 79 |         self.epochs_per_stage = epochs_per_stage
 80 |         self.schedule = schedule
 81 | 
 82 |     def on_fit_start(self, trainer: Trainer, model: Model):
 83 | 
 84 |         schedule = self.schedule
 85 | 
 86 |         task_specific_layers = model.task_dependent
 87 |         backbone_layers = [
 88 |             layer
 89 |             for layer, _ in reversed(ModelSummary(model, max_depth=1).named_modules)
 90 |             if layer not in task_specific_layers
 91 |         ]
 92 | 
 93 |         if schedule is None:
 94 |             schedule = backbone_layers
 95 | 
 96 |         if isinstance(schedule, List):
 97 |             _schedule = dict()
 98 |             for depth, layers in enumerate(schedule):
 99 |                 layers = layers if isinstance(layers, List) else [layers]
100 |                 for layer in layers:
101 |                     _schedule[layer] = (depth + 1) * self.epochs_per_stage
102 |             schedule = _schedule
103 | 
104 |         self.schedule = schedule
105 | 
106 |         # freeze all but task specific layers
107 |         for layer in backbone_layers:
108 |             model.freeze_by_name(layer)
109 | 
110 |     def on_train_epoch_start(self, trainer: Trainer, model: Model):
111 |         for layer, epoch in self.schedule.items():
112 |             if epoch == trainer.current_epoch:
113 |                 model.unfreeze_by_name(layer)
114 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/blocks/pooling.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import warnings
 24 | from typing import Optional
 25 | 
 26 | import torch
 27 | import torch.nn as nn
 28 | import torch.nn.functional as F
 29 | 
 30 | 
 31 | def _pool(sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
 32 |     """Helper function to compute statistics pooling
 33 | 
 34 |     Assumes that weights are already interpolated to match the number of frames
 35 |     in sequences and that they encode the activation of only one speaker.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     sequences : (batch, features, frames) torch.Tensor
 40 |         Sequences of features.
 41 |     weights : (batch, frames) torch.Tensor
 42 |         (Already interpolated) weights.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     output : (batch, 2 * features) torch.Tensor
 47 |         Concatenation of mean and (unbiased) standard deviation.
 48 |     """
 49 | 
 50 |     weights = weights.unsqueeze(dim=1)
 51 |     # (batch, 1, frames)
 52 | 
 53 |     v1 = weights.sum(dim=2) + 1e-8
 54 |     mean = torch.sum(sequences * weights, dim=2) / v1
 55 | 
 56 |     dx2 = torch.square(sequences - mean.unsqueeze(2))
 57 |     v2 = torch.square(weights).sum(dim=2)
 58 | 
 59 |     var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8)
 60 |     std = torch.sqrt(var)
 61 | 
 62 |     return torch.cat([mean, std], dim=1)
 63 | 
 64 | 
 65 | class StatsPool(nn.Module):
 66 |     """Statistics pooling
 67 | 
 68 |     Compute temporal mean and (unbiased) standard deviation
 69 |     and returns their concatenation.
 70 | 
 71 |     Reference
 72 |     ---------
 73 |     https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
 74 | 
 75 |     """
 76 | 
 77 |     def forward(
 78 |         self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
 79 |     ) -> torch.Tensor:
 80 |         """Forward pass
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         sequences : (batch, features, frames) torch.Tensor
 85 |             Sequences of features.
 86 |         weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
 87 |             Compute weighted mean and standard deviation, using provided `weights`.
 88 | 
 89 |         Note
 90 |         ----
 91 |         `sequences` and `weights` might use a different number of frames, in which case `weights`
 92 |         are interpolated linearly to reach the number of frames in `sequences`.
 93 | 
 94 |         Returns
 95 |         -------
 96 |         output : (batch, 2 * features) or (batch, speakers, 2 * features) torch.Tensor
 97 |             Concatenation of mean and (unbiased) standard deviation. When `weights` are
 98 |             provided with the `speakers` dimension, `output` is computed for each speaker
 99 |             separately and returned as (batch, speakers, 2 * channel)-shaped tensor.
100 |         """
101 | 
102 |         if weights is None:
103 |             mean = sequences.mean(dim=-1)
104 |             std = sequences.std(dim=-1, correction=1)
105 |             return torch.cat([mean, std], dim=-1)
106 | 
107 |         if weights.dim() == 2:
108 |             has_speaker_dimension = False
109 |             weights = weights.unsqueeze(dim=1)
110 |             # (batch, frames) -> (batch, 1, frames)
111 |         else:
112 |             has_speaker_dimension = True
113 | 
114 |         # interpolate weights if needed
115 |         _, _, num_frames = sequences.size()
116 |         _, num_speakers, num_weights = weights.size()
117 |         if num_frames != num_weights:
118 |             warnings.warn(
119 |                 f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
120 |             )
121 |             weights = F.interpolate(weights, size=num_frames, mode="nearest")
122 | 
123 |         output = torch.stack(
124 |             [
125 |                 _pool(sequences, weights[:, speaker, :])
126 |                 for speaker in range(num_speakers)
127 |             ],
128 |             dim=1,
129 |         )
130 | 
131 |         if not has_speaker_dimension:
132 |             return output.squeeze(dim=1)
133 | 
134 |         return output
135 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .wespeaker import (
25 |     WeSpeakerResNet34,
26 |     WeSpeakerResNet152,
27 |     WeSpeakerResNet221,
28 |     WeSpeakerResNet293,
29 | )
30 | from .xvector import XVectorMFCC, XVectorSincNet
31 | 
32 | __all__ = [
33 |     "XVectorSincNet",
34 |     "XVectorMFCC",
35 |     "WeSpeakerResNet34",
36 |     "WeSpeakerResNet152",
37 |     "WeSpeakerResNet221",
38 |     "WeSpeakerResNet293",
39 | ]
40 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/debug.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from functools import lru_cache
 25 | from typing import Optional
 26 | 
 27 | import torch
 28 | import torch.nn as nn
 29 | from einops import rearrange, reduce
 30 | from torchaudio.transforms import MFCC
 31 | 
 32 | from pyannote.audio.core.model import Model
 33 | from pyannote.audio.core.task import Task
 34 | 
 35 | 
 36 | class SimpleEmbeddingModel(Model):
 37 |     def __init__(
 38 |         self,
 39 |         sample_rate: int = 16000,
 40 |         num_channels: int = 1,
 41 |         task: Optional[Task] = None,
 42 |     ):
 43 |         super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
 44 | 
 45 |         self.mfcc = MFCC(
 46 |             sample_rate=self.hparams.sample_rate,
 47 |             n_mfcc=40,
 48 |             dct_type=2,
 49 |             norm="ortho",
 50 |             log_mels=False,
 51 |         )
 52 | 
 53 |         self.lstm = nn.LSTM(
 54 |             self.mfcc.n_mfcc * self.hparams.num_channels,
 55 |             32,
 56 |             num_layers=1,
 57 |             batch_first=True,
 58 |             bidirectional=True,
 59 |         )
 60 | 
 61 |     @lru_cache
 62 |     def num_frames(self, num_samples: int) -> int:
 63 |         """Compute number of output frames for a given number of input samples
 64 | 
 65 |         Parameters
 66 |         ----------
 67 |         num_samples : int
 68 |             Number of input samples
 69 | 
 70 |         Returns
 71 |         -------
 72 |         num_frames : int
 73 |             Number of output frames
 74 | 
 75 |         Source
 76 |         ------
 77 |         https://pytorch.org/docs/stable/generated/torch.stft.html#torch.stft
 78 | 
 79 |         """
 80 | 
 81 |         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
 82 |         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
 83 |         center = self.mfcc.MelSpectrogram.spectrogram.center
 84 | 
 85 |         if center:
 86 |             return 1 + num_samples // hop_length
 87 |         else:
 88 |             return 1 + (num_samples - n_fft) // hop_length
 89 | 
 90 |     def receptive_field_size(self, num_frames: int = 1) -> int:
 91 |         """Compute size of receptive field
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         num_frames : int, optional
 96 |             Number of frames in the output signal
 97 | 
 98 |         Returns
 99 |         -------
100 |         receptive_field_size : int
101 |             Receptive field size.
102 |         """
103 | 
104 |         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
105 |         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
106 |         return n_fft + (num_frames - 1) * hop_length
107 | 
108 |     def receptive_field_center(self, frame: int = 0) -> int:
109 |         """Compute center of receptive field
110 | 
111 |         Parameters
112 |         ----------
113 |         frame : int, optional
114 |             Frame index
115 | 
116 |         Returns
117 |         -------
118 |         receptive_field_center : int
119 |             Index of receptive field center.
120 |         """
121 | 
122 |         hop_length = self.mfcc.MelSpectrogram.spectrogram.hop_length
123 |         n_fft = self.mfcc.MelSpectrogram.spectrogram.n_fft
124 |         center = self.mfcc.MelSpectrogram.spectrogram.center
125 | 
126 |         if center:
127 |             return frame * hop_length
128 |         else:
129 |             return frame * hop_length + n_fft // 2
130 | 
131 |     @property
132 |     def dimension(self) -> int:
133 |         """Dimension of output"""
134 |         return 64
135 | 
136 |     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
137 |         """
138 | 
139 |         Parameters
140 |         ----------
141 |         waveforms : (batch, time, channel)
142 | 
143 |         Returns
144 |         -------
145 |         embedding : (batch, dimension)
146 |         """
147 | 
148 |         mfcc = self.mfcc(waveforms)
149 |         output, hidden = self.lstm(rearrange(mfcc, "b c f t -> b t (c f)"))
150 |         # mean temporal pooling
151 |         return reduce(output, "b t f -> b f", "mean")
152 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
 2 | 2022 Zhengyang Chen (chenzhengyang117@gmail.com)
 3 | 2023 Bing Han (hanbing97@sjtu.edu.cn)
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | 
17 | File `resnet.py` has been borrowed from WeSpeaker that is available under the Apache License, Version 2.0.
18 | 
19 | The original file is available at https://github.com/wenet-e2e/wespeaker/blob/c20d765295359e681321625fbefc1a02e8794163/wespeaker/models/resnet.py
20 | 
21 | Neither Shuai Wang (@wsstriving on Github) nor myself (Hervé Bredin, or @hbredin on Github) are lawyers, but we both agreed that putting this license file in this directory is enough to comply with the license. See https://github.com/pyannote/pyannote-audio/issues/1537#issuecomment-1808029836. If you know better about this potential MIT/Apache 2.0 compatibility issue, please let us know.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/embedding/wespeaker/convert.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # Script used to convert from WeSpeaker to pyannote.audio
24 | 
25 | import sys
26 | from pathlib import Path
27 | 
28 | import pytorch_lightning as pl
29 | import torch
30 | 
31 | import pyannote.audio.models.embedding.wespeaker as wespeaker
32 | from pyannote.audio import Model
33 | from pyannote.audio.core.task import Problem, Resolution, Specifications
34 | 
35 | wespeaker_checkpoint_dir = sys.argv[1]  # /path/to/wespeaker_cnceleb-resnet34-LM
36 | 
37 | wespeaker_checkpoint = Path(wespeaker_checkpoint_dir) / "wespeaker.pt"
38 | 
39 | depth = Path(wespeaker_checkpoint_dir).parts[-1].split("-")[-2][6:]  # '34'
40 | Klass = getattr(wespeaker, f"WeSpeakerResNet{depth}")  # WeSpeakerResNet34
41 | 
42 | duration = 5.0  # whatever
43 | specifications = Specifications(
44 |     problem=Problem.REPRESENTATION, resolution=Resolution.CHUNK, duration=duration
45 | )
46 | 
47 | state_dict = torch.load(wespeaker_checkpoint, map_location=torch.device("cpu"))
48 | state_dict.pop("projection.weight")
49 | 
50 | model = Klass()
51 | model.resnet.load_state_dict(state_dict, strict=True)
52 | model.specifications = specifications
53 | 
54 | checkpoint = {"state_dict": model.state_dict()}
55 | model.on_save_checkpoint(checkpoint)
56 | checkpoint["pytorch-lightning_version"] = pl.__version__
57 | 
58 | pyannote_checkpoint = Path(wespeaker_checkpoint_dir) / "pytorch_model.bin"
59 | torch.save(checkpoint, pyannote_checkpoint)
60 | 
61 | model = Model.from_pretrained(pyannote_checkpoint)
62 | print(model)
63 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/segmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .PyanNet import PyanNet
24 | from .SSeRiouSS import SSeRiouSS
25 | 
26 | __all__ = ["PyanNet", "SSeRiouSS"]
27 | 


--------------------------------------------------------------------------------
/pyannote/audio/models/separation/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .ToTaToNet import ToTaToNet
24 | 
25 | __all__ = ["ToTaToNet"]
26 | 


--------------------------------------------------------------------------------
/pyannote/audio/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2022 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .multilabel import MultiLabelSegmentation
24 | from .overlapped_speech_detection import OverlappedSpeechDetection
25 | from .resegmentation import Resegmentation
26 | from .speaker_diarization import SpeakerDiarization
27 | from .speech_separation import SpeechSeparation
28 | from .voice_activity_detection import VoiceActivityDetection
29 | 
30 | __all__ = [
31 |     "VoiceActivityDetection",
32 |     "OverlappedSpeechDetection",
33 |     "SpeakerDiarization",
34 |     "Resegmentation",
35 |     "MultiLabelSegmentation",
36 |     "SpeechSeparation",
37 | ]
38 | 


--------------------------------------------------------------------------------
/pyannote/audio/pipelines/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .diarization import SpeakerDiarizationMixin
24 | from .getter import (
25 |     PipelineAugmentation,
26 |     PipelineInference,
27 |     PipelineModel,
28 |     get_augmentation,
29 |     get_devices,
30 |     get_inference,
31 |     get_model,
32 | )
33 | from .oracle import oracle_segmentation
34 | 
35 | __all__ = [
36 |     "SpeakerDiarizationMixin",
37 |     "oracle_segmentation",
38 |     "get_augmentation",
39 |     "PipelineAugmentation",
40 |     "get_devices",
41 |     "get_inference",
42 |     "PipelineInference",
43 |     "get_model",
44 |     "PipelineModel",
45 | ]
46 | 


--------------------------------------------------------------------------------
/pyannote/audio/pipelines/utils/oracle.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2022- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from typing import Optional, Union
 24 | 
 25 | import numpy as np
 26 | from pyannote.core import Annotation, Segment, SlidingWindow, SlidingWindowFeature
 27 | 
 28 | from pyannote.audio.core.io import Audio, AudioFile
 29 | 
 30 | 
 31 | def oracle_segmentation(
 32 |     file: AudioFile,
 33 |     window: SlidingWindow,
 34 |     frames: Union[SlidingWindow, float],
 35 |     num_speakers: Optional[int] = None,
 36 | ) -> SlidingWindowFeature:
 37 |     """Oracle speaker segmentation
 38 | 
 39 |     Simulates inference based on an (imaginary) oracle segmentation model:
 40 | 
 41 |     >>> oracle = Model.from_pretrained("oracle")
 42 |     >>> assert frames == oracle.receptive_field
 43 |     >>> inference = Inference(oracle, duration=window.duration, step=window.step, skip_aggregation=True)
 44 |     >>> oracle_segmentation = inference(file)
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     file : AudioFile
 49 |         Audio file with "annotation".
 50 |     window : SlidingWindow
 51 |         Sliding window used for inference (see above)
 52 |     frames : SlidingWindow or float
 53 |         Output resolution of the oracle model (see above)
 54 |     num_speakers : int, optional
 55 |         Override the number of speakers returned by the oracle segmentation model
 56 |         Defaults to the actual number of speakers in the whole file
 57 | 
 58 |     Returns
 59 |     -------
 60 |     oracle_segmentation : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
 61 |         Oracle segmentation.
 62 |     """
 63 | 
 64 |     if "duration" not in file:
 65 |         duration = Audio(mono="downmix").get_duration(file)
 66 |     else:
 67 |         duration: float = file["duration"]
 68 |     reference: Annotation = file["annotation"]
 69 | 
 70 |     if not isinstance(frames, SlidingWindow):
 71 |         frames = SlidingWindow(start=0.0, step=frames, duration=frames)
 72 | 
 73 |     labels = reference.labels()
 74 |     actual_num_speakers = len(labels)
 75 |     if num_speakers is None:
 76 |         num_speakers = actual_num_speakers
 77 | 
 78 |     if num_speakers > actual_num_speakers:
 79 |         num_missing = num_speakers - actual_num_speakers
 80 |         labels += [
 81 |             f"FakeSpeakerForOracleSegmentationInference{i:d}"
 82 |             for i in range(num_missing)
 83 |         ]
 84 | 
 85 |     window = SlidingWindow(start=0.0, duration=window.duration, step=window.step)
 86 | 
 87 |     segmentations = []
 88 |     for chunk in window(Segment(0.0, duration)):
 89 |         chunk_segmentation: SlidingWindowFeature = reference.discretize(
 90 |             chunk,
 91 |             resolution=frames,
 92 |             labels=labels,
 93 |             duration=window.duration,
 94 |         )
 95 | 
 96 |         if num_speakers < actual_num_speakers:
 97 |             # keep `num_speakers` most talkative speakers
 98 |             most_talkative_index = np.argsort(-np.sum(chunk_segmentation, axis=0))[
 99 |                 :num_speakers
100 |             ]
101 |             chunk_segmentation = chunk_segmentation[:, most_talkative_index]
102 | 
103 |         segmentations.append(chunk_segmentation)
104 | 
105 |     return SlidingWindowFeature(np.float32(np.stack(segmentations)), window)
106 | 


--------------------------------------------------------------------------------
/pyannote/audio/sample/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from pathlib import Path
25 | 
26 | from pyannote.core import Annotation, Segment, Timeline
27 | from pyannote.database.util import load_rttm
28 | 
29 | from pyannote.audio.core.io import Audio, AudioFile
30 | 
31 | 
32 | def _sample() -> AudioFile:
33 |     sample_wav = Path(__file__).parent / "sample.wav"
34 |     uri = "sample"
35 | 
36 |     audio = Audio()
37 |     waveform, sample_rate = audio(sample_wav)
38 | 
39 |     sample_rttm = Path(__file__).parent / "sample.rttm"
40 | 
41 |     annotation: Annotation = load_rttm(sample_rttm)[uri]
42 |     duration = audio.get_duration(sample_wav)
43 | 
44 |     annotated: Timeline = Timeline([Segment(0.0, duration)], uri=uri)
45 | 
46 |     return {
47 |         "audio": sample_wav,
48 |         "uri": "sample",
49 |         "waveform": waveform,
50 |         "sample_rate": sample_rate,
51 |         "annotation": annotation,
52 |         "annotated": annotated,
53 |     }
54 | 
55 | 
56 | SAMPLE_FILE = _sample()
57 | 


--------------------------------------------------------------------------------
/pyannote/audio/sample/sample.rttm:
--------------------------------------------------------------------------------
 1 | SPEAKER sample 1 6.690 0.430 <NA> <NA> speaker90 <NA> <NA>
 2 | SPEAKER sample 1 7.550 0.800 <NA> <NA> speaker91 <NA> <NA>
 3 | SPEAKER sample 1 8.320 1.700 <NA> <NA> speaker90 <NA> <NA>
 4 | SPEAKER sample 1 9.920 1.110 <NA> <NA> speaker91 <NA> <NA>
 5 | SPEAKER sample 1 10.570 4.130 <NA> <NA> speaker90 <NA> <NA>
 6 | SPEAKER sample 1 14.490 3.430 <NA> <NA> speaker91 <NA> <NA>
 7 | SPEAKER sample 1 18.050 3.440 <NA> <NA> speaker90 <NA> <NA>
 8 | SPEAKER sample 1 18.150 0.440 <NA> <NA> speaker91 <NA> <NA>
 9 | SPEAKER sample 1 21.780 6.720 <NA> <NA> speaker91 <NA> <NA>
10 | SPEAKER sample 1 27.850 2.150 <NA> <NA> speaker90 <NA> <NA>
11 | 


--------------------------------------------------------------------------------
/pyannote/audio/sample/sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/pyannote/audio/sample/sample.wav


--------------------------------------------------------------------------------
/pyannote/audio/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020-2021 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .segmentation.multilabel import MultiLabelSegmentation  # isort:skip
24 | from .segmentation.speaker_diarization import SpeakerDiarization  # isort:skip
25 | from .separation.PixIT import PixIT  # isort:skip
26 | from .segmentation.voice_activity_detection import VoiceActivityDetection  # isort:skip
27 | from .segmentation.overlapped_speech_detection import (  # isort:skip
28 |     OverlappedSpeechDetection,
29 | )
30 | from .embedding.arcface import SupervisedRepresentationLearningWithArcFace  # isort:skip
31 | 
32 | # Segmentation has been renamed to SpeakerDiarization but we keep Segmentation here for backward compatibility
33 | Segmentation = SpeakerDiarization
34 | 
35 | # SpeakerEmbedding is more human-friendly
36 | SpeakerEmbedding = SupervisedRepresentationLearningWithArcFace
37 | 
38 | __all__ = [
39 |     "SpeakerDiarization",
40 |     "VoiceActivityDetection",
41 |     "OverlappedSpeechDetection",
42 |     "MultiLabelSegmentation",
43 |     "SpeakerEmbedding",
44 |     "Segmentation",
45 |     "PixIT",
46 | ]
47 | 


--------------------------------------------------------------------------------
/pyannote/audio/tasks/embedding/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/tasks/embedding/arcface.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from __future__ import annotations
 25 | 
 26 | from typing import Dict, Optional, Sequence, Union
 27 | 
 28 | import pytorch_metric_learning.losses
 29 | from pyannote.database import Protocol
 30 | from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
 31 | from torchmetrics import Metric
 32 | 
 33 | from pyannote.audio.core.task import Task
 34 | 
 35 | from .mixins import SupervisedRepresentationLearningTaskMixin
 36 | 
 37 | 
 38 | class SupervisedRepresentationLearningWithArcFace(
 39 |     SupervisedRepresentationLearningTaskMixin,
 40 |     Task,
 41 | ):
 42 |     """Supervised representation learning with ArcFace loss
 43 | 
 44 |     Representation learning is the task of ...
 45 | 
 46 |     Parameters
 47 |     ----------
 48 |     protocol : Protocol
 49 |         pyannote.database protocol
 50 |     duration : float, optional
 51 |         Chunks duration in seconds. Defaults to two seconds (2.).
 52 |     min_duration : float, optional
 53 |         Sample training chunks duration uniformely between `min_duration`
 54 |         and `duration`. Defaults to `duration` (i.e. fixed length chunks).
 55 |     num_classes_per_batch : int, optional
 56 |         Number of classes per batch. Defaults to 32.
 57 |     num_chunks_per_class : int, optional
 58 |         Number of chunks per class. Defaults to 1.
 59 |     margin : float, optional
 60 |         Margin. Defaults to 28.6.
 61 |     scale : float, optional
 62 |         Scale. Defaults to 64.
 63 |     num_workers : int, optional
 64 |         Number of workers used for generating training samples.
 65 |         Defaults to multiprocessing.cpu_count() // 2.
 66 |     pin_memory : bool, optional
 67 |         If True, data loaders will copy tensors into CUDA pinned
 68 |         memory before returning them. See pytorch documentation
 69 |         for more details. Defaults to False.
 70 |     augmentation : BaseWaveformTransform, optional
 71 |         torch_audiomentations waveform transform, used by dataloader
 72 |         during training.
 73 |     metric : optional
 74 |         Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
 75 |         Defaults to AUROC (area under the ROC curve).
 76 |     """
 77 | 
 78 |     #  TODO: add a ".metric" property that tells how speaker embedding trained with this approach
 79 |     #  should be compared. could be a string like "cosine" or "euclidean" or a pdist/cdist-like
 80 |     #  callable. this ".metric" property should be propagated all the way to Inference (via the model).
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         protocol: Protocol,
 85 |         min_duration: Optional[float] = None,
 86 |         duration: float = 2.0,
 87 |         num_classes_per_batch: int = 32,
 88 |         num_chunks_per_class: int = 1,
 89 |         margin: float = 28.6,
 90 |         scale: float = 64.0,
 91 |         num_workers: Optional[int] = None,
 92 |         pin_memory: bool = False,
 93 |         augmentation: Optional[BaseWaveformTransform] = None,
 94 |         metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
 95 |     ):
 96 | 
 97 |         self.num_chunks_per_class = num_chunks_per_class
 98 |         self.num_classes_per_batch = num_classes_per_batch
 99 | 
100 |         self.margin = margin
101 |         self.scale = scale
102 | 
103 |         super().__init__(
104 |             protocol,
105 |             duration=duration,
106 |             min_duration=min_duration,
107 |             batch_size=self.batch_size,
108 |             num_workers=num_workers,
109 |             pin_memory=pin_memory,
110 |             augmentation=augmentation,
111 |             metric=metric,
112 |         )
113 | 
114 |     def setup_loss_func(self):
115 | 
116 |         _, embedding_size = self.model(self.model.example_input_array).shape
117 | 
118 |         self.model.loss_func = pytorch_metric_learning.losses.ArcFaceLoss(
119 |             len(self.specifications.classes),
120 |             embedding_size,
121 |             margin=self.margin,
122 |             scale=self.scale,
123 |         )
124 | 


--------------------------------------------------------------------------------
/pyannote/audio/tasks/segmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/tasks/separation/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2024- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .audio.diarization_error_rate import (
25 |     DiarizationErrorRate,
26 |     FalseAlarmRate,
27 |     MissedDetectionRate,
28 |     OptimalDiarizationErrorRate,
29 |     OptimalDiarizationErrorRateThreshold,
30 |     OptimalFalseAlarmRate,
31 |     OptimalMissedDetectionRate,
32 |     OptimalSpeakerConfusionRate,
33 |     SpeakerConfusionRate,
34 | )
35 | 
36 | __all__ = [
37 |     "DiarizationErrorRate",
38 |     "FalseAlarmRate",
39 |     "MissedDetectionRate",
40 |     "SpeakerConfusionRate",
41 |     "OptimalDiarizationErrorRate",
42 |     "OptimalFalseAlarmRate",
43 |     "OptimalMissedDetectionRate",
44 |     "OptimalSpeakerConfusionRate",
45 |     "OptimalDiarizationErrorRateThreshold",
46 | ]
47 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .diarization_error_rate import (
25 |     DiarizationErrorRate,
26 |     FalseAlarmRate,
27 |     MissedDetectionRate,
28 |     OptimalDiarizationErrorRate,
29 |     OptimalDiarizationErrorRateThreshold,
30 |     OptimalFalseAlarmRate,
31 |     OptimalMissedDetectionRate,
32 |     OptimalSpeakerConfusionRate,
33 |     SpeakerConfusionRate,
34 | )
35 | 
36 | __all__ = [
37 |     "DiarizationErrorRate",
38 |     "SpeakerConfusionRate",
39 |     "MissedDetectionRate",
40 |     "FalseAlarmRate",
41 |     "OptimalDiarizationErrorRate",
42 |     "OptimalSpeakerConfusionRate",
43 |     "OptimalMissedDetectionRate",
44 |     "OptimalFalseAlarmRate",
45 |     "OptimalDiarizationErrorRateThreshold",
46 | ]
47 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from .equal_error_rate import EqualErrorRate
25 | 
26 | __all__ = [
27 |     "EqualErrorRate",
28 | ]
29 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/classification/equal_error_rate.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from typing import Optional
25 | 
26 | import torch
27 | from pyannote.metrics.binary_classification import det_curve
28 | from torchmetrics import Metric
29 | from torchmetrics.utilities.data import dim_zero_cat
30 | 
31 | 
32 | class EqualErrorRate(Metric):
33 | 
34 |     is_differentiable: Optional[bool] = False
35 |     higher_is_better: Optional[bool] = False
36 |     full_state_update: bool = True
37 | 
38 |     def __init__(self, distances: bool = True, compute_on_cpu: bool = True, **kwargs):
39 |         super().__init__(compute_on_cpu=compute_on_cpu, **kwargs)
40 |         self.distances = distances
41 |         self.add_state("scores", default=[], dist_reduce_fx="cat")
42 |         self.add_state("y_true", default=[], dist_reduce_fx="cat")
43 | 
44 |     def update(self, scores: torch.Tensor, y_true: torch.Tensor) -> None:
45 |         self.scores.append(scores)
46 |         self.y_true.append(y_true)
47 | 
48 |     def compute(self) -> torch.Tensor:
49 |         scores = dim_zero_cat(self.scores)
50 |         y_true = dim_zero_cat(self.y_true)
51 |         _, _, _, eer = det_curve(y_true.cpu(), scores.cpu(), distances=self.distances)
52 |         return torch.tensor(eer)
53 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/functional/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/torchmetrics/functional/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2022- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/multi_task.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from typing import Any, Callable, Tuple, Union
25 | 
26 | from pyannote.audio.core.model import Specifications
27 | 
28 | 
29 | def map_with_specifications(
30 |     specifications: Union[Specifications, Tuple[Specifications]],
31 |     func: Callable,
32 |     *iterables,
33 | ) -> Union[Any, Tuple[Any]]:
34 |     """Compute the function using arguments from each of the iterables
35 | 
36 |     Returns a tuple if provided `specifications` is a tuple,
37 |     otherwise returns the function return value.
38 | 
39 |     Parameters
40 |     ----------
41 |     specifications : (tuple of) Specifications
42 |         Specifications or tuple of specifications
43 |     func : callable
44 |         Function called for each specification with
45 |         `func(*iterables[i], specifications=specifications[i])`
46 |     *iterables :
47 |         List of iterables with same length as `specifications`.
48 | 
49 |     Returns
50 |     -------
51 |     output : (tuple of) `func` return value(s)
52 |     """
53 | 
54 |     if isinstance(specifications, Specifications):
55 |         return func(*iterables, specifications=specifications)
56 | 
57 |     return tuple(
58 |         func(*i, specifications=s) for s, *i in zip(specifications, *iterables)
59 |     )
60 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/params.py:
--------------------------------------------------------------------------------
 1 | # TODO - make it depth-recursive
 2 | # TODO - switch to Omegaconf maybe?
 3 | 
 4 | from typing import Optional
 5 | 
 6 | 
 7 | def merge_dict(defaults: dict, custom: Optional[dict] = None):
 8 |     params = dict(defaults)
 9 |     if custom is not None:
10 |         params.update(custom)
11 |     return params
12 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/probe.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from functools import partial
 25 | from typing import Callable, Dict, Set, Text
 26 | 
 27 | import torch.nn as nn
 28 | 
 29 | 
 30 | def probe(trunk: nn.Module, branches: Dict[Text, Text]) -> Callable:
 31 |     """Add probing branches to a trunk module
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     trunk : nn.Module
 36 |         Multi-layer trunk.
 37 |     branches : {branch_name: layer_name} dict or [layer_name] list
 38 |         Indicate where to plug a probing branch.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     revert : Callable
 43 |         Callable that, when called, removes probing branches.
 44 | 
 45 |     Usage
 46 |     -----
 47 | 
 48 |     Define a trunk made out of three consecutive layers
 49 | 
 50 |     >>> import torch.nn as nn
 51 |     >>> class Trunk(nn.Module):
 52 |     ...
 53 |     ...     def __init__(self):
 54 |     ...         super().__init__()
 55 |     ...         self.layer1 = nn.Linear(1, 2)
 56 |     ...         self.layer2 = nn.Linear(2, 3)
 57 |     ...         self.layer3 = nn.Linear(3, 4)
 58 |     ...
 59 |     ...     def forward(self, x):
 60 |     ...         return self.layer3(self.layer2(self.layer1(x)))
 61 | 
 62 |     >>> trunk = Trunk()
 63 |     >>> x = torch.tensor((0.,))
 64 |     >>> trunk(x)
 65 |     # tensor([ 0.4548, -0.1814,  0.9494,  1.0445], grad_fn=<AddBackward0>)
 66 | 
 67 |     Add two probing branches:
 68 |     - first one is called "probe1" and probes the output of "layer1"
 69 |     - second one is called "probe2" and probes the output of "layer3"
 70 | 
 71 |     >>> revert = probe(trunk, {"probe1": "layer1", "probe2": "layer3"})
 72 |     >>> trunk(x)
 73 |     # {'probe1': tensor([ 0.5854, -0.9685], grad_fn=<AddBackward0>),
 74 |     #  'probe2': tensor([ 0.4548, -0.1814,  0.9494,  1.0445], grad_fn=<AddBackward0>)}
 75 | 
 76 |     Use callback returned by `probe` to revert its effect
 77 | 
 78 |     >>> revert()
 79 |     >>> trunk(x)
 80 |     # tensor([ 0.4548, -0.1814,  0.9494,  1.0445], grad_fn=<AddBackward0>)
 81 | 
 82 |     For convenience, one can also define probes as a list of layers:
 83 | 
 84 |     >>> revert = probe(trunk, ['layer1', 'layer3'])
 85 |     >>> trunk(x)
 86 |     # {'layer1': tensor([ 0.5854, -0.9685], grad_fn=<AddBackward0>),
 87 |     #  'layer3': tensor([ 0.4548, -0.1814,  0.9494,  1.0445], grad_fn=<AddBackward0>)}
 88 |     """
 89 | 
 90 |     def remove():
 91 |         del trunk.__probe
 92 |         for handle in trunk.__probe_handles:
 93 |             handle.remove()
 94 |         del trunk.__probe_handles
 95 | 
 96 |     if hasattr(trunk, "__probe"):
 97 |         remove()
 98 | 
 99 |     trunk.__probe_handles = []
100 | 
101 |     def __probe_init(module, input):
102 |         trunk.__probe = dict()
103 | 
104 |     handle = trunk.register_forward_pre_hook(__probe_init)
105 |     trunk.__probe_handles.append(handle)
106 | 
107 |     def __probe_append(branch_name, module, input, output):
108 |         trunk.__probe[branch_name] = output
109 | 
110 |     if not isinstance(branches, dict):
111 |         branches = {b: b for b in branches}
112 | 
113 |     sehcnarb: Dict[Text, Set] = dict()
114 |     for branch_name, layer_name in branches.items():
115 |         if layer_name not in sehcnarb:
116 |             sehcnarb[layer_name] = set()
117 |         sehcnarb[layer_name].add(branch_name)
118 | 
119 |     for layer_name, layer in trunk.named_modules():
120 |         if layer_name not in sehcnarb:
121 |             continue
122 |         for branch_name in sehcnarb[layer_name]:
123 |             handle = layer.register_forward_hook(partial(__probe_append, branch_name))
124 |             trunk.__probe_handles.append(handle)
125 | 
126 |     def __probe_return(module, input, output):
127 |         return trunk.__probe
128 | 
129 |     handle = trunk.register_forward_hook(__probe_return)
130 |     trunk.__probe_handles.append(handle)
131 | 
132 |     return remove
133 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/random.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | import os
25 | import zlib
26 | from random import Random
27 | 
28 | import torch
29 | 
30 | 
31 | def create_rng_for_worker(model) -> Random:
32 |     """Create worker-specific random number generator
33 | 
34 |     This makes sure that
35 |     1. training samples generation is reproducible
36 |     2. every (worker, epoch) uses a different seed
37 | 
38 |     Parameters
39 |     ----------
40 |     epoch : int
41 |         Current epoch.
42 |     """
43 | 
44 |     # create random number generator
45 |     rng = Random()
46 | 
47 |     global_seed = os.environ.get("PL_GLOBAL_SEED", "unset")
48 |     worker_info = torch.utils.data.get_worker_info()
49 | 
50 |     if worker_info is None:
51 |         worker_id = None
52 |     else:
53 |         worker_id = worker_info.id
54 | 
55 |     seed_tuple = (
56 |         global_seed,
57 |         worker_id,
58 |         model.local_rank,
59 |         model.global_rank,
60 |         model.current_epoch,
61 |     )
62 |     # use adler32 because python's `hash` is not deterministic.
63 |     seed = zlib.adler32(str(seed_tuple).encode())
64 |     rng.seed(seed)
65 | 
66 |     return rng
67 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/receptive_field.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2023 CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from typing import List
 24 | 
 25 | 
 26 | def conv1d_num_frames(
 27 |     num_samples, kernel_size=5, stride=1, padding=0, dilation=1
 28 | ) -> int:
 29 |     """Compute expected number of frames after 1D convolution
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     num_samples : int
 34 |         Number of samples in the input signal
 35 |     kernel_size : int
 36 |         Kernel size
 37 |     stride : int
 38 |         Stride
 39 |     padding : int
 40 |         Padding
 41 |     dilation : int
 42 |         Dilation
 43 | 
 44 |     Returns
 45 |     -------
 46 |     num_frames : int
 47 |         Number of frames in the output signal
 48 | 
 49 |     Source
 50 |     ------
 51 |     https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html#torch.nn.Conv1d
 52 |     """
 53 |     return 1 + (num_samples + 2 * padding - dilation * (kernel_size - 1) - 1) // stride
 54 | 
 55 | 
 56 | def multi_conv_num_frames(
 57 |     num_samples: int,
 58 |     kernel_size: List[int] = None,
 59 |     stride: List[int] = None,
 60 |     padding: List[int] = None,
 61 |     dilation: List[int] = None,
 62 | ) -> int:
 63 |     num_frames = num_samples
 64 |     for k, s, p, d in zip(kernel_size, stride, padding, dilation):
 65 |         num_frames = conv1d_num_frames(
 66 |             num_frames, kernel_size=k, stride=s, padding=p, dilation=d
 67 |         )
 68 | 
 69 |     return num_frames
 70 | 
 71 | 
 72 | def conv1d_receptive_field_size(
 73 |     num_frames=1, kernel_size=5, stride=1, padding=0, dilation=1
 74 | ):
 75 |     """Compute size of receptive field
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     num_frames : int, optional
 80 |         Number of frames in the output signal
 81 |     kernel_size : int
 82 |         Kernel size
 83 |     stride : int
 84 |         Stride
 85 |     padding : int
 86 |         Padding
 87 |     dilation : int
 88 |         Dilation
 89 | 
 90 |     Returns
 91 |     -------
 92 |     size : int
 93 |         Receptive field size
 94 |     """
 95 | 
 96 |     effective_kernel_size = 1 + (kernel_size - 1) * dilation
 97 |     return effective_kernel_size + (num_frames - 1) * stride - 2 * padding
 98 | 
 99 | 
100 | def multi_conv_receptive_field_size(
101 |     num_frames: int,
102 |     kernel_size: List[int] = None,
103 |     stride: List[int] = None,
104 |     padding: List[int] = None,
105 |     dilation: List[int] = None,
106 | ) -> int:
107 |     receptive_field_size = num_frames
108 | 
109 |     for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))):
110 |         receptive_field_size = conv1d_receptive_field_size(
111 |             num_frames=receptive_field_size,
112 |             kernel_size=k,
113 |             stride=s,
114 |             padding=p,
115 |             dilation=d,
116 |         )
117 |     return receptive_field_size
118 | 
119 | 
120 | def conv1d_receptive_field_center(
121 |     frame=0, kernel_size=5, stride=1, padding=0, dilation=1
122 | ) -> int:
123 |     """Compute center of receptive field
124 | 
125 |     Parameters
126 |     ----------
127 |     frame : int
128 |         Frame index
129 |     kernel_size : int
130 |         Kernel size
131 |     stride : int
132 |         Stride
133 |     padding : int
134 |         Padding
135 |     dilation : int
136 |         Dilation
137 | 
138 |     Returns
139 |     -------
140 |     center : int
141 |         Index of receptive field center
142 |     """
143 | 
144 |     effective_kernel_size = 1 + (kernel_size - 1) * dilation
145 |     return frame * stride + (effective_kernel_size - 1) // 2 - padding
146 | 
147 | 
148 | def multi_conv_receptive_field_center(
149 |     frame: int,
150 |     kernel_size: List[int] = None,
151 |     stride: List[int] = None,
152 |     padding: List[int] = None,
153 |     dilation: List[int] = None,
154 | ) -> int:
155 |     receptive_field_center = frame
156 |     for k, s, p, d in reversed(list(zip(kernel_size, stride, padding, dilation))):
157 |         receptive_field_center = conv1d_receptive_field_center(
158 |             frame=receptive_field_center,
159 |             kernel_size=k,
160 |             stride=s,
161 |             padding=p,
162 |             dilation=d,
163 |         )
164 | 
165 |     return receptive_field_center
166 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/reproducibility.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # Context: https://github.com/pyannote/pyannote-audio/issues/1370
24 | 
25 | import warnings
26 | 
27 | import torch
28 | 
29 | 
30 | class ReproducibilityError(Exception):
31 |     ...
32 | 
33 | 
34 | class ReproducibilityWarning(UserWarning):
35 |     ...
36 | 
37 | 
38 | def raise_reproducibility(device: torch.device):
39 |     if (device.type == "cuda") and (
40 |         torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
41 |     ):
42 |         raise ReproducibilityError(
43 |             "Please disable TensorFloat-32 (TF32) by calling\n"
44 |             "   >>> import torch\n"
45 |             "   >>> torch.backends.cuda.matmul.allow_tf32 = False\n"
46 |             "   >>> torch.backends.cudnn.allow_tf32 = False\n"
47 |             "or you might face reproducibility issues and obtain lower accuracy.\n"
48 |             "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details."
49 |         )
50 | 
51 | 
52 | def warn_reproducibility(device: torch.device):
53 |     if (device.type == "cuda") and (
54 |         torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
55 |     ):
56 |         warnings.warn(
57 |             ReproducibilityWarning(
58 |                 "Please disable TensorFloat-32 (TF32) by calling\n"
59 |                 "   >>> import torch\n"
60 |                 "   >>> torch.backends.cuda.matmul.allow_tf32 = False\n"
61 |                 "   >>> torch.backends.cudnn.allow_tf32 = False\n"
62 |                 "or you might face reproducibility issues and obtain lower accuracy.\n"
63 |                 "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details."
64 |             )
65 |         )
66 | 
67 | 
68 | def fix_reproducibility(device: torch.device):
69 |     if (device.type == "cuda") and (
70 |         torch.backends.cuda.matmul.allow_tf32 or torch.backends.cudnn.allow_tf32
71 |     ):
72 |         torch.backends.cuda.matmul.allow_tf32 = False
73 |         torch.backends.cudnn.allow_tf32 = False
74 |         warnings.warn(
75 |             ReproducibilityWarning(
76 |                 "TensorFloat-32 (TF32) has been disabled as it might lead to reproducibility issues and lower accuracy.\n"
77 |                 "It can be re-enabled by calling\n"
78 |                 "   >>> import torch\n"
79 |                 "   >>> torch.backends.cuda.matmul.allow_tf32 = True\n"
80 |                 "   >>> torch.backends.cudnn.allow_tf32 = True\n"
81 |                 "See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.\n"
82 |             )
83 |         )
84 | 


--------------------------------------------------------------------------------
/pyannote/audio/utils/version.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from typing import Text
24 | 
25 | from semver import VersionInfo
26 | 
27 | 
28 | def check_version(library: Text, theirs: Text, mine: Text, what: Text = "Pipeline"):
29 | 
30 |     theirs = ".".join(theirs.split(".")[:3])
31 |     mine = ".".join(mine.split(".")[:3])
32 | 
33 |     theirs = VersionInfo.parse(theirs)
34 |     mine = VersionInfo.parse(mine)
35 | 
36 |     if theirs.major > mine.major:
37 |         print(
38 |             f"{what} was trained with {library} {theirs}, yours is {mine}. "
39 |             f"Bad things will probably happen unless you upgrade {library} to {theirs.major}.x."
40 |         )
41 | 
42 |     elif theirs.major < mine.major:
43 |         print(
44 |             f"{what} was trained with {library} {theirs}, yours is {mine}. "
45 |             f"Bad things might happen unless you revert {library} to {theirs.major}.x."
46 |         )
47 | 
48 |     elif theirs.minor > mine.minor:
49 |         print(
50 |             f"{what} was trained with {library} {theirs}, yours is {mine}. "
51 |             f"This should be OK but you might want to upgrade {library}."
52 |         )
53 | 


--------------------------------------------------------------------------------
/questions/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Questions
3 | 
4 | Your questions should go in this directory.
5 | 
6 | Question files should be named with the extension ".question.md".
7 | 


--------------------------------------------------------------------------------
/questions/bad_performance.question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "How can I improve performance?"
 3 | alt_titles:
 4 |   - "Pretrained pipelines do not produce good results on my data. What can I do?"
 5 |   - "It does not work! Help me!"
 6 | ---
 7 | 
 8 | **Long answer:**
 9 | 
10 | 1. Manually annotate dozens of conversations as precisely as possible.
11 | 2. Separate them into train (80%), development (10%) and test (10%) subsets.
12 | 3. Setup the data for use with [`pyannote.database`](https://github.com/pyannote/pyannote-database#speaker-diarization).
13 | 4. Follow [this recipe](https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb).
14 | 5. Enjoy.
15 | 
16 | **Also:** [I am available](https://herve.niderb.fr) for contracting to help you with that.
17 | 


--------------------------------------------------------------------------------
/questions/from_memory.question.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Can I apply pretrained pipelines on audio already loaded in memory?"
3 | alt_titles:
4 |   - "Can I apply models on an audio array?"
5 | ---
6 | 
7 | Yes: read [this tutorial](tutorials/applying_a_pipeline.ipynb) until the end.
8 | 


--------------------------------------------------------------------------------
/questions/offline.question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Can I use gated models (and pipelines) offline?"
 3 | alt_titles:
 4 |   - "Why does one need to authenticate to access the pretrained models?"
 5 |   - "Can I use pyannote.audio pretrained pipelines without the Hugginface token?"
 6 |   - "How can I solve the permission issue?"
 7 | ---
 8 | 
 9 | **Short answer**: yes, see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
10 | 
11 | **Long answer**: gating models and pipelines allows [me](https://herve.niderb.fr) to know a bit more about `pyannote.audio` user base and eventually help me write grant proposals to make `pyannote.audio` even better. So, please fill gating forms as precisely as possible.
12 | 
13 | For instance, before gating `pyannote/speaker-diarization`, I had no idea that so many people were relying on it in production. Hint: sponsors are more than welcome! Maintaining open source libraries is time consuming.
14 | 
15 | That being said, this whole authentication process does not prevent you from using official `pyannote.audio` models offline (i.e. without going through the authentication process in every `docker run ...` or whatever you are using in production): see [this tutorial](tutorials/applying_a_model.ipynb) for models and [that one](tutorials/applying_a_pipeline.ipynb) for pipelines.
16 | 


--------------------------------------------------------------------------------
/questions/pyannote.question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "How does one spell and pronounce pyannote.audio?"
 3 | alt_titles:
 4 |   - "Why the name of the library?"
 5 |   - "Why the logo of the library?"
 6 | ---
 7 | 
 8 | 📝 Written in lower case: `pyannote.audio` (or `pyannote` if you are lazy). Not `PyAnnote` nor `PyAnnotate` (sic).
 9 | 📢 Pronounced like the french verb `pianoter`. `pi` like in `pi`ano, not `py` like in `py`thon.
10 | 🎹 `pianoter` means to play the piano (hence the logo 🤯).
11 | 


--------------------------------------------------------------------------------
/questions/streaming.question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Does pyannote support streaming speaker diarization?"
 3 | alt_titles:
 4 |   - "Is it possible to do realtime speaker diarization?"
 5 |   - "Can it process online audio buffers?"
 6 | ---
 7 | 
 8 | **Short answer:** not out of the box, no.
 9 | 
10 | **Long answer:** [I](https://herve.niderb.fr) am looking for sponsors to add this feature. In the meantime, [`diart`](https://github.com/juanmc2005/StreamingSpeakerDiarization) is the closest you can get from a streaming `pyannote.audio`. You might also be interested in [this blog post](https://herve.niderb.fr/fastpages/2021/08/05/Streaming-voice-activity-detection-with-pyannote.html) about streaming voice activity detection based on `pyannote.audio`.
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asteroid-filterbanks >=0.4
 2 | einops >=0.6.0
 3 | huggingface_hub >= 0.13.0
 4 | lightning >= 2.0.1
 5 | omegaconf >=2.1,<3.0
 6 | pyannote.core >= 5.0.0
 7 | pyannote.database >= 5.0.1
 8 | pyannote.metrics >= 3.2
 9 | pyannote.pipeline >= 3.0.1
10 | pytorch_metric_learning >= 2.1.0
11 | rich >= 12.0.0
12 | semver >= 3.0.0
13 | soundfile >= 0.12.1
14 | speechbrain >= 1.0.0
15 | tensorboardX >= 2.6
16 | torch >= 2.0.0
17 | torch_audiomentations >= 0.11.0
18 | torchaudio >= 2.2.0
19 | torchmetrics >= 0.11.0
20 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # This file is used to configure your project.
  2 | # Read more about the various options under:
  3 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
  4 | 
  5 | [metadata]
  6 | name = pyannote-audio
  7 | description = Neural speaker diarization
  8 | author = Herve Bredin
  9 | author-email = herve.bredin@irit.fr
 10 | license = mit
 11 | long-description = file: README.md
 12 | long-description-content-type = text/markdown; charset=UTF-8; variant=GFM
 13 | # Change if running only on Windows, Mac or Linux (comma-separated)
 14 | platforms = Linux, Mac
 15 | # Add here all kinds of additional classifiers as defined under
 16 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 17 | classifiers =
 18 |     Development Status :: 4 - Beta
 19 |     Programming Language :: Python
 20 | 
 21 | [options]
 22 | zip_safe = False
 23 | packages = find:
 24 | include_package_data = True
 25 | # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 26 | setup_requires = pyscaffold>=3.2a0,<3.3a0
 27 | # Add here dependencies of your project (semicolon/line-separated), e.g.
 28 | # install_requires = numpy; scipy
 29 | # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 30 | python_requires = >=3.9
 31 | 
 32 | [options.packages.find]
 33 | where = .
 34 | exclude =
 35 |     tests
 36 | 
 37 | [options.extras_require]
 38 | # Add here additional requirements for extra features, to install with:
 39 | # `pip install fastaudio[PDF]` like:
 40 | # PDF = ReportLab; RXP
 41 | # Add here test requirements (semicolon/line-separated)
 42 | testing =
 43 |     pytest>=6.0
 44 |     pytest-cov>=2.10
 45 |     jupyter
 46 |     papermill
 47 | dev =
 48 |     pre_commit>=2.7
 49 |     recommonmark>=0.6
 50 |     black>=22.3.0
 51 | cli =
 52 |     hydra-core >=1.1,<1.2
 53 |     typer >= 0.4.0,<0.5.0
 54 | separation =
 55 |     transformers >= 4.39.1
 56 |     asteroid >=0.7.0
 57 | 
 58 | [options.entry_points]
 59 | 
 60 | console_scripts =
 61 |     pyannote-audio-train=pyannote.audio.cli.train:train
 62 |     pyannote-audio-eval=pyannote.audio.cli.evaluate:evaluate
 63 | 
 64 | 
 65 | [test]
 66 | # py.test options when running `python setup.py test`
 67 | # addopts = --verbose
 68 | extras = True
 69 | 
 70 | [tool:pytest]
 71 | # Options for py.test:
 72 | # Specify command line options as you would do when invoking py.test directly.
 73 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 74 | # in order to write a coverage file that can be read by Jenkins.
 75 | addopts =
 76 |     --cov pyannote --cov-report term-missing
 77 |     --verbose
 78 | norecursedirs =
 79 |     dist
 80 |     build
 81 |     .tox
 82 | testpaths = tests
 83 | 
 84 | [aliases]
 85 | dists = bdist_wheel
 86 | 
 87 | [bdist_wheel]
 88 | # Use this option if your package is pure-python
 89 | universal = 1
 90 | 
 91 | [build_sphinx]
 92 | source_dir = doc
 93 | build_dir = build/sphinx
 94 | 
 95 | [devpi:upload]
 96 | # Options for the devpi: PyPI server and packaging tool
 97 | # VCS export must be deactivated since we are using setuptools-scm
 98 | no-vcs = 1
 99 | formats = bdist_wheel
100 | 
101 | [flake8]
102 | # Some sane defaults for the code style checker flake8
103 | exclude =
104 |     .tox
105 |     build
106 |     dist
107 |     .eggs
108 |     docs/conf.py
109 | 
110 | [pyscaffold]
111 | # PyScaffold's parameters when the project was created.
112 | # This will be used when updating. Do not change!
113 | version = 3.2.3
114 | package = pyannote-audio
115 | extensions =
116 |     markdown
117 |     no_skeleton
118 |     pre_commit
119 |     dsproject
120 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | from pkg_resources import VersionConflict, require
 6 | from setuptools import find_packages, setup
 7 | 
 8 | with open("README.md", mode="r", encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | with open("requirements.txt", mode="r", encoding="utf-8") as f:
12 |     requirements = f.read().splitlines()
13 | 
14 | try:
15 |     require("setuptools>=38.3")
16 | except VersionConflict:
17 |     print("Error: version of setuptools is too old (<38.3)!")
18 |     sys.exit(1)
19 | 
20 | 
21 | ROOT_DIR = Path(__file__).parent.resolve()
22 | # Creating the version file
23 | 
24 | with open("version.txt", mode="r", encoding="utf-8") as f:
25 |     version = f.read()
26 | 
27 | version = version.strip()
28 | sha = "Unknown"
29 | 
30 | if os.getenv("BUILD_VERSION"):
31 |     version = os.getenv("BUILD_VERSION")
32 | elif sha != "Unknown":
33 |     version += "+" + sha[:7]
34 | print("-- Building version " + version)
35 | 
36 | version_path = ROOT_DIR / "pyannote" / "audio" / "version.py"
37 | 
38 | with open(version_path, mode="w", encoding="utf-8") as f:
39 |     f.write("__version__ = '{}'\n".format(version))
40 | 
41 | if __name__ == "__main__":
42 |     setup(
43 |         name="pyannote.audio",
44 |         namespace_packages=["pyannote"],
45 |         version=version,
46 |         packages=find_packages(),
47 |         install_requires=requirements,
48 |         description="Neural building blocks for speaker diarization",
49 |         long_description=long_description,
50 |         long_description_content_type="text/markdown",
51 |         author="Hervé Bredin",
52 |         author_email="herve.bredin@irit.fr",
53 |         url="https://github.com/pyannote/pyannote-audio",
54 |         classifiers=[
55 |             "Development Status :: 4 - Beta",
56 |             "Intended Audience :: Science/Research",
57 |             "License :: OSI Approved :: MIT License",
58 |             "Natural Language :: English",
59 |             "Programming Language :: Python :: 3.9",
60 |             "Programming Language :: Python :: 3.10",
61 |             "Programming Language :: Python :: 3.11",
62 |             "Topic :: Scientific/Engineering",
63 |         ],
64 |     )
65 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | def pytest_sessionstart(session):
25 |     """
26 |     Called after the Session object has been created and
27 |     before performing collection and entering the run test loop.
28 |     """
29 | 
30 |     from pyannote.database import registry
31 | 
32 |     registry.load_database("tests/data/database.yml")
33 | 


--------------------------------------------------------------------------------
/tests/data/database.yml:
--------------------------------------------------------------------------------
 1 | Protocols:
 2 |    Debug:
 3 |       SpeakerDiarization:
 4 |          Debug:
 5 |             scope: database
 6 |             train:
 7 |                uri: debug.train.lst
 8 |                annotation: debug.train.rttm
 9 |                annotated: debug.train.uem
10 |             development:
11 |                uri: debug.development.lst
12 |                annotation: debug.development.rttm
13 |                annotated: debug.development.uem
14 |             test:
15 |                uri: debug.test.lst
16 |                annotation: debug.test.rttm
17 |                annotated: debug.test.uem
18 | 
19 | Databases:
20 |    Debug: ./{uri}.wav
21 | 


--------------------------------------------------------------------------------
/tests/data/debug.development.lst:
--------------------------------------------------------------------------------
1 | dev00
2 | dev01
3 | 


--------------------------------------------------------------------------------
/tests/data/debug.development.rttm:
--------------------------------------------------------------------------------
 1 | SPEAKER dev00 1 1.440 11.872 <NA> <NA> MEE009 <NA> <NA>
 2 | SPEAKER dev00 1 13.152 3.770 <NA> <NA> MEE012 <NA> <NA>
 3 | SPEAKER dev00 1 18.064 0.336 <NA> <NA> MEE012 <NA> <NA>
 4 | SPEAKER dev00 1 18.201 2.439 <NA> <NA> MEE009 <NA> <NA>
 5 | SPEAKER dev00 1 20.560 1.056 <NA> <NA> MEE012 <NA> <NA>
 6 | SPEAKER dev00 1 21.952 4.320 <NA> <NA> MEE009 <NA> <NA>
 7 | SPEAKER dev00 1 23.072 0.736 <NA> <NA> MEE012 <NA> <NA>
 8 | SPEAKER dev00 1 26.192 2.192 <NA> <NA> MEE012 <NA> <NA>
 9 | SPEAKER dev00 1 28.224 1.776 <NA> <NA> MEE009 <NA> <NA>
10 | SPEAKER dev01 1 4.304 2.448 <NA> <NA> MEE012 <NA> <NA>
11 | SPEAKER dev01 1 7.024 4.752 <NA> <NA> MEE009 <NA> <NA>
12 | SPEAKER dev01 1 15.133 4.515 <NA> <NA> MEE009 <NA> <NA>
13 | SPEAKER dev01 1 16.384 1.168 <NA> <NA> MEE012 <NA> <NA>
14 | SPEAKER dev01 1 19.568 0.800 <NA> <NA> MEE012 <NA> <NA>
15 | SPEAKER dev01 1 21.312 1.280 <NA> <NA> MEE009 <NA> <NA>
16 | SPEAKER dev01 1 22.464 1.456 <NA> <NA> MEE012 <NA> <NA>
17 | SPEAKER dev01 1 29.072 0.464 <NA> <NA> MEE012 <NA> <NA>
18 | 


--------------------------------------------------------------------------------
/tests/data/debug.development.uem:
--------------------------------------------------------------------------------
1 | dev00 NA 0.000 30.000
2 | dev01 NA 0.000 30.000
3 | 


--------------------------------------------------------------------------------
/tests/data/debug.test.lst:
--------------------------------------------------------------------------------
1 | tst00
2 | tst01
3 | 


--------------------------------------------------------------------------------
/tests/data/debug.test.rttm:
--------------------------------------------------------------------------------
 1 | SPEAKER tst00 1 0.000 1.901 <NA> <NA> MEE071 <NA> <NA>
 2 | SPEAKER tst00 1 0.944 6.124 <NA> <NA> MEE073 <NA> <NA>
 3 | SPEAKER tst00 1 3.492 1.954 <NA> <NA> FEO072 <NA> <NA>
 4 | SPEAKER tst00 1 3.612 8.676 <NA> <NA> MEE071 <NA> <NA>
 5 | SPEAKER tst00 1 3.692 1.887 <NA> <NA> FEO070 <NA> <NA>
 6 | SPEAKER tst00 1 7.891 1.114 <NA> <NA> FEO070 <NA> <NA>
 7 | SPEAKER tst00 1 8.544 3.216 <NA> <NA> FEO072 <NA> <NA>
 8 | SPEAKER tst00 1 12.133 3.301 <NA> <NA> FEO070 <NA> <NA>
 9 | SPEAKER tst00 1 13.120 0.602 <NA> <NA> FEO072 <NA> <NA>
10 | SPEAKER tst00 1 14.959 0.666 <NA> <NA> MEE071 <NA> <NA>
11 | SPEAKER tst00 1 15.109 10.155 <NA> <NA> FEO072 <NA> <NA>
12 | SPEAKER tst00 1 19.006 0.485 <NA> <NA> FEO070 <NA> <NA>
13 | SPEAKER tst00 1 19.008 4.796 <NA> <NA> MEE071 <NA> <NA>
14 | SPEAKER tst00 1 20.124 1.044 <NA> <NA> MEE073 <NA> <NA>
15 | SPEAKER tst00 1 20.222 1.222 <NA> <NA> FEO070 <NA> <NA>
16 | SPEAKER tst00 1 21.400 1.928 <NA> <NA> MEE073 <NA> <NA>
17 | SPEAKER tst00 1 23.490 0.750 <NA> <NA> FEO070 <NA> <NA>
18 | SPEAKER tst00 1 25.344 4.656 <NA> <NA> MEE073 <NA> <NA>
19 | SPEAKER tst00 1 25.658 0.550 <NA> <NA> FEO070 <NA> <NA>
20 | SPEAKER tst00 1 27.792 2.208 <NA> <NA> MEE071 <NA> <NA>
21 | SPEAKER tst00 1 27.879 2.121 <NA> <NA> FEO072 <NA> <NA>
22 | SPEAKER tst00 1 28.016 1.984 <NA> <NA> FEO070 <NA> <NA>
23 | SPEAKER tst01 1 4.390 0.350 <NA> <NA> FEO072 <NA> <NA>
24 | SPEAKER tst01 1 4.773 0.366 <NA> <NA> MEE073 <NA> <NA>
25 | SPEAKER tst01 1 16.495 0.540 <NA> <NA> MEE071 <NA> <NA>
26 | SPEAKER tst01 1 24.159 4.388 <NA> <NA> FEO070 <NA> <NA>
27 | SPEAKER tst01 1 29.008 0.448 <NA> <NA> MEE073 <NA> <NA>
28 | 


--------------------------------------------------------------------------------
/tests/data/debug.test.uem:
--------------------------------------------------------------------------------
1 | tst00 NA 0.000 30.000
2 | tst01 NA 0.000 30.000
3 | 


--------------------------------------------------------------------------------
/tests/data/debug.train.lst:
--------------------------------------------------------------------------------
 1 | trñ00
 2 | trn01
 3 | trn02
 4 | trn03
 5 | trn04
 6 | trn05
 7 | trn06
 8 | trn07
 9 | trn08
10 | trn09
11 | 


--------------------------------------------------------------------------------
/tests/data/debug.train.rttm:
--------------------------------------------------------------------------------
 1 | SPEAKER trn00 1 3.168 0.800 <NA> <NA> MÉO069 <NA> <NA>
 2 | SPEAKER trn00 1 5.463 0.640 <NA> <NA> MÉO069 <NA> <NA>
 3 | SPEAKER trn00 1 5.496 0.574 <NA> <NA> MEE068 <NA> <NA>
 4 | SPEAKER trn00 1 10.454 0.499 <NA> <NA> MÉO069 <NA> <NA>
 5 | SPEAKER trn00 1 11.040 4.592 <NA> <NA> MEE068 <NA> <NA>
 6 | SPEAKER trn00 1 16.736 1.410 <NA> <NA> MÉO069 <NA> <NA>
 7 | SPEAKER trn00 1 16.980 2.778 <NA> <NA> MEE067 <NA> <NA>
 8 | SPEAKER trn00 1 18.883 0.490 <NA> <NA> MEE068 <NA> <NA>
 9 | SPEAKER trn00 1 18.985 1.831 <NA> <NA> MÉO069 <NA> <NA>
10 | SPEAKER trn00 1 20.944 0.447 <NA> <NA> MEE067 <NA> <NA>
11 | SPEAKER trn00 1 21.392 4.465 <NA> <NA> MEE068 <NA> <NA>
12 | SPEAKER trn00 1 22.928 0.384 <NA> <NA> MÉO069 <NA> <NA>
13 | SPEAKER trn00 1 25.001 2.471 <NA> <NA> MÉO069 <NA> <NA>
14 | SPEAKER trn00 1 28.033 1.967 <NA> <NA> MEE068 <NA> <NA>
15 | SPEAKER trn01 1 2.977 0.391 <NA> <NA> FEO066 <NA> <NA>
16 | SPEAKER trn01 1 18.705 0.964 <NA> <NA> MEE068 <NA> <NA>
17 | SPEAKER trn01 1 22.269 0.457 <NA> <NA> FEO065 <NA> <NA>
18 | SPEAKER trn01 1 28.474 1.526 <NA> <NA> MÉO069 <NA> <NA>
19 | SPEAKER trn01 1 28.593 1.407 <NA> <NA> FEO066 <NA> <NA>
20 | SPEAKER trn01 1 28.993 1.007 <NA> <NA> FEO065 <NA> <NA>
21 | SPEAKER trn02 1 20.704 0.688 <NA> <NA> FEO066 <NA> <NA>
22 | SPEAKER trn03 1 0.000 1.184 <NA> <NA> MEE067 <NA> <NA>
23 | SPEAKER trn03 1 1.104 28.896 <NA> <NA> MÉO069 <NA> <NA>
24 | SPEAKER trn04 1 14.032 1.744 <NA> <NA> MEE076 <NA> <NA>
25 | SPEAKER trn04 1 14.345 2.471 <NA> <NA> MEO074 <NA> <NA>
26 | SPEAKER trn04 1 16.736 7.216 <NA> <NA> MEE075 <NA> <NA>
27 | SPEAKER trn04 1 21.158 0.607 <NA> <NA> MEO074 <NA> <NA>
28 | SPEAKER trn04 1 25.200 0.736 <NA> <NA> MEE075 <NA> <NA>
29 | SPEAKER trn04 1 26.992 0.272 <NA> <NA> MEE075 <NA> <NA>
30 | SPEAKER trn04 1 27.840 2.160 <NA> <NA> MEE076 <NA> <NA>
31 | SPEAKER trn05 1 0.000 0.384 <NA> <NA> FEO079 <NA> <NA>
32 | SPEAKER trn05 1 0.000 1.472 <NA> <NA> FEE078 <NA> <NA>
33 | SPEAKER trn05 1 1.456 0.656 <NA> <NA> FEE081 <NA> <NA>
34 | SPEAKER trn05 1 5.936 0.342 <NA> <NA> FEE078 <NA> <NA>
35 | SPEAKER trn05 1 8.016 21.984 <NA> <NA> FEE078 <NA> <NA>
36 | SPEAKER trn05 1 8.496 0.784 <NA> <NA> FEE081 <NA> <NA>
37 | SPEAKER trn05 1 19.157 0.424 <NA> <NA> FEE080 <NA> <NA>
38 | SPEAKER trn06 1 0.000 8.856 <NA> <NA> FEE083 <NA> <NA>
39 | SPEAKER trn06 1 3.528 3.218 <NA> <NA> MEO082 <NA> <NA>
40 | SPEAKER trn06 1 10.544 0.648 <NA> <NA> FEE083 <NA> <NA>
41 | SPEAKER trn06 1 11.419 1.079 <NA> <NA> FEE085 <NA> <NA>
42 | SPEAKER trn06 1 13.524 16.476 <NA> <NA> FEE083 <NA> <NA>
43 | SPEAKER trn06 1 21.799 0.557 <NA> <NA> FEE085 <NA> <NA>
44 | SPEAKER trn07 1 8.275 1.452 <NA> <NA> FEE087 <NA> <NA>
45 | SPEAKER trn07 1 15.600 2.810 <NA> <NA> FEE087 <NA> <NA>
46 | SPEAKER trn07 1 19.901 0.559 <NA> <NA> FEE087 <NA> <NA>
47 | SPEAKER trn07 1 20.277 0.615 <NA> <NA> MEE089 <NA> <NA>
48 | SPEAKER trn07 1 22.592 2.525 <NA> <NA> FEE087 <NA> <NA>
49 | SPEAKER trn07 1 23.197 0.782 <NA> <NA> MEE089 <NA> <NA>
50 | SPEAKER trn07 1 23.502 1.779 <NA> <NA> FEE088 <NA> <NA>
51 | SPEAKER trn07 1 24.032 0.474 <NA> <NA> MEO086 <NA> <NA>
52 | SPEAKER trn07 1 26.506 1.689 <NA> <NA> FEE087 <NA> <NA>
53 | SPEAKER trn07 1 27.182 2.818 <NA> <NA> MEO086 <NA> <NA>
54 | SPEAKER trn08 1 5.015 1.738 <NA> <NA> MEE089 <NA> <NA>
55 | SPEAKER trn08 1 5.040 3.568 <NA> <NA> FEE087 <NA> <NA>
56 | SPEAKER trn08 1 5.491 3.018 <NA> <NA> FEE088 <NA> <NA>
57 | SPEAKER trn08 1 6.995 0.547 <NA> <NA> MEO086 <NA> <NA>
58 | SPEAKER trn08 1 10.099 0.858 <NA> <NA> FEE087 <NA> <NA>
59 | SPEAKER trn08 1 10.128 0.958 <NA> <NA> FEE088 <NA> <NA>
60 | SPEAKER trn08 1 12.000 7.664 <NA> <NA> FEE087 <NA> <NA>
61 | SPEAKER trn08 1 12.701 1.871 <NA> <NA> FEE088 <NA> <NA>
62 | SPEAKER trn08 1 14.912 1.008 <NA> <NA> MEE089 <NA> <NA>
63 | SPEAKER trn08 1 15.003 1.289 <NA> <NA> MEO086 <NA> <NA>
64 | SPEAKER trn08 1 17.164 0.314 <NA> <NA> FEE088 <NA> <NA>
65 | SPEAKER trn08 1 18.522 5.414 <NA> <NA> FEE088 <NA> <NA>
66 | SPEAKER trn08 1 21.168 0.969 <NA> <NA> FEE087 <NA> <NA>
67 | SPEAKER trn08 1 26.848 1.339 <NA> <NA> MEE089 <NA> <NA>
68 | SPEAKER trn08 1 27.040 1.648 <NA> <NA> FEE088 <NA> <NA>
69 | SPEAKER trn08 1 27.107 0.582 <NA> <NA> FEE087 <NA> <NA>
70 | SPEAKER trn09 1 0.000 1.854 <NA> <NA> FEE083 <NA> <NA>
71 | SPEAKER trn09 1 0.000 6.045 <NA> <NA> MEE094 <NA> <NA>
72 | SPEAKER trn09 1 1.854 28.146 <NA> <NA> FEE083 <NA> <NA>
73 | SPEAKER trn09 1 12.857 0.485 <NA> <NA> MEE094 <NA> <NA>
74 | SPEAKER trn09 1 14.201 4.023 <NA> <NA> MEE094 <NA> <NA>
75 | SPEAKER trn09 1 15.726 0.823 <NA> <NA> MEE095 <NA> <NA>
76 | SPEAKER trn09 1 24.992 2.358 <NA> <NA> MEE094 <NA> <NA>
77 | SPEAKER trn09 1 29.687 0.313 <NA> <NA> MEE094 <NA> <NA>
78 | 


--------------------------------------------------------------------------------
/tests/data/debug.train.uem:
--------------------------------------------------------------------------------
 1 | trn00 NA 0.000 30.000
 2 | trn01 NA 0.000 30.000
 3 | trn02 NA 0.000 30.000
 4 | trn03 NA 0.000 30.000
 5 | trn04 NA 0.000 30.000
 6 | trn05 NA 0.000 30.000
 7 | trn06 NA 0.000 30.000
 8 | trn07 NA 0.000 30.000
 9 | trn08 NA 0.000 30.000
10 | trn09 NA 0.000 30.000
11 | 


--------------------------------------------------------------------------------
/tests/data/dev00.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/dev00.wav


--------------------------------------------------------------------------------
/tests/data/dev01.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/dev01.wav


--------------------------------------------------------------------------------
/tests/data/empty.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/empty.wav


--------------------------------------------------------------------------------
/tests/data/trn01.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn01.wav


--------------------------------------------------------------------------------
/tests/data/trn02.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn02.wav


--------------------------------------------------------------------------------
/tests/data/trn03.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn03.wav


--------------------------------------------------------------------------------
/tests/data/trn04.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn04.wav


--------------------------------------------------------------------------------
/tests/data/trn05.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn05.wav


--------------------------------------------------------------------------------
/tests/data/trn06.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn06.wav


--------------------------------------------------------------------------------
/tests/data/trn07.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn07.wav


--------------------------------------------------------------------------------
/tests/data/trn08.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn08.wav


--------------------------------------------------------------------------------
/tests/data/trn09.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trn09.wav


--------------------------------------------------------------------------------
/tests/data/trñ00.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/trñ00.wav


--------------------------------------------------------------------------------
/tests/data/tst00.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/tst00.wav


--------------------------------------------------------------------------------
/tests/data/tst01.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tests/data/tst01.wav


--------------------------------------------------------------------------------
/tests/inference_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import pytorch_lightning as pl
 4 | from pyannote.core import SlidingWindowFeature
 5 | from pyannote.database import FileFinder, get_protocol
 6 | 
 7 | from pyannote.audio import Inference, Model
 8 | from pyannote.audio.core.task import Resolution
 9 | from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel
10 | from pyannote.audio.tasks import VoiceActivityDetection
11 | 
12 | HF_SAMPLE_MODEL_ID = "pyannote/ci-segmentation"
13 | 
14 | 
15 | def test_hf_download_inference():
16 |     inference = Inference(HF_SAMPLE_MODEL_ID, device="cpu")
17 |     assert isinstance(inference, Inference)
18 | 
19 | 
20 | def test_hf_download_model():
21 |     model = Model.from_pretrained(HF_SAMPLE_MODEL_ID)
22 |     assert isinstance(model, Model)
23 | 
24 | 
25 | @pytest.fixture()
26 | def trained():
27 |     protocol = get_protocol(
28 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
29 |     )
30 |     vad = VoiceActivityDetection(protocol, duration=2.0, batch_size=16, num_workers=4)
31 |     model = SimpleSegmentationModel(task=vad)
32 |     trainer = pl.Trainer(fast_dev_run=True, accelerator="cpu")
33 |     trainer.fit(model)
34 |     return protocol, model
35 | 
36 | 
37 | @pytest.fixture()
38 | def pretrained_model():
39 |     return Model.from_pretrained(HF_SAMPLE_MODEL_ID)
40 | 
41 | 
42 | @pytest.fixture()
43 | def dev_file():
44 |     protocol = get_protocol(
45 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
46 |     )
47 |     return next(protocol.development())
48 | 
49 | 
50 | def test_duration_warning(trained):
51 |     protocol, model = trained
52 |     with pytest.warns(UserWarning):
53 |         duration = model.specifications.duration
54 |         new_duration = duration + 1
55 |         Inference(model, duration=new_duration, step=0.1, batch_size=128)
56 | 
57 | 
58 | def test_step_check_warning(trained):
59 |     protocol, model = trained
60 |     with pytest.raises(ValueError):
61 |         duration = model.specifications.duration
62 |         Inference(model, step=duration + 1, batch_size=128)
63 | 
64 | 
65 | def test_invalid_window_fails(trained):
66 |     protocol, model = trained
67 |     with pytest.raises(ValueError):
68 |         Inference(model, window="unknown")
69 | 
70 | 
71 | def test_invalid_resolution_fails(trained):
72 |     protocol, model = trained
73 |     with pytest.warns(UserWarning):
74 |         model.specifications.resolution = Resolution.FRAME
75 |         Inference(model, window="whole", batch_size=128)
76 | 
77 | 
78 | def test_whole_window_slide(trained):
79 |     protocol, model = trained
80 |     inference = Inference(model, window="whole", batch_size=128)
81 |     dev_file = next(protocol.development())
82 |     output = inference(dev_file)
83 |     assert isinstance(output, np.ndarray)
84 | 
85 | 
86 | def test_on_file_path(trained):
87 |     protocol, model = trained
88 |     inference = Inference(model, batch_size=128)
89 |     output = inference("tests/data/dev00.wav")
90 |     assert isinstance(output, SlidingWindowFeature)
91 | 
92 | 
93 | def test_skip_aggregation(pretrained_model, dev_file):
94 |     inference = Inference(pretrained_model, skip_aggregation=True)
95 |     scores = inference(dev_file)
96 |     assert len(scores.data.shape) == 3
97 | 


--------------------------------------------------------------------------------
/tests/io_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | from pyannote.core import Segment
 4 | from torch import Tensor
 5 | 
 6 | from pyannote.audio.core.io import Audio
 7 | 
 8 | 
 9 | def test_audio_resample():
10 |     "Audio is correctly resampled when it isn't the correct sample rate"
11 |     test_file = "tests/data/dev00.wav"
12 |     info = torchaudio.info(test_file)
13 |     old_sr = info.sample_rate
14 |     loader = Audio(sample_rate=old_sr // 2, mono="downmix")
15 |     wav, sr = loader(test_file)
16 |     assert isinstance(wav, Tensor)
17 |     assert sr == old_sr // 2
18 | 
19 | 
20 | def test_basic_load_with_defaults():
21 |     test_file = "tests/data/dev00.wav"
22 |     loader = Audio(mono="downmix")
23 |     wav, sr = loader(test_file)
24 |     assert isinstance(wav, Tensor)
25 | 
26 | 
27 | def test_correct_audio_channel():
28 |     "When we specify an audio channel, it is chosen correctly"
29 |     waveform = torch.rand(2, 16000 * 2)
30 |     loader = Audio(mono="downmix")
31 |     wav, sr = loader({"waveform": waveform, "sample_rate": 16000, "channel": 1})
32 |     assert torch.equal(wav, waveform[1:2])
33 |     assert sr == 16000
34 | 
35 | 
36 | def test_can_load_with_waveform():
37 |     "We can load a raw waveform"
38 |     waveform = torch.rand(2, 16000 * 2)
39 |     loader = Audio(mono="downmix")
40 |     wav, sr = loader({"waveform": waveform, "sample_rate": 16000})
41 |     assert isinstance(wav, Tensor)
42 |     assert sr == 16000
43 | 
44 | 
45 | def test_can_crop():
46 |     "Cropping works when we give a Segment"
47 |     test_file = "tests/data/dev00.wav"
48 |     loader = Audio(mono="downmix")
49 |     segment = Segment(0.2, 0.7)
50 |     wav, sr = loader.crop(test_file, segment)
51 |     assert wav.shape[1] / sr == 0.5
52 | 
53 | 
54 | def test_can_crop_waveform():
55 |     "Cropping works on raw waveforms"
56 |     waveform = torch.rand(1, 16000 * 2)
57 |     loader = Audio(mono="downmix")
58 |     segment = Segment(0.2, 0.7)
59 |     wav, sr = loader.crop({"waveform": waveform, "sample_rate": 16000}, segment)
60 |     assert isinstance(wav, Tensor)
61 |     assert sr == 16000
62 | 
63 | 
64 | # File Like Object Tests
65 | def test_can_load_from_file_like():
66 |     "Load entire wav of file like"
67 |     loader = Audio(mono="downmix")
68 | 
69 |     with open("tests/data/dev00.wav", "rb") as f:
70 |         wav, sr = loader(f)
71 | 
72 |     assert isinstance(wav, Tensor)
73 |     assert sr == 16000
74 | 
75 | 
76 | def test_can_crop_from_file_like():
77 |     "Load cropped sections from file like objects"
78 |     loader = Audio(mono="downmix")
79 | 
80 |     with open("tests/data/dev00.wav", "rb") as f:
81 |         segment = Segment(0.2, 0.7)
82 |         wav, sr = loader.crop(f, segment)
83 | 
84 |     assert isinstance(wav, Tensor)
85 |     assert sr == 16000
86 |     assert wav.shape[1] == 0.5 * 16000
87 | 


--------------------------------------------------------------------------------
/tests/tasks/test_reproducibility.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from lightning.pytorch import seed_everything
 3 | from pyannote.database import FileFinder, get_protocol
 4 | 
 5 | from pyannote.audio.models.segmentation.debug import SimpleSegmentationModel
 6 | from pyannote.audio.tasks import VoiceActivityDetection
 7 | 
 8 | 
 9 | def setup_tasks(task):
10 |     protocol = get_protocol(
11 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
12 |     )
13 |     vad = task(protocol, duration=0.2, batch_size=32, num_workers=4)
14 |     return protocol, vad
15 | 
16 | 
17 | def create_dl(model, task):
18 |     m = model(task=task)
19 |     m.prepare_data()
20 |     m.setup()
21 |     return task.train_dataloader()
22 | 
23 | 
24 | def get_next5(dl):
25 |     last5 = []
26 |     it = iter(dl)
27 |     for i in range(5):
28 |         last5.append(next(it))
29 |     return last5
30 | 
31 | 
32 | def test_seeding_ensures_data_loaders():
33 |     "Setting a global seed for the dataloaders ensures that we get data back in the same order"
34 | 
35 |     seed_everything(1)
36 |     protocol, vad = setup_tasks(VoiceActivityDetection)
37 |     dl = create_dl(SimpleSegmentationModel, vad)
38 |     last5a = get_next5(dl)
39 | 
40 |     seed_everything(1)
41 |     protocol, vad = setup_tasks(VoiceActivityDetection)
42 |     dl = create_dl(SimpleSegmentationModel, vad)
43 |     last5b = get_next5(dl)
44 | 
45 |     for i in range(len(last5b)):
46 |         assert torch.equal(last5a[i]["X"], last5b[i]["X"])
47 | 
48 | 
49 | def test_different_seeds():
50 |     "Changing the global seed will change the order of the data that loads"
51 | 
52 |     protocol, vad = setup_tasks(VoiceActivityDetection)
53 |     seed_everything(4)
54 |     dl = create_dl(SimpleSegmentationModel, vad)
55 |     last5a = get_next5(dl)
56 | 
57 |     protocol, vad = setup_tasks(VoiceActivityDetection)
58 |     seed_everything(5)
59 |     dl = create_dl(SimpleSegmentationModel, vad)
60 |     last5b = get_next5(dl)
61 | 
62 |     for i in range(5):
63 |         assert not torch.equal(last5a[i]["X"], last5b[i]["X"])
64 | 


--------------------------------------------------------------------------------
/tests/tasks/test_specifications.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyannote.database import FileFinder, get_protocol
 3 | 
 4 | from pyannote.audio.core.model import Model
 5 | from pyannote.audio.core.task import UnknownSpecificationsError
 6 | from pyannote.audio.tasks import SpeakerDiarization
 7 | 
 8 | 
 9 | @pytest.fixture()
10 | def protocol():
11 |     return get_protocol(
12 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
13 |     )
14 | 
15 | 
16 | def test_unknown_specifications_error_raised_on_non_setup_task(protocol):
17 |     task = SpeakerDiarization(protocol=protocol)
18 |     with pytest.raises(UnknownSpecificationsError):
19 |         _ = task.specifications
20 | 
21 | 
22 | def test_unknown_specifications_error_raised_on_non_setup_model_task(protocol):
23 |     task = SpeakerDiarization(protocol=protocol)
24 |     model = Model.from_pretrained("pyannote/ci-segmentation")
25 |     model.task = task
26 |     with pytest.raises(UnknownSpecificationsError):
27 |         _ = model.specifications
28 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | #
  3 | # Copyright (c) 2024- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import subprocess
 24 | 
 25 | import pytest
 26 | from pyannote.database import FileFinder, get_protocol
 27 | 
 28 | 
 29 | @pytest.fixture()
 30 | def protocol():
 31 |     return get_protocol(
 32 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
 33 |     )
 34 | 
 35 | 
 36 | @pytest.fixture()
 37 | def database():
 38 |     return "./tests/data/database.yml"
 39 | 
 40 | 
 41 | @pytest.fixture()
 42 | def model():
 43 |     return "pyannote/ci-segmentation"
 44 | 
 45 | 
 46 | def test_cli_train_vad(database, protocol):
 47 |     res = subprocess.run(
 48 |         [
 49 |             "pyannote-audio-train",
 50 |             "model=DebugSegmentation",
 51 |             "task=VoiceActivityDetection",
 52 |             f"+registry={database}",
 53 |             f"protocol={protocol.name}",
 54 |             "trainer=fast_dev_run",
 55 |             "hydra.run.dir=.",  # run hydra app in current directory
 56 |             "hydra.output_subdir=null",  # disable hydra outputs
 57 |             "hydra/hydra_logging=disabled",
 58 |             "hydra/job_logging=disabled",
 59 |         ]
 60 |     )
 61 |     assert res.returncode == 0
 62 | 
 63 | 
 64 | def test_cli_train_segmentation(database, protocol):
 65 |     res = subprocess.run(
 66 |         [
 67 |             "pyannote-audio-train",
 68 |             "model=DebugSegmentation",
 69 |             "task=SpeakerDiarization",
 70 |             f"+registry={database}",
 71 |             f"protocol={protocol.name}",
 72 |             "trainer=fast_dev_run",
 73 |             "hydra.run.dir=.",  # run hydra app in current directory
 74 |             "hydra.output_subdir=null",  # disable hydra outputs
 75 |             "hydra/hydra_logging=disabled",
 76 |             "hydra/job_logging=disabled",
 77 |         ]
 78 |     )
 79 |     assert res.returncode == 0
 80 | 
 81 | 
 82 | def test_cli_train_osd(database, protocol):
 83 |     res = subprocess.run(
 84 |         [
 85 |             "pyannote-audio-train",
 86 |             "model=DebugSegmentation",
 87 |             "task=OverlappedSpeechDetection",
 88 |             f"+registry={database}",
 89 |             f"protocol={protocol.name}",
 90 |             "trainer=fast_dev_run",
 91 |             "hydra.run.dir=.",  # run hydra app in current directory
 92 |             "hydra.output_subdir=null",  # disable hydra outputs
 93 |             "hydra/hydra_logging=disabled",
 94 |             "hydra/job_logging=disabled",
 95 |         ]
 96 |     )
 97 |     assert res.returncode == 0
 98 | 
 99 | 
100 | def test_cli_train_supervised_representation_with_arcface(database, protocol):
101 |     res = subprocess.run(
102 |         [
103 |             "pyannote-audio-train",
104 |             "model=DebugEmbedding",
105 |             "task=SpeakerEmbedding",
106 |             f"+registry={database}",
107 |             f"protocol={protocol.name}",
108 |             "trainer=fast_dev_run",
109 |             "hydra.run.dir=.",  # run hydra app in current directory
110 |             "hydra.output_subdir=null",  # disable hydra outputs
111 |             "hydra/hydra_logging=disabled",
112 |             "hydra/job_logging=disabled",
113 |         ]
114 |     )
115 |     assert res.returncode == 0
116 | 
117 | 
118 | def test_cli_train_segmentation_with_pyannet(database, protocol):
119 |     res = subprocess.run(
120 |         [
121 |             "pyannote-audio-train",
122 |             "model=PyanNet",
123 |             "task=SpeakerDiarization",
124 |             f"+registry={database}",
125 |             f"protocol={protocol.name}",
126 |             "trainer=fast_dev_run",
127 |             "hydra.run.dir=.",  # run hydra app in current directory
128 |             "hydra.output_subdir=null",  # disable hydra outputs
129 |             "hydra/hydra_logging=disabled",
130 |             "hydra/job_logging=disabled",
131 |         ]
132 |     )
133 |     assert res.returncode == 0
134 | 
135 | 
136 | def test_cli_eval_segmentation_model(database, protocol, model):
137 |     res = subprocess.run(
138 |         [
139 |             "pyannote-audio-eval",
140 |             f"model={model}",
141 |             f"+registry={database}",
142 |             f"protocol={protocol.name}",
143 |             "hydra.run.dir=.",  # run hydra app in current directory
144 |             "hydra.output_subdir=null",  # disable hydra outputs
145 |             "hydra/hydra_logging=disabled",
146 |             "hydra/job_logging=disabled",
147 |         ]
148 |     )
149 |     assert res.returncode == 0
150 | 


--------------------------------------------------------------------------------
/tests/test_clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from pyannote.audio.pipelines.clustering import AgglomerativeClustering
 4 | 
 5 | 
 6 | def test_agglomerative_clustering_num_cluster():
 7 |     """
 8 |     Make sure AgglomerativeClustering doesn't "over-merge" clusters when initial
 9 |     clustering already matches target num_clusters, cf
10 |     https://github.com/pyannote/pyannote-audio/issues/1525
11 |     """
12 | 
13 |     # 2 embeddings different enough
14 |     embeddings = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 1.0, 2.0]])
15 | 
16 |     # clustering with params that should yield 1 cluster per embedding
17 |     clustering = AgglomerativeClustering().instantiate(
18 |         {
19 |             "method": "centroid",
20 |             "min_cluster_size": 0,
21 |             "threshold": 0.0,
22 |         }
23 |     )
24 | 
25 |     # request 2 clusters
26 |     clusters = clustering.cluster(
27 |         embeddings=embeddings, min_clusters=2, max_clusters=2, num_clusters=2
28 |     )
29 |     assert np.array_equal(clusters, np.array([0, 1]))
30 | 


--------------------------------------------------------------------------------
/tests/test_import_lib.py:
--------------------------------------------------------------------------------
1 | from pyannote.audio.core.model import Model
2 | 
3 | 
4 | def test_import_lib():
5 |     """This is a dummy test, just to check
6 |     if the lib can be successfully imported.
7 |     """
8 |     assert Model is not None
9 | 


--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2024- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import pytest
 24 | import torch
 25 | 
 26 | from pyannote.audio.torchmetrics.functional.audio.diarization_error_rate import (
 27 |     _der_update,
 28 |     diarization_error_rate,
 29 | )
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def target():
 34 |     chunk1 = [[0, 0], [1, 0], [1, 0], [1, 1], [1, 1], [0, 1], [0, 1]]
 35 |     chunk2 = [[0, 0], [0, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 0]]
 36 |     return torch.tensor([chunk1, chunk2], dtype=torch.float32).transpose(2, 1)
 37 | 
 38 | 
 39 | @pytest.fixture
 40 | def prediction():
 41 |     chunk1 = [[0, 0], [1, 0], [0, 0], [1, 1], [0, 1], [1, 1], [1, 0]]
 42 |     chunk2 = [[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 1], [1, 0]]
 43 |     return torch.tensor([chunk1, chunk2], dtype=torch.float32).transpose(2, 1)
 44 | 
 45 | 
 46 | def test_frame_reduction(target, prediction):
 47 |     false_alarm, missed_detection, speaker_confusion, speech_total = _der_update(
 48 |         prediction, target, reduce="frame"
 49 |     )
 50 | 
 51 |     torch.testing.assert_close(
 52 |         false_alarm,
 53 |         torch.Tensor(
 54 |             [[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]]
 55 |         ),
 56 |     )
 57 | 
 58 |     torch.testing.assert_close(
 59 |         missed_detection,
 60 |         torch.Tensor(
 61 |             [
 62 |                 [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
 63 |                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 64 |             ]
 65 |         ),
 66 |     )
 67 | 
 68 |     torch.testing.assert_close(
 69 |         speaker_confusion,
 70 |         torch.Tensor(
 71 |             [[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 72 |         ),
 73 |     )
 74 | 
 75 |     torch.testing.assert_close(
 76 |         speech_total,
 77 |         torch.Tensor(
 78 |             [[0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]]
 79 |         ),
 80 |     )
 81 | 
 82 | 
 83 | def test_chunk_reduction(target, prediction):
 84 |     false_alarm, missed_detection, speaker_confusion, speech_total = _der_update(
 85 |         prediction, target, reduce="chunk"
 86 |     )
 87 | 
 88 |     torch.testing.assert_close(
 89 |         false_alarm,
 90 |         torch.Tensor([1.0, 2.0]),
 91 |     )
 92 | 
 93 |     torch.testing.assert_close(
 94 |         missed_detection,
 95 |         torch.Tensor([2.0, 0.0]),
 96 |     )
 97 | 
 98 |     torch.testing.assert_close(
 99 |         speaker_confusion,
100 |         torch.Tensor([1.0, 0.0]),
101 |     )
102 | 
103 |     torch.testing.assert_close(
104 |         speech_total,
105 |         torch.Tensor([8.0, 4.0]),
106 |     )
107 | 
108 | 
109 | def test_batch_reduction(target, prediction):
110 |     false_alarm, missed_detection, speaker_confusion, speech_total = _der_update(
111 |         prediction, target, reduce="batch"
112 |     )
113 |     torch.testing.assert_close(false_alarm.item(), 3.0)
114 |     torch.testing.assert_close(missed_detection.item(), 2.0)
115 |     torch.testing.assert_close(speaker_confusion.item(), 1.0)
116 |     torch.testing.assert_close(speech_total.item(), 12.0)
117 | 
118 | 
119 | def test_batch_der(target, prediction):
120 |     der = diarization_error_rate(prediction, target, reduce="batch")
121 |     torch.testing.assert_close(der.item(), (3.0 + 2.0 + 1.0) / 12.0)
122 | 
123 | 
124 | def test_batch_der_with_components(target, prediction):
125 |     der, (
126 |         false_alarm,
127 |         missed_detection,
128 |         speaker_confusion,
129 |         speech_total,
130 |     ) = diarization_error_rate(
131 |         prediction, target, reduce="batch", return_components=True
132 |     )
133 |     torch.testing.assert_close(der.item(), (3.0 + 2.0 + 1.0) / 12.0)
134 |     torch.testing.assert_close(false_alarm.item(), 3.0)
135 |     torch.testing.assert_close(missed_detection.item(), 2.0)
136 |     torch.testing.assert_close(speaker_confusion.item(), 1.0)
137 |     torch.testing.assert_close(speech_total.item(), 12.0)
138 | 
139 | 
140 | def test_chunk_der(target, prediction):
141 |     der = diarization_error_rate(prediction, target, reduce="chunk")
142 |     torch.testing.assert_close(der, torch.Tensor([4.0 / 8.0, 2.0 / 4.0]))
143 | 


--------------------------------------------------------------------------------
/tests/test_run_notebooks.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | 
 3 | import papermill as pm
 4 | 
 5 | 
 6 | def test_can_run_notebooks():
 7 |     # Search for all notebooks in directory
 8 |     notebooks = glob("**/notebook/**/*.ipynb")
 9 |     for nb in notebooks:
10 |         try:
11 |             pm.execute_notebook(
12 |                 nb, "/dev/null", progress_bar=False, kernel_name="python"
13 |             )
14 |         except Exception as e:
15 |             # Which notebook caused the error
16 |             raise Exception(nb, e)
17 | 


--------------------------------------------------------------------------------
/tests/test_sample.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | #
 3 | # Copyright (c) 2024- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | def test_sample():
25 |     from pyannote.audio.sample import SAMPLE_FILE
26 | 
27 |     assert "annotation" in SAMPLE_FILE
28 |     assert "annotated" in SAMPLE_FILE
29 | 


--------------------------------------------------------------------------------
/tests/test_speechbrain.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import pytest
 3 | from speechbrain.inference import EncoderClassifier
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def cache():
 8 |     return tempfile.mkdtemp()
 9 | 
10 | def test_import_speechbrain_encoder_classifier(cache):
11 |     """This is a simple test that check if speechbrain
12 |     EncoderClassifier can be imported. It does not check
13 |     if the model is working properly. 
14 |     """
15 | 
16 |     model = EncoderClassifier.from_hparams(
17 |         source="speechbrain/spkrec-ecapa-voxceleb", 
18 |         savedir=cache,
19 |     )
20 |     assert isinstance(model, EncoderClassifier)
21 | 


--------------------------------------------------------------------------------
/tests/test_stats_pool.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2023- CNRS
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import torch
 24 | 
 25 | from pyannote.audio.models.blocks.pooling import StatsPool
 26 | 
 27 | 
 28 | def test_stats_pool_weightless():
 29 |     x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
 30 |     # (batch = 2, features = 2, frames = 2)
 31 | 
 32 |     stats_pool = StatsPool()
 33 | 
 34 |     y = stats_pool(x)
 35 |     # (batch = 2, features = 4)
 36 | 
 37 |     assert torch.equal(
 38 |         torch.round(y, decimals=4),
 39 |         torch.Tensor([[3.0, 3.0, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]),
 40 |     )
 41 | 
 42 | 
 43 | def test_stats_pool_one_speaker():
 44 |     x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
 45 |     # (batch = 2, features = 2, frames = 2)
 46 | 
 47 |     w = torch.Tensor(
 48 |         [
 49 |             [0.5, 0.01],
 50 |             [0.2, 0.1],
 51 |         ]
 52 |     )
 53 |     # (batch = 2, frames = 2)
 54 | 
 55 |     stats_pool = StatsPool()
 56 | 
 57 |     y = stats_pool(x, weights=w)
 58 |     # (batch = 2, features = 4)
 59 | 
 60 |     assert torch.equal(
 61 |         torch.round(y, decimals=4),
 62 |         torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]),
 63 |     )
 64 | 
 65 | 
 66 | def test_stats_pool_multi_speaker():
 67 |     x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
 68 |     # (batch = 2, features = 2, frames = 2)
 69 | 
 70 |     w = torch.Tensor([[[0.1, 0.2], [0.2, 0.3]], [[0.001, 0.001], [0.2, 0.3]]])
 71 |     # (batch = 2, speakers = 2, frames = 2)
 72 | 
 73 |     stats_pool = StatsPool()
 74 | 
 75 |     y = stats_pool(x, weights=w)
 76 |     # (batch = 2, speakers = 2, features = 4)
 77 | 
 78 |     assert torch.equal(
 79 |         torch.round(y, decimals=4),
 80 |         torch.Tensor(
 81 |             [
 82 |                 [[3.3333, 3.3333, 1.4142, 1.4142], [3.2, 3.2, 1.4142, 1.4142]],
 83 |                 [[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]],
 84 |             ]
 85 |         ),
 86 |     )
 87 | 
 88 | 
 89 | def test_stats_pool_frame_mismatch():
 90 |     x = torch.Tensor([[[2.0, 2.0], [2.0, 2.0]], [[1.0, 1.0], [1.0, 1.0]]])
 91 |     # (batch = 2, features = 2, frames = 2)
 92 | 
 93 |     stats_pool = StatsPool()
 94 |     w = torch.Tensor(
 95 |         [
 96 |             [0.5, 0.5, 0.0],
 97 |             [0.0, 0.5, 0.5],
 98 |         ]
 99 |     )
100 |     # (batch = 2, frames = 3)
101 | 
102 |     y = stats_pool(x, weights=w)
103 |     # (batch = 2, features = 4)
104 | 
105 |     assert torch.equal(
106 |         torch.round(y, decimals=4),
107 |         torch.Tensor([[2.0, 2.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]),
108 |     )
109 | 
110 | 
111 | def test_stats_pool_all_zero_weights():
112 |     x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
113 |     # (batch = 2, features = 2, frames = 2)
114 | 
115 |     w = torch.Tensor(
116 |         [
117 |             [0.5, 0.01],
118 |             [0.0, 0.0],  # all zero weights
119 |         ]
120 |     )
121 |     # (batch = 2, frames = 2)
122 | 
123 |     stats_pool = StatsPool()
124 | 
125 |     y = stats_pool(x, weights=w)
126 |     # (batch = 2, features = 4)
127 | 
128 |     assert torch.equal(
129 |         torch.round(y, decimals=4),
130 |         torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [0.0, 0.0, 0.0, 0.0]]),
131 |     )
132 | 


--------------------------------------------------------------------------------
/tests/utils/preview.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from IPython.display import Audio
 3 | 
 4 | from pyannote.audio.utils.preview import listen
 5 | from pyannote.core import Segment
 6 | from pyannote.database import FileFinder, get_protocol
 7 | 
 8 | 
 9 | def test_file():
10 |     protocol = get_protocol(
11 |         "Debug.SpeakerDiarization.Debug", preprocessors={"audio": FileFinder()}
12 |     )
13 |     return next(protocol.train())
14 | 
15 | 
16 | def test_returns_audio_object():
17 |     audio_file = test_file()
18 |     ipython_audio = listen(audio_file)
19 |     assert isinstance(ipython_audio, Audio)
20 | 
21 | 
22 | def test_can_crop():
23 |     audio_file = test_file()
24 |     listen(audio_file, Segment(0, 1))
25 | 
26 | 
27 | def test_fail_crop_too_large():
28 |     with pytest.raises(ValueError):
29 |         audio_file = test_file()
30 |         duration = audio_file.duration
31 |         listen(audio_file, Segment(0, duration * 2))
32 | 


--------------------------------------------------------------------------------
/tests/utils/probe_util_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from pyannote.audio.utils.probe import probe
 5 | 
 6 | 
 7 | class Trunk(nn.Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.layer1 = nn.Linear(1, 2)
11 |         self.layer2 = nn.Linear(2, 3)
12 |         self.layer3 = nn.Linear(3, 4)
13 | 
14 |     def forward(self, x):
15 |         return self.layer3(self.layer2(self.layer1(x)))
16 | 
17 | 
18 | def test_probe_dict():
19 |     trunk = Trunk()
20 |     probe(trunk, {"probe1": "layer1"})
21 |     out = trunk(
22 |         torch.ones(
23 |             1,
24 |         )
25 |     )
26 |     assert isinstance(out, dict)
27 |     assert len(out.keys()) == 1
28 |     assert isinstance(out["probe1"], torch.Tensor)
29 | 
30 | 
31 | def test_probe_output():
32 |     trunk = Trunk()
33 |     probe(trunk, {"probe1": "layer3"})
34 |     out = trunk(
35 |         torch.ones(
36 |             1,
37 |         )
38 |     )
39 |     out = out["probe1"]
40 |     tout = trunk.layer3(
41 |         trunk.layer2(
42 |             trunk.layer1(
43 |                 torch.ones(
44 |                     1,
45 |                 )
46 |             )
47 |         )
48 |     )
49 |     assert torch.equal(tout, out)
50 | 
51 | 
52 | def test_probe_revert():
53 |     trunk = Trunk()
54 |     revert = probe(trunk, {"probe1": "layer3"})
55 |     out = trunk(
56 |         torch.ones(
57 |             1,
58 |         )
59 |     )
60 |     assert isinstance(out, dict)
61 |     revert()
62 |     out = trunk(
63 |         torch.ones(
64 |             1,
65 |         )
66 |     )
67 |     assert isinstance(out, torch.Tensor)
68 | 
69 | 
70 | def test_probe_array():
71 |     trunk = Trunk()
72 |     probe(trunk, ["layer3"])
73 |     out = trunk(
74 |         torch.ones(
75 |             1,
76 |         )
77 |     )
78 |     assert isinstance(out, dict)
79 | 


--------------------------------------------------------------------------------
/tests/utils/test_permutation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from pyannote.audio.utils.permutation import permutate
  5 | 
  6 | 
  7 | def test_permutate_torch():
  8 | 
  9 |     num_frames, num_speakers = 10, 3
 10 | 
 11 |     actual_permutations = [
 12 |         (0, 1, 2),
 13 |         (0, 2, 1),
 14 |         (1, 0, 2),
 15 |         (1, 2, 0),
 16 |         (2, 0, 1),
 17 |         (2, 1, 0),
 18 |     ]
 19 |     batch_size = len(actual_permutations)
 20 | 
 21 |     y2 = torch.randn((num_frames, num_speakers))
 22 |     y1 = torch.zeros((batch_size, num_frames, num_speakers))
 23 | 
 24 |     for p, permutation in enumerate(actual_permutations):
 25 |         y1[p] = y2[:, permutation]
 26 | 
 27 |     permutated_y2, permutations = permutate(y1, y2)
 28 |     assert actual_permutations == permutations
 29 | 
 30 |     for p, permutation in enumerate(actual_permutations):
 31 |         np.testing.assert_allclose(permutated_y2[p], y2[:, permutation])
 32 | 
 33 | 
 34 | def test_permutate_numpy():
 35 | 
 36 |     num_frames, num_speakers = 10, 3
 37 | 
 38 |     actual_permutations = [
 39 |         (0, 1, 2),
 40 |         (0, 2, 1),
 41 |         (1, 0, 2),
 42 |         (1, 2, 0),
 43 |         (2, 0, 1),
 44 |         (2, 1, 0),
 45 |     ]
 46 |     batch_size = len(actual_permutations)
 47 | 
 48 |     y2 = np.random.randn(num_frames, num_speakers)
 49 |     y1 = np.zeros((batch_size, num_frames, num_speakers))
 50 | 
 51 |     for p, permutation in enumerate(actual_permutations):
 52 |         y1[p] = y2[:, permutation]
 53 | 
 54 |     permutated_y2, permutations = permutate(y1, y2)
 55 |     assert actual_permutations == permutations
 56 | 
 57 |     for p, permutation in enumerate(actual_permutations):
 58 |         np.testing.assert_allclose(permutated_y2[p], y2[:, permutation])
 59 | 
 60 | 
 61 | def test_permutate_less_speakers():
 62 | 
 63 |     num_frames = 10
 64 | 
 65 |     actual_permutations = [
 66 |         (0, 1, None),
 67 |         (0, None, 1),
 68 |         (1, 0, None),
 69 |         (1, None, 0),
 70 |         (None, 0, 1),
 71 |         (None, 1, 0),
 72 |     ]
 73 |     batch_size = len(actual_permutations)
 74 | 
 75 |     y2 = np.random.randn(num_frames, 2)
 76 |     y1 = np.zeros((batch_size, num_frames, 3))
 77 | 
 78 |     for p, permutation in enumerate(actual_permutations):
 79 |         for i, j in enumerate(permutation):
 80 |             if j is not None:
 81 |                 y1[p, :, i] = y2[:, j]
 82 | 
 83 |     permutated_y2, permutations = permutate(y1, y2)
 84 | 
 85 |     assert permutations == actual_permutations
 86 | 
 87 | 
 88 | def test_permutate_more_speakers():
 89 | 
 90 |     num_frames = 10
 91 | 
 92 |     actual_permutations = [
 93 |         (0, 1),
 94 |         (0, 2),
 95 |         (1, 0),
 96 |         (1, 2),
 97 |         (2, 0),
 98 |         (2, 1),
 99 |     ]
100 |     batch_size = len(actual_permutations)
101 | 
102 |     y2 = np.random.randn(num_frames, 3)
103 |     y1 = np.zeros((batch_size, num_frames, 2))
104 | 
105 |     for p, permutation in enumerate(actual_permutations):
106 |         for i, j in enumerate(permutation):
107 |             y1[p, :, i] = y2[:, j]
108 | 
109 |     permutated_y2, permutations = permutate(y1, y2)
110 | 
111 |     assert permutations == actual_permutations
112 |     np.testing.assert_allclose(permutated_y2, y1)
113 | 


--------------------------------------------------------------------------------
/tests/utils/test_powerset.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2023- CNRS
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | import torch
25 | 
26 | from pyannote.audio.utils.powerset import Powerset
27 | 
28 | 
29 | def test_roundtrip():
30 |     for num_classes in range(2, 5):
31 |         for max_set_size in range(1, num_classes + 1):
32 |             powerset = Powerset(num_classes, max_set_size)
33 | 
34 |             # simulate a sequence where each frame is assigned to a different powerset class
35 |             one_sequence = [
36 |                 [0] * powerset.num_powerset_classes
37 |                 for _ in range(powerset.num_powerset_classes)
38 |             ]
39 |             for i in range(powerset.num_powerset_classes):
40 |                 one_sequence[i][i] = 1.0
41 | 
42 |             # make a batch out of this sequence and the same sequence in reverse order
43 |             batch_powerset = torch.tensor([one_sequence, one_sequence[::-1]])
44 | 
45 |             # convert from powerset to multi-label
46 |             batch_multilabel = powerset.to_multilabel(batch_powerset)
47 | 
48 |             # convert batch back to powerset
49 |             reconstruction = powerset.to_powerset(batch_multilabel)
50 | 
51 |             assert torch.equal(batch_powerset, reconstruction)
52 | 
53 | 
54 | def test_permutate_powerset():
55 |     for num_classes in range(1, 6):
56 |         for max_set_size in range(1, num_classes + 1):
57 |             powerset = Powerset(num_classes, max_set_size)
58 | 
59 |             # create (num_powerset_class, num_powerset_class)-shaped tensor, where each frame is assigned to a different powerset class
60 |             # and convert it to its multi-label equivalent
61 |             t1 = torch.nn.functional.one_hot(
62 |                 torch.arange(powerset.num_powerset_classes),
63 |                 powerset.num_powerset_classes,
64 |             )
65 |             t1_ml = powerset.to_multilabel(t1)
66 | 
67 |             # then permutate the powerset class in powerset space AND the multilabel equivalent in its native space
68 |             # and check it has the same result.
69 |             # perm = torch.randperm(num_classes)
70 |             perm = tuple(torch.randperm(num_classes).tolist())
71 |             t1_ml_perm = t1_ml[:, perm]
72 |             perm_ps = powerset.permutation_mapping[perm]
73 |             t1_ps_perm = t1[..., perm_ps]
74 |             t1_ps_perm_ml = powerset.to_multilabel(t1_ps_perm)
75 | 
76 |             assert t1_ml_perm.equal(t1_ps_perm_ml)
77 | 


--------------------------------------------------------------------------------
/tutorials/assets/download-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/download-model.png


--------------------------------------------------------------------------------
/tutorials/assets/download-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/download-pipeline.png


--------------------------------------------------------------------------------
/tutorials/assets/prodigy-pyannote.audio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/prodigy-pyannote.audio.png


--------------------------------------------------------------------------------
/tutorials/assets/pyannote.diff.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/pyannote.diff.PNG


--------------------------------------------------------------------------------
/tutorials/assets/pyannote.review.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/pyannote.review.PNG


--------------------------------------------------------------------------------
/tutorials/assets/sample.rttm:
--------------------------------------------------------------------------------
 1 | SPEAKER sample 1 6.690 0.430 <NA> <NA> speaker90 <NA> <NA>
 2 | SPEAKER sample 1 7.550 0.800 <NA> <NA> speaker91 <NA> <NA>
 3 | SPEAKER sample 1 8.320 1.700 <NA> <NA> speaker90 <NA> <NA>
 4 | SPEAKER sample 1 9.920 1.110 <NA> <NA> speaker91 <NA> <NA>
 5 | SPEAKER sample 1 10.570 4.130 <NA> <NA> speaker90 <NA> <NA>
 6 | SPEAKER sample 1 14.490 3.430 <NA> <NA> speaker91 <NA> <NA>
 7 | SPEAKER sample 1 18.050 3.440 <NA> <NA> speaker90 <NA> <NA>
 8 | SPEAKER sample 1 18.150 0.440 <NA> <NA> speaker91 <NA> <NA>
 9 | SPEAKER sample 1 21.780 6.720 <NA> <NA> speaker91 <NA> <NA>
10 | SPEAKER sample 1 27.850 2.150 <NA> <NA> speaker90 <NA> <NA>
11 | 


--------------------------------------------------------------------------------
/tutorials/assets/sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pyannote/pyannote-audio/240a7f3ef60bc613169df860b536b10e338dbf3c/tutorials/assets/sample.wav


--------------------------------------------------------------------------------
/tutorials/speaker_verification.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "attachments": {},
 5 |    "cell_type": "markdown",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "#### Speaker verification\n",
 9 |     "\n",
10 |     "```python\n",
11 |     "import torch\n",
12 |     "from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding\n",
13 |     "model = PretrainedSpeakerEmbedding(\n",
14 |     "    \"speechbrain/spkrec-ecapa-voxceleb\",\n",
15 |     "    device=torch.device(\"cuda\"))\n",
16 |     "\n",
17 |     "from pyannote.audio import Audio\n",
18 |     "from pyannote.core import Segment\n",
19 |     "audio = Audio(sample_rate=16000, mono=\"downmix\")\n",
20 |     "\n",
21 |     "# extract embedding for a speaker speaking between t=3s and t=6s\n",
22 |     "speaker1 = Segment(3., 6.)\n",
23 |     "waveform1, sample_rate = audio.crop(\"audio.wav\", speaker1)\n",
24 |     "embedding1 = model(waveform1[None])\n",
25 |     "\n",
26 |     "# extract embedding for a speaker speaking between t=7s and t=12s\n",
27 |     "speaker2 = Segment(7., 12.)\n",
28 |     "waveform2, sample_rate = audio.crop(\"audio.wav\", speaker2)\n",
29 |     "embedding2 = model(waveform2[None])\n",
30 |     "\n",
31 |     "# compare embeddings using \"cosine\" distance\n",
32 |     "from scipy.spatial.distance import cdist\n",
33 |     "distance = cdist(embedding1, embedding2, metric=\"cosine\")\n",
34 |     "```\n"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "markdown",
39 |    "metadata": {},
40 |    "source": []
41 |   }
42 |  ],
43 |  "metadata": {
44 |   "interpreter": {
45 |    "hash": "41379f2c2a4eb17f5ac9a1f5014f4b793a0ead0b6469d8877f81a91eb030f53e"
46 |   },
47 |   "kernelspec": {
48 |    "display_name": "Python 3.8.2 64-bit ('pyannote': conda)",
49 |    "language": "python",
50 |    "name": "python3"
51 |   },
52 |   "language_info": {
53 |    "name": "python",
54 |    "version": "3.8.2"
55 |   }
56 |  },
57 |  "nbformat": 4,
58 |  "nbformat_minor": 2
59 | }
60 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 3.3.2
2 | 


--------------------------------------------------------------------------------