├── src
    └── audioseal
    │   ├── py.typed
    │   ├── libs
    │       ├── __init__.py
    │       └── audiocraft
    │       │   ├── __init__.py
    │       │   └── modules
    │       │       ├── __init__.py
    │       │       ├── lstm.py
    │       │       ├── conv.py
    │       │       └── seanet.py
    │   ├── __init__.py
    │   ├── cards
    │       ├── audioseal_detector_16bits.yaml
    │       └── audioseal_wm_16bits.yaml
    │   ├── builder.py
    │   ├── loader.py
    │   └── models.py
├── requirements.txt
├── requirements-dev.txt
├── .github
    ├── pull_request_template.md
    └── workflows
    │   └── lint_and_test.yaml
├── CHANGELOG.md
├── .pre-commit-config.yaml
├── LICENSE
├── examples
    ├── notebook.py
    ├── attacks.py
    ├── colab.ipynb
    └── attack_benchmarking_example.ipynb
├── tests
    └── test_models.py
├── pyproject.toml
├── CONTRIBUTING.md
├── .gitignore
├── CODE_OF_CONDUCT.md
└── README.md


/src/audioseal/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 |   numpy
2 |   omegaconf
3 |   julius
4 |   torch>=1.13.0
5 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # For developers wanting to contribute to AudioSeal
 2 | func_argparse
 3 | torchaudio
 4 | soundfile
 5 | pytest
 6 | mypy
 7 | black
 8 | isort
 9 | flake8
10 | pre-commit
11 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/audiocraft/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/audiocraft/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | 
7 | 
8 | from .seanet import SEANetDecoder, SEANetEncoder, SEANetEncoderKeepDimension
9 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Why ?
 2 | 
 3 | Why do we need to implement this feature ? What is the use case ?
 4 | 
 5 | ## How ?
 6 | 
 7 | Document the technical decisions you made.
 8 | If some parts are WIP, please explicit them here.
 9 | 
10 | ## Test plan
11 | 
12 | How did you test your changes ?
13 | Include full command line to help other people reproduce if needed.
14 | 


--------------------------------------------------------------------------------
/src/audioseal/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | """
 8 | Watermarking and detection for speech audios
 9 | 
10 | A Pytorch-based localized algorithm for proactive detection
11 | of the watermarkings in AI-generated audios, with very fast
12 | detector.
13 | 
14 | """
15 | 
16 | __version__ = "0.1.2"
17 | 
18 | 
19 | from audioseal import builder
20 | from audioseal.loader import AudioSeal
21 | from audioseal.models import AudioSealDetector, AudioSealWM, MsgProcessor
22 | 


--------------------------------------------------------------------------------
/src/audioseal/cards/audioseal_detector_16bits.yaml:
--------------------------------------------------------------------------------
 1 | # @package __global__
 2 | 
 3 | name: audioseal_detector_16bits
 4 | model_type: seanet
 5 | checkpoint: "https://dl.fbaipublicfiles.com/audioseal/6edcf62f/detector.pth"
 6 | nbits: 16
 7 | seanet:
 8 |   activation: ELU
 9 |   activation_params:
10 |     alpha: 1.0
11 |   causal: false
12 |   channels: 1
13 |   compress: 2
14 |   dilation_base: 2
15 |   dimension: 128
16 |   disable_norm_outer_blocks: 0
17 |   kernel_size: 7
18 |   last_kernel_size: 7
19 |   lstm: 2
20 |   n_filters: 32
21 |   n_residual_layers: 1
22 |   norm: weight_norm
23 |   norm_params: {}
24 |   pad_mode: constant
25 |   ratios:
26 |     - 8
27 |     - 5
28 |     - 4
29 |     - 2
30 |   residual_kernel_size: 3
31 |   true_skip: true
32 | detector:
33 |   output_dim: 32
34 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to AudioSeal are documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 6 | 
 7 | ## [0.1.2] - 2024-02-29
 8 | - Add py.typed to make audioseal mypy-friendly
 9 | - Add the option to resample the input audio's sample rate to the expected sample rate of the model (https://github.com/facebookresearch/audioseal/pull/18)
10 | - Move `attacks.py` to non-core code base of audioseal
11 | - Remove duplicate module `SEANetEncoderKeepDimension` in `audioseal.lib.audiocraft.modules.seanet` and `audioseal.models`
12 | 
13 | ## [0.1.1] - 2024-02-04
14 | 
15 | - Fix [issue](https://github.com/facebookresearch/audioseal/issues/7) in installing audioseal from pypi due to conflict with audiocraft package
16 | - Fix typos in example notebooks
17 | - Update checkpoint to be Windows-compatible
18 | 
19 | ## [0.1.0] - 2024-02-01
20 | 
21 | - Initial release
22 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/audiocraft/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | # Vendor from https://github.com/facebookresearch/audiocraft
 8 | 
 9 | from torch import nn
10 | 
11 | 
12 | class StreamableLSTM(nn.Module):
13 |     """LSTM without worrying about the hidden state, nor the layout of the data.
14 |     Expects input as convolutional layout.
15 |     """
16 | 
17 |     def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
18 |         super().__init__()
19 |         self.skip = skip
20 |         self.lstm = nn.LSTM(dimension, dimension, num_layers)
21 | 
22 |     def forward(self, x):
23 |         x = x.permute(2, 0, 1)
24 |         y, _ = self.lstm(x)
25 |         if self.skip:
26 |             y = y + x
27 |         y = y.permute(1, 2, 0)
28 |         return y
29 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |     rev: v4.1.0
 7 |     hooks:
 8 |       - id: trailing-whitespace
 9 |       - id: check-ast
10 |       - id: check-merge-conflict
11 |       - id: check-added-large-files
12 |         args: ["--maxkb=2000"]
13 |       - id: end-of-file-fixer
14 | 
15 |   - repo: https://github.com/psf/black
16 |     rev: 24.1.1
17 |     hooks:
18 |       - id: black
19 |         language_version: python3.8
20 | 
21 |   - repo: https://github.com/pycqa/isort
22 |     rev: 5.12.0
23 |     hooks:
24 |       - id: isort
25 |         exclude: README.md
26 | 
27 |   - repo: https://github.com/pre-commit/mirrors-prettier
28 |     rev: v2.7.1
29 |     hooks:
30 |       - id: prettier
31 | 
32 |   - repo: https://github.com/pre-commit/mirrors-mypy
33 |     rev: v1.8.0
34 |     hooks:
35 |       - id: mypy
36 |         args: [--ignore-missing-imports, --ignore-]
37 | 


--------------------------------------------------------------------------------
/src/audioseal/cards/audioseal_wm_16bits.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the BSD-style license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | name: audioseal_wm_16bits
 8 | model_type: seanet
 9 | checkpoint: "https://dl.fbaipublicfiles.com/audioseal/6edcf62f/generator.pth"
10 | nbits: 16
11 | seanet:
12 |   activation: ELU
13 |   activation_params:
14 |     alpha: 1.0
15 |   causal: false
16 |   channels: 1
17 |   compress: 2
18 |   dilation_base: 2
19 |   dimension: 128
20 |   disable_norm_outer_blocks: 0
21 |   kernel_size: 7
22 |   last_kernel_size: 7
23 |   lstm: 2
24 |   n_filters: 32
25 |   n_residual_layers: 1
26 |   norm: weight_norm
27 |   norm_params: {}
28 |   pad_mode: constant
29 |   ratios:
30 |     - 8
31 |     - 5
32 |     - 4
33 |     - 2
34 |   residual_kernel_size: 3
35 |   true_skip: true
36 | decoder:
37 |   final_activation: null
38 |   final_activation_params: null
39 |   trim_right_ratio: 1.0
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/notebook.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | # Accompanying code for the notebook.
 8 | # We need to install matplotlib and jupyter notebook beforehand
 9 | 
10 | import IPython.display as ipd
11 | import matplotlib.pyplot as plt
12 | import torch
13 | 
14 | 
15 | def plot_waveform_and_specgram(waveform, sample_rate, title):
16 |     waveform = waveform.squeeze().detach().cpu().numpy()
17 | 
18 |     num_frames = waveform.shape[-1]
19 |     time_axis = torch.arange(0, num_frames) / sample_rate
20 | 
21 |     figure, (ax1, ax2) = plt.subplots(1, 2)
22 | 
23 |     ax1.plot(time_axis, waveform, linewidth=1)
24 |     ax1.grid(True)
25 |     ax2.specgram(waveform, Fs=sample_rate)
26 | 
27 |     figure.suptitle(f"{title} - Waveform and specgram")
28 |     plt.show()
29 | 
30 | 
31 | def play_audio(waveform, sample_rate):
32 |     if waveform.dim() > 2:
33 |         waveform = waveform.squeeze(0)
34 |     waveform = waveform.detach().cpu().numpy()
35 | 
36 |     num_channels, *_ = waveform.shape
37 |     if num_channels == 1:
38 |         ipd.display(ipd.Audio(waveform[0], rate=sample_rate))
39 |     elif num_channels == 2:
40 |         ipd.display(ipd.Audio((waveform[0], waveform[1]), rate=sample_rate))
41 |     else:
42 |         raise ValueError("Waveform with more than 2 channels are not supported.")
43 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | import urllib
 9 | 
10 | import pytest
11 | import torch
12 | import torchaudio
13 | 
14 | from audioseal import AudioSeal
15 | 
16 | 
17 | @pytest.fixture
18 | def example_audio(tmp_path):
19 |     url = "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav"
20 |     with open(tmp_path / "test.wav", "wb") as f:
21 |         resp = urllib.request.urlopen(url)
22 |         f.write(resp.read())
23 | 
24 |     wav, sr = torchaudio.load(tmp_path / "test.wav")
25 | 
26 |     # Add batch dimension
27 |     yield wav.unsqueeze(0), sr
28 | 
29 | 
30 | def test_detector(example_audio):
31 |     audio, sr = example_audio
32 |     model = AudioSeal.load_generator("audioseal_wm_16bits")
33 | 
34 |     secret_message = torch.randint(0, 2, (1, 16), dtype=torch.int32)
35 |     watermark = model(audio, sample_rate=sr, message=secret_message, alpha=0.8)
36 | 
37 |     watermarked_audio = audio + watermark
38 | 
39 |     detector = AudioSeal.load_detector(("audioseal_detector_16bits"))
40 |     result, message = detector.detect_watermark(watermarked_audio, sample_rate=sr)  # noqa
41 | 
42 |     # Due to non-deterministic decoding, messages are not always the same as message
43 |     print(f"\nOriginal message: {secret_message}")
44 |     print(f"Decoded message: {message}")
45 |     print(
46 |         "Matching bits in decoded and original messages: "
47 |         f"{torch.count_nonzero(torch.eq(message, secret_message)).item()}\n"
48 |     )
49 |     assert result > 0.5
50 | 
51 |     # Try to detect the unwatermarked audio
52 |     result, _ = detector.detect_watermark(audio, sample_rate=sr)  # noqa
53 |     assert result < 0.5
54 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4", "packaging~=23.1", "setuptools~=67.8", "wheel~=0.40"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "audioseal"
 7 | readme = "README.md"
 8 | authors = [{name = "Facebook AI Research"}]
 9 | requires-python = ">=3.8"
10 | dynamic = ["version", "description"]
11 | 
12 | 
13 | # zip_safe = false
14 | classifiers=[
15 |     "License :: OSI Approved :: MIT License",
16 |     "Topic :: Scientific/Engineering",
17 |     "Development Status :: 4 - Beta",
18 | ]
19 | 
20 | dependencies = [
21 |   "numpy",
22 |   "omegaconf",
23 |   "julius",
24 |   "torch>=1.13.0",
25 | ]
26 | 
27 | [project.urls]
28 |   Source = "https://github.com/facebookresearch/audioseal"
29 |   Tracker = "https://github.com/facebookresearch/audioseal/issues"
30 | 
31 | [project.optional-dependencies]
32 |   dev = [
33 |     "func_argparse",
34 |     "torchaudio",
35 |     "soundfile",
36 |     "pytest",
37 |     "black",
38 |     "isort",
39 |     "flake8",
40 |     "pre-commit",
41 |   ]
42 | 
43 | [tool.setuptools.package-data]
44 | "audioseal" = ["py.typed", "cards/*.yaml"]
45 | 
46 | [tool.flake8]
47 | extend_ignore = ["E", "Y"]  # Black
48 | per-file-ignores = [
49 |     "__init__.py:F401",
50 | ]
51 | 
52 | [tool.isort]
53 | profile = "black"
54 | 
55 | [tool.mypy]
56 | disable_error_code = "type-abstract,typeddict-unknown-key"
57 | disallow_untyped_calls = false
58 | disallow_untyped_defs = false
59 | disallow_untyped_decorators = false
60 | ignore_missing_imports = true
61 | python_version = 3.8
62 | show_error_codes = true
63 | show_error_context = true
64 | strict = false
65 | warn_unused_configs = false
66 | warn_unused_ignores = false
67 | exclude = ["src/audiocraft", "examples"]
68 | 
69 | [tool.pytest.ini_options]
70 | minversion = "7.1"
71 | testpaths = ["tests"]
72 | filterwarnings = [
73 |     "ignore:Deprecated call to `pkg_resources",
74 |     "ignore:Please use `line_search_wolfe",
75 |     "ignore:Please use `spmatrix",
76 |     "ignore:TypedStorage is deprecated",
77 |     "ignore:distutils Version classes are deprecated",
78 |     "ignore:pkg_resources is deprecated",
79 |     "ignore:torch.nn.utils.weight_norm is deprecated in favor of",
80 | ]
81 | norecursedirs = [
82 |   "examples/*",
83 | ]
84 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to AudioSeal
 2 | 
 3 | We want to make contributing to AudioSeal as easy as possible. Please make sure
 4 | to read this guideline carefully.
 5 | 
 6 | ## Setting up Development Environment
 7 | 
 8 | AudioSeal is a lightweight Python library that only relies on PyTorch, Numpy and OmegaConf (for
 9 | model card loading). Currenet minimal Pytorch requirement is 13.0, and it is advisable to
10 | keep the constraints on PyTorch as lenient as possible. Please keep both the text file
11 | `requirements.txt` and the Poetry file `pyproject.toml` up-to-date should you change the
12 | third-party library requirements.
13 | 
14 | ```sh
15 | git clone https://github.com/facebookresearch/audioseal.git
16 | ```
17 | 
18 | And, install the package in editable mode with development tools before contributing:
19 | 
20 | ```sh
21 | cd audioseal
22 | pip install -e ".[dev]"
23 | ```
24 | 
25 | Alternatively, you can also install the package and its development tools separately
26 | 
27 | ```sh
28 | cd audioseal
29 | pip install -e .
30 | pip install -r requirements-dev.txt
31 | ```
32 | 
33 | It is advisable to keep your commits linted and syntax-correct. In AudioSeal we provide a few
34 | [pre-commit] hooks to support that. Simply install pre-commit:
35 | 
36 | ```sh
37 | pre-commit install .
38 | ```
39 | 
40 | ## Pull Requests
41 | 
42 | We actively welcome your pull requests.
43 | 
44 | 1. Fork the repo and create your branch from `main`.
45 | 2. If you've added code that should be tested, add tests.
46 | 3. If you've changed APIs, update the documentation.
47 | 4. Ensure the test suite passes.
48 | 5. Make sure your code lints.
49 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
50 | 
51 | ## Contributor License Agreement ("CLA")
52 | 
53 | In order to accept your pull request, we need you to submit a CLA. You only need
54 | to do this once to work on any of Meta's open source projects.
55 | 
56 | Complete your CLA here: <https://code.facebook.com/cla>
57 | 
58 | ## Issues
59 | 
60 | We use GitHub issues to track public bugs. Please ensure your description is
61 | clear and has sufficient instructions to be able to reproduce the issue.
62 | 
63 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
64 | disclosure of security bugs. In those cases, please go through the process
65 | outlined on that page and do not file a public issue.
66 | 
67 | ## License
68 | 
69 | By contributing to `SONAR`, you agree that your contributions will be licensed
70 | under the LICENSE file in the root directory of this source tree.
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Sphinx documentation
 40 | docs/_build/
 41 | 
 42 | # PyBuilder
 43 | .pybuilder/
 44 | target/
 45 | 
 46 | # Jupyter Notebook
 47 | .ipynb_checkpoints
 48 | 
 49 | # IPython
 50 | profile_default/
 51 | ipython_config.py
 52 | 
 53 | # pyenv
 54 | #   For a library or package, you might want to ignore these files since the code is
 55 | #   intended to run in multiple environments; otherwise, check them in:
 56 | .python-version
 57 | 
 58 | # pipenv
 59 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 60 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 61 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 62 | #   install all needed dependencies.
 63 | Pipfile.lock
 64 | 
 65 | # poetry
 66 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 67 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
 68 | #   commonly ignored for libraries.
 69 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 70 | poetry.lock
 71 | 
 72 | # pdm
 73 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 74 | #pdm.lock
 75 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 76 | #   in version control.
 77 | #   https://pdm.fming.dev/#use-with-ide
 78 | .pdm.toml
 79 | 
 80 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 81 | __pypackages__/
 82 | 
 83 | 
 84 | # mypy
 85 | .mypy_cache/
 86 | .dmypy.json
 87 | dmypy.json
 88 | 
 89 | # Pyre type checker
 90 | .pyre/
 91 | 
 92 | # pytype static type analyzer
 93 | .pytype/
 94 | 
 95 | # Cython debug symbols
 96 | cython_debug/
 97 | 
 98 | # PyCharm
 99 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
100 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
101 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
102 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
103 | .idea/
104 | 
105 | # local training outputs
106 | outputs/*
107 | 


--------------------------------------------------------------------------------
/.github/workflows/lint_and_test.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: ["main"]
 9 |   pull_request:
10 |     branches: ["main"]
11 | 
12 | jobs:
13 |   linter:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ["3.8"]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v3
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           cache: "pip"
27 |       - name: Install dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           pip install -r requirements.txt
31 |           pip install -r requirements-dev.txt
32 |       - name: Lint with flake8
33 |         run: |
34 |           # stop the build if there are Python syntax errors or undefined names
35 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |       - name: isort
39 |         run: isort --check --diff .
40 |       - name: mypy
41 |         run: mypy --install-types --non-interactive ./ --cache-dir=.mypy_cache/
42 | 
43 |   unit_test:
44 |     runs-on: ubuntu-latest
45 |     timeout-minutes: 20
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         python-version: ["3.8", "3.9", "3.10"]
50 |     steps:
51 |       - uses: actions/checkout@v3
52 |       - name: Set up Python ${{ matrix.python-version }}
53 |         uses: actions/setup-python@v3
54 |         with:
55 |           python-version: ${{ matrix.python-version }}
56 |           cache: "pip"
57 |       - name: Install dependencies
58 |         run: |
59 |           sudo apt-get install libsndfile1
60 |           python -m pip install --upgrade pip
61 |           # We also test that pyproject.toml and requirements*.txt are synced
62 |           pip install -r requirements-dev.txt
63 |           pip install -e .
64 |       - name: pytest_unit
65 |         run: pytest -s -v tests/test_models.py
66 | 
67 |   unit_test_old_torch:
68 |     runs-on: ubuntu-latest
69 |     timeout-minutes: 20
70 |     strategy:
71 |       fail-fast: false
72 |       matrix:
73 |         python-version: ["3.8"]
74 |     steps:
75 |       - uses: actions/checkout@v3
76 |       - name: Set up Python ${{ matrix.python-version }}
77 |         uses: actions/setup-python@v3
78 |         with:
79 |           python-version: ${{ matrix.python-version }}
80 |           cache: "pip"
81 |       - name: Install dependencies
82 |         run: |
83 |           sudo apt-get install libsndfile1
84 |           python -m pip install --upgrade pip
85 |           pip install torch==1.13.0 torchaudio==0.13.0 func_argparse soundfile pytest omegaconf numpy julius
86 |           pip install --no-deps -e .
87 |       - name: pytest_unit
88 |         run: pytest -s -v tests/test_models.py
89 | 


--------------------------------------------------------------------------------
/src/audioseal/builder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from dataclasses import dataclass
  8 | from typing import Any, Dict, List, Optional
  9 | 
 10 | from torch import device, dtype
 11 | from typing_extensions import TypeAlias
 12 | 
 13 | from audioseal.libs import audiocraft
 14 | from audioseal.models import AudioSealDetector, AudioSealWM, MsgProcessor
 15 | 
 16 | Device: TypeAlias = device
 17 | 
 18 | DataType: TypeAlias = dtype
 19 | 
 20 | 
 21 | @dataclass
 22 | class SEANetConfig:
 23 |     """
 24 |     Map common hparams of SEANet encoder and decoder.
 25 |     """
 26 | 
 27 |     channels: int
 28 |     dimension: int
 29 |     n_filters: int
 30 |     n_residual_layers: int
 31 |     ratios: List[int]
 32 |     activation: str
 33 |     activation_params: Dict[str, float]
 34 |     norm: str
 35 |     norm_params: Dict[str, Any]
 36 |     kernel_size: int
 37 |     last_kernel_size: int
 38 |     residual_kernel_size: int
 39 |     dilation_base: int
 40 |     causal: bool
 41 |     pad_mode: str
 42 |     true_skip: bool
 43 |     compress: int
 44 |     lstm: int
 45 |     disable_norm_outer_blocks: int
 46 | 
 47 | 
 48 | @dataclass
 49 | class DecoderConfig:
 50 |     final_activation: Optional[str]
 51 |     final_activation_params: Optional[dict]
 52 |     trim_right_ratio: float
 53 | 
 54 | 
 55 | @dataclass
 56 | class DetectorConfig:
 57 |     output_dim: int
 58 | 
 59 | 
 60 | @dataclass
 61 | class AudioSealWMConfig:
 62 |     nbits: int
 63 |     seanet: SEANetConfig
 64 |     decoder: DecoderConfig
 65 | 
 66 | 
 67 | @dataclass
 68 | class AudioSealDetectorConfig:
 69 |     nbits: int
 70 |     seanet: SEANetConfig
 71 |     detector: DetectorConfig
 72 | 
 73 | 
 74 | def create_generator(
 75 |     config: AudioSealWMConfig,
 76 |     *,
 77 |     device: Optional[Device] = None,
 78 |     dtype: Optional[DataType] = None,
 79 | ) -> AudioSealWM:
 80 |     """Create a generator from hparams"""
 81 | 
 82 |     #  Currently the encoder hparams are the same as
 83 |     # SEANet, but this can be changed in the future.
 84 |     encoder = audiocraft.modules.SEANetEncoder(**config.seanet)  # type: ignore[arg-type]
 85 |     encoder = encoder.to(device=device, dtype=dtype)
 86 | 
 87 |     decoder_config = {**config.seanet, **config.decoder}  # type: ignore
 88 |     decoder = audiocraft.modules.SEANetDecoder(**decoder_config)  # type: ignore[arg-type]
 89 |     decoder = decoder.to(device=device, dtype=dtype)
 90 | 
 91 |     msgprocessor = MsgProcessor(nbits=config.nbits, hidden_size=config.seanet.dimension)
 92 |     msgprocessor = msgprocessor.to(device=device, dtype=dtype)
 93 | 
 94 |     return AudioSealWM(encoder=encoder, decoder=decoder, msg_processor=msgprocessor)
 95 | 
 96 | 
 97 | def create_detector(
 98 |     config: AudioSealDetectorConfig,
 99 |     *,
100 |     device: Optional[Device] = None,
101 |     dtype: Optional[DataType] = None,
102 | ) -> AudioSealDetector:
103 |     detector_config = {**config.seanet, **config.detector}  # type: ignore
104 |     detector = AudioSealDetector(nbits=config.nbits, **detector_config)
105 |     detector = detector.to(device=device, dtype=dtype)
106 |     return detector
107 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@meta.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/src/audioseal/loader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | 
  8 | import os
  9 | from dataclasses import fields
 10 | from hashlib import sha1
 11 | from pathlib import Path
 12 | from typing import (  # type: ignore[attr-defined]
 13 |     Any,
 14 |     Dict,
 15 |     List,
 16 |     Optional,
 17 |     Tuple,
 18 |     Type,
 19 |     TypeVar,
 20 |     Union,
 21 |     cast,
 22 | )
 23 | from urllib.parse import urlparse  # noqa: F401
 24 | 
 25 | import torch
 26 | from omegaconf import DictConfig, OmegaConf
 27 | 
 28 | from audioseal.builder import (
 29 |     AudioSealDetectorConfig,
 30 |     AudioSealWMConfig,
 31 |     create_detector,
 32 |     create_generator,
 33 | )
 34 | from audioseal.models import AudioSealDetector, AudioSealWM
 35 | 
 36 | AudioSealT = TypeVar("AudioSealT", AudioSealWMConfig, AudioSealDetectorConfig)
 37 | 
 38 | 
 39 | class ModelLoadError(RuntimeError):
 40 |     """Raised when the model loading fails"""
 41 | 
 42 | 
 43 | def _get_path_from_env(var_name: str) -> Optional[Path]:
 44 |     pathname = os.getenv(var_name)
 45 |     if not pathname:
 46 |         return None
 47 | 
 48 |     try:
 49 |         return Path(pathname)
 50 |     except ValueError as ex:
 51 |         raise RuntimeError(f"Expect valid pathname, get '{pathname}'.") from ex
 52 | 
 53 | 
 54 | def _get_cache_dir(env_names: List[str]):
 55 |     """Re-use cache dir from a list of existing caches"""
 56 |     for env in env_names:
 57 |         cache_dir = _get_path_from_env(env)
 58 |         if cache_dir:
 59 |             break
 60 |     else:
 61 |         cache_dir = Path("~/.cache").expanduser().resolve()
 62 | 
 63 |     # Create a sub-dir to not mess up with existing caches
 64 |     cache_dir = cache_dir / "audioseal"
 65 |     cache_dir.mkdir(exist_ok=True, parents=True)
 66 | 
 67 |     return cache_dir
 68 | 
 69 | 
 70 | def load_model_checkpoint(
 71 |     model_path: Union[Path, str],
 72 |     device: Union[str, torch.device] = "cpu",
 73 | ):
 74 |     if Path(model_path).is_file():
 75 |         return torch.load(model_path, map_location=device)
 76 | 
 77 |     cache_dir = _get_cache_dir(
 78 |         ["AUDIOSEAL_CACHE_DIR", "AUDIOCRAFT_CACHE_DIR", "XDG_CACHE_HOME"]
 79 |     )
 80 |     parts = urlparse(str(model_path))
 81 |     if parts.scheme == "https":
 82 | 
 83 |         # TODO: Add HF Hub
 84 |         hash_ = sha1(parts.path.encode()).hexdigest()[:24]
 85 |         return torch.hub.load_state_dict_from_url(
 86 |             str(model_path), model_dir=cache_dir, map_location=device, file_name=hash_
 87 |         )
 88 |     else:
 89 |         raise ModelLoadError(f"Path or uri {model_path} is unknown or does not exist")
 90 | 
 91 | 
 92 | def load_local_model_config(model_card: str) -> Optional[DictConfig]:
 93 |     config_file = Path(__file__).parent / "cards" / (model_card + ".yaml")
 94 |     if Path(config_file).is_file():
 95 |         return cast(DictConfig, OmegaConf.load(config_file.resolve()))
 96 |     else:
 97 |         return None
 98 | 
 99 | 
100 | class AudioSeal:
101 | 
102 |     @staticmethod
103 |     def _parse_model(
104 |         model_card_or_path: str,
105 |         model_type: Type[AudioSealT],
106 |         nbits: Optional[int] = None,
107 |     ) -> Tuple[Dict[str, Any], AudioSealT]:
108 |         """
109 |         Parse the information from the model card or checkpoint path using
110 |         the schema `model_type` that defines the model type
111 |         """
112 |         # Get the raw checkpoint and config from the local model cards
113 |         config = load_local_model_config(model_card_or_path)
114 | 
115 |         if config:
116 |             assert "checkpoint" in config, f"Checkpoint missing in {model_card_or_path}"
117 |             config_dict = OmegaConf.to_container(config)
118 |             assert isinstance(
119 |                 config_dict, dict
120 |             ), f"Cannot parse config from {model_card_or_path}"
121 |             checkpoint = config_dict.pop("checkpoint")
122 |             checkpoint = load_model_checkpoint(checkpoint)
123 | 
124 |         # Get the raw checkpoint and config from the checkpoint path
125 |         else:
126 |             config_dict = {}
127 |             checkpoint = load_model_checkpoint(model_card_or_path)
128 | 
129 |         # If the checkpoint has config in its, take this but uses the info
130 |         # in the mode as precedence
131 |         assert isinstance(
132 |             checkpoint, dict
133 |         ), f"Expect loaded checkpoint to be a dictionary, get {type(checkpoint)}"
134 |         assert isinstance(
135 |             config_dict, dict
136 |         ), f"Except loaded config to be a dictionary, get {type(config_dict)}"
137 |         if "xp.cfg" in checkpoint:
138 |             config = {**checkpoint["xp.cfg"], **config_dict}  # type: ignore
139 |             assert config is not None
140 |             assert (
141 |                 "seanet" in config
142 |             ), f"missing seanet backbone config in {model_card_or_path}"
143 | 
144 |             # Patch 1: Resolve the variables in the checkpoint
145 |             config = OmegaConf.create(config)
146 |             OmegaConf.resolve(config)
147 |             config = OmegaConf.to_container(config)  # type: ignore
148 | 
149 |             # Patch 2: Put decoder, encoder and detector outside seanet
150 |             seanet_config = config["seanet"]
151 |             for key_to_patch in ["encoder", "decoder", "detector"]:
152 |                 if key_to_patch in seanet_config:
153 |                     config_to_patch = config.get(key_to_patch) or {}
154 |                     config[key_to_patch] = {
155 |                         **config_to_patch,
156 |                         **seanet_config.pop(key_to_patch),
157 |                     }
158 | 
159 |             config["seanet"] = seanet_config
160 | 
161 |             # Patch 3: Put nbits into config if specified
162 |             if nbits and "nbits" not in config:
163 |                 config["nbits"] = nbits
164 | 
165 |         if "model" in checkpoint:
166 |             checkpoint = checkpoint["model"]
167 | 
168 |         # remove attributes not related to the model_type
169 |         result_config = {}
170 |         assert config, f"Empty config in {model_card_or_path}"
171 |         for field in fields(model_type):
172 |             if field.name in config:
173 |                 result_config[field.name] = config[field.name]
174 | 
175 |         schema = OmegaConf.structured(model_type)
176 |         schema.merge_with(result_config)
177 |         return checkpoint, schema
178 | 
179 |     @staticmethod
180 |     def load_generator(
181 |         model_card_or_path: str,
182 |         nbits: Optional[int] = None,
183 |     ) -> AudioSealWM:
184 |         """Load the AudioSeal generator from the model card"""
185 |         checkpoint, config = AudioSeal._parse_model(
186 |             model_card_or_path, AudioSealWMConfig, nbits=nbits,
187 |         )
188 | 
189 |         model = create_generator(config)
190 |         model.load_state_dict(checkpoint)
191 |         return model
192 | 
193 |     @staticmethod
194 |     def load_detector(
195 |         model_card_or_path: str,
196 |         nbits: Optional[int] = None,
197 |     ) -> AudioSealDetector:
198 |         checkpoint, config = AudioSeal._parse_model(
199 |             model_card_or_path, AudioSealDetectorConfig, nbits=nbits,
200 |         )
201 |         model = create_detector(config)
202 |         model.load_state_dict(checkpoint)
203 |         return model
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # :loud_sound: AudioSeal: Proactive Localized Watermarking
  2 | 
  3 | <a href="https://www.python.org/"><img alt="Python" src="https://img.shields.io/badge/-Python 3.8+-blue?style=for-the-badge&logo=python&logoColor=white"></a>
  4 | <a href="https://black.readthedocs.io/en/stable/"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-black.svg?style=for-the-badge&labelColor=gray"></a>
  5 | 
  6 | Inference code for AudioSeal, a method for speech localized watermarking, with state-of-the-art robustness and detector speed (training code coming soon).
  7 | More details can be found in the [paper](https://arxiv.org/abs/2401.17264).
  8 | 
  9 | [[`arXiv`](https://arxiv.org/abs/2401.17264)]
 10 | [[`Colab notebook`](https://colab.research.google.com/github/facebookresearch/audioseal/blob/master/examples/colab.ipynb)][[🤗`Hugging Face`](https://huggingface.co/facebook/audioseal)]
 11 | 
 12 | ![fig](https://github.com/facebookresearch/audioseal/assets/1453243/5d8cd96f-47b5-4c34-a3fa-7af386ed59f2)
 13 | 
 14 | # Updates:
 15 | 
 16 | - 2024-04-02: We have updated our license to full MIT license (including the license for the model weights) ! Now you can use AudioSeal in commercial application too !
 17 | - 2024-02-29: AudioSeal 0.1.2 is out, with more bug fixes for resampled audios and updated notebooks
 18 | 
 19 | # Abtract
 20 | 
 21 | We introduce AudioSeal, a method for speech localized watermarking, with state-of-the-art robustness and detector speed. It jointly trains a generator that embeds a watermark in the audio, and a detector that detects the watermarked fragments in longer audios, even in the presence of editing.
 22 | Audioseal achieves state-of-the-art detection performance of both natural and synthetic speech at the sample level (1/16k second resolution), it generates limited alteration of signal quality and is robust to many types of audio editing. 
 23 | Audioseal is designed with a fast, single-pass detector, that significantly surpasses existing models in speed — achieving detection up to two orders of magnitude faster, making it ideal for large-scale and real-time applications.
 24 | 
 25 | # :mate: Installation
 26 | 
 27 | AudioSeal requires Python >=3.8, Pytorch >= 1.13.0, [omegaconf](https://omegaconf.readthedocs.io/), [julius](https://pypi.org/project/julius/), and numpy. To install from PyPI:
 28 | 
 29 | ```
 30 | pip install audioseal
 31 | ```
 32 | 
 33 | To install from source: Clone this repo and install in editable mode:
 34 | 
 35 | ```
 36 | git clone https://github.com/facebookresearch/audioseal
 37 | cd audioseal
 38 | pip install -e .
 39 | ```
 40 | 
 41 | # :gear: Models
 42 | 
 43 | You can find all the model checkpoints on the [Hugging Face Hub](https://huggingface.co/facebook/audioseal). We provide the checkpoints for the following models:
 44 | 
 45 | - [AudioSeal Generator](src/audioseal/cards/audioseal_wm_16bits.yaml).
 46 |   It takes as input an audio signal (as a waveform), and outputs a watermark of the same size as the input, that can be added to the input to watermark it.
 47 |   Optionally, it can also take as input a secret message of 16-bits that will be encoded in the watermark.
 48 | - [AudioSeal Detector](src/audioseal/cards/audioseal_detector_16bits.yaml).
 49 |   It takes as input an audio signal (as a waveform), and outputs a probability that the input contains a watermark at each sample of the audio (every 1/16k s).
 50 |   Optionally, it may also output the secret message encoded in the watermark.
 51 | 
 52 | Note that the message is optional and has no influence on the detection output. It may be used to identify a model version for instance (up to $2**16=65536$ possible choices).
 53 | 
 54 | **Note**: We are working to release the training code for anyone wants to build their own watermarker. Stay tuned !
 55 | 
 56 | # :abacus: Usage
 57 | 
 58 | Audioseal provides a simple API to watermark and detect the watermarks from an audio sample. Example usage:
 59 | 
 60 | ```python
 61 | 
 62 | from audioseal import AudioSeal
 63 | 
 64 | # model name corresponds to the YAML card file name found in audioseal/cards
 65 | model = AudioSeal.load_generator("audioseal_wm_16bits")
 66 | 
 67 | # Other way is to load directly from the checkpoint
 68 | # model =  Watermarker.from_pretrained(checkpoint_path, device = wav.device)
 69 | 
 70 | # a torch tensor of shape (batch, channels, samples) and a sample rate
 71 | # It is important to process the audio to the same sample rate as the model
 72 | # expectes. In our case, we support 16khz audio 
 73 | wav, sr = ..., 16000
 74 | 
 75 | watermark = model.get_watermark(wav, sr)
 76 | 
 77 | # Optional: you can add a 16-bit message to embed in the watermark
 78 | # msg = torch.randint(0, 2, (wav.shape(0), model.msg_processor.nbits), device=wav.device)
 79 | # watermark = model.get_watermark(wav, message = msg)
 80 | 
 81 | watermarked_audio = wav + watermark
 82 | 
 83 | detector = AudioSeal.load_detector("audioseal_detector_16bits")
 84 | 
 85 | # To detect the messages in the high-level.
 86 | result, message = detector.detect_watermark(watermarked_audio, sr)
 87 | 
 88 | print(result) # result is a float number indicating the probability of the audio being watermarked,
 89 | print(message)  # message is a binary vector of 16 bits
 90 | 
 91 | 
 92 | # To detect the messages in the low-level.
 93 | result, message = detector(watermarked_audio, sr)
 94 | 
 95 | # result is a tensor of size batch x 2 x frames, indicating the probability (positive and negative) of watermarking for each frame
 96 | # A watermarked audio should have result[:, 1, :] > 0.5
 97 | print(result[:, 1 , :])  
 98 | 
 99 | # Message is a tensor of size batch x 16, indicating of the probability of each bit to be 1.
100 | # message will be a random tensor if the detector detects no watermarking from the audio
101 | print(message)  
102 | ```
103 | 
104 | # Want to contribute?
105 | 
106 |  We welcome Pull Requests with improvements or suggestions.
107 |  If you want to flag an issue or propose an improvement, but dont' know how to realize it, create a GitHub Issue.
108 | 
109 | # Troubleshooting
110 | 
111 | - If you encounter the error `ValueError: not enough values to unpack (expected 3, got 2)`, this is because we expect a batch of audio  tensors as inputs. Add one
112 | dummy batch dimension to your input (e.g. `wav.unsqueeze(0)`, see [example notebook for getting started](examples/Getting_started.ipynb)).
113 | 
114 | - In Windows machines, if you encounter the error `KeyError raised while resolving interpolation: "Environmen variable 'USER' not found"`: This is due to an old checkpoint
115 | uploaded to the model hub, which is not compatible in Windows. Try to invalidate the cache by removing the files in `C:\Users\<USER>\.cache\audioseal`
116 | and re-run again.
117 | 
118 | - If you use torchaudio to handle your audios and encounter the error `Couldn't find appropriate backend to handle uri ...`, this is due to newer version of 
119 | torchaudio does not handle the default backend well. Either downgrade your torchaudio to `2.0.1` or earlier, or install `soundfile` as your audio backend.
120 | 
121 | # License
122 | 
123 | - The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
124 | 
125 | # Maintainers:
126 | - [Tuan Tran](https://github.com/antoine-tran)
127 | - [Hady Elsahar](https://github.com/hadyelsahar)
128 | - [Pierre Fernandez](https://github.com/pierrefdz)
129 | - [Robin San Roman](https://github.com/robinsrm)
130 | 
131 | # Citation
132 | 
133 | If you find this repository useful, please consider giving a star :star: and please cite as:
134 | 
135 | ```
136 | @article{sanroman2024proactive,
137 |   title={Proactive Detection of Voice Cloning with Localized Watermarking},
138 |   author={San Roman, Robin and Fernandez, Pierre and Elsahar, Hady and D´efossez, Alexandre and Furon, Teddy and Tran, Tuan},
139 |   journal={arXiv preprint},
140 |   year={2024}
141 | }
142 | ```
143 | 


--------------------------------------------------------------------------------
/src/audioseal/models.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import logging
  8 | from typing import Optional, Tuple
  9 | 
 10 | import julius
 11 | import torch
 12 | 
 13 | from audioseal.libs.audiocraft.modules.seanet import SEANetEncoderKeepDimension
 14 | 
 15 | logger = logging.getLogger("Audioseal")
 16 | 
 17 | COMPATIBLE_WARNING = """
 18 | AudioSeal is designed to work at a sample rate 16khz.
 19 | Implicit sampling rate usage is deprecated and will be removed in future version.
 20 | To remove this warning please add this argument to the function call: 
 21 | sample_rate = your_sample_rate
 22 | """
 23 | 
 24 | 
 25 | class MsgProcessor(torch.nn.Module):
 26 |     """
 27 |     Apply the secret message to the encoder output.
 28 |     Args:
 29 |         nbits: Number of bits used to generate the message. Must be non-zero
 30 |         hidden_size: Dimension of the encoder output
 31 |     """
 32 | 
 33 |     def __init__(self, nbits: int, hidden_size: int):
 34 |         super().__init__()
 35 |         assert nbits > 0, "MsgProcessor should not be built in 0bit watermarking"
 36 |         self.nbits = nbits
 37 |         self.hidden_size = hidden_size
 38 |         self.msg_processor = torch.nn.Embedding(2 * nbits, hidden_size)
 39 | 
 40 |     def forward(self, hidden: torch.Tensor, msg: torch.Tensor) -> torch.Tensor:
 41 |         """
 42 |         Build the embedding map: 2 x k -> k x h, then sum on the first dim
 43 |         Args:
 44 |             hidden: The encoder output, size: batch x hidden x frames
 45 |             msg: The secret message, size: batch x k
 46 |         """
 47 |         # create indices to take from embedding layer
 48 |         indices = 2 * torch.arange(msg.shape[-1]).to(msg.device)  # k: 0 2 4 ... 2k
 49 |         indices = indices.repeat(msg.shape[0], 1)  # b x k
 50 |         indices = (indices + msg).long()
 51 |         msg_aux = self.msg_processor(indices)  # b x k -> b x k x h
 52 |         msg_aux = msg_aux.sum(dim=-2)  # b x k x h -> b x h
 53 |         msg_aux = msg_aux.unsqueeze(-1).repeat(
 54 |             1, 1, hidden.shape[2]
 55 |         )  # b x h -> b x h x t/f
 56 |         hidden = hidden + msg_aux  # -> b x h x t/f
 57 |         return hidden
 58 | 
 59 | 
 60 | class AudioSealWM(torch.nn.Module):
 61 |     """
 62 |     Generate watermarking for a given audio signal
 63 |     """
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         encoder: torch.nn.Module,
 68 |         decoder: torch.nn.Module,
 69 |         msg_processor: Optional[torch.nn.Module] = None,
 70 |     ):
 71 |         super().__init__()
 72 |         self.encoder = encoder
 73 |         self.decoder = decoder
 74 |         # The build should take care of validating the dimensions between component
 75 |         self.msg_processor = msg_processor
 76 |         self._message: Optional[torch.Tensor] = None
 77 | 
 78 |     @property
 79 |     def message(self) -> Optional[torch.Tensor]:
 80 |         return self._message
 81 | 
 82 |     @message.setter
 83 |     def message(self, message: torch.Tensor) -> None:
 84 |         self._message = message
 85 | 
 86 |     def get_watermark(
 87 |         self,
 88 |         x: torch.Tensor,
 89 |         sample_rate: Optional[int] = None,
 90 |         message: Optional[torch.Tensor] = None,
 91 |     ) -> torch.Tensor:
 92 |         """
 93 |         Get the watermark from an audio tensor and a message.
 94 |         If the input message is None, a random message of
 95 |         n bits {0,1} will be generated.
 96 |         Args:
 97 |             x: Audio signal, size: batch x frames
 98 |             sample_rate: The sample rate of the input audio (default 16khz as
 99 |                 currently supported by the main AudioSeal model)
100 |             message: An optional binary message, size: batch x k
101 |         """
102 |         length = x.size(-1)
103 |         if sample_rate is None:
104 |             logger.warning(COMPATIBLE_WARNING)
105 |             sample_rate = 16_000
106 |         assert sample_rate
107 |         if sample_rate != 16000:
108 |             x = julius.resample_frac(x, old_sr=sample_rate, new_sr=16000)
109 |         hidden = self.encoder(x)
110 | 
111 |         if self.msg_processor is not None:
112 |             if message is None:
113 |                 self.message = self.message or torch.randint(
114 |                     0, 2, (x.shape[0], self.msg_processor.nbits), device=x.device
115 |                 )
116 |                 message = self.message
117 | 
118 |             hidden = self.msg_processor(hidden, message)
119 | 
120 |         watermark = self.decoder(hidden)
121 | 
122 |         if sample_rate != 16000:
123 |             watermark = julius.resample_frac(watermark, old_sr=16000, new_sr=sample_rate)
124 | 
125 |         return watermark[
126 |             ..., : length
127 |         ]  # trim output cf encodec codebase
128 | 
129 |     def forward(
130 |         self,
131 |         x: torch.Tensor,
132 |         sample_rate: Optional[int] = None,
133 |         message: Optional[torch.Tensor] = None,
134 |         alpha: float = 1.0,
135 |     ) -> torch.Tensor:
136 |         """Apply the watermarking to the audio signal x with a tune-down ratio (default 1.0)"""
137 |         if sample_rate is None:
138 |             logger.warning(COMPATIBLE_WARNING)
139 |             sample_rate = 16_000
140 |         wm = self.get_watermark(x, sample_rate=sample_rate, message=message)
141 |         return x + alpha * wm
142 | 
143 | 
144 | class AudioSealDetector(torch.nn.Module):
145 |     """
146 |     Detect the watermarking from an audio signal
147 |     Args:
148 |         SEANetEncoderKeepDimension (_type_): _description_
149 |         nbits (int): The number of bits in the secret message. The result will have size
150 |             of 2 + nbits, where the first two items indicate the possibilities of the
151 |             audio being watermarked (positive / negative scores), he rest is used to decode
152 |             the secret message. In 0bit watermarking (no secret message), the detector just
153 |             returns 2 values.
154 |     """
155 | 
156 |     def __init__(self, *args, nbits: int = 0, **kwargs):
157 |         super().__init__()
158 |         encoder = SEANetEncoderKeepDimension(*args, **kwargs)
159 |         last_layer = torch.nn.Conv1d(encoder.output_dim, 2 + nbits, 1)
160 |         self.detector = torch.nn.Sequential(encoder, last_layer)
161 |         self.nbits = nbits
162 | 
163 |     def detect_watermark(
164 |         self,
165 |         x: torch.Tensor,
166 |         sample_rate: Optional[int] = None,
167 |         message_threshold: float = 0.5
168 |     ) -> Tuple[float, torch.Tensor]:
169 |         """
170 |         A convenience function that returns a probability of an audio being watermarked,
171 |         together with its message in n-bits (binary) format. If the audio is not watermarked,
172 |         the message will be random.
173 |         Args:
174 |             x: Audio signal, size: batch x frames
175 |             sample_rate: The sample rate of the input audio
176 |             message_threshold: threshold used to convert the watermark output (probability
177 |                 of each bits being 0 or 1) into the binary n-bit message. 
178 |         """
179 |         if sample_rate is None:
180 |             logger.warning(COMPATIBLE_WARNING)
181 |             sample_rate = 16_000
182 |         result, message = self.forward(x, sample_rate=sample_rate)  # b x 2+nbits
183 |         detected = torch.count_nonzero(torch.gt(result[:, 1, :], 0.5)) / result.shape[-1]
184 |         detect_prob = detected.cpu().item()  # type: ignore
185 |         message = torch.gt(message, message_threshold).int()
186 |         return detect_prob, message
187 | 
188 |     def decode_message(self, result: torch.Tensor) -> torch.Tensor:
189 |         """
190 |         Decode the message from the watermark result (batch x nbits x frames)
191 |         Args:
192 |             result: watermark result (batch x nbits x frames)
193 |         Returns:
194 |             The message of size batch x nbits, indicating probability of 1 for each bit
195 |         """
196 |         assert (
197 |             (result.dim() > 2 and result.shape[1] == self.nbits) or
198 |             (self.dim() == 2 and result.shape[0] == self.nbits)
199 |         ), f"Expect message of size [,{self.nbits}, frames] (get {result.size()})"
200 |         decoded_message = result.mean(dim=-1)
201 |         return torch.sigmoid(decoded_message)
202 | 
203 |     def forward(
204 |         self,
205 |         x: torch.Tensor,
206 |         sample_rate: Optional[int] = None,
207 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
208 |         """
209 |         Detect the watermarks from the audio signal
210 |         Args:
211 |             x: Audio signal, size batch x frames
212 |             sample_rate: The sample rate of the input audio
213 |         """
214 |         if sample_rate is None:
215 |             logger.warning(COMPATIBLE_WARNING)
216 |             sample_rate = 16_000
217 |         assert sample_rate
218 |         if sample_rate != 16000:
219 |             x = julius.resample_frac(x, old_sr=sample_rate, new_sr=16000)
220 |         result = self.detector(x)  # b x 2+nbits
221 |         # hardcode softmax on 2 first units used for detection
222 |         result[:, :2, :] = torch.softmax(result[:, :2, :], dim=1)
223 |         message = self.decode_message(result[:, 2:, :])
224 |         return result[:, :2, :], message
225 | 


--------------------------------------------------------------------------------
/examples/attacks.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import typing as tp
  8 | 
  9 | import julius
 10 | import torch
 11 | 
 12 | 
 13 | def generate_pink_noise(length: int) -> torch.Tensor:
 14 |     """
 15 |     Generate pink noise using Voss-McCartney algorithm with PyTorch.
 16 |     """
 17 |     num_rows = 16
 18 |     array = torch.randn(num_rows, length // num_rows + 1)
 19 |     reshaped_array = torch.cumsum(array, dim=1)
 20 |     reshaped_array = reshaped_array.reshape(-1)
 21 |     reshaped_array = reshaped_array[:length]
 22 |     # Normalize
 23 |     pink_noise = reshaped_array / torch.max(torch.abs(reshaped_array))
 24 |     return pink_noise
 25 | 
 26 | 
 27 | def audio_effect_return(
 28 |     tensor: torch.Tensor, mask: tp.Optional[torch.Tensor]
 29 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
 30 |     """Return the mask if it was in the input otherwise only the output tensor"""
 31 |     if mask is None:
 32 |         return tensor
 33 |     else:
 34 |         return tensor, mask
 35 | 
 36 | 
 37 | class AudioEffects:
 38 |     @staticmethod
 39 |     def speed(
 40 |         tensor: torch.Tensor,
 41 |         speed_range: tuple = (0.5, 1.5),
 42 |         sample_rate: int = 16000,
 43 |         mask: tp.Optional[torch.Tensor] = None,
 44 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
 45 |         """
 46 |         Function to change the speed of a batch of audio data.
 47 |         The output will have a different length !
 48 | 
 49 |         Parameters:
 50 |         audio_batch (torch.Tensor): The batch of audio data in torch tensor format.
 51 |         speed (float): The speed to change the audio to.
 52 | 
 53 |         Returns:
 54 |         torch.Tensor: The batch of audio data with the speed changed.
 55 |         """
 56 |         speed = torch.FloatTensor(1).uniform_(*speed_range)
 57 |         new_sr = int(sample_rate * 1 / speed)
 58 |         resampled_tensor = julius.resample_frac(tensor, sample_rate, new_sr)
 59 |         if mask is None:
 60 |             return resampled_tensor
 61 |         else:
 62 |             return resampled_tensor, torch.nn.functional.interpolate(
 63 |                 mask, size=resampled_tensor.size(-1), mode="nearest-exact"
 64 |             )
 65 | 
 66 |     @staticmethod
 67 |     def updownresample(
 68 |         tensor: torch.Tensor,
 69 |         sample_rate: int = 16000,
 70 |         intermediate_freq: int = 32000,
 71 |         mask: tp.Optional[torch.Tensor] = None,
 72 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
 73 | 
 74 |         orig_shape = tensor.shape
 75 |         # upsample
 76 |         tensor = julius.resample_frac(tensor, sample_rate, intermediate_freq)
 77 |         # downsample
 78 |         tensor = julius.resample_frac(tensor, intermediate_freq, sample_rate)
 79 | 
 80 |         assert tensor.shape == orig_shape
 81 |         return audio_effect_return(tensor=tensor, mask=mask)
 82 | 
 83 |     @staticmethod
 84 |     def echo(
 85 |         tensor: torch.Tensor,
 86 |         volume_range: tuple = (0.1, 0.5),
 87 |         duration_range: tuple = (0.1, 0.5),
 88 |         sample_rate: int = 16000,
 89 |         mask: tp.Optional[torch.Tensor] = None,
 90 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
 91 |         """
 92 |         Attenuating the audio volume by a factor of 0.4, delaying it by 100ms,
 93 |         and then overlaying it with the original.
 94 | 
 95 |         :param tensor: 3D Tensor representing the audio signal [bsz, channels, frames]
 96 |         :param echo_volume: volume of the echo signal
 97 |         :param sample_rate: Sample rate of the audio signal.
 98 |         :return: Audio signal with reverb.
 99 |         """
100 | 
101 |         # Create a simple impulse response
102 |         # Duration of the impulse response in seconds
103 |         duration = torch.FloatTensor(1).uniform_(*duration_range)
104 |         volume = torch.FloatTensor(1).uniform_(*volume_range)
105 | 
106 |         n_samples = int(sample_rate * duration)
107 |         impulse_response = torch.zeros(n_samples).type(tensor.type()).to(tensor.device)
108 | 
109 |         # Define a few reflections with decreasing amplitude
110 |         impulse_response[0] = 1.0  # Direct sound
111 | 
112 |         impulse_response[
113 |             int(sample_rate * duration) - 1
114 |         ] = volume  # First reflection after 100ms
115 | 
116 |         # Add batch and channel dimensions to the impulse response
117 |         impulse_response = impulse_response.unsqueeze(0).unsqueeze(0)
118 | 
119 |         # Convolve the audio signal with the impulse response
120 |         reverbed_signal = julius.fft_conv1d(tensor, impulse_response)
121 | 
122 |         # Normalize to the original amplitude range for stability
123 |         reverbed_signal = (
124 |             reverbed_signal
125 |             / torch.max(torch.abs(reverbed_signal))
126 |             * torch.max(torch.abs(tensor))
127 |         )
128 | 
129 |         # Ensure tensor size is not changed
130 |         tmp = torch.zeros_like(tensor)
131 |         tmp[..., : reverbed_signal.shape[-1]] = reverbed_signal
132 |         reverbed_signal = tmp
133 | 
134 |         return audio_effect_return(tensor=reverbed_signal, mask=mask)
135 | 
136 |     @staticmethod
137 |     def random_noise(
138 |         waveform: torch.Tensor,
139 |         noise_std: float = 0.001,
140 |         mask: tp.Optional[torch.Tensor] = None,
141 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
142 |         """Add Gaussian noise to the waveform."""
143 |         noise = torch.randn_like(waveform) * noise_std
144 |         noisy_waveform = waveform + noise
145 |         return audio_effect_return(tensor=noisy_waveform, mask=mask)
146 | 
147 |     @staticmethod
148 |     def pink_noise(
149 |         waveform: torch.Tensor,
150 |         noise_std: float = 0.01,
151 |         mask: tp.Optional[torch.Tensor] = None,
152 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
153 |         """Add pink background noise to the waveform."""
154 |         noise = generate_pink_noise(waveform.shape[-1]) * noise_std
155 |         noise = noise.to(waveform.device)
156 |         # Assuming waveform is of shape (bsz, channels, length)
157 |         noisy_waveform = waveform + noise.unsqueeze(0).unsqueeze(0).to(waveform.device)
158 |         return audio_effect_return(tensor=noisy_waveform, mask=mask)
159 | 
160 |     @staticmethod
161 |     def lowpass_filter(
162 |         waveform: torch.Tensor,
163 |         cutoff_freq: float = 5000,
164 |         sample_rate: int = 16000,
165 |         mask: tp.Optional[torch.Tensor] = None,
166 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
167 | 
168 |         return audio_effect_return(
169 |             tensor=julius.lowpass_filter(waveform, cutoff=cutoff_freq / sample_rate),
170 |             mask=mask,
171 |         )
172 | 
173 |     @staticmethod
174 |     def highpass_filter(
175 |         waveform: torch.Tensor,
176 |         cutoff_freq: float = 500,
177 |         sample_rate: int = 16000,
178 |         mask: tp.Optional[torch.Tensor] = None,
179 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
180 | 
181 |         return audio_effect_return(
182 |             tensor=julius.highpass_filter(waveform, cutoff=cutoff_freq / sample_rate),
183 |             mask=mask,
184 |         )
185 | 
186 |     @staticmethod
187 |     def bandpass_filter(
188 |         waveform: torch.Tensor,
189 |         cutoff_freq_low: float = 300,
190 |         cutoff_freq_high: float = 8000,
191 |         sample_rate: int = 16000,
192 |         mask: tp.Optional[torch.Tensor] = None,
193 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
194 |         """
195 |         Apply a bandpass filter to the waveform by cascading
196 |         a high-pass filter followed by a low-pass filter.
197 | 
198 |         Parameters:
199 |         - waveform (torch.Tensor): Input audio waveform.
200 |         - low_cutoff (float): Lower cutoff frequency.
201 |         - high_cutoff (float): Higher cutoff frequency.
202 |         - sample_rate (int): The sample rate of the waveform.
203 | 
204 |         Returns:
205 |         - torch.Tensor: Filtered audio waveform.
206 |         """
207 | 
208 |         return audio_effect_return(
209 |             tensor=julius.bandpass_filter(
210 |                 waveform,
211 |                 cutoff_low=cutoff_freq_low / sample_rate,
212 |                 cutoff_high=cutoff_freq_high / sample_rate,
213 |             ),
214 |             mask=mask,
215 |         )
216 | 
217 |     @staticmethod
218 |     def smooth(
219 |         tensor: torch.Tensor,
220 |         window_size_range: tuple = (2, 10),
221 |         mask: tp.Optional[torch.Tensor] = None,
222 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
223 |         """
224 |         Smooths the input tensor (audio signal) using a moving average filter with the given window size.
225 | 
226 |         Parameters:
227 |         - tensor (torch.Tensor): Input audio tensor. Assumes tensor shape is (batch_size, channels, time).
228 |         - window_size (int): Size of the moving average window.
229 | 
230 |         Returns:
231 |         - torch.Tensor: Smoothed audio tensor.
232 |         """
233 | 
234 |         window_size = int(torch.FloatTensor(1).uniform_(*window_size_range))
235 |         # Create a uniform smoothing kernel
236 |         kernel = torch.ones(1, 1, window_size).type(tensor.type()) / window_size
237 |         kernel = kernel.to(tensor.device)
238 | 
239 |         smoothed = julius.fft_conv1d(tensor, kernel)
240 |         # Ensure tensor size is not changed
241 |         tmp = torch.zeros_like(tensor)
242 |         tmp[..., : smoothed.shape[-1]] = smoothed
243 |         smoothed = tmp
244 | 
245 |         return audio_effect_return(tensor=smoothed, mask=mask)
246 | 
247 |     @staticmethod
248 |     def boost_audio(
249 |         tensor: torch.Tensor,
250 |         amount: float = 20,
251 |         mask: tp.Optional[torch.Tensor] = None,
252 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
253 |         return audio_effect_return(tensor=tensor * (1 + amount / 100), mask=mask)
254 | 
255 |     @staticmethod
256 |     def duck_audio(
257 |         tensor: torch.Tensor,
258 |         amount: float = 20,
259 |         mask: tp.Optional[torch.Tensor] = None,
260 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
261 |         return audio_effect_return(tensor=tensor * (1 - amount / 100), mask=mask)
262 | 
263 |     @staticmethod
264 |     def identity(
265 |         tensor: torch.Tensor, mask: tp.Optional[torch.Tensor] = None
266 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
267 |         return audio_effect_return(tensor=tensor, mask=mask)
268 | 
269 |     @staticmethod
270 |     def shush(
271 |         tensor: torch.Tensor,
272 |         fraction: float = 0.001,
273 |         mask: tp.Optional[torch.Tensor] = None
274 |     ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
275 |         """
276 |         Sets a specified chronological fraction of indices of the input tensor (audio signal) to 0.
277 | 
278 |         Parameters:
279 |         - tensor (torch.Tensor): Input audio tensor. Assumes tensor shape is (batch_size, channels, time).
280 |         - fraction (float): Fraction of indices to be set to 0 (from the start of the tensor) (default: 0.001, i.e, 0.1%)
281 | 
282 |         Returns:
283 |         - torch.Tensor: Transformed audio tensor.
284 |         """
285 |         time = tensor.size(-1)
286 |         shush_tensor = tensor.detach().clone()
287 |         
288 |         # Set the first `fraction*time` indices of the waveform to 0
289 |         shush_tensor[:, :, :int(fraction*time)] = 0.0
290 |                 
291 |         return audio_effect_return(tensor=shush_tensor, mask=mask)
292 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/audiocraft/modules/conv.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | # 
  7 | # Vendor from https://github.com/facebookresearch/audiocraft
  8 | 
  9 | import math
 10 | import typing as tp
 11 | import warnings
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | from torch.nn import functional as F
 16 | from torch.nn.utils import spectral_norm
 17 | 
 18 | try:
 19 |     from torch.nn.utils.parametrizations import weight_norm
 20 | except ImportError:
 21 |     # Old Pytorch
 22 |     from torch.nn.utils import weight_norm
 23 | 
 24 | 
 25 | CONV_NORMALIZATIONS = frozenset(
 26 |     ["none", "weight_norm", "spectral_norm", "time_group_norm"]
 27 | )
 28 | 
 29 | 
 30 | def apply_parametrization_norm(module: nn.Module, norm: str = "none"):
 31 |     assert norm in CONV_NORMALIZATIONS
 32 |     if norm == "weight_norm":
 33 |         return weight_norm(module)
 34 |     elif norm == "spectral_norm":
 35 |         return spectral_norm(module)
 36 |     else:
 37 |         # We already check was in CONV_NORMALIZATION, so any other choice
 38 |         # doesn't need reparametrization.
 39 |         return module
 40 | 
 41 | 
 42 | def get_norm_module(
 43 |     module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs
 44 | ):
 45 |     """Return the proper normalization module. If causal is True, this will ensure the returned
 46 |     module is causal, or return an error if the normalization doesn't support causal evaluation.
 47 |     """
 48 |     assert norm in CONV_NORMALIZATIONS
 49 |     if norm == "time_group_norm":
 50 |         if causal:
 51 |             raise ValueError("GroupNorm doesn't support causal evaluation.")
 52 |         assert isinstance(module, nn.modules.conv._ConvNd)
 53 |         return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
 54 |     else:
 55 |         return nn.Identity()
 56 | 
 57 | 
 58 | def get_extra_padding_for_conv1d(
 59 |     x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
 60 | ) -> int:
 61 |     """See `pad_for_conv1d`."""
 62 |     length = x.shape[-1]
 63 |     n_frames = (length - kernel_size + padding_total) / stride + 1
 64 |     ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
 65 |     return ideal_length - length
 66 | 
 67 | 
 68 | def pad_for_conv1d(
 69 |     x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
 70 | ):
 71 |     """Pad for a convolution to make sure that the last window is full.
 72 |     Extra padding is added at the end. This is required to ensure that we can rebuild
 73 |     an output of the same length, as otherwise, even with padding, some time steps
 74 |     might get removed.
 75 |     For instance, with total padding = 4, kernel size = 4, stride = 2:
 76 |         0 0 1 2 3 4 5 0 0   # (0s are padding)
 77 |         1   2   3           # (output frames of a convolution, last 0 is never used)
 78 |         0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
 79 |             1 2 3 4         # once you removed padding, we are missing one time step !
 80 |     """
 81 |     extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
 82 |     return F.pad(x, (0, extra_padding))
 83 | 
 84 | 
 85 | def pad1d(
 86 |     x: torch.Tensor,
 87 |     paddings: tp.Tuple[int, int],
 88 |     mode: str = "constant",
 89 |     value: float = 0.0,
 90 | ):
 91 |     """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
 92 |     If this is the case, we insert extra 0 padding to the right before the reflection happen.
 93 |     """
 94 |     length = x.shape[-1]
 95 |     padding_left, padding_right = paddings
 96 |     assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
 97 |     if mode == "reflect":
 98 |         max_pad = max(padding_left, padding_right)
 99 |         extra_pad = 0
100 |         if length <= max_pad:
101 |             extra_pad = max_pad - length + 1
102 |             x = F.pad(x, (0, extra_pad))
103 |         padded = F.pad(x, paddings, mode, value)
104 |         end = padded.shape[-1] - extra_pad
105 |         return padded[..., :end]
106 |     else:
107 |         return F.pad(x, paddings, mode, value)
108 | 
109 | 
110 | def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
111 |     """Remove padding from x, handling properly zero padding. Only for 1d!"""
112 |     padding_left, padding_right = paddings
113 |     assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
114 |     assert (padding_left + padding_right) <= x.shape[-1]
115 |     end = x.shape[-1] - padding_right
116 |     return x[..., padding_left:end]
117 | 
118 | 
119 | class NormConv1d(nn.Module):
120 |     """Wrapper around Conv1d and normalization applied to this conv
121 |     to provide a uniform interface across normalization approaches.
122 |     """
123 | 
124 |     def __init__(
125 |         self,
126 |         *args,
127 |         causal: bool = False,
128 |         norm: str = "none",
129 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
130 |         **kwargs,
131 |     ):
132 |         super().__init__()
133 |         self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
134 |         self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
135 |         self.norm_type = norm
136 | 
137 |     def forward(self, x):
138 |         x = self.conv(x)
139 |         x = self.norm(x)
140 |         return x
141 | 
142 | 
143 | class NormConv2d(nn.Module):
144 |     """Wrapper around Conv2d and normalization applied to this conv
145 |     to provide a uniform interface across normalization approaches.
146 |     """
147 | 
148 |     def __init__(
149 |         self,
150 |         *args,
151 |         norm: str = "none",
152 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
153 |         **kwargs,
154 |     ):
155 |         super().__init__()
156 |         self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
157 |         self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
158 |         self.norm_type = norm
159 | 
160 |     def forward(self, x):
161 |         x = self.conv(x)
162 |         x = self.norm(x)
163 |         return x
164 | 
165 | 
166 | class NormConvTranspose1d(nn.Module):
167 |     """Wrapper around ConvTranspose1d and normalization applied to this conv
168 |     to provide a uniform interface across normalization approaches.
169 |     """
170 | 
171 |     def __init__(
172 |         self,
173 |         *args,
174 |         causal: bool = False,
175 |         norm: str = "none",
176 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
177 |         **kwargs,
178 |     ):
179 |         super().__init__()
180 |         self.convtr = apply_parametrization_norm(
181 |             nn.ConvTranspose1d(*args, **kwargs), norm
182 |         )
183 |         self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
184 |         self.norm_type = norm
185 | 
186 |     def forward(self, x):
187 |         x = self.convtr(x)
188 |         x = self.norm(x)
189 |         return x
190 | 
191 | 
192 | class NormConvTranspose2d(nn.Module):
193 |     """Wrapper around ConvTranspose2d and normalization applied to this conv
194 |     to provide a uniform interface across normalization approaches.
195 |     """
196 | 
197 |     def __init__(
198 |         self,
199 |         *args,
200 |         norm: str = "none",
201 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
202 |         **kwargs,
203 |     ):
204 |         super().__init__()
205 |         self.convtr = apply_parametrization_norm(
206 |             nn.ConvTranspose2d(*args, **kwargs), norm
207 |         )
208 |         self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
209 | 
210 |     def forward(self, x):
211 |         x = self.convtr(x)
212 |         x = self.norm(x)
213 |         return x
214 | 
215 | 
216 | class StreamableConv1d(nn.Module):
217 |     """Conv1d with some builtin handling of asymmetric or causal padding
218 |     and normalization.
219 |     """
220 | 
221 |     def __init__(
222 |         self,
223 |         in_channels: int,
224 |         out_channels: int,
225 |         kernel_size: int,
226 |         stride: int = 1,
227 |         dilation: int = 1,
228 |         groups: int = 1,
229 |         bias: bool = True,
230 |         causal: bool = False,
231 |         norm: str = "none",
232 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
233 |         pad_mode: str = "reflect",
234 |     ):
235 |         super().__init__()
236 |         # warn user on unusual setup between dilation and stride
237 |         if stride > 1 and dilation > 1:
238 |             warnings.warn(
239 |                 "StreamableConv1d has been initialized with stride > 1 and dilation > 1"
240 |                 f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
241 |             )
242 |         self.conv = NormConv1d(
243 |             in_channels,
244 |             out_channels,
245 |             kernel_size,
246 |             stride,
247 |             dilation=dilation,
248 |             groups=groups,
249 |             bias=bias,
250 |             causal=causal,
251 |             norm=norm,
252 |             norm_kwargs=norm_kwargs,
253 |         )
254 |         self.causal = causal
255 |         self.pad_mode = pad_mode
256 | 
257 |     def forward(self, x):
258 |         B, C, T = x.shape
259 |         kernel_size = self.conv.conv.kernel_size[0]
260 |         stride = self.conv.conv.stride[0]
261 |         dilation = self.conv.conv.dilation[0]
262 |         kernel_size = (
263 |             kernel_size - 1
264 |         ) * dilation + 1  # effective kernel size with dilations
265 |         padding_total = kernel_size - stride
266 |         extra_padding = get_extra_padding_for_conv1d(
267 |             x, kernel_size, stride, padding_total
268 |         )
269 |         if self.causal:
270 |             # Left padding for causal
271 |             x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
272 |         else:
273 |             # Asymmetric padding required for odd strides
274 |             padding_right = padding_total // 2
275 |             padding_left = padding_total - padding_right
276 |             x = pad1d(
277 |                 x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
278 |             )
279 |         return self.conv(x)
280 | 
281 | 
282 | class StreamableConvTranspose1d(nn.Module):
283 |     """ConvTranspose1d with some builtin handling of asymmetric or causal padding
284 |     and normalization.
285 |     """
286 | 
287 |     def __init__(
288 |         self,
289 |         in_channels: int,
290 |         out_channels: int,
291 |         kernel_size: int,
292 |         stride: int = 1,
293 |         causal: bool = False,
294 |         norm: str = "none",
295 |         trim_right_ratio: float = 1.0,
296 |         norm_kwargs: tp.Dict[str, tp.Any] = {},
297 |     ):
298 |         super().__init__()
299 |         self.convtr = NormConvTranspose1d(
300 |             in_channels,
301 |             out_channels,
302 |             kernel_size,
303 |             stride,
304 |             causal=causal,
305 |             norm=norm,
306 |             norm_kwargs=norm_kwargs,
307 |         )
308 |         self.causal = causal
309 |         self.trim_right_ratio = trim_right_ratio
310 |         assert (
311 |             self.causal or self.trim_right_ratio == 1.0
312 |         ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
313 |         assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
314 | 
315 |     def forward(self, x):
316 |         kernel_size = self.convtr.convtr.kernel_size[0]
317 |         stride = self.convtr.convtr.stride[0]
318 |         padding_total = kernel_size - stride
319 | 
320 |         y = self.convtr(x)
321 | 
322 |         # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
323 |         # removed at the very end, when keeping only the right length for the output,
324 |         # as removing it here would require also passing the length at the matching layer
325 |         # in the encoder.
326 |         if self.causal:
327 |             # Trim the padding on the right according to the specified ratio
328 |             # if trim_right_ratio = 1.0, trim everything from right
329 |             padding_right = math.ceil(padding_total * self.trim_right_ratio)
330 |             padding_left = padding_total - padding_right
331 |             y = unpad1d(y, (padding_left, padding_right))
332 |         else:
333 |             # Asymmetric padding required for odd strides
334 |             padding_right = padding_total // 2
335 |             padding_left = padding_total - padding_right
336 |             y = unpad1d(y, (padding_left, padding_right))
337 |         return y
338 | 


--------------------------------------------------------------------------------
/examples/colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a73a40df",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Watermarking custom audio\n",
  9 |     "\n",
 10 |     "[[`arXiv`](https://arxiv.org/abs/2401.17264)]\n",
 11 |     "[[`GitHub`](https://github.com/facebookresearch/audioseal)]\n",
 12 |     "\n",
 13 |     "This notebook shows a minimal example how to watermark a custom audio, for example your own recorded voice. This notebook aims to run in Google Collab. Make sure you get familiar with the APIs of AudioSeal, for example using [Getting Started notebook](./Getting_started.ipynb)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "4c2562ce",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Installation"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "1fbb4b36",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "#@title Install requirements\n",
 32 |     "!pip install torchaudio\n",
 33 |     "!pip install matplotlib\n",
 34 |     "!pip install audioseal # Ensure this matches the actual package name for AudioSeal\n",
 35 |     "!pip install ffmpeg-python"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "1325f7d7",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "import io\n",
 46 |     "import ffmpeg\n",
 47 |     "import IPython.display as ipd\n",
 48 |     "from google.colab.output import eval_js\n",
 49 |     "\n",
 50 |     "from base64 import b64decode\n",
 51 |     "from scipy.io.wavfile import read as wav_read\n",
 52 |     "import numpy as np\n",
 53 |     "import matplotlib.pyplot as plt\n",
 54 |     "\n",
 55 |     "import torch\n",
 56 |     "import torchaudio\n",
 57 |     "\n",
 58 |     "from audioseal import AudioSeal"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "e7f95544",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "def plot_waveform_and_specgram(waveform, sample_rate, title):\n",
 69 |     "    waveform = waveform.squeeze().detach().cpu().numpy()\n",
 70 |     "\n",
 71 |     "    num_frames = waveform.shape[-1]\n",
 72 |     "    time_axis = torch.arange(0, num_frames) / sample_rate\n",
 73 |     "\n",
 74 |     "    figure, (ax1, ax2) = plt.subplots(1, 2)\n",
 75 |     "\n",
 76 |     "    ax1.plot(time_axis, waveform, linewidth=1)\n",
 77 |     "    ax1.grid(True)\n",
 78 |     "    ax2.specgram(waveform, Fs=sample_rate)\n",
 79 |     "\n",
 80 |     "    figure.suptitle(f\"{title} - Waveform and specgram\")\n",
 81 |     "    plt.show()\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "def play_audio(waveform, sample_rate):\n",
 85 |     "    if waveform.dim() > 2:\n",
 86 |     "        waveform = waveform.squeeze(0)\n",
 87 |     "    waveform = waveform.detach().cpu().numpy()\n",
 88 |     "\n",
 89 |     "    num_channels, *_ = waveform.shape\n",
 90 |     "    if num_channels == 1:\n",
 91 |     "        ipd.display(ipd.Audio(waveform[0], rate=sample_rate))\n",
 92 |     "    elif num_channels == 2:\n",
 93 |     "        ipd.display(ipd.Audio((waveform[0], waveform[1]), rate=sample_rate))\n",
 94 |     "    else:\n",
 95 |     "        raise ValueError(\"Waveform with more than 2 channels are not supported.\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "daf14d39",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "AUDIO_HTML = \"\"\"\n",
106 |     "<script>\n",
107 |     "var my_div = document.createElement(\"DIV\");\n",
108 |     "var my_p = document.createElement(\"P\");\n",
109 |     "var my_btn = document.createElement(\"BUTTON\");\n",
110 |     "var t = document.createTextNode(\"Press to start recording\");\n",
111 |     "\n",
112 |     "my_btn.appendChild(t);\n",
113 |     "//my_p.appendChild(my_btn);\n",
114 |     "my_div.appendChild(my_btn);\n",
115 |     "document.body.appendChild(my_div);\n",
116 |     "\n",
117 |     "var base64data = 0;\n",
118 |     "var reader;\n",
119 |     "var recorder, gumStream;\n",
120 |     "var recordButton = my_btn;\n",
121 |     "\n",
122 |     "var handleSuccess = function(stream) {\n",
123 |     "  gumStream = stream;\n",
124 |     "  var options = {\n",
125 |     "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
126 |     "    mimeType : 'audio/webm;codecs=opus'\n",
127 |     "    //mimeType : 'audio/webm;codecs=pcm'\n",
128 |     "  };\n",
129 |     "  //recorder = new MediaRecorder(stream, options);\n",
130 |     "  recorder = new MediaRecorder(stream);\n",
131 |     "  recorder.ondataavailable = function(e) {\n",
132 |     "    var url = URL.createObjectURL(e.data);\n",
133 |     "    var preview = document.createElement('audio');\n",
134 |     "    preview.controls = true;\n",
135 |     "    preview.src = url;\n",
136 |     "    document.body.appendChild(preview);\n",
137 |     "\n",
138 |     "    reader = new FileReader();\n",
139 |     "    reader.readAsDataURL(e.data);\n",
140 |     "    reader.onloadend = function() {\n",
141 |     "      base64data = reader.result;\n",
142 |     "      //console.log(\"Inside FileReader:\" + base64data);\n",
143 |     "    }\n",
144 |     "  };\n",
145 |     "  recorder.start();\n",
146 |     "  };\n",
147 |     "\n",
148 |     "recordButton.innerText = \"Recording... press to stop\";\n",
149 |     "\n",
150 |     "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
151 |     "\n",
152 |     "\n",
153 |     "function toggleRecording() {\n",
154 |     "  if (recorder && recorder.state == \"recording\") {\n",
155 |     "      recorder.stop();\n",
156 |     "      gumStream.getAudioTracks()[0].stop();\n",
157 |     "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
158 |     "  }\n",
159 |     "}\n",
160 |     "\n",
161 |     "// https://stackoverflow.com/a/951057\n",
162 |     "function sleep(ms) {\n",
163 |     "  return new Promise(resolve => setTimeout(resolve, ms));\n",
164 |     "}\n",
165 |     "\n",
166 |     "var data = new Promise(resolve=>{\n",
167 |     "//recordButton.addEventListener(\"click\", toggleRecording);\n",
168 |     "recordButton.onclick = ()=>{\n",
169 |     "toggleRecording()\n",
170 |     "\n",
171 |     "sleep(2000).then(() => {\n",
172 |     "  // wait 2000ms for the data to be available...\n",
173 |     "  // ideally this should use something like await...\n",
174 |     "  //console.log(\"Inside data:\" + base64data)\n",
175 |     "  resolve(base64data.toString())\n",
176 |     "\n",
177 |     "});\n",
178 |     "\n",
179 |     "}\n",
180 |     "});\n",
181 |     "\n",
182 |     "</script>\n",
183 |     "\"\"\"\n",
184 |     "\n",
185 |     "def get_audio():\n",
186 |     "  display(ipd.HTML(AUDIO_HTML))\n",
187 |     "  data = eval_js(\"data\")\n",
188 |     "  binary = b64decode(data.split(',')[1])\n",
189 |     "\n",
190 |     "  process = (ffmpeg\n",
191 |     "    .input('pipe:0')\n",
192 |     "    .output('pipe:1', format='wav')\n",
193 |     "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
194 |     "  )\n",
195 |     "  output, err = process.communicate(input=binary)\n",
196 |     "\n",
197 |     "  riff_chunk_size = len(output) - 8\n",
198 |     "  # Break up the chunk size into four bytes, held in b.\n",
199 |     "  q = riff_chunk_size\n",
200 |     "  b = []\n",
201 |     "  for i in range(4):\n",
202 |     "      q, r = divmod(q, 256)\n",
203 |     "      b.append(r)\n",
204 |     "\n",
205 |     "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
206 |     "  riff = output[:4] + bytes(b) + output[8:]\n",
207 |     "\n",
208 |     "  sr, audio = wav_read(io.BytesIO(riff))\n",
209 |     "\n",
210 |     "  return audio, sr"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "id": "53826104",
216 |    "metadata": {},
217 |    "source": [
218 |     "## Record your audio"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "3216ff1a",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "recorded, sr = get_audio()\n",
229 |     "\n",
230 |     "# Convert audio from list of int16 to a normalized tensor\n",
231 |     "audio = torch.tensor(recorded).float() / 32768.0\n",
232 |     "print(audio.shape)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "a2110143",
238 |    "metadata": {},
239 |    "source": [
240 |     "## Generator\n",
241 |     "\n",
242 |     "To watermark an audio, we simply load the watermarking generator from the hub:"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "id": "007c48cb",
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "model = AudioSeal.load_generator(\"audioseal_wm_16bits\")\n",
253 |     "\n",
254 |     "# We add the batch dimension to the single audio to mimic the batch watermarking\n",
255 |     "audios = audio.unsqueeze(0).unsqueeze(0)  # b=1 c=1 t\n",
256 |     "\n",
257 |     "watermark = model.get_watermark(audios, sample_rate=sr)\n",
258 |     "watermarked_audio = audios + watermark\n",
259 |     "\n",
260 |     "# Alternatively, you can also call forward() function directly with different tune-down / tune-up rate\n",
261 |     "watermarked_audio = model(audios, sample_rate=sr, alpha=1)\n",
262 |     "\n",
263 |     "# You can also watermark with a secret message\n",
264 |     "# secret_mesage = torch.randint(0, 2, (1, 16), dtype=torch.int32)\n",
265 |     "# watermarked_audio = model(audios, sample_rate=sr, message=secret_mesage, alpha=1)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "id": "ac5aac4f",
271 |    "metadata": {},
272 |    "source": [
273 |     "We can see that the watermarked audio has preserved almost the same spectrogram and contents as the original one"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "id": "0200bc22",
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "plot_waveform_and_specgram(watermarked_audio.squeeze(), sr, title=\"Watermarked audio\")"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "id": "d70a0d73",
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "play_audio(watermarked_audio, sr)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "id": "cd7fa4fe",
299 |    "metadata": {},
300 |    "source": [
301 |     "## Detector\n",
302 |     "\n",
303 |     "To detect the watermarks from an audio, we load the separate detector model and can do one of the following:\n",
304 |     "\n",
305 |     "### Basic usage: Call `detect_watermark()`\n",
306 |     "\n",
307 |     "This results in a tuple of form `Tuple(float, Tensor)`, where the first value indicates the probability of the audio being watermarked (the higher, the more likely), and the second value is the decoded message that is embeded by the generator. If the audio is unwatermarked (low first value), the decoded message will be just some random bits.\n",
308 |     "\n",
309 |     "Note that due to the stochastic nature of the detector, the decoded message and the secret message might miss by 1 bit, so depending on the user's need, the detection might be called multiple times to get an averaged decoded message."
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "id": "0b1a3a9e",
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "detector = AudioSeal.load_detector((\"audioseal_detector_16bits\"))\n",
320 |     "\n",
321 |     "result, message = detector.detect_watermark(watermarked_audio, sample_rate=sr, message_threshold=0.5)\n",
322 |     "\n",
323 |     "print(f\"\\nThis is likely a watermarked audio. WM probability: {result}\")\n",
324 |     "\n",
325 |     "# Run on an unwatermarked audio\n",
326 |     "result2, message2 = detector.detect_watermark(audios, sample_rate=sr, message_threshold=0.5)\n",
327 |     "print(f\"This is likely an unwatermarked audio. WM probability: {result2}\")\n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "7730364d",
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "message"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "id": "8dc67150",
343 |    "metadata": {},
344 |    "source": [
345 |     "`message_threshold` indicates the threshold in which the detector will convert the stochastic messages (with probability between 0 and 1) into the n-bit binary format. In most of the case, the generator generates an unbiased message from the secret, so `0.5` is a reasonable choice (so in the above example, value > 0.5 means 1 and value < 0.5 means 0). \n",
346 |     "\n",
347 |     "\n",
348 |     "### Advanced usage: Call `forward()`\n",
349 |     "\n",
350 |     "The detector can also be called directly as a Torch module. This will return 2 tensors: \n",
351 |     "- The first tensor of size `batch x 2 x frames` indicates the probability of each frame being watermarked (positive or negative). So t[:, 0, :] corresponds to the negative probability and t[:, 1, :] corresponds to the positive probability\n",
352 |     "- The second tensor of size `batch x n_bits` corresponds to the message detected from the audio. It indicates the probability for each bit to be 1. In case of unwatermarked audios, this tensor is random"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "id": "fadf26a1",
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "pred_prob, message_prob = detector(watermarked_audio, sample_rate=sr)\n",
363 |     "pred_prob[:, 1, :]"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "id": "899de6b3",
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "message_prob"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "id": "a78766fd",
379 |    "metadata": {},
380 |    "source": [
381 |     "### Robustness against attacks\n",
382 |     "\n",
383 |     "We can evaluate the robustness of the detector against some attacks. For this purpose, we will perform some simple attacks: Pink noise, highpass filter, compression in different formats. For the full list of attacks, please refer to our paper. \n",
384 |     "\n",
385 |     "\n",
386 |     "#### Pink noise attack"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "id": "4cc0efde",
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "from attacks import AudioEffects as af\n",
397 |     "\n",
398 |     "pink_noised_audio = af.pink_noise(watermarked_audio, noise_std=0.1)\n",
399 |     "plot_waveform_and_specgram(pink_noised_audio, sample_rate=sr, title=\"Audio with pink noise\")\n",
400 |     "result, message = detector.detect_watermark(pink_noised_audio, sample_rate=sr)\n",
401 |     "print(result)"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "id": "e2b96eb1",
407 |    "metadata": {},
408 |    "source": [
409 |     "#### Lowpass filter"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "id": "254e6012",
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "lowpass_filtered = af.lowpass_filter(watermarked_audio, cutoff_freq=5000, sample_rate=sr)\n",
420 |     "plot_waveform_and_specgram(lowpass_filtered, sample_rate=sr, title=\"Audio with low pass filtered\")\n",
421 |     "result, message = detector.detect_watermark(lowpass_filtered, sample_rate=sr)\n",
422 |     "print(result)"
423 |    ]
424 |   }
425 |  ],
426 |  "metadata": {
427 |   "kernelspec": {
428 |    "display_name": "awm-oss",
429 |    "language": "python",
430 |    "name": "python3"
431 |   },
432 |   "language_info": {
433 |    "codemirror_mode": {
434 |     "name": "ipython",
435 |     "version": 3
436 |    },
437 |    "file_extension": ".py",
438 |    "mimetype": "text/x-python",
439 |    "name": "python",
440 |    "nbconvert_exporter": "python",
441 |    "pygments_lexer": "ipython3",
442 |    "version": "3.9.12"
443 |   }
444 |  },
445 |  "nbformat": 4,
446 |  "nbformat_minor": 5
447 | }
448 | 


--------------------------------------------------------------------------------
/src/audioseal/libs/audiocraft/modules/seanet.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | # Vendor from https://github.com/facebookresearch/audiocraft
  8 | 
  9 | import math
 10 | import typing as tp
 11 | 
 12 | import numpy as np
 13 | import torch.nn as nn
 14 | 
 15 | from audioseal.libs.audiocraft.modules.conv import (
 16 |     StreamableConv1d,
 17 |     StreamableConvTranspose1d,
 18 | )
 19 | from audioseal.libs.audiocraft.modules.lstm import StreamableLSTM
 20 | 
 21 | 
 22 | class SEANetResnetBlock(nn.Module):
 23 |     """Residual block from SEANet model.
 24 | 
 25 |     Args:
 26 |         dim (int): Dimension of the input/output.
 27 |         kernel_sizes (list): List of kernel sizes for the convolutions.
 28 |         dilations (list): List of dilations for the convolutions.
 29 |         activation (str): Activation function.
 30 |         activation_params (dict): Parameters to provide to the activation function.
 31 |         norm (str): Normalization method.
 32 |         norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
 33 |         causal (bool): Whether to use fully causal convolution.
 34 |         pad_mode (str): Padding mode for the convolutions.
 35 |         compress (int): Reduced dimensionality in residual branches (from Demucs v3).
 36 |         true_skip (bool): Whether to use true skip connection or a simple
 37 |             (streamable) convolution as the skip connection.
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         dim: int,
 43 |         kernel_sizes: tp.List[int] = [3, 1],
 44 |         dilations: tp.List[int] = [1, 1],
 45 |         activation: str = "ELU",
 46 |         activation_params: dict = {"alpha": 1.0},
 47 |         norm: str = "none",
 48 |         norm_params: tp.Dict[str, tp.Any] = {},
 49 |         causal: bool = False,
 50 |         pad_mode: str = "reflect",
 51 |         compress: int = 2,
 52 |         true_skip: bool = True,
 53 |     ):
 54 |         super().__init__()
 55 |         assert len(kernel_sizes) == len(
 56 |             dilations
 57 |         ), "Number of kernel sizes should match number of dilations"
 58 |         act = getattr(nn, activation)
 59 |         hidden = dim // compress
 60 |         block = []
 61 |         for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
 62 |             in_chs = dim if i == 0 else hidden
 63 |             out_chs = dim if i == len(kernel_sizes) - 1 else hidden
 64 |             block += [
 65 |                 act(**activation_params),
 66 |                 StreamableConv1d(
 67 |                     in_chs,
 68 |                     out_chs,
 69 |                     kernel_size=kernel_size,
 70 |                     dilation=dilation,
 71 |                     norm=norm,
 72 |                     norm_kwargs=norm_params,
 73 |                     causal=causal,
 74 |                     pad_mode=pad_mode,
 75 |                 ),
 76 |             ]
 77 |         self.block = nn.Sequential(*block)
 78 |         self.shortcut: nn.Module
 79 |         if true_skip:
 80 |             self.shortcut = nn.Identity()
 81 |         else:
 82 |             self.shortcut = StreamableConv1d(
 83 |                 dim,
 84 |                 dim,
 85 |                 kernel_size=1,
 86 |                 norm=norm,
 87 |                 norm_kwargs=norm_params,
 88 |                 causal=causal,
 89 |                 pad_mode=pad_mode,
 90 |             )
 91 | 
 92 |     def forward(self, x):
 93 |         return self.shortcut(x) + self.block(x)
 94 | 
 95 | 
 96 | class SEANetEncoder(nn.Module):
 97 |     """SEANet encoder.
 98 | 
 99 |     Args:
100 |         channels (int): Audio channels.
101 |         dimension (int): Intermediate representation dimension.
102 |         n_filters (int): Base width for the model.
103 |         n_residual_layers (int): nb of residual layers.
104 |         ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
105 |             upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
106 |             that must match the decoder order. We use the decoder order as some models may only employ the decoder.
107 |         activation (str): Activation function.
108 |         activation_params (dict): Parameters to provide to the activation function.
109 |         norm (str): Normalization method.
110 |         norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
111 |         kernel_size (int): Kernel size for the initial convolution.
112 |         last_kernel_size (int): Kernel size for the initial convolution.
113 |         residual_kernel_size (int): Kernel size for the residual layers.
114 |         dilation_base (int): How much to increase the dilation with each layer.
115 |         causal (bool): Whether to use fully causal convolution.
116 |         pad_mode (str): Padding mode for the convolutions.
117 |         true_skip (bool): Whether to use true skip connection or a simple
118 |             (streamable) convolution as the skip connection in the residual network blocks.
119 |         compress (int): Reduced dimensionality in residual branches (from Demucs v3).
120 |         lstm (int): Number of LSTM layers at the end of the encoder.
121 |         disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
122 |             For the encoder, it corresponds to the N first blocks.
123 |     """
124 | 
125 |     def __init__(
126 |         self,
127 |         channels: int = 1,
128 |         dimension: int = 128,
129 |         n_filters: int = 32,
130 |         n_residual_layers: int = 3,
131 |         ratios: tp.List[int] = [8, 5, 4, 2],
132 |         activation: str = "ELU",
133 |         activation_params: dict = {"alpha": 1.0},
134 |         norm: str = "none",
135 |         norm_params: tp.Dict[str, tp.Any] = {},
136 |         kernel_size: int = 7,
137 |         last_kernel_size: int = 7,
138 |         residual_kernel_size: int = 3,
139 |         dilation_base: int = 2,
140 |         causal: bool = False,
141 |         pad_mode: str = "reflect",
142 |         true_skip: bool = True,
143 |         compress: int = 2,
144 |         lstm: int = 0,
145 |         disable_norm_outer_blocks: int = 0,
146 |     ):
147 |         super().__init__()
148 |         self.channels = channels
149 |         self.dimension = dimension
150 |         self.n_filters = n_filters
151 |         self.ratios = list(reversed(ratios))
152 |         del ratios
153 |         self.n_residual_layers = n_residual_layers
154 |         self.hop_length = np.prod(self.ratios)
155 |         self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
156 |         self.disable_norm_outer_blocks = disable_norm_outer_blocks
157 |         assert (
158 |             self.disable_norm_outer_blocks >= 0
159 |             and self.disable_norm_outer_blocks <= self.n_blocks
160 |         ), (
161 |             "Number of blocks for which to disable norm is invalid."
162 |             "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
163 |         )
164 | 
165 |         act = getattr(nn, activation)
166 |         mult = 1
167 |         model: tp.List[nn.Module] = [
168 |             StreamableConv1d(
169 |                 channels,
170 |                 mult * n_filters,
171 |                 kernel_size,
172 |                 norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
173 |                 norm_kwargs=norm_params,
174 |                 causal=causal,
175 |                 pad_mode=pad_mode,
176 |             )
177 |         ]
178 |         # Downsample to raw audio scale
179 |         for i, ratio in enumerate(self.ratios):
180 |             block_norm = "none" if self.disable_norm_outer_blocks >= i + 2 else norm
181 |             # Add residual layers
182 |             for j in range(n_residual_layers):
183 |                 model += [
184 |                     SEANetResnetBlock(
185 |                         mult * n_filters,
186 |                         kernel_sizes=[residual_kernel_size, 1],
187 |                         dilations=[dilation_base**j, 1],
188 |                         norm=block_norm,
189 |                         norm_params=norm_params,
190 |                         activation=activation,
191 |                         activation_params=activation_params,
192 |                         causal=causal,
193 |                         pad_mode=pad_mode,
194 |                         compress=compress,
195 |                         true_skip=true_skip,
196 |                     )
197 |                 ]
198 | 
199 |             # Add downsampling layers
200 |             model += [
201 |                 act(**activation_params),
202 |                 StreamableConv1d(
203 |                     mult * n_filters,
204 |                     mult * n_filters * 2,
205 |                     kernel_size=ratio * 2,
206 |                     stride=ratio,
207 |                     norm=block_norm,
208 |                     norm_kwargs=norm_params,
209 |                     causal=causal,
210 |                     pad_mode=pad_mode,
211 |                 ),
212 |             ]
213 |             mult *= 2
214 | 
215 |         if lstm:
216 |             model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
217 | 
218 |         model += [
219 |             act(**activation_params),
220 |             StreamableConv1d(
221 |                 mult * n_filters,
222 |                 dimension,
223 |                 last_kernel_size,
224 |                 norm=(
225 |                     "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
226 |                 ),
227 |                 norm_kwargs=norm_params,
228 |                 causal=causal,
229 |                 pad_mode=pad_mode,
230 |             ),
231 |         ]
232 | 
233 |         self.model = nn.Sequential(*model)
234 | 
235 |     def forward(self, x):
236 |         return self.model(x)
237 | 
238 | 
239 | class SEANetEncoderKeepDimension(SEANetEncoder):
240 |     """
241 |     similar architecture to the SEANet encoder but with an extra step that
242 |     projects the output dimension to the same input dimension by repeating
243 |     the sequential
244 | 
245 |     Args:
246 |         SEANetEncoder (_type_): _description_
247 |     """
248 | 
249 |     def __init__(self, *args, **kwargs):
250 | 
251 |         self.output_dim = kwargs.pop("output_dim")
252 |         super().__init__(*args, **kwargs)
253 |         # Adding a reverse convolution layer
254 |         self.reverse_convolution = nn.ConvTranspose1d(
255 |             in_channels=self.dimension,
256 |             out_channels=self.output_dim,
257 |             kernel_size=math.prod(self.ratios),
258 |             stride=math.prod(self.ratios),
259 |             padding=0,
260 |         )
261 | 
262 |     def forward(self, x):
263 |         orig_nframes = x.shape[-1]
264 |         x = self.model(x)
265 |         x = self.reverse_convolution(x)
266 |         # make sure dim didn't change
267 |         return x[:, :, :orig_nframes]
268 | 
269 | class SEANetDecoder(nn.Module):
270 |     """SEANet decoder.
271 | 
272 |     Args:
273 |         channels (int): Audio channels.
274 |         dimension (int): Intermediate representation dimension.
275 |         n_filters (int): Base width for the model.
276 |         n_residual_layers (int): nb of residual layers.
277 |         ratios (Sequence[int]): kernel size and stride ratios.
278 |         activation (str): Activation function.
279 |         activation_params (dict): Parameters to provide to the activation function.
280 |         final_activation (str): Final activation function after all convolutions.
281 |         final_activation_params (dict): Parameters to provide to the activation function.
282 |         norm (str): Normalization method.
283 |         norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
284 |         kernel_size (int): Kernel size for the initial convolution.
285 |         last_kernel_size (int): Kernel size for the initial convolution.
286 |         residual_kernel_size (int): Kernel size for the residual layers.
287 |         dilation_base (int): How much to increase the dilation with each layer.
288 |         causal (bool): Whether to use fully causal convolution.
289 |         pad_mode (str): Padding mode for the convolutions.
290 |         true_skip (bool): Whether to use true skip connection or a simple.
291 |             (streamable) convolution as the skip connection in the residual network blocks.
292 |         compress (int): Reduced dimensionality in residual branches (from Demucs v3).
293 |         lstm (int): Number of LSTM layers at the end of the encoder.
294 |         disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
295 |             For the decoder, it corresponds to the N last blocks.
296 |         trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
297 |             If equal to 1.0, it means that all the trimming is done at the right.
298 |     """
299 | 
300 |     def __init__(
301 |         self,
302 |         channels: int = 1,
303 |         dimension: int = 128,
304 |         n_filters: int = 32,
305 |         n_residual_layers: int = 3,
306 |         ratios: tp.List[int] = [8, 5, 4, 2],
307 |         activation: str = "ELU",
308 |         activation_params: dict = {"alpha": 1.0},
309 |         final_activation: tp.Optional[str] = None,
310 |         final_activation_params: tp.Optional[dict] = None,
311 |         norm: str = "none",
312 |         norm_params: tp.Dict[str, tp.Any] = {},
313 |         kernel_size: int = 7,
314 |         last_kernel_size: int = 7,
315 |         residual_kernel_size: int = 3,
316 |         dilation_base: int = 2,
317 |         causal: bool = False,
318 |         pad_mode: str = "reflect",
319 |         true_skip: bool = True,
320 |         compress: int = 2,
321 |         lstm: int = 0,
322 |         disable_norm_outer_blocks: int = 0,
323 |         trim_right_ratio: float = 1.0,
324 |     ):
325 |         super().__init__()
326 |         self.dimension = dimension
327 |         self.channels = channels
328 |         self.n_filters = n_filters
329 |         self.ratios = ratios
330 |         del ratios
331 |         self.n_residual_layers = n_residual_layers
332 |         self.hop_length = np.prod(self.ratios)
333 |         self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
334 |         self.disable_norm_outer_blocks = disable_norm_outer_blocks
335 |         assert (
336 |             self.disable_norm_outer_blocks >= 0
337 |             and self.disable_norm_outer_blocks <= self.n_blocks
338 |         ), (
339 |             "Number of blocks for which to disable norm is invalid."
340 |             "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
341 |         )
342 | 
343 |         act = getattr(nn, activation)
344 |         mult = int(2 ** len(self.ratios))
345 |         model: tp.List[nn.Module] = [
346 |             StreamableConv1d(
347 |                 dimension,
348 |                 mult * n_filters,
349 |                 kernel_size,
350 |                 norm=(
351 |                     "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
352 |                 ),
353 |                 norm_kwargs=norm_params,
354 |                 causal=causal,
355 |                 pad_mode=pad_mode,
356 |             )
357 |         ]
358 | 
359 |         if lstm:
360 |             model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
361 | 
362 |         # Upsample to raw audio scale
363 |         for i, ratio in enumerate(self.ratios):
364 |             block_norm = (
365 |                 "none"
366 |                 if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1)
367 |                 else norm
368 |             )
369 |             # Add upsampling layers
370 |             model += [
371 |                 act(**activation_params),
372 |                 StreamableConvTranspose1d(
373 |                     mult * n_filters,
374 |                     mult * n_filters // 2,
375 |                     kernel_size=ratio * 2,
376 |                     stride=ratio,
377 |                     norm=block_norm,
378 |                     norm_kwargs=norm_params,
379 |                     causal=causal,
380 |                     trim_right_ratio=trim_right_ratio,
381 |                 ),
382 |             ]
383 |             # Add residual layers
384 |             for j in range(n_residual_layers):
385 |                 model += [
386 |                     SEANetResnetBlock(
387 |                         mult * n_filters // 2,
388 |                         kernel_sizes=[residual_kernel_size, 1],
389 |                         dilations=[dilation_base**j, 1],
390 |                         activation=activation,
391 |                         activation_params=activation_params,
392 |                         norm=block_norm,
393 |                         norm_params=norm_params,
394 |                         causal=causal,
395 |                         pad_mode=pad_mode,
396 |                         compress=compress,
397 |                         true_skip=true_skip,
398 |                     )
399 |                 ]
400 | 
401 |             mult //= 2
402 | 
403 |         # Add final layers
404 |         model += [
405 |             act(**activation_params),
406 |             StreamableConv1d(
407 |                 n_filters,
408 |                 channels,
409 |                 last_kernel_size,
410 |                 norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
411 |                 norm_kwargs=norm_params,
412 |                 causal=causal,
413 |                 pad_mode=pad_mode,
414 |             ),
415 |         ]
416 |         # Add optional final activation to decoder (eg. tanh)
417 |         if final_activation is not None:
418 |             final_act = getattr(nn, final_activation)
419 |             final_activation_params = final_activation_params or {}
420 |             model += [final_act(**final_activation_params)]
421 |         self.model = nn.Sequential(*model)
422 | 
423 |     def forward(self, z):
424 |         y = self.model(z)
425 |         return y
426 | 


--------------------------------------------------------------------------------
/examples/attack_benchmarking_example.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["# Benchmarking Audioseal on the SHUSH attack applied on RAVDESS Dataset\n","\n","In this notebook, we outline the steps taken to benchmark the Audioseal architecture against different attacks on a dataset of audio files.  \n","In particular, we follow these steps:\n","- Load audio files from a dataset \n","- Watermark each audio file using Audioseal\n","- Perform perturbations/attacks to the audio files\n","- Detect the watermarks on these attacked files and keep track of the confidence of Audioseal in its predictions that the files are watermarked.\n","\n","\n","For a better understanding of Audioseal and its functionalities, it is highly recommended to go through the [Getting started notebook](https://github.com/facebookresearch/audioseal/blob/main/examples/Getting_started.ipynb)."]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## Dataset\n","\n","We use the [RAVDESS Emotional Speech audio](https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio) dataset for this experiment.   \n","When added to a Kaggle notebook environment, all input datasets are stored in the read-only `/kaggle/input` path. If you are not using Kaggle, or have stored your files elsewhere, you can load nested audio files by modifying `PARENT_FILES_DIR` in the cell below."]},{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-03-15T06:51:03.909949Z","iopub.status.busy":"2024-03-15T06:51:03.909220Z","iopub.status.idle":"2024-03-15T06:51:04.354260Z","shell.execute_reply":"2024-03-15T06:51:04.353256Z","shell.execute_reply.started":"2024-03-15T06:51:03.909904Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of input files: 2880\n"]}],"source":["import numpy as np \n","import pandas as pd\n","import os\n","\n","all_input_files = []\n","PARENT_FILES_DIR = '/kaggle/input'\n","\n","for dirname, _, filenames in os.walk(PARENT_FILES_DIR):\n","    for filename in filenames:\n","        if \"wav\" in filename:\n","            all_input_files.append(os.path.join(dirname, filename))\n","            \n","print(f\"Number of input files: {len(all_input_files)}\")"]},{"cell_type":"markdown","metadata":{},"source":["### Installations and Imports "]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:05.910354Z","iopub.status.busy":"2024-03-15T06:51:05.909237Z","iopub.status.idle":"2024-03-15T06:51:20.351281Z","shell.execute_reply":"2024-03-15T06:51:20.350239Z","shell.execute_reply.started":"2024-03-15T06:51:05.910319Z"},"trusted":true},"outputs":[],"source":["import sys\n","!{sys.executable} -m pip install -q torchaudio soundfile matplotlib audioseal\n","\n","import typing as tp\n","import julius\n","import torch\n","import torchaudio\n","import urllib"]},{"cell_type":"markdown","metadata":{},"source":["### Load Audioseal models"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.354221Z","iopub.status.busy":"2024-03-15T06:51:20.353436Z","iopub.status.idle":"2024-03-15T06:51:20.378701Z","shell.execute_reply":"2024-03-15T06:51:20.377805Z","shell.execute_reply.started":"2024-03-15T06:51:20.354185Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Using device: cuda\n"]}],"source":["device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n","print(f\"Using device: {device}\")"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.380975Z","iopub.status.busy":"2024-03-15T06:51:20.379901Z","iopub.status.idle":"2024-03-15T06:51:20.919397Z","shell.execute_reply":"2024-03-15T06:51:20.918564Z","shell.execute_reply.started":"2024-03-15T06:51:20.380948Z"},"trusted":true},"outputs":[],"source":["from audioseal import AudioSeal\n","\n","model = AudioSeal.load_generator(\"audioseal_wm_16bits\")\n","detector = AudioSeal.load_detector(\"audioseal_detector_16bits\")"]},{"cell_type":"markdown","metadata":{},"source":["### Helper functions to load audio data, watermark audio, and get prediction scores for audio"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.922026Z","iopub.status.busy":"2024-03-15T06:51:20.921593Z","iopub.status.idle":"2024-03-15T06:51:21.091167Z","shell.execute_reply":"2024-03-15T06:51:21.090157Z","shell.execute_reply.started":"2024-03-15T06:51:20.921992Z"},"trusted":true},"outputs":[],"source":["model = model.to(device)\n","detector = detector.to(device)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:21.092660Z","iopub.status.busy":"2024-03-15T06:51:21.092352Z","iopub.status.idle":"2024-03-15T06:51:21.105098Z","shell.execute_reply":"2024-03-15T06:51:21.104097Z","shell.execute_reply.started":"2024-03-15T06:51:21.092635Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Secret message: tensor([[1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1]], device='cuda:0',\n","       dtype=torch.int32)\n"]}],"source":["secret_message = torch.randint(0, 2, (1, 16), dtype=torch.int32)\n","secret_message = secret_message.to(device)\n","print(f\"Secret message: {secret_message}\")\n","\n","# Function to load an audio file from its file path\n","def load_audio_file(\n","    file_path: str\n",") -> tp.Optional[tp.Tuple[torch.Tensor, int]]:\n","    try:\n","        wav, sample_rate = torchaudio.load(file_path)\n","        return wav, sample_rate\n","    except Exception as e:\n","        print(f\"Error while loading audio: {e}\")\n","        return None\n","    \n","# Function to generate a watermark for the audio and embed it into a new audio tensor\n","def generate_watermark_audio(\n","    tensor: torch.Tensor,\n","    sample_rate: int\n",") -> tp.Optional[torch.Tensor]:\n","    try:\n","        global model, device, secret_message\n","        audios = tensor.unsqueeze(0).to(device)\n","        watermarked_audio = model(audios, sample_rate=sample_rate, message=secret_message.to(device), alpha=1)\n","        return watermarked_audio\n","\n","    \n","    except Exception as e:\n","        print(f\"Error while watermarking audio: {e}\")\n","        return None\n","    \n","# Function to get the confidence score that an audio tensor was watermarked by Audioseal\n","def detect_watermark_audio(\n","    tensor: torch.Tensor,\n","    sample_rate: int,\n","    message_threshold: float = 0.50\n",") -> tp.Optional[float]:\n","    try:\n","        global detector, device\n","        # In our analysis we are not concerned with the hidden/embedded message as of now\n","        result, _ = detector.detect_watermark(tensor, sample_rate=sample_rate, message_threshold=message_threshold)\n","        return float(result)\n","    except Exception as e:\n","        print(f\"Error while detecting watermark: {e}\")\n","        return None"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## Audio attacks\n","\n","- In this notebook, we use the `SHUSH` attack.\n","- For more attacks and their descriptions, please refer to the [source](https://github.com/facebookresearch/audioseal/blob/main/examples/attacks.py)."]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:33.919489Z","iopub.status.busy":"2024-03-15T06:51:33.919084Z","iopub.status.idle":"2024-03-15T06:51:33.962401Z","shell.execute_reply":"2024-03-15T06:51:33.961261Z","shell.execute_reply.started":"2024-03-15T06:51:33.919460Z"},"trusted":true},"outputs":[],"source":["from attacks import AudioEffects as af"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Experimental setup\n","- `fraction` values: \\{0.1\\%, 1\\%, 10\\%, 30\\%\\}\n","- `nomenclature` : n, s, m, l\n","\n","In this notebook, we set the above parameters for the SHUSH attack and note the average confidence scores of Audioseal in predicting the presence of watermarks for these attacked audio files."]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:39.665411Z","iopub.status.busy":"2024-03-15T06:51:39.665025Z","iopub.status.idle":"2024-03-15T06:51:39.673355Z","shell.execute_reply":"2024-03-15T06:51:39.672473Z","shell.execute_reply.started":"2024-03-15T06:51:39.665382Z"},"trusted":true},"outputs":[{"data":{"text/plain":["<torch._C.Generator at 0x791cf49c2890>"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["import random\n","random.seed(42)\n","torch.backends.cudnn.benchmark = True\n","np.random.seed(42)\n","torch.manual_seed(42)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:40.533674Z","iopub.status.busy":"2024-03-15T06:51:40.532744Z","iopub.status.idle":"2024-03-15T06:59:10.240026Z","shell.execute_reply":"2024-03-15T06:59:10.239069Z","shell.execute_reply.started":"2024-03-15T06:51:40.533640Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["  5%|▌         | 148/2880 [01:38<09:22,  4.86it/s]  "]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 67807] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_05/03-01-02-01-02-02-05.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 12%|█▏        | 335/2880 [02:27<04:57,  8.56it/s]  "]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 57663] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_01/03-01-02-01-01-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 12%|█▏        | 339/2880 [02:27<04:13, 10.02it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 52324] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_01/03-01-08-01-02-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 15%|█▍        | 425/2880 [02:45<03:49, 10.68it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 69942] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_20/03-01-06-01-01-02-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 15%|█▍        | 431/2880 [02:45<03:42, 11.01it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 55528] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_20/03-01-03-01-02-01-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 45%|████▍     | 1289/2880 [04:43<02:16, 11.62it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 67807] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_05/03-01-02-01-02-02-05.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 51%|█████▏    | 1476/2880 [05:02<02:07, 11.02it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 57663] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_01/03-01-02-01-01-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 51%|█████▏    | 1478/2880 [05:02<01:55, 12.10it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 52324] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_01/03-01-08-01-02-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 54%|█████▍    | 1564/2880 [05:10<01:57, 11.20it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 69942] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_20/03-01-06-01-01-02-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 55%|█████▍    | 1570/2880 [05:11<01:52, 11.61it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 55528] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_20/03-01-03-01-02-01-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 2880/2880 [07:29<00:00,  6.40it/s]\n"]}],"source":["from tqdm import tqdm\n","\n","all_scores_n = []\n","all_scores_s = []\n","all_scores_m = []\n","all_scores_l = []\n","all_saved_files = []\n","\n","for input_file in tqdm(all_input_files):\n","    try:\n","        # Load audio\n","        audio, sample_rate = load_audio_file(input_file)\n","\n","        # Generate watermarked audio\n","        watermarked_audio = generate_watermark_audio(audio, sample_rate)\n","\n","        # Perform SHUSH attacks\n","        shush_attack_audio_n = af.shush(watermarked_audio, fraction=0.001)\n","        shush_attack_audio_s = af.shush(watermarked_audio, fraction=0.01)\n","        shush_attack_audio_m = af.shush(watermarked_audio, fraction=0.1)\n","        shush_attack_audio_l = af.shush(watermarked_audio, fraction=0.3)\n","\n","        # Compute scores\n","        shush_score_n = detect_watermark_audio(shush_attack_audio_n, sample_rate)\n","        shush_score_s = detect_watermark_audio(shush_attack_audio_s, sample_rate)\n","        shush_score_m = detect_watermark_audio(shush_attack_audio_m, sample_rate)\n","        shush_score_l = detect_watermark_audio(shush_attack_audio_l, sample_rate)\n","\n","        # Store scores\n","        all_scores_n.append(float(shush_score_n))\n","        all_scores_s.append(float(shush_score_s))\n","        all_scores_m.append(float(shush_score_m))\n","        all_scores_l.append(float(shush_score_l))\n","        all_saved_files.append(input_file)\n","    except Exception as e:\n","        print(f\"Skipping file {input_file} due to {e}\")\n","        pass"]},{"cell_type":"markdown","metadata":{},"source":["## Store results and calculate metrics"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:59:10.242250Z","iopub.status.busy":"2024-03-15T06:59:10.241976Z","iopub.status.idle":"2024-03-15T06:59:10.249995Z","shell.execute_reply":"2024-03-15T06:59:10.248957Z","shell.execute_reply.started":"2024-03-15T06:59:10.242224Z"},"trusted":true},"outputs":[],"source":["df = pd.DataFrame({\n","    \"input_file\" : all_saved_files,\n","    \"watermark_confidence_n\" : all_scores_n,\n","    \"watermark_confidence_s\" : all_scores_s,\n","    \"watermark_confidence_m\" : all_scores_m,\n","    \"watermark_confidence_l\" : all_scores_l,\n","})"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:59:10.251570Z","iopub.status.busy":"2024-03-15T06:59:10.251203Z","iopub.status.idle":"2024-03-15T06:59:10.278172Z","shell.execute_reply":"2024-03-15T06:59:10.277277Z","shell.execute_reply.started":"2024-03-15T06:59:10.251528Z"},"trusted":true},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>watermark_confidence_n</th>\n","      <th>watermark_confidence_s</th>\n","      <th>watermark_confidence_m</th>\n","      <th>watermark_confidence_l</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>count</th>\n","      <td>2870.000000</td>\n","      <td>2870.000000</td>\n","      <td>2870.000000</td>\n","      <td>2870.000000</td>\n","    </tr>\n","    <tr>\n","      <th>mean</th>\n","      <td>0.998849</td>\n","      <td>0.990138</td>\n","      <td>0.900209</td>\n","      <td>0.699678</td>\n","    </tr>\n","    <tr>\n","      <th>std</th>\n","      <td>0.000776</td>\n","      <td>0.000738</td>\n","      <td>0.000763</td>\n","      <td>0.000516</td>\n","    </tr>\n","    <tr>\n","      <th>min</th>\n","      <td>0.976302</td>\n","      <td>0.967376</td>\n","      <td>0.876146</td>\n","      <td>0.694676</td>\n","    </tr>\n","    <tr>\n","      <th>25%</th>\n","      <td>0.998877</td>\n","      <td>0.990022</td>\n","      <td>0.900083</td>\n","      <td>0.699631</td>\n","    </tr>\n","    <tr>\n","      <th>50%</th>\n","      <td>0.998922</td>\n","      <td>0.990202</td>\n","      <td>0.900260</td>\n","      <td>0.699777</td>\n","    </tr>\n","    <tr>\n","      <th>75%</th>\n","      <td>0.998963</td>\n","      <td>0.990334</td>\n","      <td>0.900352</td>\n","      <td>0.699923</td>\n","    </tr>\n","    <tr>\n","      <th>max</th>\n","      <td>0.999177</td>\n","      <td>0.990550</td>\n","      <td>0.900958</td>\n","      <td>0.700464</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["       watermark_confidence_n  watermark_confidence_s  watermark_confidence_m  \\\n","count             2870.000000             2870.000000             2870.000000   \n","mean                 0.998849                0.990138                0.900209   \n","std                  0.000776                0.000738                0.000763   \n","min                  0.976302                0.967376                0.876146   \n","25%                  0.998877                0.990022                0.900083   \n","50%                  0.998922                0.990202                0.900260   \n","75%                  0.998963                0.990334                0.900352   \n","max                  0.999177                0.990550                0.900958   \n","\n","       watermark_confidence_l  \n","count             2870.000000  \n","mean                 0.699678  \n","std                  0.000516  \n","min                  0.694676  \n","25%                  0.699631  \n","50%                  0.699777  \n","75%                  0.699923  \n","max                  0.700464  "]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["df.describe()"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## We note that Audioseal performs very well in recalling the watermarks - even in extreme conditions of masking the first 30\\% of the audio, the average confidence is $0.699678$. "]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[{"datasetId":107620,"sourceId":256618,"sourceType":"datasetVersion"}],"dockerImageVersionId":30665,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]"},"vscode":{"interpreter":{"hash":"916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"}}},"nbformat":4,"nbformat_minor":4}
2 | 


--------------------------------------------------------------------------------