├── src └── audioseal │ ├── py.typed │ ├── libs │ ├── __init__.py │ └── audiocraft │ │ ├── __init__.py │ │ └── modules │ │ ├── __init__.py │ │ ├── lstm.py │ │ ├── conv.py │ │ └── seanet.py │ ├── __init__.py │ ├── cards │ ├── audioseal_detector_16bits.yaml │ └── audioseal_wm_16bits.yaml │ ├── builder.py │ ├── loader.py │ └── models.py ├── requirements.txt ├── requirements-dev.txt ├── .github ├── pull_request_template.md └── workflows │ └── lint_and_test.yaml ├── CHANGELOG.md ├── .pre-commit-config.yaml ├── LICENSE ├── examples ├── notebook.py ├── attacks.py ├── colab.ipynb └── attack_benchmarking_example.ipynb ├── tests └── test_models.py ├── pyproject.toml ├── CONTRIBUTING.md ├── .gitignore ├── CODE_OF_CONDUCT.md └── README.md /src/audioseal/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | omegaconf 3 | julius 4 | torch>=1.13.0 5 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # For developers wanting to contribute to AudioSeal 2 | func_argparse 3 | torchaudio 4 | soundfile 5 | pytest 6 | mypy 7 | black 8 | isort 9 | flake8 10 | pre-commit 11 | -------------------------------------------------------------------------------- /src/audioseal/libs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /src/audioseal/libs/audiocraft/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /src/audioseal/libs/audiocraft/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from .seanet import SEANetDecoder, SEANetEncoder, SEANetEncoderKeepDimension 9 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Why ? 2 | 3 | Why do we need to implement this feature ? What is the use case ? 4 | 5 | ## How ? 6 | 7 | Document the technical decisions you made. 8 | If some parts are WIP, please explicit them here. 9 | 10 | ## Test plan 11 | 12 | How did you test your changes ? 13 | Include full command line to help other people reproduce if needed. 14 | -------------------------------------------------------------------------------- /src/audioseal/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | """ 8 | Watermarking and detection for speech audios 9 | 10 | A Pytorch-based localized algorithm for proactive detection 11 | of the watermarkings in AI-generated audios, with very fast 12 | detector. 13 | 14 | """ 15 | 16 | __version__ = "0.1.2" 17 | 18 | 19 | from audioseal import builder 20 | from audioseal.loader import AudioSeal 21 | from audioseal.models import AudioSealDetector, AudioSealWM, MsgProcessor 22 | -------------------------------------------------------------------------------- /src/audioseal/cards/audioseal_detector_16bits.yaml: -------------------------------------------------------------------------------- 1 | # @package __global__ 2 | 3 | name: audioseal_detector_16bits 4 | model_type: seanet 5 | checkpoint: "https://dl.fbaipublicfiles.com/audioseal/6edcf62f/detector.pth" 6 | nbits: 16 7 | seanet: 8 | activation: ELU 9 | activation_params: 10 | alpha: 1.0 11 | causal: false 12 | channels: 1 13 | compress: 2 14 | dilation_base: 2 15 | dimension: 128 16 | disable_norm_outer_blocks: 0 17 | kernel_size: 7 18 | last_kernel_size: 7 19 | lstm: 2 20 | n_filters: 32 21 | n_residual_layers: 1 22 | norm: weight_norm 23 | norm_params: {} 24 | pad_mode: constant 25 | ratios: 26 | - 8 27 | - 5 28 | - 4 29 | - 2 30 | residual_kernel_size: 3 31 | true_skip: true 32 | detector: 33 | output_dim: 32 34 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to AudioSeal are documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). 6 | 7 | ## [0.1.2] - 2024-02-29 8 | - Add py.typed to make audioseal mypy-friendly 9 | - Add the option to resample the input audio's sample rate to the expected sample rate of the model (https://github.com/facebookresearch/audioseal/pull/18) 10 | - Move `attacks.py` to non-core code base of audioseal 11 | - Remove duplicate module `SEANetEncoderKeepDimension` in `audioseal.lib.audiocraft.modules.seanet` and `audioseal.models` 12 | 13 | ## [0.1.1] - 2024-02-04 14 | 15 | - Fix [issue](https://github.com/facebookresearch/audioseal/issues/7) in installing audioseal from pypi due to conflict with audiocraft package 16 | - Fix typos in example notebooks 17 | - Update checkpoint to be Windows-compatible 18 | 19 | ## [0.1.0] - 2024-02-01 20 | 21 | - Initial release 22 | -------------------------------------------------------------------------------- /src/audioseal/libs/audiocraft/modules/lstm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | # Vendor from https://github.com/facebookresearch/audiocraft 8 | 9 | from torch import nn 10 | 11 | 12 | class StreamableLSTM(nn.Module): 13 | """LSTM without worrying about the hidden state, nor the layout of the data. 14 | Expects input as convolutional layout. 15 | """ 16 | 17 | def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True): 18 | super().__init__() 19 | self.skip = skip 20 | self.lstm = nn.LSTM(dimension, dimension, num_layers) 21 | 22 | def forward(self, x): 23 | x = x.permute(2, 0, 1) 24 | y, _ = self.lstm(x) 25 | if self.skip: 26 | y = y + x 27 | y = y.permute(1, 2, 0) 28 | return y 29 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.1.0 7 | hooks: 8 | - id: trailing-whitespace 9 | - id: check-ast 10 | - id: check-merge-conflict 11 | - id: check-added-large-files 12 | args: ["--maxkb=2000"] 13 | - id: end-of-file-fixer 14 | 15 | - repo: https://github.com/psf/black 16 | rev: 24.1.1 17 | hooks: 18 | - id: black 19 | language_version: python3.8 20 | 21 | - repo: https://github.com/pycqa/isort 22 | rev: 5.12.0 23 | hooks: 24 | - id: isort 25 | exclude: README.md 26 | 27 | - repo: https://github.com/pre-commit/mirrors-prettier 28 | rev: v2.7.1 29 | hooks: 30 | - id: prettier 31 | 32 | - repo: https://github.com/pre-commit/mirrors-mypy 33 | rev: v1.8.0 34 | hooks: 35 | - id: mypy 36 | args: [--ignore-missing-imports, --ignore-] 37 | -------------------------------------------------------------------------------- /src/audioseal/cards/audioseal_wm_16bits.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | name: audioseal_wm_16bits 8 | model_type: seanet 9 | checkpoint: "https://dl.fbaipublicfiles.com/audioseal/6edcf62f/generator.pth" 10 | nbits: 16 11 | seanet: 12 | activation: ELU 13 | activation_params: 14 | alpha: 1.0 15 | causal: false 16 | channels: 1 17 | compress: 2 18 | dilation_base: 2 19 | dimension: 128 20 | disable_norm_outer_blocks: 0 21 | kernel_size: 7 22 | last_kernel_size: 7 23 | lstm: 2 24 | n_filters: 32 25 | n_residual_layers: 1 26 | norm: weight_norm 27 | norm_params: {} 28 | pad_mode: constant 29 | ratios: 30 | - 8 31 | - 5 32 | - 4 33 | - 2 34 | residual_kernel_size: 3 35 | true_skip: true 36 | decoder: 37 | final_activation: null 38 | final_activation_params: null 39 | trim_right_ratio: 1.0 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/notebook.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | # Accompanying code for the notebook. 8 | # We need to install matplotlib and jupyter notebook beforehand 9 | 10 | import IPython.display as ipd 11 | import matplotlib.pyplot as plt 12 | import torch 13 | 14 | 15 | def plot_waveform_and_specgram(waveform, sample_rate, title): 16 | waveform = waveform.squeeze().detach().cpu().numpy() 17 | 18 | num_frames = waveform.shape[-1] 19 | time_axis = torch.arange(0, num_frames) / sample_rate 20 | 21 | figure, (ax1, ax2) = plt.subplots(1, 2) 22 | 23 | ax1.plot(time_axis, waveform, linewidth=1) 24 | ax1.grid(True) 25 | ax2.specgram(waveform, Fs=sample_rate) 26 | 27 | figure.suptitle(f"{title} - Waveform and specgram") 28 | plt.show() 29 | 30 | 31 | def play_audio(waveform, sample_rate): 32 | if waveform.dim() > 2: 33 | waveform = waveform.squeeze(0) 34 | waveform = waveform.detach().cpu().numpy() 35 | 36 | num_channels, *_ = waveform.shape 37 | if num_channels == 1: 38 | ipd.display(ipd.Audio(waveform[0], rate=sample_rate)) 39 | elif num_channels == 2: 40 | ipd.display(ipd.Audio((waveform[0], waveform[1]), rate=sample_rate)) 41 | else: 42 | raise ValueError("Waveform with more than 2 channels are not supported.") 43 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import urllib 9 | 10 | import pytest 11 | import torch 12 | import torchaudio 13 | 14 | from audioseal import AudioSeal 15 | 16 | 17 | @pytest.fixture 18 | def example_audio(tmp_path): 19 | url = "https://keithito.com/LJ-Speech-Dataset/LJ037-0171.wav" 20 | with open(tmp_path / "test.wav", "wb") as f: 21 | resp = urllib.request.urlopen(url) 22 | f.write(resp.read()) 23 | 24 | wav, sr = torchaudio.load(tmp_path / "test.wav") 25 | 26 | # Add batch dimension 27 | yield wav.unsqueeze(0), sr 28 | 29 | 30 | def test_detector(example_audio): 31 | audio, sr = example_audio 32 | model = AudioSeal.load_generator("audioseal_wm_16bits") 33 | 34 | secret_message = torch.randint(0, 2, (1, 16), dtype=torch.int32) 35 | watermark = model(audio, sample_rate=sr, message=secret_message, alpha=0.8) 36 | 37 | watermarked_audio = audio + watermark 38 | 39 | detector = AudioSeal.load_detector(("audioseal_detector_16bits")) 40 | result, message = detector.detect_watermark(watermarked_audio, sample_rate=sr) # noqa 41 | 42 | # Due to non-deterministic decoding, messages are not always the same as message 43 | print(f"\nOriginal message: {secret_message}") 44 | print(f"Decoded message: {message}") 45 | print( 46 | "Matching bits in decoded and original messages: " 47 | f"{torch.count_nonzero(torch.eq(message, secret_message)).item()}\n" 48 | ) 49 | assert result > 0.5 50 | 51 | # Try to detect the unwatermarked audio 52 | result, _ = detector.detect_watermark(audio, sample_rate=sr) # noqa 53 | assert result < 0.5 54 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4", "packaging~=23.1", "setuptools~=67.8", "wheel~=0.40"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "audioseal" 7 | readme = "README.md" 8 | authors = [{name = "Facebook AI Research"}] 9 | requires-python = ">=3.8" 10 | dynamic = ["version", "description"] 11 | 12 | 13 | # zip_safe = false 14 | classifiers=[ 15 | "License :: OSI Approved :: MIT License", 16 | "Topic :: Scientific/Engineering", 17 | "Development Status :: 4 - Beta", 18 | ] 19 | 20 | dependencies = [ 21 | "numpy", 22 | "omegaconf", 23 | "julius", 24 | "torch>=1.13.0", 25 | ] 26 | 27 | [project.urls] 28 | Source = "https://github.com/facebookresearch/audioseal" 29 | Tracker = "https://github.com/facebookresearch/audioseal/issues" 30 | 31 | [project.optional-dependencies] 32 | dev = [ 33 | "func_argparse", 34 | "torchaudio", 35 | "soundfile", 36 | "pytest", 37 | "black", 38 | "isort", 39 | "flake8", 40 | "pre-commit", 41 | ] 42 | 43 | [tool.setuptools.package-data] 44 | "audioseal" = ["py.typed", "cards/*.yaml"] 45 | 46 | [tool.flake8] 47 | extend_ignore = ["E", "Y"] # Black 48 | per-file-ignores = [ 49 | "__init__.py:F401", 50 | ] 51 | 52 | [tool.isort] 53 | profile = "black" 54 | 55 | [tool.mypy] 56 | disable_error_code = "type-abstract,typeddict-unknown-key" 57 | disallow_untyped_calls = false 58 | disallow_untyped_defs = false 59 | disallow_untyped_decorators = false 60 | ignore_missing_imports = true 61 | python_version = 3.8 62 | show_error_codes = true 63 | show_error_context = true 64 | strict = false 65 | warn_unused_configs = false 66 | warn_unused_ignores = false 67 | exclude = ["src/audiocraft", "examples"] 68 | 69 | [tool.pytest.ini_options] 70 | minversion = "7.1" 71 | testpaths = ["tests"] 72 | filterwarnings = [ 73 | "ignore:Deprecated call to `pkg_resources", 74 | "ignore:Please use `line_search_wolfe", 75 | "ignore:Please use `spmatrix", 76 | "ignore:TypedStorage is deprecated", 77 | "ignore:distutils Version classes are deprecated", 78 | "ignore:pkg_resources is deprecated", 79 | "ignore:torch.nn.utils.weight_norm is deprecated in favor of", 80 | ] 81 | norecursedirs = [ 82 | "examples/*", 83 | ] 84 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to AudioSeal 2 | 3 | We want to make contributing to AudioSeal as easy as possible. Please make sure 4 | to read this guideline carefully. 5 | 6 | ## Setting up Development Environment 7 | 8 | AudioSeal is a lightweight Python library that only relies on PyTorch, Numpy and OmegaConf (for 9 | model card loading). Currenet minimal Pytorch requirement is 13.0, and it is advisable to 10 | keep the constraints on PyTorch as lenient as possible. Please keep both the text file 11 | `requirements.txt` and the Poetry file `pyproject.toml` up-to-date should you change the 12 | third-party library requirements. 13 | 14 | ```sh 15 | git clone https://github.com/facebookresearch/audioseal.git 16 | ``` 17 | 18 | And, install the package in editable mode with development tools before contributing: 19 | 20 | ```sh 21 | cd audioseal 22 | pip install -e ".[dev]" 23 | ``` 24 | 25 | Alternatively, you can also install the package and its development tools separately 26 | 27 | ```sh 28 | cd audioseal 29 | pip install -e . 30 | pip install -r requirements-dev.txt 31 | ``` 32 | 33 | It is advisable to keep your commits linted and syntax-correct. In AudioSeal we provide a few 34 | [pre-commit] hooks to support that. Simply install pre-commit: 35 | 36 | ```sh 37 | pre-commit install . 38 | ``` 39 | 40 | ## Pull Requests 41 | 42 | We actively welcome your pull requests. 43 | 44 | 1. Fork the repo and create your branch from `main`. 45 | 2. If you've added code that should be tested, add tests. 46 | 3. If you've changed APIs, update the documentation. 47 | 4. Ensure the test suite passes. 48 | 5. Make sure your code lints. 49 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 50 | 51 | ## Contributor License Agreement ("CLA") 52 | 53 | In order to accept your pull request, we need you to submit a CLA. You only need 54 | to do this once to work on any of Meta's open source projects. 55 | 56 | Complete your CLA here: 57 | 58 | ## Issues 59 | 60 | We use GitHub issues to track public bugs. Please ensure your description is 61 | clear and has sufficient instructions to be able to reproduce the issue. 62 | 63 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 64 | disclosure of security bugs. In those cases, please go through the process 65 | outlined on that page and do not file a public issue. 66 | 67 | ## License 68 | 69 | By contributing to `SONAR`, you agree that your contributions will be licensed 70 | under the LICENSE file in the root directory of this source tree. 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Sphinx documentation 40 | docs/_build/ 41 | 42 | # PyBuilder 43 | .pybuilder/ 44 | target/ 45 | 46 | # Jupyter Notebook 47 | .ipynb_checkpoints 48 | 49 | # IPython 50 | profile_default/ 51 | ipython_config.py 52 | 53 | # pyenv 54 | # For a library or package, you might want to ignore these files since the code is 55 | # intended to run in multiple environments; otherwise, check them in: 56 | .python-version 57 | 58 | # pipenv 59 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 60 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 61 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 62 | # install all needed dependencies. 63 | Pipfile.lock 64 | 65 | # poetry 66 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 67 | # This is especially recommended for binary packages to ensure reproducibility, and is more 68 | # commonly ignored for libraries. 69 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 70 | poetry.lock 71 | 72 | # pdm 73 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 74 | #pdm.lock 75 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 76 | # in version control. 77 | # https://pdm.fming.dev/#use-with-ide 78 | .pdm.toml 79 | 80 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 81 | __pypackages__/ 82 | 83 | 84 | # mypy 85 | .mypy_cache/ 86 | .dmypy.json 87 | dmypy.json 88 | 89 | # Pyre type checker 90 | .pyre/ 91 | 92 | # pytype static type analyzer 93 | .pytype/ 94 | 95 | # Cython debug symbols 96 | cython_debug/ 97 | 98 | # PyCharm 99 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 100 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 101 | # and can be added to the global gitignore or merged into this file. For a more nuclear 102 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 103 | .idea/ 104 | 105 | # local training outputs 106 | outputs/* 107 | -------------------------------------------------------------------------------- /.github/workflows/lint_and_test.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: ["main"] 9 | pull_request: 10 | branches: ["main"] 11 | 12 | jobs: 13 | linter: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.8"] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | cache: "pip" 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install -r requirements.txt 31 | pip install -r requirements-dev.txt 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: isort 39 | run: isort --check --diff . 40 | - name: mypy 41 | run: mypy --install-types --non-interactive ./ --cache-dir=.mypy_cache/ 42 | 43 | unit_test: 44 | runs-on: ubuntu-latest 45 | timeout-minutes: 20 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | python-version: ["3.8", "3.9", "3.10"] 50 | steps: 51 | - uses: actions/checkout@v3 52 | - name: Set up Python ${{ matrix.python-version }} 53 | uses: actions/setup-python@v3 54 | with: 55 | python-version: ${{ matrix.python-version }} 56 | cache: "pip" 57 | - name: Install dependencies 58 | run: | 59 | sudo apt-get install libsndfile1 60 | python -m pip install --upgrade pip 61 | # We also test that pyproject.toml and requirements*.txt are synced 62 | pip install -r requirements-dev.txt 63 | pip install -e . 64 | - name: pytest_unit 65 | run: pytest -s -v tests/test_models.py 66 | 67 | unit_test_old_torch: 68 | runs-on: ubuntu-latest 69 | timeout-minutes: 20 70 | strategy: 71 | fail-fast: false 72 | matrix: 73 | python-version: ["3.8"] 74 | steps: 75 | - uses: actions/checkout@v3 76 | - name: Set up Python ${{ matrix.python-version }} 77 | uses: actions/setup-python@v3 78 | with: 79 | python-version: ${{ matrix.python-version }} 80 | cache: "pip" 81 | - name: Install dependencies 82 | run: | 83 | sudo apt-get install libsndfile1 84 | python -m pip install --upgrade pip 85 | pip install torch==1.13.0 torchaudio==0.13.0 func_argparse soundfile pytest omegaconf numpy julius 86 | pip install --no-deps -e . 87 | - name: pytest_unit 88 | run: pytest -s -v tests/test_models.py 89 | -------------------------------------------------------------------------------- /src/audioseal/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from dataclasses import dataclass 8 | from typing import Any, Dict, List, Optional 9 | 10 | from torch import device, dtype 11 | from typing_extensions import TypeAlias 12 | 13 | from audioseal.libs import audiocraft 14 | from audioseal.models import AudioSealDetector, AudioSealWM, MsgProcessor 15 | 16 | Device: TypeAlias = device 17 | 18 | DataType: TypeAlias = dtype 19 | 20 | 21 | @dataclass 22 | class SEANetConfig: 23 | """ 24 | Map common hparams of SEANet encoder and decoder. 25 | """ 26 | 27 | channels: int 28 | dimension: int 29 | n_filters: int 30 | n_residual_layers: int 31 | ratios: List[int] 32 | activation: str 33 | activation_params: Dict[str, float] 34 | norm: str 35 | norm_params: Dict[str, Any] 36 | kernel_size: int 37 | last_kernel_size: int 38 | residual_kernel_size: int 39 | dilation_base: int 40 | causal: bool 41 | pad_mode: str 42 | true_skip: bool 43 | compress: int 44 | lstm: int 45 | disable_norm_outer_blocks: int 46 | 47 | 48 | @dataclass 49 | class DecoderConfig: 50 | final_activation: Optional[str] 51 | final_activation_params: Optional[dict] 52 | trim_right_ratio: float 53 | 54 | 55 | @dataclass 56 | class DetectorConfig: 57 | output_dim: int 58 | 59 | 60 | @dataclass 61 | class AudioSealWMConfig: 62 | nbits: int 63 | seanet: SEANetConfig 64 | decoder: DecoderConfig 65 | 66 | 67 | @dataclass 68 | class AudioSealDetectorConfig: 69 | nbits: int 70 | seanet: SEANetConfig 71 | detector: DetectorConfig 72 | 73 | 74 | def create_generator( 75 | config: AudioSealWMConfig, 76 | *, 77 | device: Optional[Device] = None, 78 | dtype: Optional[DataType] = None, 79 | ) -> AudioSealWM: 80 | """Create a generator from hparams""" 81 | 82 | # Currently the encoder hparams are the same as 83 | # SEANet, but this can be changed in the future. 84 | encoder = audiocraft.modules.SEANetEncoder(**config.seanet) # type: ignore[arg-type] 85 | encoder = encoder.to(device=device, dtype=dtype) 86 | 87 | decoder_config = {**config.seanet, **config.decoder} # type: ignore 88 | decoder = audiocraft.modules.SEANetDecoder(**decoder_config) # type: ignore[arg-type] 89 | decoder = decoder.to(device=device, dtype=dtype) 90 | 91 | msgprocessor = MsgProcessor(nbits=config.nbits, hidden_size=config.seanet.dimension) 92 | msgprocessor = msgprocessor.to(device=device, dtype=dtype) 93 | 94 | return AudioSealWM(encoder=encoder, decoder=decoder, msg_processor=msgprocessor) 95 | 96 | 97 | def create_detector( 98 | config: AudioSealDetectorConfig, 99 | *, 100 | device: Optional[Device] = None, 101 | dtype: Optional[DataType] = None, 102 | ) -> AudioSealDetector: 103 | detector_config = {**config.seanet, **config.detector} # type: ignore 104 | detector = AudioSealDetector(nbits=config.nbits, **detector_config) 105 | detector = detector.to(device=device, dtype=dtype) 106 | return detector 107 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /src/audioseal/loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | from dataclasses import fields 10 | from hashlib import sha1 11 | from pathlib import Path 12 | from typing import ( # type: ignore[attr-defined] 13 | Any, 14 | Dict, 15 | List, 16 | Optional, 17 | Tuple, 18 | Type, 19 | TypeVar, 20 | Union, 21 | cast, 22 | ) 23 | from urllib.parse import urlparse # noqa: F401 24 | 25 | import torch 26 | from omegaconf import DictConfig, OmegaConf 27 | 28 | from audioseal.builder import ( 29 | AudioSealDetectorConfig, 30 | AudioSealWMConfig, 31 | create_detector, 32 | create_generator, 33 | ) 34 | from audioseal.models import AudioSealDetector, AudioSealWM 35 | 36 | AudioSealT = TypeVar("AudioSealT", AudioSealWMConfig, AudioSealDetectorConfig) 37 | 38 | 39 | class ModelLoadError(RuntimeError): 40 | """Raised when the model loading fails""" 41 | 42 | 43 | def _get_path_from_env(var_name: str) -> Optional[Path]: 44 | pathname = os.getenv(var_name) 45 | if not pathname: 46 | return None 47 | 48 | try: 49 | return Path(pathname) 50 | except ValueError as ex: 51 | raise RuntimeError(f"Expect valid pathname, get '{pathname}'.") from ex 52 | 53 | 54 | def _get_cache_dir(env_names: List[str]): 55 | """Re-use cache dir from a list of existing caches""" 56 | for env in env_names: 57 | cache_dir = _get_path_from_env(env) 58 | if cache_dir: 59 | break 60 | else: 61 | cache_dir = Path("~/.cache").expanduser().resolve() 62 | 63 | # Create a sub-dir to not mess up with existing caches 64 | cache_dir = cache_dir / "audioseal" 65 | cache_dir.mkdir(exist_ok=True, parents=True) 66 | 67 | return cache_dir 68 | 69 | 70 | def load_model_checkpoint( 71 | model_path: Union[Path, str], 72 | device: Union[str, torch.device] = "cpu", 73 | ): 74 | if Path(model_path).is_file(): 75 | return torch.load(model_path, map_location=device) 76 | 77 | cache_dir = _get_cache_dir( 78 | ["AUDIOSEAL_CACHE_DIR", "AUDIOCRAFT_CACHE_DIR", "XDG_CACHE_HOME"] 79 | ) 80 | parts = urlparse(str(model_path)) 81 | if parts.scheme == "https": 82 | 83 | # TODO: Add HF Hub 84 | hash_ = sha1(parts.path.encode()).hexdigest()[:24] 85 | return torch.hub.load_state_dict_from_url( 86 | str(model_path), model_dir=cache_dir, map_location=device, file_name=hash_ 87 | ) 88 | else: 89 | raise ModelLoadError(f"Path or uri {model_path} is unknown or does not exist") 90 | 91 | 92 | def load_local_model_config(model_card: str) -> Optional[DictConfig]: 93 | config_file = Path(__file__).parent / "cards" / (model_card + ".yaml") 94 | if Path(config_file).is_file(): 95 | return cast(DictConfig, OmegaConf.load(config_file.resolve())) 96 | else: 97 | return None 98 | 99 | 100 | class AudioSeal: 101 | 102 | @staticmethod 103 | def _parse_model( 104 | model_card_or_path: str, 105 | model_type: Type[AudioSealT], 106 | nbits: Optional[int] = None, 107 | ) -> Tuple[Dict[str, Any], AudioSealT]: 108 | """ 109 | Parse the information from the model card or checkpoint path using 110 | the schema `model_type` that defines the model type 111 | """ 112 | # Get the raw checkpoint and config from the local model cards 113 | config = load_local_model_config(model_card_or_path) 114 | 115 | if config: 116 | assert "checkpoint" in config, f"Checkpoint missing in {model_card_or_path}" 117 | config_dict = OmegaConf.to_container(config) 118 | assert isinstance( 119 | config_dict, dict 120 | ), f"Cannot parse config from {model_card_or_path}" 121 | checkpoint = config_dict.pop("checkpoint") 122 | checkpoint = load_model_checkpoint(checkpoint) 123 | 124 | # Get the raw checkpoint and config from the checkpoint path 125 | else: 126 | config_dict = {} 127 | checkpoint = load_model_checkpoint(model_card_or_path) 128 | 129 | # If the checkpoint has config in its, take this but uses the info 130 | # in the mode as precedence 131 | assert isinstance( 132 | checkpoint, dict 133 | ), f"Expect loaded checkpoint to be a dictionary, get {type(checkpoint)}" 134 | assert isinstance( 135 | config_dict, dict 136 | ), f"Except loaded config to be a dictionary, get {type(config_dict)}" 137 | if "xp.cfg" in checkpoint: 138 | config = {**checkpoint["xp.cfg"], **config_dict} # type: ignore 139 | assert config is not None 140 | assert ( 141 | "seanet" in config 142 | ), f"missing seanet backbone config in {model_card_or_path}" 143 | 144 | # Patch 1: Resolve the variables in the checkpoint 145 | config = OmegaConf.create(config) 146 | OmegaConf.resolve(config) 147 | config = OmegaConf.to_container(config) # type: ignore 148 | 149 | # Patch 2: Put decoder, encoder and detector outside seanet 150 | seanet_config = config["seanet"] 151 | for key_to_patch in ["encoder", "decoder", "detector"]: 152 | if key_to_patch in seanet_config: 153 | config_to_patch = config.get(key_to_patch) or {} 154 | config[key_to_patch] = { 155 | **config_to_patch, 156 | **seanet_config.pop(key_to_patch), 157 | } 158 | 159 | config["seanet"] = seanet_config 160 | 161 | # Patch 3: Put nbits into config if specified 162 | if nbits and "nbits" not in config: 163 | config["nbits"] = nbits 164 | 165 | if "model" in checkpoint: 166 | checkpoint = checkpoint["model"] 167 | 168 | # remove attributes not related to the model_type 169 | result_config = {} 170 | assert config, f"Empty config in {model_card_or_path}" 171 | for field in fields(model_type): 172 | if field.name in config: 173 | result_config[field.name] = config[field.name] 174 | 175 | schema = OmegaConf.structured(model_type) 176 | schema.merge_with(result_config) 177 | return checkpoint, schema 178 | 179 | @staticmethod 180 | def load_generator( 181 | model_card_or_path: str, 182 | nbits: Optional[int] = None, 183 | ) -> AudioSealWM: 184 | """Load the AudioSeal generator from the model card""" 185 | checkpoint, config = AudioSeal._parse_model( 186 | model_card_or_path, AudioSealWMConfig, nbits=nbits, 187 | ) 188 | 189 | model = create_generator(config) 190 | model.load_state_dict(checkpoint) 191 | return model 192 | 193 | @staticmethod 194 | def load_detector( 195 | model_card_or_path: str, 196 | nbits: Optional[int] = None, 197 | ) -> AudioSealDetector: 198 | checkpoint, config = AudioSeal._parse_model( 199 | model_card_or_path, AudioSealDetectorConfig, nbits=nbits, 200 | ) 201 | model = create_detector(config) 202 | model.load_state_dict(checkpoint) 203 | return model 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # :loud_sound: AudioSeal: Proactive Localized Watermarking 2 | 3 | Python 4 | Code style: black 5 | 6 | Inference code for AudioSeal, a method for speech localized watermarking, with state-of-the-art robustness and detector speed (training code coming soon). 7 | More details can be found in the [paper](https://arxiv.org/abs/2401.17264). 8 | 9 | [[`arXiv`](https://arxiv.org/abs/2401.17264)] 10 | [[`Colab notebook`](https://colab.research.google.com/github/facebookresearch/audioseal/blob/master/examples/colab.ipynb)][[🤗`Hugging Face`](https://huggingface.co/facebook/audioseal)] 11 | 12 | ![fig](https://github.com/facebookresearch/audioseal/assets/1453243/5d8cd96f-47b5-4c34-a3fa-7af386ed59f2) 13 | 14 | # Updates: 15 | 16 | - 2024-04-02: We have updated our license to full MIT license (including the license for the model weights) ! Now you can use AudioSeal in commercial application too ! 17 | - 2024-02-29: AudioSeal 0.1.2 is out, with more bug fixes for resampled audios and updated notebooks 18 | 19 | # Abtract 20 | 21 | We introduce AudioSeal, a method for speech localized watermarking, with state-of-the-art robustness and detector speed. It jointly trains a generator that embeds a watermark in the audio, and a detector that detects the watermarked fragments in longer audios, even in the presence of editing. 22 | Audioseal achieves state-of-the-art detection performance of both natural and synthetic speech at the sample level (1/16k second resolution), it generates limited alteration of signal quality and is robust to many types of audio editing. 23 | Audioseal is designed with a fast, single-pass detector, that significantly surpasses existing models in speed — achieving detection up to two orders of magnitude faster, making it ideal for large-scale and real-time applications. 24 | 25 | # :mate: Installation 26 | 27 | AudioSeal requires Python >=3.8, Pytorch >= 1.13.0, [omegaconf](https://omegaconf.readthedocs.io/), [julius](https://pypi.org/project/julius/), and numpy. To install from PyPI: 28 | 29 | ``` 30 | pip install audioseal 31 | ``` 32 | 33 | To install from source: Clone this repo and install in editable mode: 34 | 35 | ``` 36 | git clone https://github.com/facebookresearch/audioseal 37 | cd audioseal 38 | pip install -e . 39 | ``` 40 | 41 | # :gear: Models 42 | 43 | You can find all the model checkpoints on the [Hugging Face Hub](https://huggingface.co/facebook/audioseal). We provide the checkpoints for the following models: 44 | 45 | - [AudioSeal Generator](src/audioseal/cards/audioseal_wm_16bits.yaml). 46 | It takes as input an audio signal (as a waveform), and outputs a watermark of the same size as the input, that can be added to the input to watermark it. 47 | Optionally, it can also take as input a secret message of 16-bits that will be encoded in the watermark. 48 | - [AudioSeal Detector](src/audioseal/cards/audioseal_detector_16bits.yaml). 49 | It takes as input an audio signal (as a waveform), and outputs a probability that the input contains a watermark at each sample of the audio (every 1/16k s). 50 | Optionally, it may also output the secret message encoded in the watermark. 51 | 52 | Note that the message is optional and has no influence on the detection output. It may be used to identify a model version for instance (up to $2**16=65536$ possible choices). 53 | 54 | **Note**: We are working to release the training code for anyone wants to build their own watermarker. Stay tuned ! 55 | 56 | # :abacus: Usage 57 | 58 | Audioseal provides a simple API to watermark and detect the watermarks from an audio sample. Example usage: 59 | 60 | ```python 61 | 62 | from audioseal import AudioSeal 63 | 64 | # model name corresponds to the YAML card file name found in audioseal/cards 65 | model = AudioSeal.load_generator("audioseal_wm_16bits") 66 | 67 | # Other way is to load directly from the checkpoint 68 | # model = Watermarker.from_pretrained(checkpoint_path, device = wav.device) 69 | 70 | # a torch tensor of shape (batch, channels, samples) and a sample rate 71 | # It is important to process the audio to the same sample rate as the model 72 | # expectes. In our case, we support 16khz audio 73 | wav, sr = ..., 16000 74 | 75 | watermark = model.get_watermark(wav, sr) 76 | 77 | # Optional: you can add a 16-bit message to embed in the watermark 78 | # msg = torch.randint(0, 2, (wav.shape(0), model.msg_processor.nbits), device=wav.device) 79 | # watermark = model.get_watermark(wav, message = msg) 80 | 81 | watermarked_audio = wav + watermark 82 | 83 | detector = AudioSeal.load_detector("audioseal_detector_16bits") 84 | 85 | # To detect the messages in the high-level. 86 | result, message = detector.detect_watermark(watermarked_audio, sr) 87 | 88 | print(result) # result is a float number indicating the probability of the audio being watermarked, 89 | print(message) # message is a binary vector of 16 bits 90 | 91 | 92 | # To detect the messages in the low-level. 93 | result, message = detector(watermarked_audio, sr) 94 | 95 | # result is a tensor of size batch x 2 x frames, indicating the probability (positive and negative) of watermarking for each frame 96 | # A watermarked audio should have result[:, 1, :] > 0.5 97 | print(result[:, 1 , :]) 98 | 99 | # Message is a tensor of size batch x 16, indicating of the probability of each bit to be 1. 100 | # message will be a random tensor if the detector detects no watermarking from the audio 101 | print(message) 102 | ``` 103 | 104 | # Want to contribute? 105 | 106 | We welcome Pull Requests with improvements or suggestions. 107 | If you want to flag an issue or propose an improvement, but dont' know how to realize it, create a GitHub Issue. 108 | 109 | # Troubleshooting 110 | 111 | - If you encounter the error `ValueError: not enough values to unpack (expected 3, got 2)`, this is because we expect a batch of audio tensors as inputs. Add one 112 | dummy batch dimension to your input (e.g. `wav.unsqueeze(0)`, see [example notebook for getting started](examples/Getting_started.ipynb)). 113 | 114 | - In Windows machines, if you encounter the error `KeyError raised while resolving interpolation: "Environmen variable 'USER' not found"`: This is due to an old checkpoint 115 | uploaded to the model hub, which is not compatible in Windows. Try to invalidate the cache by removing the files in `C:\Users\\.cache\audioseal` 116 | and re-run again. 117 | 118 | - If you use torchaudio to handle your audios and encounter the error `Couldn't find appropriate backend to handle uri ...`, this is due to newer version of 119 | torchaudio does not handle the default backend well. Either downgrade your torchaudio to `2.0.1` or earlier, or install `soundfile` as your audio backend. 120 | 121 | # License 122 | 123 | - The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE). 124 | 125 | # Maintainers: 126 | - [Tuan Tran](https://github.com/antoine-tran) 127 | - [Hady Elsahar](https://github.com/hadyelsahar) 128 | - [Pierre Fernandez](https://github.com/pierrefdz) 129 | - [Robin San Roman](https://github.com/robinsrm) 130 | 131 | # Citation 132 | 133 | If you find this repository useful, please consider giving a star :star: and please cite as: 134 | 135 | ``` 136 | @article{sanroman2024proactive, 137 | title={Proactive Detection of Voice Cloning with Localized Watermarking}, 138 | author={San Roman, Robin and Fernandez, Pierre and Elsahar, Hady and D´efossez, Alexandre and Furon, Teddy and Tran, Tuan}, 139 | journal={arXiv preprint}, 140 | year={2024} 141 | } 142 | ``` 143 | -------------------------------------------------------------------------------- /src/audioseal/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import logging 8 | from typing import Optional, Tuple 9 | 10 | import julius 11 | import torch 12 | 13 | from audioseal.libs.audiocraft.modules.seanet import SEANetEncoderKeepDimension 14 | 15 | logger = logging.getLogger("Audioseal") 16 | 17 | COMPATIBLE_WARNING = """ 18 | AudioSeal is designed to work at a sample rate 16khz. 19 | Implicit sampling rate usage is deprecated and will be removed in future version. 20 | To remove this warning please add this argument to the function call: 21 | sample_rate = your_sample_rate 22 | """ 23 | 24 | 25 | class MsgProcessor(torch.nn.Module): 26 | """ 27 | Apply the secret message to the encoder output. 28 | Args: 29 | nbits: Number of bits used to generate the message. Must be non-zero 30 | hidden_size: Dimension of the encoder output 31 | """ 32 | 33 | def __init__(self, nbits: int, hidden_size: int): 34 | super().__init__() 35 | assert nbits > 0, "MsgProcessor should not be built in 0bit watermarking" 36 | self.nbits = nbits 37 | self.hidden_size = hidden_size 38 | self.msg_processor = torch.nn.Embedding(2 * nbits, hidden_size) 39 | 40 | def forward(self, hidden: torch.Tensor, msg: torch.Tensor) -> torch.Tensor: 41 | """ 42 | Build the embedding map: 2 x k -> k x h, then sum on the first dim 43 | Args: 44 | hidden: The encoder output, size: batch x hidden x frames 45 | msg: The secret message, size: batch x k 46 | """ 47 | # create indices to take from embedding layer 48 | indices = 2 * torch.arange(msg.shape[-1]).to(msg.device) # k: 0 2 4 ... 2k 49 | indices = indices.repeat(msg.shape[0], 1) # b x k 50 | indices = (indices + msg).long() 51 | msg_aux = self.msg_processor(indices) # b x k -> b x k x h 52 | msg_aux = msg_aux.sum(dim=-2) # b x k x h -> b x h 53 | msg_aux = msg_aux.unsqueeze(-1).repeat( 54 | 1, 1, hidden.shape[2] 55 | ) # b x h -> b x h x t/f 56 | hidden = hidden + msg_aux # -> b x h x t/f 57 | return hidden 58 | 59 | 60 | class AudioSealWM(torch.nn.Module): 61 | """ 62 | Generate watermarking for a given audio signal 63 | """ 64 | 65 | def __init__( 66 | self, 67 | encoder: torch.nn.Module, 68 | decoder: torch.nn.Module, 69 | msg_processor: Optional[torch.nn.Module] = None, 70 | ): 71 | super().__init__() 72 | self.encoder = encoder 73 | self.decoder = decoder 74 | # The build should take care of validating the dimensions between component 75 | self.msg_processor = msg_processor 76 | self._message: Optional[torch.Tensor] = None 77 | 78 | @property 79 | def message(self) -> Optional[torch.Tensor]: 80 | return self._message 81 | 82 | @message.setter 83 | def message(self, message: torch.Tensor) -> None: 84 | self._message = message 85 | 86 | def get_watermark( 87 | self, 88 | x: torch.Tensor, 89 | sample_rate: Optional[int] = None, 90 | message: Optional[torch.Tensor] = None, 91 | ) -> torch.Tensor: 92 | """ 93 | Get the watermark from an audio tensor and a message. 94 | If the input message is None, a random message of 95 | n bits {0,1} will be generated. 96 | Args: 97 | x: Audio signal, size: batch x frames 98 | sample_rate: The sample rate of the input audio (default 16khz as 99 | currently supported by the main AudioSeal model) 100 | message: An optional binary message, size: batch x k 101 | """ 102 | length = x.size(-1) 103 | if sample_rate is None: 104 | logger.warning(COMPATIBLE_WARNING) 105 | sample_rate = 16_000 106 | assert sample_rate 107 | if sample_rate != 16000: 108 | x = julius.resample_frac(x, old_sr=sample_rate, new_sr=16000) 109 | hidden = self.encoder(x) 110 | 111 | if self.msg_processor is not None: 112 | if message is None: 113 | self.message = self.message or torch.randint( 114 | 0, 2, (x.shape[0], self.msg_processor.nbits), device=x.device 115 | ) 116 | message = self.message 117 | 118 | hidden = self.msg_processor(hidden, message) 119 | 120 | watermark = self.decoder(hidden) 121 | 122 | if sample_rate != 16000: 123 | watermark = julius.resample_frac(watermark, old_sr=16000, new_sr=sample_rate) 124 | 125 | return watermark[ 126 | ..., : length 127 | ] # trim output cf encodec codebase 128 | 129 | def forward( 130 | self, 131 | x: torch.Tensor, 132 | sample_rate: Optional[int] = None, 133 | message: Optional[torch.Tensor] = None, 134 | alpha: float = 1.0, 135 | ) -> torch.Tensor: 136 | """Apply the watermarking to the audio signal x with a tune-down ratio (default 1.0)""" 137 | if sample_rate is None: 138 | logger.warning(COMPATIBLE_WARNING) 139 | sample_rate = 16_000 140 | wm = self.get_watermark(x, sample_rate=sample_rate, message=message) 141 | return x + alpha * wm 142 | 143 | 144 | class AudioSealDetector(torch.nn.Module): 145 | """ 146 | Detect the watermarking from an audio signal 147 | Args: 148 | SEANetEncoderKeepDimension (_type_): _description_ 149 | nbits (int): The number of bits in the secret message. The result will have size 150 | of 2 + nbits, where the first two items indicate the possibilities of the 151 | audio being watermarked (positive / negative scores), he rest is used to decode 152 | the secret message. In 0bit watermarking (no secret message), the detector just 153 | returns 2 values. 154 | """ 155 | 156 | def __init__(self, *args, nbits: int = 0, **kwargs): 157 | super().__init__() 158 | encoder = SEANetEncoderKeepDimension(*args, **kwargs) 159 | last_layer = torch.nn.Conv1d(encoder.output_dim, 2 + nbits, 1) 160 | self.detector = torch.nn.Sequential(encoder, last_layer) 161 | self.nbits = nbits 162 | 163 | def detect_watermark( 164 | self, 165 | x: torch.Tensor, 166 | sample_rate: Optional[int] = None, 167 | message_threshold: float = 0.5 168 | ) -> Tuple[float, torch.Tensor]: 169 | """ 170 | A convenience function that returns a probability of an audio being watermarked, 171 | together with its message in n-bits (binary) format. If the audio is not watermarked, 172 | the message will be random. 173 | Args: 174 | x: Audio signal, size: batch x frames 175 | sample_rate: The sample rate of the input audio 176 | message_threshold: threshold used to convert the watermark output (probability 177 | of each bits being 0 or 1) into the binary n-bit message. 178 | """ 179 | if sample_rate is None: 180 | logger.warning(COMPATIBLE_WARNING) 181 | sample_rate = 16_000 182 | result, message = self.forward(x, sample_rate=sample_rate) # b x 2+nbits 183 | detected = torch.count_nonzero(torch.gt(result[:, 1, :], 0.5)) / result.shape[-1] 184 | detect_prob = detected.cpu().item() # type: ignore 185 | message = torch.gt(message, message_threshold).int() 186 | return detect_prob, message 187 | 188 | def decode_message(self, result: torch.Tensor) -> torch.Tensor: 189 | """ 190 | Decode the message from the watermark result (batch x nbits x frames) 191 | Args: 192 | result: watermark result (batch x nbits x frames) 193 | Returns: 194 | The message of size batch x nbits, indicating probability of 1 for each bit 195 | """ 196 | assert ( 197 | (result.dim() > 2 and result.shape[1] == self.nbits) or 198 | (self.dim() == 2 and result.shape[0] == self.nbits) 199 | ), f"Expect message of size [,{self.nbits}, frames] (get {result.size()})" 200 | decoded_message = result.mean(dim=-1) 201 | return torch.sigmoid(decoded_message) 202 | 203 | def forward( 204 | self, 205 | x: torch.Tensor, 206 | sample_rate: Optional[int] = None, 207 | ) -> Tuple[torch.Tensor, torch.Tensor]: 208 | """ 209 | Detect the watermarks from the audio signal 210 | Args: 211 | x: Audio signal, size batch x frames 212 | sample_rate: The sample rate of the input audio 213 | """ 214 | if sample_rate is None: 215 | logger.warning(COMPATIBLE_WARNING) 216 | sample_rate = 16_000 217 | assert sample_rate 218 | if sample_rate != 16000: 219 | x = julius.resample_frac(x, old_sr=sample_rate, new_sr=16000) 220 | result = self.detector(x) # b x 2+nbits 221 | # hardcode softmax on 2 first units used for detection 222 | result[:, :2, :] = torch.softmax(result[:, :2, :], dim=1) 223 | message = self.decode_message(result[:, 2:, :]) 224 | return result[:, :2, :], message 225 | -------------------------------------------------------------------------------- /examples/attacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import typing as tp 8 | 9 | import julius 10 | import torch 11 | 12 | 13 | def generate_pink_noise(length: int) -> torch.Tensor: 14 | """ 15 | Generate pink noise using Voss-McCartney algorithm with PyTorch. 16 | """ 17 | num_rows = 16 18 | array = torch.randn(num_rows, length // num_rows + 1) 19 | reshaped_array = torch.cumsum(array, dim=1) 20 | reshaped_array = reshaped_array.reshape(-1) 21 | reshaped_array = reshaped_array[:length] 22 | # Normalize 23 | pink_noise = reshaped_array / torch.max(torch.abs(reshaped_array)) 24 | return pink_noise 25 | 26 | 27 | def audio_effect_return( 28 | tensor: torch.Tensor, mask: tp.Optional[torch.Tensor] 29 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 30 | """Return the mask if it was in the input otherwise only the output tensor""" 31 | if mask is None: 32 | return tensor 33 | else: 34 | return tensor, mask 35 | 36 | 37 | class AudioEffects: 38 | @staticmethod 39 | def speed( 40 | tensor: torch.Tensor, 41 | speed_range: tuple = (0.5, 1.5), 42 | sample_rate: int = 16000, 43 | mask: tp.Optional[torch.Tensor] = None, 44 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 45 | """ 46 | Function to change the speed of a batch of audio data. 47 | The output will have a different length ! 48 | 49 | Parameters: 50 | audio_batch (torch.Tensor): The batch of audio data in torch tensor format. 51 | speed (float): The speed to change the audio to. 52 | 53 | Returns: 54 | torch.Tensor: The batch of audio data with the speed changed. 55 | """ 56 | speed = torch.FloatTensor(1).uniform_(*speed_range) 57 | new_sr = int(sample_rate * 1 / speed) 58 | resampled_tensor = julius.resample_frac(tensor, sample_rate, new_sr) 59 | if mask is None: 60 | return resampled_tensor 61 | else: 62 | return resampled_tensor, torch.nn.functional.interpolate( 63 | mask, size=resampled_tensor.size(-1), mode="nearest-exact" 64 | ) 65 | 66 | @staticmethod 67 | def updownresample( 68 | tensor: torch.Tensor, 69 | sample_rate: int = 16000, 70 | intermediate_freq: int = 32000, 71 | mask: tp.Optional[torch.Tensor] = None, 72 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 73 | 74 | orig_shape = tensor.shape 75 | # upsample 76 | tensor = julius.resample_frac(tensor, sample_rate, intermediate_freq) 77 | # downsample 78 | tensor = julius.resample_frac(tensor, intermediate_freq, sample_rate) 79 | 80 | assert tensor.shape == orig_shape 81 | return audio_effect_return(tensor=tensor, mask=mask) 82 | 83 | @staticmethod 84 | def echo( 85 | tensor: torch.Tensor, 86 | volume_range: tuple = (0.1, 0.5), 87 | duration_range: tuple = (0.1, 0.5), 88 | sample_rate: int = 16000, 89 | mask: tp.Optional[torch.Tensor] = None, 90 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 91 | """ 92 | Attenuating the audio volume by a factor of 0.4, delaying it by 100ms, 93 | and then overlaying it with the original. 94 | 95 | :param tensor: 3D Tensor representing the audio signal [bsz, channels, frames] 96 | :param echo_volume: volume of the echo signal 97 | :param sample_rate: Sample rate of the audio signal. 98 | :return: Audio signal with reverb. 99 | """ 100 | 101 | # Create a simple impulse response 102 | # Duration of the impulse response in seconds 103 | duration = torch.FloatTensor(1).uniform_(*duration_range) 104 | volume = torch.FloatTensor(1).uniform_(*volume_range) 105 | 106 | n_samples = int(sample_rate * duration) 107 | impulse_response = torch.zeros(n_samples).type(tensor.type()).to(tensor.device) 108 | 109 | # Define a few reflections with decreasing amplitude 110 | impulse_response[0] = 1.0 # Direct sound 111 | 112 | impulse_response[ 113 | int(sample_rate * duration) - 1 114 | ] = volume # First reflection after 100ms 115 | 116 | # Add batch and channel dimensions to the impulse response 117 | impulse_response = impulse_response.unsqueeze(0).unsqueeze(0) 118 | 119 | # Convolve the audio signal with the impulse response 120 | reverbed_signal = julius.fft_conv1d(tensor, impulse_response) 121 | 122 | # Normalize to the original amplitude range for stability 123 | reverbed_signal = ( 124 | reverbed_signal 125 | / torch.max(torch.abs(reverbed_signal)) 126 | * torch.max(torch.abs(tensor)) 127 | ) 128 | 129 | # Ensure tensor size is not changed 130 | tmp = torch.zeros_like(tensor) 131 | tmp[..., : reverbed_signal.shape[-1]] = reverbed_signal 132 | reverbed_signal = tmp 133 | 134 | return audio_effect_return(tensor=reverbed_signal, mask=mask) 135 | 136 | @staticmethod 137 | def random_noise( 138 | waveform: torch.Tensor, 139 | noise_std: float = 0.001, 140 | mask: tp.Optional[torch.Tensor] = None, 141 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 142 | """Add Gaussian noise to the waveform.""" 143 | noise = torch.randn_like(waveform) * noise_std 144 | noisy_waveform = waveform + noise 145 | return audio_effect_return(tensor=noisy_waveform, mask=mask) 146 | 147 | @staticmethod 148 | def pink_noise( 149 | waveform: torch.Tensor, 150 | noise_std: float = 0.01, 151 | mask: tp.Optional[torch.Tensor] = None, 152 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 153 | """Add pink background noise to the waveform.""" 154 | noise = generate_pink_noise(waveform.shape[-1]) * noise_std 155 | noise = noise.to(waveform.device) 156 | # Assuming waveform is of shape (bsz, channels, length) 157 | noisy_waveform = waveform + noise.unsqueeze(0).unsqueeze(0).to(waveform.device) 158 | return audio_effect_return(tensor=noisy_waveform, mask=mask) 159 | 160 | @staticmethod 161 | def lowpass_filter( 162 | waveform: torch.Tensor, 163 | cutoff_freq: float = 5000, 164 | sample_rate: int = 16000, 165 | mask: tp.Optional[torch.Tensor] = None, 166 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 167 | 168 | return audio_effect_return( 169 | tensor=julius.lowpass_filter(waveform, cutoff=cutoff_freq / sample_rate), 170 | mask=mask, 171 | ) 172 | 173 | @staticmethod 174 | def highpass_filter( 175 | waveform: torch.Tensor, 176 | cutoff_freq: float = 500, 177 | sample_rate: int = 16000, 178 | mask: tp.Optional[torch.Tensor] = None, 179 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 180 | 181 | return audio_effect_return( 182 | tensor=julius.highpass_filter(waveform, cutoff=cutoff_freq / sample_rate), 183 | mask=mask, 184 | ) 185 | 186 | @staticmethod 187 | def bandpass_filter( 188 | waveform: torch.Tensor, 189 | cutoff_freq_low: float = 300, 190 | cutoff_freq_high: float = 8000, 191 | sample_rate: int = 16000, 192 | mask: tp.Optional[torch.Tensor] = None, 193 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 194 | """ 195 | Apply a bandpass filter to the waveform by cascading 196 | a high-pass filter followed by a low-pass filter. 197 | 198 | Parameters: 199 | - waveform (torch.Tensor): Input audio waveform. 200 | - low_cutoff (float): Lower cutoff frequency. 201 | - high_cutoff (float): Higher cutoff frequency. 202 | - sample_rate (int): The sample rate of the waveform. 203 | 204 | Returns: 205 | - torch.Tensor: Filtered audio waveform. 206 | """ 207 | 208 | return audio_effect_return( 209 | tensor=julius.bandpass_filter( 210 | waveform, 211 | cutoff_low=cutoff_freq_low / sample_rate, 212 | cutoff_high=cutoff_freq_high / sample_rate, 213 | ), 214 | mask=mask, 215 | ) 216 | 217 | @staticmethod 218 | def smooth( 219 | tensor: torch.Tensor, 220 | window_size_range: tuple = (2, 10), 221 | mask: tp.Optional[torch.Tensor] = None, 222 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 223 | """ 224 | Smooths the input tensor (audio signal) using a moving average filter with the given window size. 225 | 226 | Parameters: 227 | - tensor (torch.Tensor): Input audio tensor. Assumes tensor shape is (batch_size, channels, time). 228 | - window_size (int): Size of the moving average window. 229 | 230 | Returns: 231 | - torch.Tensor: Smoothed audio tensor. 232 | """ 233 | 234 | window_size = int(torch.FloatTensor(1).uniform_(*window_size_range)) 235 | # Create a uniform smoothing kernel 236 | kernel = torch.ones(1, 1, window_size).type(tensor.type()) / window_size 237 | kernel = kernel.to(tensor.device) 238 | 239 | smoothed = julius.fft_conv1d(tensor, kernel) 240 | # Ensure tensor size is not changed 241 | tmp = torch.zeros_like(tensor) 242 | tmp[..., : smoothed.shape[-1]] = smoothed 243 | smoothed = tmp 244 | 245 | return audio_effect_return(tensor=smoothed, mask=mask) 246 | 247 | @staticmethod 248 | def boost_audio( 249 | tensor: torch.Tensor, 250 | amount: float = 20, 251 | mask: tp.Optional[torch.Tensor] = None, 252 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 253 | return audio_effect_return(tensor=tensor * (1 + amount / 100), mask=mask) 254 | 255 | @staticmethod 256 | def duck_audio( 257 | tensor: torch.Tensor, 258 | amount: float = 20, 259 | mask: tp.Optional[torch.Tensor] = None, 260 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 261 | return audio_effect_return(tensor=tensor * (1 - amount / 100), mask=mask) 262 | 263 | @staticmethod 264 | def identity( 265 | tensor: torch.Tensor, mask: tp.Optional[torch.Tensor] = None 266 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 267 | return audio_effect_return(tensor=tensor, mask=mask) 268 | 269 | @staticmethod 270 | def shush( 271 | tensor: torch.Tensor, 272 | fraction: float = 0.001, 273 | mask: tp.Optional[torch.Tensor] = None 274 | ) -> tp.Union[tp.Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: 275 | """ 276 | Sets a specified chronological fraction of indices of the input tensor (audio signal) to 0. 277 | 278 | Parameters: 279 | - tensor (torch.Tensor): Input audio tensor. Assumes tensor shape is (batch_size, channels, time). 280 | - fraction (float): Fraction of indices to be set to 0 (from the start of the tensor) (default: 0.001, i.e, 0.1%) 281 | 282 | Returns: 283 | - torch.Tensor: Transformed audio tensor. 284 | """ 285 | time = tensor.size(-1) 286 | shush_tensor = tensor.detach().clone() 287 | 288 | # Set the first `fraction*time` indices of the waveform to 0 289 | shush_tensor[:, :, :int(fraction*time)] = 0.0 290 | 291 | return audio_effect_return(tensor=shush_tensor, mask=mask) 292 | -------------------------------------------------------------------------------- /src/audioseal/libs/audiocraft/modules/conv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | # Vendor from https://github.com/facebookresearch/audiocraft 8 | 9 | import math 10 | import typing as tp 11 | import warnings 12 | 13 | import torch 14 | from torch import nn 15 | from torch.nn import functional as F 16 | from torch.nn.utils import spectral_norm 17 | 18 | try: 19 | from torch.nn.utils.parametrizations import weight_norm 20 | except ImportError: 21 | # Old Pytorch 22 | from torch.nn.utils import weight_norm 23 | 24 | 25 | CONV_NORMALIZATIONS = frozenset( 26 | ["none", "weight_norm", "spectral_norm", "time_group_norm"] 27 | ) 28 | 29 | 30 | def apply_parametrization_norm(module: nn.Module, norm: str = "none"): 31 | assert norm in CONV_NORMALIZATIONS 32 | if norm == "weight_norm": 33 | return weight_norm(module) 34 | elif norm == "spectral_norm": 35 | return spectral_norm(module) 36 | else: 37 | # We already check was in CONV_NORMALIZATION, so any other choice 38 | # doesn't need reparametrization. 39 | return module 40 | 41 | 42 | def get_norm_module( 43 | module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs 44 | ): 45 | """Return the proper normalization module. If causal is True, this will ensure the returned 46 | module is causal, or return an error if the normalization doesn't support causal evaluation. 47 | """ 48 | assert norm in CONV_NORMALIZATIONS 49 | if norm == "time_group_norm": 50 | if causal: 51 | raise ValueError("GroupNorm doesn't support causal evaluation.") 52 | assert isinstance(module, nn.modules.conv._ConvNd) 53 | return nn.GroupNorm(1, module.out_channels, **norm_kwargs) 54 | else: 55 | return nn.Identity() 56 | 57 | 58 | def get_extra_padding_for_conv1d( 59 | x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 60 | ) -> int: 61 | """See `pad_for_conv1d`.""" 62 | length = x.shape[-1] 63 | n_frames = (length - kernel_size + padding_total) / stride + 1 64 | ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) 65 | return ideal_length - length 66 | 67 | 68 | def pad_for_conv1d( 69 | x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 70 | ): 71 | """Pad for a convolution to make sure that the last window is full. 72 | Extra padding is added at the end. This is required to ensure that we can rebuild 73 | an output of the same length, as otherwise, even with padding, some time steps 74 | might get removed. 75 | For instance, with total padding = 4, kernel size = 4, stride = 2: 76 | 0 0 1 2 3 4 5 0 0 # (0s are padding) 77 | 1 2 3 # (output frames of a convolution, last 0 is never used) 78 | 0 0 1 2 3 4 5 0 # (output of tr. conv., but pos. 5 is going to get removed as padding) 79 | 1 2 3 4 # once you removed padding, we are missing one time step ! 80 | """ 81 | extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total) 82 | return F.pad(x, (0, extra_padding)) 83 | 84 | 85 | def pad1d( 86 | x: torch.Tensor, 87 | paddings: tp.Tuple[int, int], 88 | mode: str = "constant", 89 | value: float = 0.0, 90 | ): 91 | """Tiny wrapper around F.pad, just to allow for reflect padding on small input. 92 | If this is the case, we insert extra 0 padding to the right before the reflection happen. 93 | """ 94 | length = x.shape[-1] 95 | padding_left, padding_right = paddings 96 | assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) 97 | if mode == "reflect": 98 | max_pad = max(padding_left, padding_right) 99 | extra_pad = 0 100 | if length <= max_pad: 101 | extra_pad = max_pad - length + 1 102 | x = F.pad(x, (0, extra_pad)) 103 | padded = F.pad(x, paddings, mode, value) 104 | end = padded.shape[-1] - extra_pad 105 | return padded[..., :end] 106 | else: 107 | return F.pad(x, paddings, mode, value) 108 | 109 | 110 | def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]): 111 | """Remove padding from x, handling properly zero padding. Only for 1d!""" 112 | padding_left, padding_right = paddings 113 | assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) 114 | assert (padding_left + padding_right) <= x.shape[-1] 115 | end = x.shape[-1] - padding_right 116 | return x[..., padding_left:end] 117 | 118 | 119 | class NormConv1d(nn.Module): 120 | """Wrapper around Conv1d and normalization applied to this conv 121 | to provide a uniform interface across normalization approaches. 122 | """ 123 | 124 | def __init__( 125 | self, 126 | *args, 127 | causal: bool = False, 128 | norm: str = "none", 129 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 130 | **kwargs, 131 | ): 132 | super().__init__() 133 | self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm) 134 | self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs) 135 | self.norm_type = norm 136 | 137 | def forward(self, x): 138 | x = self.conv(x) 139 | x = self.norm(x) 140 | return x 141 | 142 | 143 | class NormConv2d(nn.Module): 144 | """Wrapper around Conv2d and normalization applied to this conv 145 | to provide a uniform interface across normalization approaches. 146 | """ 147 | 148 | def __init__( 149 | self, 150 | *args, 151 | norm: str = "none", 152 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 153 | **kwargs, 154 | ): 155 | super().__init__() 156 | self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm) 157 | self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs) 158 | self.norm_type = norm 159 | 160 | def forward(self, x): 161 | x = self.conv(x) 162 | x = self.norm(x) 163 | return x 164 | 165 | 166 | class NormConvTranspose1d(nn.Module): 167 | """Wrapper around ConvTranspose1d and normalization applied to this conv 168 | to provide a uniform interface across normalization approaches. 169 | """ 170 | 171 | def __init__( 172 | self, 173 | *args, 174 | causal: bool = False, 175 | norm: str = "none", 176 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 177 | **kwargs, 178 | ): 179 | super().__init__() 180 | self.convtr = apply_parametrization_norm( 181 | nn.ConvTranspose1d(*args, **kwargs), norm 182 | ) 183 | self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs) 184 | self.norm_type = norm 185 | 186 | def forward(self, x): 187 | x = self.convtr(x) 188 | x = self.norm(x) 189 | return x 190 | 191 | 192 | class NormConvTranspose2d(nn.Module): 193 | """Wrapper around ConvTranspose2d and normalization applied to this conv 194 | to provide a uniform interface across normalization approaches. 195 | """ 196 | 197 | def __init__( 198 | self, 199 | *args, 200 | norm: str = "none", 201 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 202 | **kwargs, 203 | ): 204 | super().__init__() 205 | self.convtr = apply_parametrization_norm( 206 | nn.ConvTranspose2d(*args, **kwargs), norm 207 | ) 208 | self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs) 209 | 210 | def forward(self, x): 211 | x = self.convtr(x) 212 | x = self.norm(x) 213 | return x 214 | 215 | 216 | class StreamableConv1d(nn.Module): 217 | """Conv1d with some builtin handling of asymmetric or causal padding 218 | and normalization. 219 | """ 220 | 221 | def __init__( 222 | self, 223 | in_channels: int, 224 | out_channels: int, 225 | kernel_size: int, 226 | stride: int = 1, 227 | dilation: int = 1, 228 | groups: int = 1, 229 | bias: bool = True, 230 | causal: bool = False, 231 | norm: str = "none", 232 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 233 | pad_mode: str = "reflect", 234 | ): 235 | super().__init__() 236 | # warn user on unusual setup between dilation and stride 237 | if stride > 1 and dilation > 1: 238 | warnings.warn( 239 | "StreamableConv1d has been initialized with stride > 1 and dilation > 1" 240 | f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})." 241 | ) 242 | self.conv = NormConv1d( 243 | in_channels, 244 | out_channels, 245 | kernel_size, 246 | stride, 247 | dilation=dilation, 248 | groups=groups, 249 | bias=bias, 250 | causal=causal, 251 | norm=norm, 252 | norm_kwargs=norm_kwargs, 253 | ) 254 | self.causal = causal 255 | self.pad_mode = pad_mode 256 | 257 | def forward(self, x): 258 | B, C, T = x.shape 259 | kernel_size = self.conv.conv.kernel_size[0] 260 | stride = self.conv.conv.stride[0] 261 | dilation = self.conv.conv.dilation[0] 262 | kernel_size = ( 263 | kernel_size - 1 264 | ) * dilation + 1 # effective kernel size with dilations 265 | padding_total = kernel_size - stride 266 | extra_padding = get_extra_padding_for_conv1d( 267 | x, kernel_size, stride, padding_total 268 | ) 269 | if self.causal: 270 | # Left padding for causal 271 | x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode) 272 | else: 273 | # Asymmetric padding required for odd strides 274 | padding_right = padding_total // 2 275 | padding_left = padding_total - padding_right 276 | x = pad1d( 277 | x, (padding_left, padding_right + extra_padding), mode=self.pad_mode 278 | ) 279 | return self.conv(x) 280 | 281 | 282 | class StreamableConvTranspose1d(nn.Module): 283 | """ConvTranspose1d with some builtin handling of asymmetric or causal padding 284 | and normalization. 285 | """ 286 | 287 | def __init__( 288 | self, 289 | in_channels: int, 290 | out_channels: int, 291 | kernel_size: int, 292 | stride: int = 1, 293 | causal: bool = False, 294 | norm: str = "none", 295 | trim_right_ratio: float = 1.0, 296 | norm_kwargs: tp.Dict[str, tp.Any] = {}, 297 | ): 298 | super().__init__() 299 | self.convtr = NormConvTranspose1d( 300 | in_channels, 301 | out_channels, 302 | kernel_size, 303 | stride, 304 | causal=causal, 305 | norm=norm, 306 | norm_kwargs=norm_kwargs, 307 | ) 308 | self.causal = causal 309 | self.trim_right_ratio = trim_right_ratio 310 | assert ( 311 | self.causal or self.trim_right_ratio == 1.0 312 | ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions" 313 | assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0 314 | 315 | def forward(self, x): 316 | kernel_size = self.convtr.convtr.kernel_size[0] 317 | stride = self.convtr.convtr.stride[0] 318 | padding_total = kernel_size - stride 319 | 320 | y = self.convtr(x) 321 | 322 | # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be 323 | # removed at the very end, when keeping only the right length for the output, 324 | # as removing it here would require also passing the length at the matching layer 325 | # in the encoder. 326 | if self.causal: 327 | # Trim the padding on the right according to the specified ratio 328 | # if trim_right_ratio = 1.0, trim everything from right 329 | padding_right = math.ceil(padding_total * self.trim_right_ratio) 330 | padding_left = padding_total - padding_right 331 | y = unpad1d(y, (padding_left, padding_right)) 332 | else: 333 | # Asymmetric padding required for odd strides 334 | padding_right = padding_total // 2 335 | padding_left = padding_total - padding_right 336 | y = unpad1d(y, (padding_left, padding_right)) 337 | return y 338 | -------------------------------------------------------------------------------- /examples/colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a73a40df", 6 | "metadata": {}, 7 | "source": [ 8 | "# Watermarking custom audio\n", 9 | "\n", 10 | "[[`arXiv`](https://arxiv.org/abs/2401.17264)]\n", 11 | "[[`GitHub`](https://github.com/facebookresearch/audioseal)]\n", 12 | "\n", 13 | "This notebook shows a minimal example how to watermark a custom audio, for example your own recorded voice. This notebook aims to run in Google Collab. Make sure you get familiar with the APIs of AudioSeal, for example using [Getting Started notebook](./Getting_started.ipynb)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "4c2562ce", 19 | "metadata": {}, 20 | "source": [ 21 | "## Installation" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "1fbb4b36", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "#@title Install requirements\n", 32 | "!pip install torchaudio\n", 33 | "!pip install matplotlib\n", 34 | "!pip install audioseal # Ensure this matches the actual package name for AudioSeal\n", 35 | "!pip install ffmpeg-python" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "1325f7d7", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import io\n", 46 | "import ffmpeg\n", 47 | "import IPython.display as ipd\n", 48 | "from google.colab.output import eval_js\n", 49 | "\n", 50 | "from base64 import b64decode\n", 51 | "from scipy.io.wavfile import read as wav_read\n", 52 | "import numpy as np\n", 53 | "import matplotlib.pyplot as plt\n", 54 | "\n", 55 | "import torch\n", 56 | "import torchaudio\n", 57 | "\n", 58 | "from audioseal import AudioSeal" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "e7f95544", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "def plot_waveform_and_specgram(waveform, sample_rate, title):\n", 69 | " waveform = waveform.squeeze().detach().cpu().numpy()\n", 70 | "\n", 71 | " num_frames = waveform.shape[-1]\n", 72 | " time_axis = torch.arange(0, num_frames) / sample_rate\n", 73 | "\n", 74 | " figure, (ax1, ax2) = plt.subplots(1, 2)\n", 75 | "\n", 76 | " ax1.plot(time_axis, waveform, linewidth=1)\n", 77 | " ax1.grid(True)\n", 78 | " ax2.specgram(waveform, Fs=sample_rate)\n", 79 | "\n", 80 | " figure.suptitle(f\"{title} - Waveform and specgram\")\n", 81 | " plt.show()\n", 82 | "\n", 83 | "\n", 84 | "def play_audio(waveform, sample_rate):\n", 85 | " if waveform.dim() > 2:\n", 86 | " waveform = waveform.squeeze(0)\n", 87 | " waveform = waveform.detach().cpu().numpy()\n", 88 | "\n", 89 | " num_channels, *_ = waveform.shape\n", 90 | " if num_channels == 1:\n", 91 | " ipd.display(ipd.Audio(waveform[0], rate=sample_rate))\n", 92 | " elif num_channels == 2:\n", 93 | " ipd.display(ipd.Audio((waveform[0], waveform[1]), rate=sample_rate))\n", 94 | " else:\n", 95 | " raise ValueError(\"Waveform with more than 2 channels are not supported.\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "daf14d39", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "AUDIO_HTML = \"\"\"\n", 106 | "\n", 183 | "\"\"\"\n", 184 | "\n", 185 | "def get_audio():\n", 186 | " display(ipd.HTML(AUDIO_HTML))\n", 187 | " data = eval_js(\"data\")\n", 188 | " binary = b64decode(data.split(',')[1])\n", 189 | "\n", 190 | " process = (ffmpeg\n", 191 | " .input('pipe:0')\n", 192 | " .output('pipe:1', format='wav')\n", 193 | " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n", 194 | " )\n", 195 | " output, err = process.communicate(input=binary)\n", 196 | "\n", 197 | " riff_chunk_size = len(output) - 8\n", 198 | " # Break up the chunk size into four bytes, held in b.\n", 199 | " q = riff_chunk_size\n", 200 | " b = []\n", 201 | " for i in range(4):\n", 202 | " q, r = divmod(q, 256)\n", 203 | " b.append(r)\n", 204 | "\n", 205 | " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n", 206 | " riff = output[:4] + bytes(b) + output[8:]\n", 207 | "\n", 208 | " sr, audio = wav_read(io.BytesIO(riff))\n", 209 | "\n", 210 | " return audio, sr" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "53826104", 216 | "metadata": {}, 217 | "source": [ 218 | "## Record your audio" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "3216ff1a", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "recorded, sr = get_audio()\n", 229 | "\n", 230 | "# Convert audio from list of int16 to a normalized tensor\n", 231 | "audio = torch.tensor(recorded).float() / 32768.0\n", 232 | "print(audio.shape)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "a2110143", 238 | "metadata": {}, 239 | "source": [ 240 | "## Generator\n", 241 | "\n", 242 | "To watermark an audio, we simply load the watermarking generator from the hub:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "007c48cb", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "model = AudioSeal.load_generator(\"audioseal_wm_16bits\")\n", 253 | "\n", 254 | "# We add the batch dimension to the single audio to mimic the batch watermarking\n", 255 | "audios = audio.unsqueeze(0).unsqueeze(0) # b=1 c=1 t\n", 256 | "\n", 257 | "watermark = model.get_watermark(audios, sample_rate=sr)\n", 258 | "watermarked_audio = audios + watermark\n", 259 | "\n", 260 | "# Alternatively, you can also call forward() function directly with different tune-down / tune-up rate\n", 261 | "watermarked_audio = model(audios, sample_rate=sr, alpha=1)\n", 262 | "\n", 263 | "# You can also watermark with a secret message\n", 264 | "# secret_mesage = torch.randint(0, 2, (1, 16), dtype=torch.int32)\n", 265 | "# watermarked_audio = model(audios, sample_rate=sr, message=secret_mesage, alpha=1)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "ac5aac4f", 271 | "metadata": {}, 272 | "source": [ 273 | "We can see that the watermarked audio has preserved almost the same spectrogram and contents as the original one" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "0200bc22", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "plot_waveform_and_specgram(watermarked_audio.squeeze(), sr, title=\"Watermarked audio\")" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "d70a0d73", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "play_audio(watermarked_audio, sr)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "id": "cd7fa4fe", 299 | "metadata": {}, 300 | "source": [ 301 | "## Detector\n", 302 | "\n", 303 | "To detect the watermarks from an audio, we load the separate detector model and can do one of the following:\n", 304 | "\n", 305 | "### Basic usage: Call `detect_watermark()`\n", 306 | "\n", 307 | "This results in a tuple of form `Tuple(float, Tensor)`, where the first value indicates the probability of the audio being watermarked (the higher, the more likely), and the second value is the decoded message that is embeded by the generator. If the audio is unwatermarked (low first value), the decoded message will be just some random bits.\n", 308 | "\n", 309 | "Note that due to the stochastic nature of the detector, the decoded message and the secret message might miss by 1 bit, so depending on the user's need, the detection might be called multiple times to get an averaged decoded message." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "0b1a3a9e", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "detector = AudioSeal.load_detector((\"audioseal_detector_16bits\"))\n", 320 | "\n", 321 | "result, message = detector.detect_watermark(watermarked_audio, sample_rate=sr, message_threshold=0.5)\n", 322 | "\n", 323 | "print(f\"\\nThis is likely a watermarked audio. WM probability: {result}\")\n", 324 | "\n", 325 | "# Run on an unwatermarked audio\n", 326 | "result2, message2 = detector.detect_watermark(audios, sample_rate=sr, message_threshold=0.5)\n", 327 | "print(f\"This is likely an unwatermarked audio. WM probability: {result2}\")\n" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "7730364d", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "message" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "8dc67150", 343 | "metadata": {}, 344 | "source": [ 345 | "`message_threshold` indicates the threshold in which the detector will convert the stochastic messages (with probability between 0 and 1) into the n-bit binary format. In most of the case, the generator generates an unbiased message from the secret, so `0.5` is a reasonable choice (so in the above example, value > 0.5 means 1 and value < 0.5 means 0). \n", 346 | "\n", 347 | "\n", 348 | "### Advanced usage: Call `forward()`\n", 349 | "\n", 350 | "The detector can also be called directly as a Torch module. This will return 2 tensors: \n", 351 | "- The first tensor of size `batch x 2 x frames` indicates the probability of each frame being watermarked (positive or negative). So t[:, 0, :] corresponds to the negative probability and t[:, 1, :] corresponds to the positive probability\n", 352 | "- The second tensor of size `batch x n_bits` corresponds to the message detected from the audio. It indicates the probability for each bit to be 1. In case of unwatermarked audios, this tensor is random" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "id": "fadf26a1", 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "pred_prob, message_prob = detector(watermarked_audio, sample_rate=sr)\n", 363 | "pred_prob[:, 1, :]" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "899de6b3", 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "message_prob" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "id": "a78766fd", 379 | "metadata": {}, 380 | "source": [ 381 | "### Robustness against attacks\n", 382 | "\n", 383 | "We can evaluate the robustness of the detector against some attacks. For this purpose, we will perform some simple attacks: Pink noise, highpass filter, compression in different formats. For the full list of attacks, please refer to our paper. \n", 384 | "\n", 385 | "\n", 386 | "#### Pink noise attack" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "4cc0efde", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "from attacks import AudioEffects as af\n", 397 | "\n", 398 | "pink_noised_audio = af.pink_noise(watermarked_audio, noise_std=0.1)\n", 399 | "plot_waveform_and_specgram(pink_noised_audio, sample_rate=sr, title=\"Audio with pink noise\")\n", 400 | "result, message = detector.detect_watermark(pink_noised_audio, sample_rate=sr)\n", 401 | "print(result)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "id": "e2b96eb1", 407 | "metadata": {}, 408 | "source": [ 409 | "#### Lowpass filter" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "254e6012", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "lowpass_filtered = af.lowpass_filter(watermarked_audio, cutoff_freq=5000, sample_rate=sr)\n", 420 | "plot_waveform_and_specgram(lowpass_filtered, sample_rate=sr, title=\"Audio with low pass filtered\")\n", 421 | "result, message = detector.detect_watermark(lowpass_filtered, sample_rate=sr)\n", 422 | "print(result)" 423 | ] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "awm-oss", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.9.12" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 5 447 | } 448 | -------------------------------------------------------------------------------- /src/audioseal/libs/audiocraft/modules/seanet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | # Vendor from https://github.com/facebookresearch/audiocraft 8 | 9 | import math 10 | import typing as tp 11 | 12 | import numpy as np 13 | import torch.nn as nn 14 | 15 | from audioseal.libs.audiocraft.modules.conv import ( 16 | StreamableConv1d, 17 | StreamableConvTranspose1d, 18 | ) 19 | from audioseal.libs.audiocraft.modules.lstm import StreamableLSTM 20 | 21 | 22 | class SEANetResnetBlock(nn.Module): 23 | """Residual block from SEANet model. 24 | 25 | Args: 26 | dim (int): Dimension of the input/output. 27 | kernel_sizes (list): List of kernel sizes for the convolutions. 28 | dilations (list): List of dilations for the convolutions. 29 | activation (str): Activation function. 30 | activation_params (dict): Parameters to provide to the activation function. 31 | norm (str): Normalization method. 32 | norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution. 33 | causal (bool): Whether to use fully causal convolution. 34 | pad_mode (str): Padding mode for the convolutions. 35 | compress (int): Reduced dimensionality in residual branches (from Demucs v3). 36 | true_skip (bool): Whether to use true skip connection or a simple 37 | (streamable) convolution as the skip connection. 38 | """ 39 | 40 | def __init__( 41 | self, 42 | dim: int, 43 | kernel_sizes: tp.List[int] = [3, 1], 44 | dilations: tp.List[int] = [1, 1], 45 | activation: str = "ELU", 46 | activation_params: dict = {"alpha": 1.0}, 47 | norm: str = "none", 48 | norm_params: tp.Dict[str, tp.Any] = {}, 49 | causal: bool = False, 50 | pad_mode: str = "reflect", 51 | compress: int = 2, 52 | true_skip: bool = True, 53 | ): 54 | super().__init__() 55 | assert len(kernel_sizes) == len( 56 | dilations 57 | ), "Number of kernel sizes should match number of dilations" 58 | act = getattr(nn, activation) 59 | hidden = dim // compress 60 | block = [] 61 | for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)): 62 | in_chs = dim if i == 0 else hidden 63 | out_chs = dim if i == len(kernel_sizes) - 1 else hidden 64 | block += [ 65 | act(**activation_params), 66 | StreamableConv1d( 67 | in_chs, 68 | out_chs, 69 | kernel_size=kernel_size, 70 | dilation=dilation, 71 | norm=norm, 72 | norm_kwargs=norm_params, 73 | causal=causal, 74 | pad_mode=pad_mode, 75 | ), 76 | ] 77 | self.block = nn.Sequential(*block) 78 | self.shortcut: nn.Module 79 | if true_skip: 80 | self.shortcut = nn.Identity() 81 | else: 82 | self.shortcut = StreamableConv1d( 83 | dim, 84 | dim, 85 | kernel_size=1, 86 | norm=norm, 87 | norm_kwargs=norm_params, 88 | causal=causal, 89 | pad_mode=pad_mode, 90 | ) 91 | 92 | def forward(self, x): 93 | return self.shortcut(x) + self.block(x) 94 | 95 | 96 | class SEANetEncoder(nn.Module): 97 | """SEANet encoder. 98 | 99 | Args: 100 | channels (int): Audio channels. 101 | dimension (int): Intermediate representation dimension. 102 | n_filters (int): Base width for the model. 103 | n_residual_layers (int): nb of residual layers. 104 | ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of 105 | upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here 106 | that must match the decoder order. We use the decoder order as some models may only employ the decoder. 107 | activation (str): Activation function. 108 | activation_params (dict): Parameters to provide to the activation function. 109 | norm (str): Normalization method. 110 | norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution. 111 | kernel_size (int): Kernel size for the initial convolution. 112 | last_kernel_size (int): Kernel size for the initial convolution. 113 | residual_kernel_size (int): Kernel size for the residual layers. 114 | dilation_base (int): How much to increase the dilation with each layer. 115 | causal (bool): Whether to use fully causal convolution. 116 | pad_mode (str): Padding mode for the convolutions. 117 | true_skip (bool): Whether to use true skip connection or a simple 118 | (streamable) convolution as the skip connection in the residual network blocks. 119 | compress (int): Reduced dimensionality in residual branches (from Demucs v3). 120 | lstm (int): Number of LSTM layers at the end of the encoder. 121 | disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm. 122 | For the encoder, it corresponds to the N first blocks. 123 | """ 124 | 125 | def __init__( 126 | self, 127 | channels: int = 1, 128 | dimension: int = 128, 129 | n_filters: int = 32, 130 | n_residual_layers: int = 3, 131 | ratios: tp.List[int] = [8, 5, 4, 2], 132 | activation: str = "ELU", 133 | activation_params: dict = {"alpha": 1.0}, 134 | norm: str = "none", 135 | norm_params: tp.Dict[str, tp.Any] = {}, 136 | kernel_size: int = 7, 137 | last_kernel_size: int = 7, 138 | residual_kernel_size: int = 3, 139 | dilation_base: int = 2, 140 | causal: bool = False, 141 | pad_mode: str = "reflect", 142 | true_skip: bool = True, 143 | compress: int = 2, 144 | lstm: int = 0, 145 | disable_norm_outer_blocks: int = 0, 146 | ): 147 | super().__init__() 148 | self.channels = channels 149 | self.dimension = dimension 150 | self.n_filters = n_filters 151 | self.ratios = list(reversed(ratios)) 152 | del ratios 153 | self.n_residual_layers = n_residual_layers 154 | self.hop_length = np.prod(self.ratios) 155 | self.n_blocks = len(self.ratios) + 2 # first and last conv + residual blocks 156 | self.disable_norm_outer_blocks = disable_norm_outer_blocks 157 | assert ( 158 | self.disable_norm_outer_blocks >= 0 159 | and self.disable_norm_outer_blocks <= self.n_blocks 160 | ), ( 161 | "Number of blocks for which to disable norm is invalid." 162 | "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0." 163 | ) 164 | 165 | act = getattr(nn, activation) 166 | mult = 1 167 | model: tp.List[nn.Module] = [ 168 | StreamableConv1d( 169 | channels, 170 | mult * n_filters, 171 | kernel_size, 172 | norm="none" if self.disable_norm_outer_blocks >= 1 else norm, 173 | norm_kwargs=norm_params, 174 | causal=causal, 175 | pad_mode=pad_mode, 176 | ) 177 | ] 178 | # Downsample to raw audio scale 179 | for i, ratio in enumerate(self.ratios): 180 | block_norm = "none" if self.disable_norm_outer_blocks >= i + 2 else norm 181 | # Add residual layers 182 | for j in range(n_residual_layers): 183 | model += [ 184 | SEANetResnetBlock( 185 | mult * n_filters, 186 | kernel_sizes=[residual_kernel_size, 1], 187 | dilations=[dilation_base**j, 1], 188 | norm=block_norm, 189 | norm_params=norm_params, 190 | activation=activation, 191 | activation_params=activation_params, 192 | causal=causal, 193 | pad_mode=pad_mode, 194 | compress=compress, 195 | true_skip=true_skip, 196 | ) 197 | ] 198 | 199 | # Add downsampling layers 200 | model += [ 201 | act(**activation_params), 202 | StreamableConv1d( 203 | mult * n_filters, 204 | mult * n_filters * 2, 205 | kernel_size=ratio * 2, 206 | stride=ratio, 207 | norm=block_norm, 208 | norm_kwargs=norm_params, 209 | causal=causal, 210 | pad_mode=pad_mode, 211 | ), 212 | ] 213 | mult *= 2 214 | 215 | if lstm: 216 | model += [StreamableLSTM(mult * n_filters, num_layers=lstm)] 217 | 218 | model += [ 219 | act(**activation_params), 220 | StreamableConv1d( 221 | mult * n_filters, 222 | dimension, 223 | last_kernel_size, 224 | norm=( 225 | "none" if self.disable_norm_outer_blocks == self.n_blocks else norm 226 | ), 227 | norm_kwargs=norm_params, 228 | causal=causal, 229 | pad_mode=pad_mode, 230 | ), 231 | ] 232 | 233 | self.model = nn.Sequential(*model) 234 | 235 | def forward(self, x): 236 | return self.model(x) 237 | 238 | 239 | class SEANetEncoderKeepDimension(SEANetEncoder): 240 | """ 241 | similar architecture to the SEANet encoder but with an extra step that 242 | projects the output dimension to the same input dimension by repeating 243 | the sequential 244 | 245 | Args: 246 | SEANetEncoder (_type_): _description_ 247 | """ 248 | 249 | def __init__(self, *args, **kwargs): 250 | 251 | self.output_dim = kwargs.pop("output_dim") 252 | super().__init__(*args, **kwargs) 253 | # Adding a reverse convolution layer 254 | self.reverse_convolution = nn.ConvTranspose1d( 255 | in_channels=self.dimension, 256 | out_channels=self.output_dim, 257 | kernel_size=math.prod(self.ratios), 258 | stride=math.prod(self.ratios), 259 | padding=0, 260 | ) 261 | 262 | def forward(self, x): 263 | orig_nframes = x.shape[-1] 264 | x = self.model(x) 265 | x = self.reverse_convolution(x) 266 | # make sure dim didn't change 267 | return x[:, :, :orig_nframes] 268 | 269 | class SEANetDecoder(nn.Module): 270 | """SEANet decoder. 271 | 272 | Args: 273 | channels (int): Audio channels. 274 | dimension (int): Intermediate representation dimension. 275 | n_filters (int): Base width for the model. 276 | n_residual_layers (int): nb of residual layers. 277 | ratios (Sequence[int]): kernel size and stride ratios. 278 | activation (str): Activation function. 279 | activation_params (dict): Parameters to provide to the activation function. 280 | final_activation (str): Final activation function after all convolutions. 281 | final_activation_params (dict): Parameters to provide to the activation function. 282 | norm (str): Normalization method. 283 | norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution. 284 | kernel_size (int): Kernel size for the initial convolution. 285 | last_kernel_size (int): Kernel size for the initial convolution. 286 | residual_kernel_size (int): Kernel size for the residual layers. 287 | dilation_base (int): How much to increase the dilation with each layer. 288 | causal (bool): Whether to use fully causal convolution. 289 | pad_mode (str): Padding mode for the convolutions. 290 | true_skip (bool): Whether to use true skip connection or a simple. 291 | (streamable) convolution as the skip connection in the residual network blocks. 292 | compress (int): Reduced dimensionality in residual branches (from Demucs v3). 293 | lstm (int): Number of LSTM layers at the end of the encoder. 294 | disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm. 295 | For the decoder, it corresponds to the N last blocks. 296 | trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup. 297 | If equal to 1.0, it means that all the trimming is done at the right. 298 | """ 299 | 300 | def __init__( 301 | self, 302 | channels: int = 1, 303 | dimension: int = 128, 304 | n_filters: int = 32, 305 | n_residual_layers: int = 3, 306 | ratios: tp.List[int] = [8, 5, 4, 2], 307 | activation: str = "ELU", 308 | activation_params: dict = {"alpha": 1.0}, 309 | final_activation: tp.Optional[str] = None, 310 | final_activation_params: tp.Optional[dict] = None, 311 | norm: str = "none", 312 | norm_params: tp.Dict[str, tp.Any] = {}, 313 | kernel_size: int = 7, 314 | last_kernel_size: int = 7, 315 | residual_kernel_size: int = 3, 316 | dilation_base: int = 2, 317 | causal: bool = False, 318 | pad_mode: str = "reflect", 319 | true_skip: bool = True, 320 | compress: int = 2, 321 | lstm: int = 0, 322 | disable_norm_outer_blocks: int = 0, 323 | trim_right_ratio: float = 1.0, 324 | ): 325 | super().__init__() 326 | self.dimension = dimension 327 | self.channels = channels 328 | self.n_filters = n_filters 329 | self.ratios = ratios 330 | del ratios 331 | self.n_residual_layers = n_residual_layers 332 | self.hop_length = np.prod(self.ratios) 333 | self.n_blocks = len(self.ratios) + 2 # first and last conv + residual blocks 334 | self.disable_norm_outer_blocks = disable_norm_outer_blocks 335 | assert ( 336 | self.disable_norm_outer_blocks >= 0 337 | and self.disable_norm_outer_blocks <= self.n_blocks 338 | ), ( 339 | "Number of blocks for which to disable norm is invalid." 340 | "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0." 341 | ) 342 | 343 | act = getattr(nn, activation) 344 | mult = int(2 ** len(self.ratios)) 345 | model: tp.List[nn.Module] = [ 346 | StreamableConv1d( 347 | dimension, 348 | mult * n_filters, 349 | kernel_size, 350 | norm=( 351 | "none" if self.disable_norm_outer_blocks == self.n_blocks else norm 352 | ), 353 | norm_kwargs=norm_params, 354 | causal=causal, 355 | pad_mode=pad_mode, 356 | ) 357 | ] 358 | 359 | if lstm: 360 | model += [StreamableLSTM(mult * n_filters, num_layers=lstm)] 361 | 362 | # Upsample to raw audio scale 363 | for i, ratio in enumerate(self.ratios): 364 | block_norm = ( 365 | "none" 366 | if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1) 367 | else norm 368 | ) 369 | # Add upsampling layers 370 | model += [ 371 | act(**activation_params), 372 | StreamableConvTranspose1d( 373 | mult * n_filters, 374 | mult * n_filters // 2, 375 | kernel_size=ratio * 2, 376 | stride=ratio, 377 | norm=block_norm, 378 | norm_kwargs=norm_params, 379 | causal=causal, 380 | trim_right_ratio=trim_right_ratio, 381 | ), 382 | ] 383 | # Add residual layers 384 | for j in range(n_residual_layers): 385 | model += [ 386 | SEANetResnetBlock( 387 | mult * n_filters // 2, 388 | kernel_sizes=[residual_kernel_size, 1], 389 | dilations=[dilation_base**j, 1], 390 | activation=activation, 391 | activation_params=activation_params, 392 | norm=block_norm, 393 | norm_params=norm_params, 394 | causal=causal, 395 | pad_mode=pad_mode, 396 | compress=compress, 397 | true_skip=true_skip, 398 | ) 399 | ] 400 | 401 | mult //= 2 402 | 403 | # Add final layers 404 | model += [ 405 | act(**activation_params), 406 | StreamableConv1d( 407 | n_filters, 408 | channels, 409 | last_kernel_size, 410 | norm="none" if self.disable_norm_outer_blocks >= 1 else norm, 411 | norm_kwargs=norm_params, 412 | causal=causal, 413 | pad_mode=pad_mode, 414 | ), 415 | ] 416 | # Add optional final activation to decoder (eg. tanh) 417 | if final_activation is not None: 418 | final_act = getattr(nn, final_activation) 419 | final_activation_params = final_activation_params or {} 420 | model += [final_act(**final_activation_params)] 421 | self.model = nn.Sequential(*model) 422 | 423 | def forward(self, z): 424 | y = self.model(z) 425 | return y 426 | -------------------------------------------------------------------------------- /examples/attack_benchmarking_example.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["# Benchmarking Audioseal on the SHUSH attack applied on RAVDESS Dataset\n","\n","In this notebook, we outline the steps taken to benchmark the Audioseal architecture against different attacks on a dataset of audio files. \n","In particular, we follow these steps:\n","- Load audio files from a dataset \n","- Watermark each audio file using Audioseal\n","- Perform perturbations/attacks to the audio files\n","- Detect the watermarks on these attacked files and keep track of the confidence of Audioseal in its predictions that the files are watermarked.\n","\n","\n","For a better understanding of Audioseal and its functionalities, it is highly recommended to go through the [Getting started notebook](https://github.com/facebookresearch/audioseal/blob/main/examples/Getting_started.ipynb)."]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## Dataset\n","\n","We use the [RAVDESS Emotional Speech audio](https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio) dataset for this experiment. \n","When added to a Kaggle notebook environment, all input datasets are stored in the read-only `/kaggle/input` path. If you are not using Kaggle, or have stored your files elsewhere, you can load nested audio files by modifying `PARENT_FILES_DIR` in the cell below."]},{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-03-15T06:51:03.909949Z","iopub.status.busy":"2024-03-15T06:51:03.909220Z","iopub.status.idle":"2024-03-15T06:51:04.354260Z","shell.execute_reply":"2024-03-15T06:51:04.353256Z","shell.execute_reply.started":"2024-03-15T06:51:03.909904Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of input files: 2880\n"]}],"source":["import numpy as np \n","import pandas as pd\n","import os\n","\n","all_input_files = []\n","PARENT_FILES_DIR = '/kaggle/input'\n","\n","for dirname, _, filenames in os.walk(PARENT_FILES_DIR):\n"," for filename in filenames:\n"," if \"wav\" in filename:\n"," all_input_files.append(os.path.join(dirname, filename))\n"," \n","print(f\"Number of input files: {len(all_input_files)}\")"]},{"cell_type":"markdown","metadata":{},"source":["### Installations and Imports "]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:05.910354Z","iopub.status.busy":"2024-03-15T06:51:05.909237Z","iopub.status.idle":"2024-03-15T06:51:20.351281Z","shell.execute_reply":"2024-03-15T06:51:20.350239Z","shell.execute_reply.started":"2024-03-15T06:51:05.910319Z"},"trusted":true},"outputs":[],"source":["import sys\n","!{sys.executable} -m pip install -q torchaudio soundfile matplotlib audioseal\n","\n","import typing as tp\n","import julius\n","import torch\n","import torchaudio\n","import urllib"]},{"cell_type":"markdown","metadata":{},"source":["### Load Audioseal models"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.354221Z","iopub.status.busy":"2024-03-15T06:51:20.353436Z","iopub.status.idle":"2024-03-15T06:51:20.378701Z","shell.execute_reply":"2024-03-15T06:51:20.377805Z","shell.execute_reply.started":"2024-03-15T06:51:20.354185Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Using device: cuda\n"]}],"source":["device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n","print(f\"Using device: {device}\")"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.380975Z","iopub.status.busy":"2024-03-15T06:51:20.379901Z","iopub.status.idle":"2024-03-15T06:51:20.919397Z","shell.execute_reply":"2024-03-15T06:51:20.918564Z","shell.execute_reply.started":"2024-03-15T06:51:20.380948Z"},"trusted":true},"outputs":[],"source":["from audioseal import AudioSeal\n","\n","model = AudioSeal.load_generator(\"audioseal_wm_16bits\")\n","detector = AudioSeal.load_detector(\"audioseal_detector_16bits\")"]},{"cell_type":"markdown","metadata":{},"source":["### Helper functions to load audio data, watermark audio, and get prediction scores for audio"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:20.922026Z","iopub.status.busy":"2024-03-15T06:51:20.921593Z","iopub.status.idle":"2024-03-15T06:51:21.091167Z","shell.execute_reply":"2024-03-15T06:51:21.090157Z","shell.execute_reply.started":"2024-03-15T06:51:20.921992Z"},"trusted":true},"outputs":[],"source":["model = model.to(device)\n","detector = detector.to(device)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:21.092660Z","iopub.status.busy":"2024-03-15T06:51:21.092352Z","iopub.status.idle":"2024-03-15T06:51:21.105098Z","shell.execute_reply":"2024-03-15T06:51:21.104097Z","shell.execute_reply.started":"2024-03-15T06:51:21.092635Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Secret message: tensor([[1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1]], device='cuda:0',\n"," dtype=torch.int32)\n"]}],"source":["secret_message = torch.randint(0, 2, (1, 16), dtype=torch.int32)\n","secret_message = secret_message.to(device)\n","print(f\"Secret message: {secret_message}\")\n","\n","# Function to load an audio file from its file path\n","def load_audio_file(\n"," file_path: str\n",") -> tp.Optional[tp.Tuple[torch.Tensor, int]]:\n"," try:\n"," wav, sample_rate = torchaudio.load(file_path)\n"," return wav, sample_rate\n"," except Exception as e:\n"," print(f\"Error while loading audio: {e}\")\n"," return None\n"," \n","# Function to generate a watermark for the audio and embed it into a new audio tensor\n","def generate_watermark_audio(\n"," tensor: torch.Tensor,\n"," sample_rate: int\n",") -> tp.Optional[torch.Tensor]:\n"," try:\n"," global model, device, secret_message\n"," audios = tensor.unsqueeze(0).to(device)\n"," watermarked_audio = model(audios, sample_rate=sample_rate, message=secret_message.to(device), alpha=1)\n"," return watermarked_audio\n","\n"," \n"," except Exception as e:\n"," print(f\"Error while watermarking audio: {e}\")\n"," return None\n"," \n","# Function to get the confidence score that an audio tensor was watermarked by Audioseal\n","def detect_watermark_audio(\n"," tensor: torch.Tensor,\n"," sample_rate: int,\n"," message_threshold: float = 0.50\n",") -> tp.Optional[float]:\n"," try:\n"," global detector, device\n"," # In our analysis we are not concerned with the hidden/embedded message as of now\n"," result, _ = detector.detect_watermark(tensor, sample_rate=sample_rate, message_threshold=message_threshold)\n"," return float(result)\n"," except Exception as e:\n"," print(f\"Error while detecting watermark: {e}\")\n"," return None"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## Audio attacks\n","\n","- In this notebook, we use the `SHUSH` attack.\n","- For more attacks and their descriptions, please refer to the [source](https://github.com/facebookresearch/audioseal/blob/main/examples/attacks.py)."]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:33.919489Z","iopub.status.busy":"2024-03-15T06:51:33.919084Z","iopub.status.idle":"2024-03-15T06:51:33.962401Z","shell.execute_reply":"2024-03-15T06:51:33.961261Z","shell.execute_reply.started":"2024-03-15T06:51:33.919460Z"},"trusted":true},"outputs":[],"source":["from attacks import AudioEffects as af"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Experimental setup\n","- `fraction` values: \\{0.1\\%, 1\\%, 10\\%, 30\\%\\}\n","- `nomenclature` : n, s, m, l\n","\n","In this notebook, we set the above parameters for the SHUSH attack and note the average confidence scores of Audioseal in predicting the presence of watermarks for these attacked audio files."]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:39.665411Z","iopub.status.busy":"2024-03-15T06:51:39.665025Z","iopub.status.idle":"2024-03-15T06:51:39.673355Z","shell.execute_reply":"2024-03-15T06:51:39.672473Z","shell.execute_reply.started":"2024-03-15T06:51:39.665382Z"},"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["import random\n","random.seed(42)\n","torch.backends.cudnn.benchmark = True\n","np.random.seed(42)\n","torch.manual_seed(42)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:51:40.533674Z","iopub.status.busy":"2024-03-15T06:51:40.532744Z","iopub.status.idle":"2024-03-15T06:59:10.240026Z","shell.execute_reply":"2024-03-15T06:59:10.239069Z","shell.execute_reply.started":"2024-03-15T06:51:40.533640Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":[" 5%|▌ | 148/2880 [01:38<09:22, 4.86it/s] "]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 67807] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_05/03-01-02-01-02-02-05.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 12%|█▏ | 335/2880 [02:27<04:57, 8.56it/s] "]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 57663] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_01/03-01-02-01-01-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 12%|█▏ | 339/2880 [02:27<04:13, 10.02it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 52324] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_01/03-01-08-01-02-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 15%|█▍ | 425/2880 [02:45<03:49, 10.68it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 69942] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_20/03-01-06-01-01-02-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 15%|█▍ | 431/2880 [02:45<03:42, 11.01it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 55528] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/Actor_20/03-01-03-01-02-01-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 45%|████▍ | 1289/2880 [04:43<02:16, 11.62it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 67807] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_05/03-01-02-01-02-02-05.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 51%|█████▏ | 1476/2880 [05:02<02:07, 11.02it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 57663] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_01/03-01-02-01-01-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 51%|█████▏ | 1478/2880 [05:02<01:55, 12.10it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 52324] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_01/03-01-08-01-02-02-01.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 54%|█████▍ | 1564/2880 [05:10<01:57, 11.20it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 69942] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_20/03-01-06-01-01-02-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":[" 55%|█████▍ | 1570/2880 [05:11<01:52, 11.61it/s]"]},{"name":"stdout","output_type":"stream","text":["Error while watermarking audio: Given groups=1, weight of size [32, 1, 7], expected input[1, 2, 55528] to have 1 channels, but got 2 channels instead\n","Skipping file /kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_20/03-01-03-01-02-01-20.wav due to 'NoneType' object has no attribute 'shape'\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 2880/2880 [07:29<00:00, 6.40it/s]\n"]}],"source":["from tqdm import tqdm\n","\n","all_scores_n = []\n","all_scores_s = []\n","all_scores_m = []\n","all_scores_l = []\n","all_saved_files = []\n","\n","for input_file in tqdm(all_input_files):\n"," try:\n"," # Load audio\n"," audio, sample_rate = load_audio_file(input_file)\n","\n"," # Generate watermarked audio\n"," watermarked_audio = generate_watermark_audio(audio, sample_rate)\n","\n"," # Perform SHUSH attacks\n"," shush_attack_audio_n = af.shush(watermarked_audio, fraction=0.001)\n"," shush_attack_audio_s = af.shush(watermarked_audio, fraction=0.01)\n"," shush_attack_audio_m = af.shush(watermarked_audio, fraction=0.1)\n"," shush_attack_audio_l = af.shush(watermarked_audio, fraction=0.3)\n","\n"," # Compute scores\n"," shush_score_n = detect_watermark_audio(shush_attack_audio_n, sample_rate)\n"," shush_score_s = detect_watermark_audio(shush_attack_audio_s, sample_rate)\n"," shush_score_m = detect_watermark_audio(shush_attack_audio_m, sample_rate)\n"," shush_score_l = detect_watermark_audio(shush_attack_audio_l, sample_rate)\n","\n"," # Store scores\n"," all_scores_n.append(float(shush_score_n))\n"," all_scores_s.append(float(shush_score_s))\n"," all_scores_m.append(float(shush_score_m))\n"," all_scores_l.append(float(shush_score_l))\n"," all_saved_files.append(input_file)\n"," except Exception as e:\n"," print(f\"Skipping file {input_file} due to {e}\")\n"," pass"]},{"cell_type":"markdown","metadata":{},"source":["## Store results and calculate metrics"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:59:10.242250Z","iopub.status.busy":"2024-03-15T06:59:10.241976Z","iopub.status.idle":"2024-03-15T06:59:10.249995Z","shell.execute_reply":"2024-03-15T06:59:10.248957Z","shell.execute_reply.started":"2024-03-15T06:59:10.242224Z"},"trusted":true},"outputs":[],"source":["df = pd.DataFrame({\n"," \"input_file\" : all_saved_files,\n"," \"watermark_confidence_n\" : all_scores_n,\n"," \"watermark_confidence_s\" : all_scores_s,\n"," \"watermark_confidence_m\" : all_scores_m,\n"," \"watermark_confidence_l\" : all_scores_l,\n","})"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-03-15T06:59:10.251570Z","iopub.status.busy":"2024-03-15T06:59:10.251203Z","iopub.status.idle":"2024-03-15T06:59:10.278172Z","shell.execute_reply":"2024-03-15T06:59:10.277277Z","shell.execute_reply.started":"2024-03-15T06:59:10.251528Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
watermark_confidence_nwatermark_confidence_swatermark_confidence_mwatermark_confidence_l
count2870.0000002870.0000002870.0000002870.000000
mean0.9988490.9901380.9002090.699678
std0.0007760.0007380.0007630.000516
min0.9763020.9673760.8761460.694676
25%0.9988770.9900220.9000830.699631
50%0.9989220.9902020.9002600.699777
75%0.9989630.9903340.9003520.699923
max0.9991770.9905500.9009580.700464
\n","
"],"text/plain":[" watermark_confidence_n watermark_confidence_s watermark_confidence_m \\\n","count 2870.000000 2870.000000 2870.000000 \n","mean 0.998849 0.990138 0.900209 \n","std 0.000776 0.000738 0.000763 \n","min 0.976302 0.967376 0.876146 \n","25% 0.998877 0.990022 0.900083 \n","50% 0.998922 0.990202 0.900260 \n","75% 0.998963 0.990334 0.900352 \n","max 0.999177 0.990550 0.900958 \n","\n"," watermark_confidence_l \n","count 2870.000000 \n","mean 0.699678 \n","std 0.000516 \n","min 0.694676 \n","25% 0.699631 \n","50% 0.699777 \n","75% 0.699923 \n","max 0.700464 "]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["df.describe()"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["## We note that Audioseal performs very well in recalling the watermarks - even in extreme conditions of masking the first 30\\% of the audio, the average confidence is $0.699678$. "]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[{"datasetId":107620,"sourceId":256618,"sourceType":"datasetVersion"}],"dockerImageVersionId":30665,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]"},"vscode":{"interpreter":{"hash":"916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"}}},"nbformat":4,"nbformat_minor":4} 2 | --------------------------------------------------------------------------------