├── .gitignore
├── archisound
├── __init__.py
└── archisound.py
├── setup.py
├── LICENSE
├── .pre-commit-config.yaml
├── .github
└── workflows
│ └── python-publish.yml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .mypy_cache
3 |
--------------------------------------------------------------------------------
/archisound/__init__.py:
--------------------------------------------------------------------------------
1 | from .archisound import ArchiSound
2 |
--------------------------------------------------------------------------------
/archisound/archisound.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from transformers import AutoModel
3 |
4 | REVISION = {
5 | "autoencoder1d-AT-v1": "57b6cde1969208d10fdd3e813708c1abe49f25c1",
6 | "dmae1d-ATC64-v1": "07885065867977af43b460bb9c1422bdc90c29a0",
7 | "dmae1d-ATC64-v2": "3ffeea68d4c069777055fce9ac77bbb67eec1d68",
8 | "dmae1d-ATC32-v3": "3d43b811b83fa395d5ccd6cf58b796b85fddd1d2",
9 | "adapter-A-v1": "2ee66467450389917eab027526ea31c66d4b7edb",
10 | }
11 |
12 |
13 | class ArchiSound:
14 | @staticmethod
15 | def from_pretrained(name: str = "", **kwargs) -> nn.Module:
16 | default_kwargs = dict(revision=REVISION[name])
17 | return AutoModel.from_pretrained(
18 | f"archinetai/{name}", trust_remote_code=True, **{**default_kwargs, **kwargs}
19 | )
20 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="archisound",
5 | packages=find_packages(exclude=[]),
6 | version="0.0.6",
7 | license="MIT",
8 | description="ArchiSound",
9 | long_description_content_type="text/markdown",
10 | author="Flavio Schneider",
11 | author_email="archinetai@protonmail.com",
12 | url="https://github.com/archinetai/archisound",
13 | keywords=["artificial intelligence", "deep learning"],
14 | install_requires=[
15 | "torch>=1.6",
16 | "data-science-types>=0.2",
17 | "transformers",
18 | "audio-diffusion-pytorch",
19 | "audio-encoders-pytorch",
20 | ],
21 | classifiers=[
22 | "Development Status :: 4 - Beta",
23 | "Intended Audience :: Developers",
24 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
25 | "License :: OSI Approved :: MIT License",
26 | "Programming Language :: Python :: 3.6",
27 | ],
28 | )
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 archinet.ai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v2.3.0
4 | hooks:
5 | - id: end-of-file-fixer
6 | - id: trailing-whitespace
7 |
8 | # Formats code correctly
9 | - repo: https://github.com/psf/black
10 | rev: 22.3.0
11 | hooks:
12 | - id: black
13 | args: [
14 | '--experimental-string-processing'
15 | ]
16 |
17 | # Sorts imports
18 | - repo: https://github.com/pycqa/isort
19 | rev: 5.10.1
20 | hooks:
21 | - id: isort
22 | name: isort (python)
23 | args: ["--profile", "black"]
24 |
25 | # Checks unused imports, like lengths, etc
26 | - repo: https://gitlab.com/pycqa/flake8
27 | rev: 4.0.0
28 | hooks:
29 | - id: flake8
30 | args: [
31 | '--per-file-ignores=__init__.py:F401',
32 | '--max-line-length=88',
33 | '--ignore=E1,W1,E2,W2,E4,W4,E5,W5' # Handled by black
34 | ]
35 |
36 | # Checks types
37 | - repo: https://github.com/pre-commit/mirrors-mypy
38 | rev: 'v0.971'
39 | hooks:
40 | - id: mypy
41 | additional_dependencies: [data-science-types>=0.2, torch>=1.6]
42 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # ArchiSound
3 |
4 | A collection of pre-trained audio models in PyTorch from [`audio-encoders-pytorch`](https://github.com/archinetai/audio-encoders-pytorch) and [`audio-diffusion-pytorch`](https://github.com/archinetai/audio-diffusion-pytorch).
5 |
6 | ## Install
7 | ```bash
8 | pip install archisound
9 | ```
10 |
11 | [](https://pypi.org/project/archisound/)
12 |
13 |
14 | ## Autoencoders
15 |
16 | * [`dmae1d-ATC32-v3`](https://huggingface.co/archinetai/dmae1d-ATC32-v3/tree/main)
17 | Usage and Info
18 |
19 | ```py
20 | from archisound import ArchiSound
21 |
22 | autoencoder = ArchiSound.from_pretrained("dmae1d-ATC32-v3")
23 |
24 | x = torch.randn(1, 2, 2**18)
25 | z = autoencoder.encode(x) # [1, 32, 512]
26 | y = autoencoder.decode(z, num_steps=20) # [1, 2, 262144]
27 | ```
28 |
29 | | Info | |
30 | | ------------- | ------------- |
31 | | Input type | Audio (stereo @ 48kHz) |
32 | | Number of parameters | 86M |
33 | | Compression Factor | 32x |
34 | | Downsampling Factor | 512x |
35 | | Bottleneck Type | Tanh |
36 |
37 |
38 |
39 |
40 | * [`dmae1d-ATC64-v2`](https://huggingface.co/archinetai/dmae1d-ATC64-v2/tree/main)
41 | Usage and Info
42 |
43 | ```py
44 | from archisound import ArchiSound
45 |
46 | autoencoder = ArchiSound.from_pretrained("dmae1d-ATC64-v2")
47 |
48 | x = torch.randn(1, 2, 2**18)
49 | z = autoencoder.encode(x) # [1, 32, 256]
50 | y = autoencoder.decode(z, num_steps=20) # [1, 2, 262144]
51 | ```
52 |
53 | | Info | |
54 | | ------------- | ------------- |
55 | | Input type | Audio (stereo @ 48kHz) |
56 | | Number of parameters | 185M |
57 | | Compression Factor | 64x |
58 | | Downsampling Factor | 1024x |
59 | | Bottleneck Type | Tanh |
60 |
61 |
62 |
63 |
64 |
65 | * [`autoencoder1d-AT-v1`](https://huggingface.co/archinetai/autoencoder1d-AT-v1/tree/main)
66 | Usage and Info
67 |
68 | ```py
69 | from archisound import ArchiSound
70 |
71 | autoencoder = ArchiSound.from_pretrained('autoencoder1d-AT-v1')
72 |
73 | x = torch.randn(1, 2, 2**18) # [1, 2, 262144]
74 | z = autoencoder.encode(x) # [1, 32, 8192]
75 | y = autoencoder.decode(z) # [1, 2, 262144]
76 | ```
77 |
78 | | Info | |
79 | | ------------- | ------------- |
80 | | Input type | Audio (stereo @ 48kHz) |
81 | | Number of parameters | 20.7M |
82 | | Compression Factor | 2x |
83 | | Downsampling Factor | 32x |
84 | | Bottleneck Type | Tanh |
85 | | Known Limitations | Slight blurriness in high frequency spectrogram reconstruction |
86 |
87 |
88 |
89 |
90 |
91 | * [`dmae1d-ATC64-v1`](https://huggingface.co/archinetai/dmae1d-ATC64-v1/tree/main)
92 | Usage and Info
93 |
94 | A diffusion based autoencoder with high compression ratio. Requires `audio_diffusion_pytorch==0.0.92`.
95 |
96 | ```py
97 | from archisound import ArchiSound
98 |
99 | autoencoder = ArchiSound.from_pretrained("dmae1d-ATC64-v1")
100 |
101 | x = torch.randn(1, 2, 2**18)
102 | z = autoencoder.encode(x) # [1, 32, 256]
103 | y = autoencoder.decode(z, num_steps=20) # [1, 2, 262144]
104 | ```
105 |
106 | | Info | |
107 | | ------------- | ------------- |
108 | | Input type | Audio (stereo @ 48kHz) |
109 | | Number of parameters | 234.2M |
110 | | Compression Factor | 64x |
111 | | Downsampling Factor | 1024x |
112 | | Bottleneck Type | Tanh |
113 |
114 |
--------------------------------------------------------------------------------