├── conf ├── salad_bowl.yml ├── generated │ ├── cat │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── cat10 │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── saxophone │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── ivo │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── march-31 │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── lazaro-ros │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── le-poisson-steve │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ ├── sax-new │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml │ └── lazaro-ros-sep │ │ ├── coarse.yml │ │ ├── interface.yml │ │ └── c2f.yml ├── c2f.yml ├── interface.yml ├── lora │ ├── lora.yml │ └── lora-s2s.yml └── vampnet.yml ├── DEFAULT_MODEL ├── unloop ├── .gitignore ├── requirements.txt ├── max │ ├── choose_from_list.js │ ├── randint.maxpat │ ├── randrange.maxpat │ ├── paths.js │ ├── unloop.maxpat │ ├── two-gate.maxpat │ ├── pan~.maxpat │ ├── dry-wet.maxpat │ ├── panner-cleat.maxpat │ └── click.maxpat ├── _.md └── client.py ├── DEFAULT_HF_MODEL_REPO ├── TODOS ├── scratch ├── convert_to_wav.sh ├── separate_folder.sh └── rms_mask.txt ├── assets └── example.wav ├── vampnet ├── modules │ ├── __init__.py │ ├── activations.py │ └── layers.py ├── scheduler.py ├── util.py ├── __init__.py ├── mask.py ├── beats.py ├── control.py └── newmask.py ├── requirements.txt ├── update-repos.sh ├── scripts ├── utils │ ├── stage.py │ ├── remove_quiet_files.py │ ├── split_long_audio_file.py │ ├── README.md │ ├── huggingface │ │ └── push_to_repos.sh │ ├── plots.py │ ├── split.py │ ├── xeno-canto-dl.py │ ├── visualize_embeddings.py │ └── gtzan_embeddings.py └── exp │ ├── export.py │ ├── fine_tune.py │ ├── eval.py │ └── experiment.py ├── LICENSE ├── hello.py ├── setup.py ├── .gitattributes ├── token_telephone ├── ttutil.py └── vamp_helper.py ├── .gitignore └── README.md /conf/salad_bowl.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DEFAULT_MODEL: -------------------------------------------------------------------------------- 1 | default 2 | -------------------------------------------------------------------------------- /unloop/.gitignore: -------------------------------------------------------------------------------- 1 | .gradio -------------------------------------------------------------------------------- /DEFAULT_HF_MODEL_REPO: -------------------------------------------------------------------------------- 1 | hugggof/vampnet -------------------------------------------------------------------------------- /TODOS: -------------------------------------------------------------------------------- 1 | [ ] add sketch2sound finetuning -------------------------------------------------------------------------------- /scratch/convert_to_wav.sh: -------------------------------------------------------------------------------- 1 | for f in *.mp3; do ffmpeg -i "$f" "${f%.mp3}.wav"; done -------------------------------------------------------------------------------- /scratch/separate_folder.sh: -------------------------------------------------------------------------------- 1 | for f in *.mp3; do demucs "$f" --two-stems=vocals; done 2 | -------------------------------------------------------------------------------- /assets/example.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hugofloresgarcia/vampnet/HEAD/assets/example.wav -------------------------------------------------------------------------------- /unloop/requirements.txt: -------------------------------------------------------------------------------- 1 | python-osc 2 | descript-audiotools 3 | tqdm 4 | argbind 5 | gradio-client -------------------------------------------------------------------------------- /vampnet/modules/__init__.py: -------------------------------------------------------------------------------- 1 | import audiotools 2 | 3 | audiotools.ml.BaseModel.INTERN += ["vampnet.modules.**"] 4 | audiotools.ml.BaseModel.EXTERN += ["einops", "flash_attn.flash_attention", "loralib"] 5 | 6 | from .transformer import VampNet -------------------------------------------------------------------------------- /unloop/max/choose_from_list.js: -------------------------------------------------------------------------------- 1 | subdivs = [0.125, 0.25, 0.5, 1, 2, 4]; 2 | subdivs = subdivs.map(function(x) { return x; }); 3 | 4 | function bang() { 5 | var i = Math.floor(Math.random() * subdivs.length); 6 | outlet(0, subdivs[i]); 7 | } -------------------------------------------------------------------------------- /conf/generated/cat/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/cat/coarse 6 | train/AudioLoader.sources: &id001 7 | - scratch/cat-audio 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/cat10/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/cat10/coarse 6 | train/AudioLoader.sources: &id001 7 | - scratch/cat-audio-10s 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/saxophone/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/saxophone/coarse 6 | train/AudioLoader.sources: &id001 7 | - scratch/sounds 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/ivo/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/ivo/coarse 6 | train/AudioLoader.sources: &id001 7 | - ./scratch/miguel/ivo/separated 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/march-31/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/march-31/coarse 6 | train/AudioLoader.sources: &id001 7 | - sound-journal-march-31 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/lazaro-ros/coarse 6 | train/AudioLoader.sources: &id001 7 | - ./scratch/miguel/lazaro-ros 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/le-poisson-steve/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/le-poisson-steve/coarse 6 | train/AudioLoader.sources: &id001 7 | - scratch/steve 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/sax-new/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/sax-new/coarse 6 | train/AudioLoader.sources: &id001 7 | - ./scratch/miguel/saxophone-new/ 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/vampnet.yml 3 | 4 | VampNet.n_codebooks: 14 5 | VampNet.n_conditioning_codebooks: 4 6 | 7 | VampNet.embedding_dim: 1280 8 | VampNet.n_layers: 16 9 | VampNet.n_heads: 20 10 | 11 | AudioDataset.duration: 3.0 12 | 13 | 14 | AudioDataset.loudness_cutoff: -40.0 15 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros-sep/coarse.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | fine_tune: true 4 | fine_tune_checkpoint: ./models/vampnet/coarse.pth 5 | save_path: ./runs/lazaro-ros-sep/coarse 6 | train/AudioLoader.sources: &id001 7 | - ./scratch/miguel/lazaro-ros/separated 8 | val/AudioLoader.sources: *id001 9 | -------------------------------------------------------------------------------- /conf/generated/cat/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - scratch/cat-audio 3 | Interface.coarse2fine_ckpt: ./runs/cat/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/cat/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/cat10/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - scratch/cat-audio-10s 3 | Interface.coarse2fine_ckpt: ./runs/cat10/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/cat10/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/ivo/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - ./scratch/miguel/ivo/separated 3 | Interface.coarse2fine_ckpt: ./runs/ivo/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/ivo/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/saxophone/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - scratch/sounds 3 | Interface.coarse2fine_ckpt: ./runs/saxophone/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/saxophone/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/march-31/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - sound-journal-march-31 3 | Interface.coarse2fine_ckpt: ./runs/march-31/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/march-31/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/sax-new/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - ./scratch/miguel/saxophone-new/ 3 | Interface.coarse2fine_ckpt: ./runs/sax-new/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/sax-new/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - ./scratch/miguel/lazaro-ros 3 | Interface.coarse2fine_ckpt: ./runs/lazaro-ros/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/lazaro-ros/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/le-poisson-steve/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - scratch/steve 3 | Interface.coarse2fine_ckpt: ./runs/le-poisson-steve/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/le-poisson-steve/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros-sep/interface.yml: -------------------------------------------------------------------------------- 1 | AudioLoader.sources: 2 | - - ./scratch/miguel/lazaro-ros/separated 3 | Interface.coarse2fine_ckpt: ./runs/lazaro-ros-sep/c2f/latest/vampnet/weights.pth 4 | Interface.coarse_ckpt: ./runs/lazaro-ros-sep/coarse/latest/vampnet/weights.pth 5 | Interface.codec_ckpt: ./models/vampnet/codec.pth 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | -------------------------------------------------------------------------------- /conf/interface.yml: -------------------------------------------------------------------------------- 1 | Interface.coarse_ckpt: ./models/vampnet/coarse.pth 2 | Interface.coarse2fine_ckpt: ./models/vampnet/c2f.pth 3 | Interface.codec_ckpt: ./models/vampnet/codec.pth 4 | Interface.coarse_chunk_size_s: 10 5 | Interface.coarse2fine_chunk_size_s: 3 6 | Interface.wavebeat_ckpt: ./models/wavebeat.pth 7 | 8 | # AudioLoader.sources: 9 | # - /media/CHONK/null 10 | 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | argbind>=0.3.2 3 | numpy==1.23 4 | loralib 5 | wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat 6 | lac @ git+https://github.com/hugofloresgarcia/lac.git 7 | descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git 8 | -e git+https://github.com/audacitorch/pyharp.git@develop#egg=pyharp 9 | torch_pitch_shift 10 | gradio 11 | pydantic==2.10.6 -------------------------------------------------------------------------------- /update-repos.sh: -------------------------------------------------------------------------------- 1 | # 2 | repos=( "vampnet-music" "vampnet-percussion" "vampnet-n64" "vampnet-birds" "vampnet-choir" "vampnet-machines" "nesquik" "vampnet-opera") 3 | for repo in "${repos[@]}" 4 | do 5 | echo "Updating $repo" 6 | git remote add --fetch $repo https://huggingface.co/spaces/hugggof/$repo 7 | git push --force $repo main 8 | done 9 | 10 | # https://huggingface.co/spaces/hugggof/vampnet-music 11 | # git push --space-percussion main -------------------------------------------------------------------------------- /conf/generated/cat/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/cat/c2f 13 | train/AudioLoader.sources: &id001 14 | - scratch/cat-audio 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/cat10/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/cat10/c2f 13 | train/AudioLoader.sources: &id001 14 | - scratch/cat-audio-10s 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/ivo/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/ivo/c2f 13 | train/AudioLoader.sources: &id001 14 | - ./scratch/miguel/ivo/separated 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/saxophone/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/saxophone/c2f 13 | train/AudioLoader.sources: &id001 14 | - scratch/sounds 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/march-31/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/march-31/c2f 13 | train/AudioLoader.sources: &id001 14 | - sound-journal-march-31 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/le-poisson-steve/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/le-poisson-steve/c2f 13 | train/AudioLoader.sources: &id001 14 | - scratch/steve 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/sax-new/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/sax-new/c2f 13 | train/AudioLoader.sources: &id001 14 | - ./scratch/miguel/saxophone-new/ 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/lazaro-ros/c2f 13 | train/AudioLoader.sources: &id001 14 | - ./scratch/miguel/lazaro-ros 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/generated/lazaro-ros-sep/c2f.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/lora/lora.yml 3 | AudioDataset.duration: 3.0 4 | AudioDataset.loudness_cutoff: -40.0 5 | VampNet.embedding_dim: 1280 6 | VampNet.n_codebooks: 14 7 | VampNet.n_conditioning_codebooks: 4 8 | VampNet.n_heads: 20 9 | VampNet.n_layers: 16 10 | fine_tune: true 11 | fine_tune_checkpoint: ./models/vampnet/c2f.pth 12 | save_path: ./runs/lazaro-ros-sep/c2f 13 | train/AudioLoader.sources: &id001 14 | - ./scratch/miguel/lazaro-ros/separated 15 | val/AudioLoader.sources: *id001 16 | -------------------------------------------------------------------------------- /conf/lora/lora.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/vampnet.yml 3 | 4 | fine_tune: True 5 | 6 | train/AudioDataset.n_examples: 100000000 7 | val/AudioDataset.n_examples: 500 8 | 9 | 10 | NoamScheduler.warmup: 500 11 | 12 | batch_size: 7 13 | num_workers: 7 14 | save_iters: [2000, 4000, 10000, 20000, 40000, 100000] 15 | sample_freq: 2000 16 | val_freq: 1000 17 | 18 | AdamW.lr: 0.0001 19 | 20 | # let's us organize sound classes into folders and choose from those sound classes uniformly 21 | AudioDataset.without_replacement: False 22 | num_iters: 500000 23 | -------------------------------------------------------------------------------- /conf/lora/lora-s2s.yml: -------------------------------------------------------------------------------- 1 | $include: 2 | - conf/vampnet.yml 3 | 4 | fine_tune: True 5 | 6 | train/AudioDataset.n_examples: 100000000 7 | val/AudioDataset.n_examples: 500 8 | 9 | 10 | NoamScheduler.warmup: 500 11 | 12 | batch_size: 7 13 | num_workers: 7 14 | save_iters: [2000, 4000, 10000,20000, 40000, 100000] 15 | sample_freq: 2000 16 | val_freq: 1000 17 | 18 | AdamW.lr: 0.0001 19 | 20 | # let's us organize sound classes into folders and choose from those sound classes uniformly 21 | AudioDataset.without_replacement: False 22 | num_iters: 500000 23 | 24 | 25 | # control signals to use as conditioning. 26 | Sketch2SoundController.ctrl_keys: ['rmsq16',] 27 | 28 | -------------------------------------------------------------------------------- /scripts/utils/stage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | import argbind 6 | import rich 7 | from audiotools.ml import Experiment 8 | 9 | 10 | @argbind.bind(without_prefix=True) 11 | def run( 12 | run_dir: str = os.getenv("PATH_TO_RUNS", "runs"), 13 | name: str = None, 14 | recent: bool = False, 15 | ): 16 | if recent: 17 | paths = sorted(Path(run_dir).iterdir(), key=os.path.getmtime) 18 | paths = [p.name for p in paths if p.is_dir()] 19 | if paths: 20 | name = paths[-1] 21 | 22 | with Experiment(run_dir, name) as exp: 23 | exp.snapshot() 24 | rich.print(f"Created a snapshot of {exp.parent_directory} at {exp.exp_dir}") 25 | 26 | 27 | if __name__ == "__main__": 28 | args = argbind.parse_args() 29 | with argbind.scope(args): 30 | run() 31 | -------------------------------------------------------------------------------- /scripts/utils/remove_quiet_files.py: -------------------------------------------------------------------------------- 1 | # removes files with loudness below 24db 2 | 3 | from pathlib import Path 4 | import shutil 5 | import audiotools as at 6 | import argbind 7 | 8 | @argbind.bind(without_prefix=True) 9 | def remove_quiet_files( 10 | src_dir: Path = None, 11 | dest_dir: Path = None, 12 | min_loudness: float = -30, 13 | ): 14 | # copy src to dest 15 | dest_dir.mkdir(parents=True, exist_ok=True) 16 | shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True) 17 | 18 | audio_files = at.util.find_audio(dest_dir) 19 | for audio_file in audio_files: 20 | sig = at.AudioSignal(audio_file) 21 | if sig.loudness() < min_loudness: 22 | audio_file.unlink() 23 | print(f"removed {audio_file}") 24 | 25 | if __name__ == "__main__": 26 | args = argbind.parse_args() 27 | 28 | with argbind.scope(args): 29 | remove_quiet_files() -------------------------------------------------------------------------------- /scripts/utils/split_long_audio_file.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import argbind 3 | 4 | import audiotools as at 5 | import tqdm 6 | 7 | 8 | @argbind.bind(without_prefix=True) 9 | def split_long_audio_file( 10 | file: str = None, 11 | max_chunk_size_s: int = 60*10 12 | ): 13 | file = Path(file) 14 | output_dir = file.parent / file.stem 15 | output_dir.mkdir() 16 | 17 | sig = at.AudioSignal(file) 18 | 19 | # split into chunks 20 | for i, sig in tqdm.tqdm(enumerate(sig.windows( 21 | window_duration=max_chunk_size_s, hop_duration=max_chunk_size_s/2, 22 | preprocess=True)) 23 | ): 24 | sig.write(output_dir / f"{i}.wav") 25 | 26 | print(f"wrote {len(list(output_dir.glob('*.wav')))} files to {output_dir}") 27 | 28 | return output_dir 29 | 30 | if __name__ == "__main__": 31 | args = argbind.parse_args() 32 | 33 | with argbind.scope(args): 34 | split_long_audio_file() -------------------------------------------------------------------------------- /scripts/utils/README.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | ## process_zip.py 4 | 5 | Some requirements that may not be installed in the docker image: 6 | * argbind 7 | * wav2wav (pip install git+https://github.com/descriptinc/lyrebird-wav2wav.git or `pip install git+https://github.com/descriptinc/lyrebird-wav2wav.git@`) 8 | 9 | ### zip folder structure 10 | 11 | The zip folder should have the following internal structure: 12 | 13 | ``` 14 | base_folder/ 15 | test_case_1/ 16 | before.wav 17 | test_case_2/ 18 | before.wav 19 | ... 20 | test_case_n/ 21 | before.wav 22 | ``` 23 | 24 | Note: There can be issues with the output zip if the input zip folder structure is too deep or too shallow. IF you want/need to use a zip file with a different folder structure, adjust this: 25 | https://github.com/descriptinc/lyrebird-wav2wav/blob/136c923ce19df03876a515ca0ed83854710cfa30/scripts/utils/process_zip.py#L28 26 | 27 | ### Execution 28 | `python process_zip.py -tag ` 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Hugo Flores García and Prem Seetharaman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /conf/vampnet.yml: -------------------------------------------------------------------------------- 1 | 2 | codec_ckpt: ./models/vampnet/codec.pth 3 | save_path: ckpt 4 | 5 | num_iters: 1000000000 6 | save_iters: [10000, 50000, 100000, 300000, 500000] 7 | val_idx: [0,1,2,3,4,5,6,7,8,9] 8 | sample_freq: 10000 9 | val_freq: 1000 10 | 11 | batch_size: 8 12 | num_workers: 10 13 | 14 | # Optimization 15 | amp: false 16 | 17 | CrossEntropyLoss.label_smoothing: 0.1 18 | 19 | AdamW.lr: 0.001 20 | 21 | NoamScheduler.factor: 2.0 22 | NoamScheduler.warmup: 10000 23 | 24 | VampNet.vocab_size: 1024 25 | VampNet.n_codebooks: 4 26 | VampNet.n_conditioning_codebooks: 0 27 | VampNet.r_cond_dim: 0 28 | VampNet.noise_mode: mask 29 | VampNet.embedding_dim: 1280 30 | VampNet.n_layers: 20 31 | VampNet.n_heads: 20 32 | VampNet.flash_attn: false 33 | VampNet.dropout: 0.1 34 | 35 | AudioLoader.relative_path: "" 36 | AudioDataset.loudness_cutoff: -30.0 37 | AudioDataset.without_replacement: true 38 | AudioLoader.shuffle: true 39 | 40 | AudioDataset.duration: 10.0 41 | 42 | train/AudioDataset.n_examples: 10000000 43 | train/AudioLoader.sources: 44 | - /media/CHONK/hugo/spotdl/audio-train 45 | 46 | val/AudioDataset.n_examples: 2000 47 | val/AudioLoader.sources: 48 | - /media/CHONK/hugo/spotdl/audio-val 49 | 50 | -------------------------------------------------------------------------------- /scripts/utils/huggingface/push_to_repos.sh: -------------------------------------------------------------------------------- 1 | # the (remote repo, model_name) are: 2 | # vampnet-music (default) 3 | # vampnet-percussion (percussion) 4 | # vampnet-choir ()'choir') 5 | # etc for.. 6 | # 'machines' 7 | # 'n64' 8 | # 'opera' 9 | # 'percussion' 10 | 11 | # iterate through remote, model_name pairs: 12 | # and edit the DEFAULT_MODEL file in the repo 13 | # add commit and push to the right remote 14 | # each remote starts with https://huggingface.co/hugggof/{repo_name} 15 | 16 | for repo in vampnet-music vampnet-percussion vampnet-choir vampnet-machines vampnet-n64 vampnet-opera vampnet-percussion 17 | do 18 | echo "repo: $repo" 19 | # get the model name from the repo 20 | model_name=$(echo $repo | cut -d'-' -f2) 21 | # if the model_name is music , set it to default 22 | if [ $model_name == "music" ]; then 23 | model_name="default" 24 | fi 25 | echo "model_name: $model_name" 26 | # remove the DEFAULT_MODEL file 27 | rm DEFAULT_MODEL 28 | # create a new DEFAULT_MODEL file with the model name 29 | echo $model_name > DEFAULT_MODEL 30 | 31 | # commit and push to the right remote 32 | git add DEFAULT_MODEL 33 | git commit -m "update DEFAULT_MODEL to $model_name" 34 | git remote remove $repo 35 | git remote add $repo https://huggingface.co/spaces/hugggof/$repo 36 | git push $repo main 37 | done -------------------------------------------------------------------------------- /hello.py: -------------------------------------------------------------------------------- 1 | import random 2 | import vampnet 3 | import audiotools as at 4 | 5 | # load the default vampnet model 6 | interface = vampnet.interface.Interface.default() 7 | 8 | # list available finetuned models 9 | finetuned_model_choices = interface.available_models() 10 | print(f"available finetuned models: {finetuned_model_choices}") 11 | 12 | # pick a random finetuned model 13 | model_choice = random.choice(finetuned_model_choices) 14 | print(f"choosing model: {model_choice}") 15 | 16 | # or pick a specific finetuned model 17 | print(f"actually, forcing model: default") 18 | model_choice = "default" 19 | 20 | # load a finetuned model 21 | interface.load_finetuned(model_choice) 22 | 23 | # load an example audio file 24 | signal = at.AudioSignal("assets/example.wav") 25 | 26 | # get the tokens for the audio 27 | codes = interface.encode(signal) 28 | 29 | # build a mask for the audio 30 | mask = interface.build_mask( 31 | codes, signal, 32 | periodic_prompt=13, 33 | upper_codebook_mask=3, 34 | ) 35 | 36 | # generate the output tokens 37 | output_tokens = interface.vamp( 38 | codes, mask, return_mask=False, 39 | temperature=1.0, 40 | typical_filtering=False, 41 | debug=True 42 | ) 43 | 44 | # convert them to a signal 45 | output_signal = interface.decode(output_tokens) 46 | 47 | # save the output signal 48 | output_signal.write("scratch/output.wav") -------------------------------------------------------------------------------- /vampnet/scheduler.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import List 3 | 4 | import torch 5 | 6 | class NoamScheduler: 7 | """OG scheduler from transformer paper: https://arxiv.org/pdf/1706.03762.pdf 8 | Implementation from Annotated Transformer: https://nlp.seas.harvard.edu/2018/04/03/attention.html 9 | """ 10 | 11 | def __init__( 12 | self, 13 | optimizer: torch.optim.Optimizer, 14 | d_model: int = 512, 15 | factor: float = 1.0, 16 | warmup: int = 4000, 17 | ): 18 | # Store hparams 19 | self.warmup = warmup 20 | self.factor = factor 21 | self.d_model = d_model 22 | 23 | # Initialize variables `lr` and `steps` 24 | self.lr = None 25 | self.steps = 0 26 | 27 | # Store the optimizer 28 | self.optimizer = optimizer 29 | 30 | def state_dict(self): 31 | return { 32 | key: value for key, value in self.__dict__.items() if key != "optimizer" 33 | } 34 | 35 | def load_state_dict(self, state_dict): 36 | self.__dict__.update(state_dict) 37 | 38 | def step(self): 39 | self.steps += 1 40 | self.lr = self.factor * ( 41 | self.d_model ** (-0.5) 42 | * min(self.steps ** (-0.5), self.steps * self.warmup ** (-1.5)) 43 | ) 44 | 45 | for p in self.optimizer.param_groups: 46 | p["lr"] = self.lr 47 | 48 | -------------------------------------------------------------------------------- /vampnet/util.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | 3 | import torch 4 | from einops import rearrange 5 | 6 | def scalar_to_batch_tensor(x, batch_size): 7 | return torch.tensor(x).repeat(batch_size) 8 | 9 | 10 | def parallelize( 11 | fn, 12 | *iterables, 13 | parallel: str = "thread_map", 14 | **kwargs 15 | ): 16 | if parallel == "thread_map": 17 | from tqdm.contrib.concurrent import thread_map 18 | return thread_map( 19 | fn, 20 | *iterables, 21 | **kwargs 22 | ) 23 | elif parallel == "process_map": 24 | from tqdm.contrib.concurrent import process_map 25 | return process_map( 26 | fn, 27 | *iterables, 28 | **kwargs 29 | ) 30 | elif parallel == "single": 31 | return [fn(x) for x in tqdm.tqdm(*iterables)] 32 | else: 33 | raise ValueError(f"parallel must be one of 'thread_map', 'process_map', 'single', but got {parallel}") 34 | 35 | def codebook_flatten(tokens: torch.Tensor): 36 | """ 37 | flatten a sequence of tokens from (batch, codebook, time) to (batch, codebook * time) 38 | """ 39 | return rearrange(tokens, "b c t -> b (t c)") 40 | 41 | def codebook_unflatten(flat_tokens: torch.Tensor, n_c: int = None): 42 | """ 43 | unflatten a sequence of tokens from (batch, codebook * time) to (batch, codebook, time) 44 | """ 45 | tokens = rearrange(flat_tokens, "b (t c) -> b c t", c=n_c) 46 | return tokens 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open("README.md") as f: 5 | long_description = f.read() 6 | 7 | setup( 8 | name="vampnet", 9 | version="0.0.1", 10 | classifiers=[ 11 | "Intended Audience :: Developers", 12 | "Natural Language :: English", 13 | "Programming Language :: Python :: 3.7", 14 | "Topic :: Artistic Software", 15 | "Topic :: Multimedia", 16 | "Topic :: Multimedia :: Sound/Audio", 17 | "Topic :: Multimedia :: Sound/Audio :: Editors", 18 | "Topic :: Software Development :: Libraries", 19 | ], 20 | description="Generative Music Modeling.", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | author="Hugo Flores García, Prem Seetharaman", 24 | author_email="hugggofloresgarcia@gmail.com", 25 | url="https://github.com/hugofloresgarcia/vampnet", 26 | license="MIT", 27 | packages=find_packages(), 28 | install_requires=[ 29 | "torch==2.4.1", 30 | "argbind>=0.3.2", 31 | "numpy==1.23", 32 | "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat", 33 | "lac @ git+https://github.com/hugofloresgarcia/lac.git", 34 | "descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git", 35 | "gradio", 36 | "loralib", 37 | "torch_pitch_shift", 38 | "plotly", 39 | "pydantic==2.10.6", 40 | "spaces", 41 | ], 42 | ) 43 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.bin filter=lfs diff=lfs merge=lfs -text 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text 6 | *.ftz filter=lfs diff=lfs merge=lfs -text 7 | *.gz filter=lfs diff=lfs merge=lfs -text 8 | *.h5 filter=lfs diff=lfs merge=lfs -text 9 | *.joblib filter=lfs diff=lfs merge=lfs -text 10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text 11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text 12 | *.model filter=lfs diff=lfs merge=lfs -text 13 | *.msgpack filter=lfs diff=lfs merge=lfs -text 14 | *.npy filter=lfs diff=lfs merge=lfs -text 15 | *.npz filter=lfs diff=lfs merge=lfs -text 16 | *.onnx filter=lfs diff=lfs merge=lfs -text 17 | *.ot filter=lfs diff=lfs merge=lfs -text 18 | *.parquet filter=lfs diff=lfs merge=lfs -text 19 | *.pb filter=lfs diff=lfs merge=lfs -text 20 | *.pickle filter=lfs diff=lfs merge=lfs -text 21 | *.pkl filter=lfs diff=lfs merge=lfs -text 22 | *.pt filter=lfs diff=lfs merge=lfs -text 23 | *.rar filter=lfs diff=lfs merge=lfs -text 24 | *.safetensors filter=lfs diff=lfs merge=lfs -text 25 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text 26 | *.tar.* filter=lfs diff=lfs merge=lfs -text 27 | *.tar filter=lfs diff=lfs merge=lfs -text 28 | *.tflite filter=lfs diff=lfs merge=lfs -text 29 | *.tgz filter=lfs diff=lfs merge=lfs -text 30 | *.wasm filter=lfs diff=lfs merge=lfs -text 31 | *.xz filter=lfs diff=lfs merge=lfs -text 32 | *.zip filter=lfs diff=lfs merge=lfs -text 33 | *.zst filter=lfs diff=lfs merge=lfs -text 34 | *tfevents* filter=lfs diff=lfs merge=lfs -text 35 | -------------------------------------------------------------------------------- /token_telephone/ttutil.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | ROOT = Path(__file__).parent 4 | 5 | import numpy as np 6 | from queue import Queue 7 | 8 | # make a log file!! 9 | logfile= ROOT / "log.txt" 10 | if logfile.exists(): 11 | logfile.unlink() 12 | logging.basicConfig(filename=logfile, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S", format="%(asctime)s | %(levelname)s | %(message)s") 13 | 14 | 15 | def hsv_to_rgb(h, s, v): 16 | # from https://en.wikipedia.org/wiki/HSL_and_HSV#From_HSV 17 | c = v * s 18 | h_ = h / 60 19 | x = c * (1 - abs(h_ % 2 - 1)) 20 | m = v - c 21 | 22 | if h_ < 1: 23 | r, g, b = c, x, 0 24 | elif h_ < 2: 25 | r, g, b = x, c, 0 26 | elif h_ < 3: 27 | r, g, b = 0, c, x 28 | elif h_ < 4: 29 | r, g, b = 0, x, c 30 | elif h_ < 5: 31 | r, g, b = x, 0, c 32 | else: 33 | r, g, b = c, 0, x 34 | 35 | return r + m, g + m, b + m 36 | 37 | 38 | def dbg(*args): 39 | print(" ".join(map(str, args))) 40 | 41 | 42 | # we'll want to log on a separate thread 43 | # so that we can log without blocking the main thread 44 | 45 | # make a queue for logging 46 | log_queue = Queue() 47 | 48 | # log to a file instead of the console 49 | def log(msg): 50 | # log_queue.put(msg) 51 | logging.info(msg) 52 | pass 53 | 54 | def set_debug(debug): 55 | if debug: 56 | # print log to console 57 | logging.getLogger().addHandler(logging.StreamHandler()) 58 | 59 | 60 | def pow2db(x): 61 | return 10 * np.log10(x + 1e-6) 62 | 63 | 64 | def db2pow(x): 65 | return 10 ** (x / 10) 66 | -------------------------------------------------------------------------------- /vampnet/modules/activations.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from einops import rearrange 7 | 8 | 9 | class NewGELU(nn.Module): 10 | """ 11 | Implementation of the GELU activation function currently in Google BERT repo 12 | (identical to OpenAI GPT). Also see the Gaussian Error Linear Units 13 | paper: https://arxiv.org/abs/1606.08415 14 | """ 15 | 16 | def forward(self, x): 17 | return ( 18 | 0.5 19 | * x 20 | * ( 21 | 1.0 22 | + torch.tanh( 23 | math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)) 24 | ) 25 | ) 26 | ) 27 | 28 | class GatedGELU(nn.Module): 29 | def __init__(self): 30 | super().__init__() 31 | self.gelu = NewGELU() 32 | 33 | def forward(self, x, dim: int = -1): 34 | p1, p2 = x.chunk(2, dim=dim) 35 | return p1 * self.gelu(p2) 36 | 37 | class Snake1d(nn.Module): 38 | def __init__(self, channels): 39 | super().__init__() 40 | self.alpha = nn.Parameter(torch.ones(channels)) 41 | 42 | def forward(self, x): 43 | return x + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * x).pow(2) 44 | 45 | def get_activation(name: str = "relu"): 46 | if name == "relu": 47 | return nn.ReLU 48 | elif name == "gelu": 49 | return NewGELU 50 | elif name == "geglu": 51 | return GatedGELU 52 | elif name == "snake": 53 | return Snake1d 54 | else: 55 | raise ValueError(f"Unrecognized activation {name}") -------------------------------------------------------------------------------- /scripts/utils/plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | from pandas.api.types import CategoricalDtype 4 | 5 | def plot_metrics(metrics, condition_to_latex, title, color_palette): 6 | # Add a new column to your dataframe with the latex representation 7 | metrics['condition_latex'] = metrics['condition'].map(condition_to_latex) 8 | 9 | # Order condition_latex as per the condition_to_latex dictionary 10 | cat_type = CategoricalDtype(categories=condition_to_latex.values(), ordered=True) 11 | metrics['condition_latex'] = metrics['condition_latex'].astype(cat_type) 12 | 13 | # Compute mean and std for each condition for each metric 14 | grouped = metrics.groupby('condition_latex')[['mel', 'frechet']].agg(['mean', 'std']) 15 | 16 | fig, axs = plt.subplots(2, 1, figsize=(7, 5.25)) 17 | 18 | # Set the main title for the figure 19 | fig.suptitle(title, fontsize=16) 20 | 21 | # Get color for each bar in the plot 22 | bar_colors = [color_palette[condition] for condition in grouped.index] 23 | 24 | # Plot mel 25 | sns.boxplot(x='condition_latex', y='mel', data=metrics, ax=axs[0], palette=color_palette, showfliers=False) 26 | axs[0].set_ylabel('Mel Spectrogram Loss \u2190') 27 | axs[0].set_xlabel('') # Remove x-axis label 28 | axs[0].set_xticklabels(grouped.index, rotation=0, ha='center') 29 | 30 | # Plot frechet 31 | axs[1].bar(grouped.index, grouped['frechet']['mean'], yerr=grouped['frechet']['std'], color=bar_colors) 32 | axs[1].set_ylabel('FAD \u2190') 33 | axs[1].set_xlabel('') # Remove x-axis label 34 | axs[1].set_xticklabels(grouped.index, rotation=0, ha='center') 35 | 36 | # Adjust the space between plots 37 | plt.subplots_adjust(hspace=0.1) 38 | 39 | # Remove any unnecessary space around the plot 40 | plt.tight_layout(rect=[0, 0, 1, 0.96]) 41 | 42 | # Reduce the space between suptitle and the plot 43 | plt.subplots_adjust(top=0.92) -------------------------------------------------------------------------------- /scripts/utils/split.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import random 3 | import shutil 4 | import os 5 | import json 6 | 7 | import argbind 8 | from tqdm import tqdm 9 | from tqdm.contrib.concurrent import thread_map 10 | 11 | from audiotools.core import util 12 | 13 | 14 | @argbind.bind(without_prefix=True) 15 | def train_test_split( 16 | audio_folder: str = ".", 17 | test_size: float = 0.2, 18 | seed: int = 42, 19 | ): 20 | print(f"finding audio") 21 | 22 | audio_folder = Path(audio_folder) 23 | audio_files = util.find_audio(audio_folder) 24 | print(f"found {len(audio_files)} audio files") 25 | 26 | # split according to test_size 27 | n_test = int(len(audio_files) * test_size) 28 | n_train = len(audio_files) - n_test 29 | 30 | # shuffle 31 | random.seed(seed) 32 | random.shuffle(audio_files) 33 | 34 | train_files = audio_files[:n_train] 35 | test_files = audio_files[n_train:] 36 | 37 | 38 | print(f"Train files: {len(train_files)}") 39 | print(f"Test files: {len(test_files)}") 40 | continue_ = input("Continue [yn]? ") or "n" 41 | 42 | if continue_ != "y": 43 | return 44 | 45 | for split, files in ( 46 | ("train", train_files), ("test", test_files) 47 | ): 48 | for file in tqdm(files): 49 | out_file = audio_folder.parent / f"{audio_folder.name}-{split}" / Path(file).name 50 | out_file.parent.mkdir(exist_ok=True, parents=True) 51 | try: 52 | os.symlink(file, out_file) 53 | except FileExistsError: 54 | print(f"File {out_file} already exists, skipping") 55 | 56 | # save split as json 57 | with open(Path(audio_folder) / f"{split}.json", "w") as f: 58 | json.dump([str(f) for f in files], f) 59 | 60 | 61 | 62 | if __name__ == "__main__": 63 | args = argbind.parse_args() 64 | 65 | with argbind.scope(args): 66 | train_test_split() -------------------------------------------------------------------------------- /unloop/_.md: -------------------------------------------------------------------------------- 1 | ## client side setup 2 | clone 3 | ``` 4 | https://github.com/hugofloresgarcia/unsound-objects.git 5 | git checkout unloop 6 | ``` 7 | 8 | install 9 | ``` 10 | conda create -n unsound python=3.10 11 | conda activate unsound 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## server side setup 16 | ssh into malleus 17 | ``` 18 | ssh bryan@malleus.cs.northwestern.edu -L 7860:localhost:7860 19 | ``` 20 | 21 | then leave the malleus window open and start up a new local window 22 | 23 | (kindly ask hugo to launch the gradio on port 7860) 24 | 25 | you can verify that the gradio is running by opening `http://localhost:7860` on your browser 26 | 27 | ## launch the gradio server (vampnet) 28 | you have to run the gradio server running vampnet model. 29 | (on the remote machine) 30 | ```bash 31 | conda create -n vampnet python=3.10 32 | git clone https://github.com/huggingface.co/spaces/hugggof/vampnet-music.git 33 | pip install -e . 34 | CUDA_VISIBLE_DEVICES=0 python app.py 35 | ``` 36 | 37 | ### launch the gradio server (s2s) 38 | you have to run the gradio server running audit model. 39 | 40 | (on the remote machine) 41 | ```bash 42 | conda create -n audit python=3.10 43 | cd audit 44 | pip install -r requirements.txt 45 | CUDA_VISIBLE_DEVICES=0 python scripts/text2sfx/demo.py ckpts/adobe-soda/checkpoints/seethara/text2sfx/25-02-18-256ch-8s/ --model latest_ema.pth 46 | ``` 47 | 48 | or for audit-old 49 | ``` 50 | CUDA_VISIBLE_DEVICES=0 python scripts/cdit/demos/voice2sfx.py ckpts/rms-centroid-ppg/latest.pth 51 | ``` 52 | 53 | ## launch the client (laptop) 54 | then launch the client from your local terminal 55 | ``` 56 | python client.py --vampnet_url --s2s_url http://localhost:7860 57 | ``` 58 | 59 | ## max setup 60 | Then...make sure you have installed (in Max) 61 | ``` 62 | flucoma 63 | ``` 64 | 65 | MAKE SURE YOU ARE RUNNING MAX 8. It is not compatible with Max 9. 66 | 67 | Now open up the right max patch `./max/sound-objects.maxpat`. 68 | 69 | ### text prompts 70 | NOTE: text prompts are from the list here 71 | https://universalcategorysystem.com/ 72 | https://www.dropbox.com/scl/fo/lw1i20cgsm4edsvj3awn1/AP_ZhzG3LlpfFLbX309FbOU?dl=0&e=1&preview=UCS+v8.2.1+Full+List.xlsx&rlkey=wa2onzo0difpew1nze6odztlp 73 | *** HUGO make an empty 'audio' directory in the repo! *** 74 | -------------------------------------------------------------------------------- /scripts/exp/export.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import shutil 4 | import argparse 5 | from vampnet import DEFAULT_HF_MODEL_REPO 6 | from huggingface_hub import create_repo, repo_exists, HfApi 7 | 8 | 9 | 10 | parser = argparse.ArgumentParser(description="Export the fine-tuned model to the repo") 11 | parser.add_argument( 12 | "--name", type=str, default="lazaro-ros-sep", 13 | help="name of the fine-tuned model to export" 14 | ) 15 | parser.add_argument( 16 | "--model", type=str, default="latest", 17 | help="model version to export. check runs/ for available versions" 18 | ) 19 | parser.add_argument( 20 | "--repo", type=str, default=DEFAULT_HF_MODEL_REPO, 21 | help="name of the repo to export to" 22 | ) 23 | 24 | args = parser.parse_args() 25 | name = args.name 26 | version = args.model 27 | 28 | ## 29 | print(f"~~~~~~~~~~~ vampnet export! ~~~~~~~~~~~~") 30 | print(f"exporting {name} version {version} to {args.repo}\n") 31 | 32 | run_dir = Path(f"runs/{name}") 33 | repo_dir = Path("models/vampnet") 34 | 35 | # create our repo 36 | new_repo = False 37 | if not repo_exists(args.repo): 38 | print(f"repo {args.repo} does not exist, creating it") 39 | print(f"creating a repo at {args.repo}") 40 | create_repo(args.repo) 41 | new_repo = True 42 | 43 | paths = [] 44 | for part in ("coarse", "c2f"): 45 | outdir = repo_dir / "loras" / name 46 | outdir.mkdir(parents=True, exist_ok=True) 47 | outpath = outdir / f"{part}.pth" 48 | path = run_dir / part / version / "vampnet" / "weights.pth" 49 | # path.rename(outpath) 50 | shutil.copy(path, outpath) 51 | paths.append(outpath) 52 | print(f"copied {path} to {outpath}") 53 | 54 | print(f"uploading files to {args.repo}") 55 | # upload files to the repo 56 | 57 | # if it's a new repo, let's add the default models too 58 | if new_repo: 59 | paths.extend([repo_dir / "c2f.pth", repo_dir / "coarse.pth", repo_dir / "codec.pth", repo_dir / "wavebeat.pth"]) 60 | 61 | api = HfApi() 62 | 63 | for path in paths: 64 | path_in_repo = str(path.relative_to(repo_dir)) 65 | print(f"uploading {path} to {args.repo}/{path_in_repo}") 66 | api.upload_file( 67 | path_or_fileobj=path, 68 | path_in_repo=path_in_repo, 69 | repo_id=args.repo, 70 | token=True, 71 | commit_message=f"uploading {path_in_repo}", 72 | ) 73 | 74 | 75 | print("done!!! >::0") -------------------------------------------------------------------------------- /vampnet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import modules 3 | from pathlib import Path 4 | from . import scheduler 5 | from .interface import Interface 6 | from .modules.transformer import VampNet 7 | 8 | 9 | __version__ = "0.0.1" 10 | 11 | ROOT = Path(__file__).parent.parent 12 | MODELS_DIR = ROOT / "models" / "vampnet" 13 | 14 | from huggingface_hub import hf_hub_download, HfFileSystem 15 | DEFAULT_HF_MODEL_REPO_DIR = ROOT / "DEFAULT_HF_MODEL_REPO" 16 | DEFAULT_HF_MODEL_REPO = DEFAULT_HF_MODEL_REPO_DIR.read_text().strip() 17 | # DEFAULT_HF_MODEL_REPO = "hugggof/vampnet" 18 | FS = HfFileSystem() 19 | 20 | def download_codec(): 21 | # from dac.model.dac import DAC 22 | from lac.model.lac import LAC as DAC 23 | repo_id = DEFAULT_HF_MODEL_REPO 24 | filename = "codec.pth" 25 | codec_path = hf_hub_download( 26 | repo_id=repo_id, 27 | filename=filename, 28 | subfolder=None, 29 | local_dir=MODELS_DIR 30 | ) 31 | return codec_path 32 | 33 | 34 | def download_default(): 35 | filenames = ["coarse.pth", "c2f.pth", "wavebeat.pth"] 36 | repo_id = DEFAULT_HF_MODEL_REPO 37 | paths = [] 38 | for filename in filenames: 39 | path = f"{MODELS_DIR}/{filename}" 40 | if not Path(path).exists(): 41 | print(f"{path} does not exist, downloading") 42 | FS.download(f"{repo_id}/{filename}", path) 43 | paths.append(path) 44 | 45 | # load the models 46 | return paths[0], paths[1] 47 | 48 | 49 | def download_finetuned(name, repo_id=DEFAULT_HF_MODEL_REPO): 50 | filenames = ["coarse.pth", "c2f.pth"] 51 | paths = [] 52 | for filename in filenames: 53 | path = f"{MODELS_DIR}/loras/{name}/{filename}" 54 | if not Path(path).exists(): 55 | print(f"{path} does not exist, downloading") 56 | FS.download(f"{repo_id}/loras/{name}/{filename}", path) 57 | paths.append(path) 58 | 59 | # load the models 60 | return paths[0], paths[1] 61 | 62 | def list_finetuned(repo_id=DEFAULT_HF_MODEL_REPO): 63 | diritems = FS.listdir(f"{repo_id}/loras") 64 | # iterate through all the names 65 | valid_diritems = [] 66 | for item in diritems: 67 | model_file_items = FS.listdir(item["name"]) 68 | item_names = [item["name"].split("/")[-1] for item in model_file_items] 69 | # check that theres a "c2f.pth" and "coarse.pth" in the items 70 | c2f_exists = "c2f.pth" in item_names 71 | coarse_exists = "coarse.pth" in item_names 72 | if c2f_exists and coarse_exists: 73 | valid_diritems.append(item) 74 | 75 | # get the names of the valid items 76 | names = [item["name"].split("/")[-1] for item in valid_diritems] 77 | return names 78 | 79 | 80 | -------------------------------------------------------------------------------- /scratch/rms_mask.txt: -------------------------------------------------------------------------------- 1 | 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 2 | 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 3 | 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 4 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 12 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 13 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 14 | 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 15 | -------------------------------------------------------------------------------- /unloop/max/randint.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 59.0, 106.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "comment" : "", 44 | "id" : "obj-4", 45 | "index" : 1, 46 | "maxclass" : "outlet", 47 | "numinlets" : 1, 48 | "numoutlets" : 0, 49 | "patching_rect" : [ 56.0, 199.0, 30.0, 30.0 ] 50 | } 51 | 52 | } 53 | , { 54 | "box" : { 55 | "id" : "obj-3", 56 | "maxclass" : "newobj", 57 | "numinlets" : 2, 58 | "numoutlets" : 1, 59 | "outlettype" : [ "" ], 60 | "patching_rect" : [ 56.0, 127.0, 63.0, 22.0 ], 61 | "text" : "random 1." 62 | } 63 | 64 | } 65 | , { 66 | "box" : { 67 | "comment" : "bang for a random number", 68 | "id" : "obj-2", 69 | "index" : 1, 70 | "maxclass" : "inlet", 71 | "numinlets" : 0, 72 | "numoutlets" : 1, 73 | "outlettype" : [ "bang" ], 74 | "patching_rect" : [ 56.0, 77.0, 30.0, 30.0 ] 75 | } 76 | 77 | } 78 | , { 79 | "box" : { 80 | "id" : "obj-1", 81 | "maxclass" : "newobj", 82 | "numinlets" : 6, 83 | "numoutlets" : 1, 84 | "outlettype" : [ "" ], 85 | "patching_rect" : [ 56.0, 159.0, 130.0, 22.0 ], 86 | "text" : "scale 0. 1. #1 #2 #3" 87 | } 88 | 89 | } 90 | ], 91 | "lines" : [ { 92 | "patchline" : { 93 | "destination" : [ "obj-4", 0 ], 94 | "source" : [ "obj-1", 0 ] 95 | } 96 | 97 | } 98 | , { 99 | "patchline" : { 100 | "destination" : [ "obj-3", 0 ], 101 | "source" : [ "obj-2", 0 ] 102 | } 103 | 104 | } 105 | , { 106 | "patchline" : { 107 | "destination" : [ "obj-1", 0 ], 108 | "source" : [ "obj-3", 0 ] 109 | } 110 | 111 | } 112 | ] 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /unloop/max/randrange.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 59.0, 106.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "comment" : "", 44 | "id" : "obj-4", 45 | "index" : 0, 46 | "maxclass" : "outlet", 47 | "numinlets" : 1, 48 | "numoutlets" : 0, 49 | "patching_rect" : [ 77.0, 218.0, 30.0, 30.0 ] 50 | } 51 | 52 | } 53 | , { 54 | "box" : { 55 | "id" : "obj-3", 56 | "maxclass" : "newobj", 57 | "numinlets" : 6, 58 | "numoutlets" : 1, 59 | "outlettype" : [ "" ], 60 | "patching_rect" : [ 77.0, 177.0, 97.0, 22.0 ], 61 | "text" : "scale 0. 1. #1 #2" 62 | } 63 | 64 | } 65 | , { 66 | "box" : { 67 | "id" : "obj-2", 68 | "maxclass" : "newobj", 69 | "numinlets" : 2, 70 | "numoutlets" : 1, 71 | "outlettype" : [ "" ], 72 | "patching_rect" : [ 77.0, 137.0, 63.0, 22.0 ], 73 | "text" : "random 1." 74 | } 75 | 76 | } 77 | , { 78 | "box" : { 79 | "comment" : "", 80 | "id" : "obj-1", 81 | "index" : 0, 82 | "maxclass" : "inlet", 83 | "numinlets" : 0, 84 | "numoutlets" : 1, 85 | "outlettype" : [ "" ], 86 | "patching_rect" : [ 77.0, 88.0, 30.0, 30.0 ] 87 | } 88 | 89 | } 90 | ], 91 | "lines" : [ { 92 | "patchline" : { 93 | "destination" : [ "obj-2", 0 ], 94 | "source" : [ "obj-1", 0 ] 95 | } 96 | 97 | } 98 | , { 99 | "patchline" : { 100 | "destination" : [ "obj-3", 0 ], 101 | "source" : [ "obj-2", 0 ] 102 | } 103 | 104 | } 105 | , { 106 | "patchline" : { 107 | "destination" : [ "obj-4", 0 ], 108 | "source" : [ "obj-3", 0 ] 109 | } 110 | 111 | } 112 | ], 113 | "dependency_cache" : [ ], 114 | "autosave" : 0 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /scripts/exp/fine_tune.py: -------------------------------------------------------------------------------- 1 | import argbind 2 | from pathlib import Path 3 | import yaml 4 | from typing import List 5 | 6 | 7 | 8 | 9 | """example output: (yaml) 10 | 11 | """ 12 | 13 | @argbind.bind(without_prefix=True, positional=True) 14 | def fine_tune(audio_files_or_folders: List[str], name: str): 15 | 16 | conf_dir = Path("conf") 17 | assert conf_dir.exists(), "conf directory not found. are you in the vampnet directory?" 18 | 19 | conf_dir = conf_dir / "generated" 20 | conf_dir.mkdir(exist_ok=True) 21 | 22 | finetune_dir = conf_dir / name 23 | finetune_dir.mkdir(exist_ok=True) 24 | 25 | finetune_c2f_conf = { 26 | "$include": ["conf/lora/lora.yml"], 27 | "fine_tune": True, 28 | "train/AudioLoader.sources": audio_files_or_folders, 29 | "val/AudioLoader.sources": audio_files_or_folders, 30 | "VampNet.n_codebooks": 14, 31 | "VampNet.n_conditioning_codebooks": 4, 32 | "VampNet.embedding_dim": 1280, 33 | "VampNet.n_layers": 16, 34 | "VampNet.n_heads": 20, 35 | "AudioDataset.duration": 3.0, 36 | "AudioDataset.loudness_cutoff": -40.0, 37 | "save_path": f"./runs/{name}/c2f", 38 | "fine_tune_checkpoint": "./models/vampnet/c2f.pth" 39 | } 40 | 41 | finetune_coarse_conf = { 42 | "$include": ["conf/lora/lora.yml"], 43 | "fine_tune": True, 44 | "train/AudioLoader.sources": audio_files_or_folders, 45 | "val/AudioLoader.sources": audio_files_or_folders, 46 | "save_path": f"./runs/{name}/coarse", 47 | "fine_tune_checkpoint": "./models/vampnet/coarse.pth" 48 | } 49 | 50 | interface_conf = { 51 | "Interface.coarse_ckpt": f"./runs/{name}/coarse/latest/vampnet/weights.pth", 52 | 53 | "Interface.coarse2fine_ckpt": f"./runs/{name}/c2f/latest/vampnet/weights.pth", 54 | "Interface.wavebeat_ckpt": "./models/wavebeat.pth", 55 | 56 | "Interface.codec_ckpt": "./models/vampnet/codec.pth", 57 | "AudioLoader.sources": [audio_files_or_folders], 58 | } 59 | 60 | # save the confs 61 | with open(finetune_dir / "c2f.yml", "w") as f: 62 | yaml.dump(finetune_c2f_conf, f) 63 | 64 | with open(finetune_dir / "coarse.yml", "w") as f: 65 | yaml.dump(finetune_coarse_conf, f) 66 | 67 | with open(finetune_dir / "interface.yml", "w") as f: 68 | yaml.dump(interface_conf, f) 69 | 70 | 71 | # print(f"generated confs in {finetune_dir}. 72 | # run training jobs with `python scripts/exp/train.py --args.load {finetune_dir}/.yml` ") 73 | 74 | print(f"generated confs in {finetune_dir}.") 75 | print() 76 | print(f"you'll need to run two training jobs, though they can run in parallel on separate GPUs.") 77 | print(f"run the coarse job with \n\tpython scripts/exp/train.py --args.load {finetune_dir}/coarse.yml\n") 78 | print(f"run the c2f job with \n\tpython scripts/exp/train.py --args.load {finetune_dir}/c2f.yml\n") 79 | if __name__ == "__main__": 80 | args = argbind.parse_args() 81 | 82 | with argbind.scope(args): 83 | fine_tune() 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/env.sh 108 | venv/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # Files created by experiments 131 | output/ 132 | snapshot/ 133 | *.m4a 134 | notebooks/scratch.ipynb 135 | notebooks/inspect.ipynb 136 | notebooks/effects.ipynb 137 | notebooks/*.ipynb 138 | notebooks/*.gif 139 | notebooks/*.wav 140 | notebooks/*.mp4 141 | *runs/ 142 | boards/ 143 | samples/ 144 | *.ipynb 145 | 146 | results.json 147 | metrics.csv 148 | mprofile_* 149 | mem.png 150 | 151 | results/ 152 | mprofile* 153 | *.png 154 | # do not ignore the test wav file 155 | !tests/audio/short_test_audio.wav 156 | !tests/audio/output.wav 157 | */.DS_Store 158 | .DS_Store 159 | env.sh 160 | _codebraid/ 161 | **/*.html 162 | **/*.exec.md 163 | flagged/ 164 | log.txt 165 | ckpt/ 166 | .syncthing* 167 | tests/assets/ 168 | archived/ 169 | 170 | # scratch/ 171 | scratch/miguel 172 | scratch/saxophone 173 | scratch/*.wav 174 | 175 | runs-archive 176 | lyrebird-audiotools 177 | lyrebird-audio-codec 178 | samples-*/** 179 | 180 | gradio-outputs/ 181 | samples*/ 182 | models-all/ 183 | models.zip 184 | .git-old 185 | 186 | 187 | 188 | gtzan.zip 189 | .gtzan_emb_cache 190 | 191 | 192 | data/ 193 | data 194 | pyharp 195 | 196 | models/vampnet/* 197 | models/* 198 | 199 | lib/ 200 | 201 | _outputs/ 202 | debug.txt 203 | 204 | scratch/* 205 | 206 | .gradio -------------------------------------------------------------------------------- /unloop/max/paths.js: -------------------------------------------------------------------------------- 1 | var pathModes = ["off", "wander", "circle", "bounce"]; 2 | 3 | // Define state object 4 | var state = { 5 | coords: [], 6 | coordidx: 0, 7 | mode: "off" 8 | }; 9 | 10 | // init with a random 11 | setPath("random"); 12 | 13 | // the space ranges from -1 to 1 in x and y, z must always be 0 14 | function bang() { 15 | // TODO: emit next xyz coordinate in path 16 | if (state.coords.length > 0) { 17 | outlet(0, state.coords[state.coordidx]); 18 | state.coordidx = (state.coordidx + 1) % state.coords.length; 19 | } 20 | else { 21 | post("no path to follow\n"); 22 | } 23 | } 24 | 25 | function setPath(mode) { 26 | if (pathModes.indexOf(mode) >= 0) { 27 | state.mode = mode; 28 | } 29 | state.mode = mode; 30 | 31 | // generate points for the pathe 32 | if (state.mode == "circle") { 33 | // circle around in a random direction 34 | state.coords = []; 35 | var numPoints = Math.round(Math.random() * 100); 36 | var angle = Math.random() * 2 * Math.PI; 37 | 38 | var direction = Math.random() < 0.5 ? 1 : -1; 39 | for (var i = 0; i < numPoints; i++) { 40 | state.coords.push([Math.cos(angle), Math.sin(angle), 0]); 41 | angle += 2 * Math.PI / numPoints * direction; 42 | } 43 | } 44 | else if (state.mode == "wander") { 45 | // wander around in brownian motion 46 | state.coords = []; 47 | var numPoints = Math.round(Math.random() * 100); 48 | // var x = 0; 49 | // var y = 0; 50 | // pick a random starting point within -1 and 1 51 | var x = Math.random() * 2 - 1; 52 | var y = Math.random() * 2 - 1; 53 | for (var i = 0; i < numPoints; i++) { 54 | x += Math.random() * 0.2 - 0.1; // TODO: this 0.1 controls wander amt 55 | y += Math.random() * 0.2 - 0.1; 56 | 57 | // clamp to -1 to 1 58 | x = Math.min(1, Math.max(-1, x)); 59 | y = Math.min(1, Math.max(-1, y)); 60 | state.coords.push([x, y, 0]); 61 | } 62 | } 63 | else if (state.mode == "bounce") { 64 | // bounce around two points 65 | state.coords = []; 66 | var numPoints = 2; 67 | var x = 0; 68 | var y = 0; 69 | 70 | // pick two random quadrants to place the point in 71 | quads = { 72 | 1: [1, 1], 73 | 2: [-1, 1], 74 | 3: [-1, -1], 75 | 4: [1, -1] 76 | } 77 | var quadindices = [1, 2, 3, 4]; 78 | // scramble quadindices 79 | quadindices.sort(function(a, b) { return Math.random() - 0.5; }); 80 | var quadidx1 = quadindices.pop(); 81 | var quadidx2 = quadindices.pop(); 82 | var quad1 = quads[quadidx1]; 83 | var quad2 = quads[quadidx2]; 84 | // post("quad1: " + quad1 + " quad2: " + quad2); 85 | 86 | // pick point 1, a random point in the range (0, 1), then scale by the quad 87 | var x1 = Math.random() * quad1[0]; 88 | var y1 = Math.random() * quad1[1]; 89 | 90 | // pick point 2, a random point in the range (0, 1), then scale by the quad 91 | var x2 = Math.random() * quad2[0]; 92 | var y2 = Math.random() * quad2[1]; 93 | 94 | // generate the path 95 | state.coords.push([x1, y1, 0]); 96 | state.coords.push([x2, y2, 0]); 97 | } 98 | else if (state.mode == "random") { 99 | state.coords = []; 100 | var numPoints = Math.round(Math.random() * 100) + 4; 101 | for (var i = 0; i < numPoints; i++) { 102 | state.coords.push([Math.random() * 2 - 1, Math.random() * 2 - 1, 0]); 103 | } 104 | // post("random now has " + state.coords.length + " points\n"); 105 | } 106 | else { 107 | post("unknown path mode"); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /unloop/max/unloop.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 9, 6 | "minor" : 0, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 84.0, 131.0, 1000.0, 780.0 ], 14 | "gridsize" : [ 15.0, 15.0 ], 15 | "boxes" : [ { 16 | "box" : { 17 | "bgmode" : 0, 18 | "border" : 0, 19 | "clickthrough" : 0, 20 | "enablehscroll" : 0, 21 | "enablevscroll" : 0, 22 | "id" : "obj-2", 23 | "lockeddragscroll" : 0, 24 | "lockedsize" : 0, 25 | "maxclass" : "bpatcher", 26 | "name" : "unloop-bpatcher.maxpat", 27 | "numinlets" : 2, 28 | "numoutlets" : 2, 29 | "offset" : [ 0.0, 0.0 ], 30 | "outlettype" : [ "signal", "" ], 31 | "patching_rect" : [ 100.0, 68.0, 307.0, 469.0 ], 32 | "viewvisibility" : 1 33 | } 34 | 35 | } 36 | , { 37 | "box" : { 38 | "id" : "obj-1", 39 | "maxclass" : "newobj", 40 | "numinlets" : 2, 41 | "numoutlets" : 0, 42 | "patching_rect" : [ 143.0, 616.0, 55.0, 22.0 ], 43 | "text" : "dac~ 1 2" 44 | } 45 | 46 | } 47 | ], 48 | "lines" : [ { 49 | "patchline" : { 50 | "destination" : [ "obj-1", 1 ], 51 | "order" : 0, 52 | "source" : [ "obj-2", 0 ] 53 | } 54 | 55 | } 56 | , { 57 | "patchline" : { 58 | "destination" : [ "obj-1", 0 ], 59 | "order" : 1, 60 | "source" : [ "obj-2", 0 ] 61 | } 62 | 63 | } 64 | ], 65 | "originid" : "pat-142", 66 | "parameters" : { 67 | "obj-2::obj-1124" : [ "morph", "dry/wet", 0 ], 68 | "obj-2::obj-1125" : [ "level[8]", "level", 0 ], 69 | "obj-2::obj-1128" : [ "gain[4]", "gain", 0 ], 70 | "obj-2::obj-1140" : [ "overdub", "overdub", 0 ], 71 | "obj-2::obj-117" : [ "live.drop", "live.drop", 0 ], 72 | "obj-2::obj-1230" : [ "speed[2]", "speed+", 0 ], 73 | "obj-2::obj-171" : [ "toggle[2]", "toggle[30]", 0 ], 74 | "obj-2::obj-295" : [ "button[1]", "button[1]", 0 ], 75 | "obj-2::obj-316" : [ "toggle[3]", "toggle[3]", 0 ], 76 | "obj-2::obj-424::obj-12" : [ "number[8]", "number[2]", 0 ], 77 | "obj-2::obj-424::obj-13" : [ "number[9]", "number[3]", 0 ], 78 | "obj-2::obj-424::obj-15" : [ "number[2]", "number[2]", 0 ], 79 | "obj-2::obj-424::obj-19" : [ "number[3]", "number[3]", 0 ], 80 | "obj-2::obj-424::obj-20" : [ "number", "number", 0 ], 81 | "obj-2::obj-424::obj-23" : [ "number[4]", "number[3]", 0 ], 82 | "obj-2::obj-424::obj-26" : [ "number[5]", "number[3]", 0 ], 83 | "obj-2::obj-424::obj-28" : [ "number[6]", "number[2]", 0 ], 84 | "obj-2::obj-424::obj-30" : [ "number[7]", "number[2]", 0 ], 85 | "obj-2::obj-424::obj-347" : [ "periodic", "periodic", 0 ], 86 | "obj-2::obj-424::obj-349" : [ "drop", "drop", 0 ], 87 | "obj-2::obj-424::obj-8" : [ "toggle", "toggle", 0 ], 88 | "obj-2::obj-54" : [ "lpf", "lpf", 0 ], 89 | "obj-2::obj-55" : [ "tapelength", "length", 0 ], 90 | "obj-2::obj-76" : [ "hpf", "hpf", 0 ], 91 | "obj-2::obj-91::obj-156" : [ "live.gain~[26]", "live.gain~", 0 ], 92 | "obj-2::obj-91::obj-162" : [ "live.gain~[25]", "live.gain~", 0 ], 93 | "parameterbanks" : { 94 | "0" : { 95 | "index" : 0, 96 | "name" : "", 97 | "parameters" : [ "-", "-", "-", "-", "-", "-", "-", "-" ] 98 | } 99 | 100 | } 101 | , 102 | "inherited_shortname" : 1 103 | } 104 | , 105 | "dependency_cache" : [ { 106 | "name" : "dry-wet.maxpat", 107 | "bootpath" : "~/projects/research/unloop-2025/vampnet/unloop/max", 108 | "patcherrelativepath" : ".", 109 | "type" : "JSON", 110 | "implicit" : 1 111 | } 112 | , { 113 | "name" : "unloop-bpatcher.maxpat", 114 | "bootpath" : "~/projects/research/unloop-2025/vampnet/unloop/max", 115 | "patcherrelativepath" : ".", 116 | "type" : "JSON", 117 | "implicit" : 1 118 | } 119 | , { 120 | "name" : "vampnet-ui.maxpat", 121 | "bootpath" : "~/projects/research/unloop-2025/vampnet/unloop/max", 122 | "patcherrelativepath" : ".", 123 | "type" : "JSON", 124 | "implicit" : 1 125 | } 126 | ], 127 | "autosave" : 0 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /unloop/max/two-gate.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 59.0, 106.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "comment" : "", 44 | "id" : "obj-4", 45 | "index" : 2, 46 | "maxclass" : "outlet", 47 | "numinlets" : 1, 48 | "numoutlets" : 0, 49 | "patching_rect" : [ 70.491801261901855, 137.704914093017578, 30.0, 30.0 ] 50 | } 51 | 52 | } 53 | , { 54 | "box" : { 55 | "comment" : "", 56 | "id" : "obj-3", 57 | "index" : 1, 58 | "maxclass" : "outlet", 59 | "numinlets" : 1, 60 | "numoutlets" : 0, 61 | "patching_rect" : [ 26.229507446289062, 137.704914093017578, 30.0, 30.0 ] 62 | } 63 | 64 | } 65 | , { 66 | "box" : { 67 | "comment" : "", 68 | "id" : "obj-1", 69 | "index" : 2, 70 | "maxclass" : "inlet", 71 | "numinlets" : 0, 72 | "numoutlets" : 1, 73 | "outlettype" : [ "" ], 74 | "patching_rect" : [ 70.491801261901855, 28.688523769378662, 30.0, 30.0 ] 75 | } 76 | 77 | } 78 | , { 79 | "box" : { 80 | "comment" : "", 81 | "id" : "obj-518", 82 | "index" : 1, 83 | "maxclass" : "inlet", 84 | "numinlets" : 0, 85 | "numoutlets" : 1, 86 | "outlettype" : [ "" ], 87 | "patching_rect" : [ 26.229507446289062, 28.688523769378662, 30.0, 30.0 ] 88 | } 89 | 90 | } 91 | , { 92 | "box" : { 93 | "id" : "obj-517", 94 | "maxclass" : "newobj", 95 | "numinlets" : 2, 96 | "numoutlets" : 1, 97 | "outlettype" : [ "int" ], 98 | "patching_rect" : [ 69.672129154205322, 74.590161800384521, 33.0, 22.0 ], 99 | "text" : "== 0" 100 | } 101 | 102 | } 103 | , { 104 | "box" : { 105 | "id" : "obj-516", 106 | "maxclass" : "newobj", 107 | "numinlets" : 2, 108 | "numoutlets" : 1, 109 | "outlettype" : [ "" ], 110 | "patching_rect" : [ 69.672129154205322, 106.557374000549316, 32.0, 22.0 ], 111 | "text" : "gate" 112 | } 113 | 114 | } 115 | , { 116 | "box" : { 117 | "id" : "obj-515", 118 | "maxclass" : "newobj", 119 | "numinlets" : 2, 120 | "numoutlets" : 1, 121 | "outlettype" : [ "" ], 122 | "patching_rect" : [ 26.229507446289062, 106.557374000549316, 32.0, 22.0 ], 123 | "text" : "gate" 124 | } 125 | 126 | } 127 | ], 128 | "lines" : [ { 129 | "patchline" : { 130 | "destination" : [ "obj-515", 1 ], 131 | "order" : 1, 132 | "source" : [ "obj-1", 0 ] 133 | } 134 | 135 | } 136 | , { 137 | "patchline" : { 138 | "destination" : [ "obj-516", 1 ], 139 | "order" : 0, 140 | "source" : [ "obj-1", 0 ] 141 | } 142 | 143 | } 144 | , { 145 | "patchline" : { 146 | "destination" : [ "obj-3", 0 ], 147 | "source" : [ "obj-515", 0 ] 148 | } 149 | 150 | } 151 | , { 152 | "patchline" : { 153 | "destination" : [ "obj-4", 0 ], 154 | "source" : [ "obj-516", 0 ] 155 | } 156 | 157 | } 158 | , { 159 | "patchline" : { 160 | "destination" : [ "obj-516", 0 ], 161 | "source" : [ "obj-517", 0 ] 162 | } 163 | 164 | } 165 | , { 166 | "patchline" : { 167 | "destination" : [ "obj-515", 0 ], 168 | "order" : 1, 169 | "source" : [ "obj-518", 0 ] 170 | } 171 | 172 | } 173 | , { 174 | "patchline" : { 175 | "destination" : [ "obj-517", 0 ], 176 | "order" : 0, 177 | "source" : [ "obj-518", 0 ] 178 | } 179 | 180 | } 181 | ] 182 | } 183 | 184 | } 185 | -------------------------------------------------------------------------------- /scripts/exp/eval.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | from functools import partial 4 | 5 | from frechet_audio_distance import FrechetAudioDistance 6 | import pandas 7 | import argbind 8 | import torch 9 | from tqdm import tqdm 10 | 11 | import audiotools 12 | from audiotools import AudioSignal 13 | 14 | @argbind.bind(without_prefix=True) 15 | def eval( 16 | exp_dir: str = None, 17 | baseline_key: str = "baseline", 18 | audio_ext: str = ".wav", 19 | ): 20 | assert exp_dir is not None 21 | exp_dir = Path(exp_dir) 22 | assert exp_dir.exists(), f"exp_dir {exp_dir} does not exist" 23 | 24 | # set up our metrics 25 | # sisdr_loss = audiotools.metrics.distance.SISDRLoss() 26 | # stft_loss = audiotools.metrics.spectral.MultiScaleSTFTLoss() 27 | mel_loss = audiotools.metrics.spectral.MelSpectrogramLoss() 28 | frechet = FrechetAudioDistance( 29 | use_pca=False, 30 | use_activation=False, 31 | verbose=True, 32 | audio_load_worker=4, 33 | ) 34 | frechet.model.to("cuda" if torch.cuda.is_available() else "cpu") 35 | 36 | # figure out what conditions we have 37 | conditions = [d.name for d in exp_dir.iterdir() if d.is_dir()] 38 | 39 | assert baseline_key in conditions, f"baseline_key {baseline_key} not found in {exp_dir}" 40 | conditions.remove(baseline_key) 41 | 42 | print(f"Found {len(conditions)} conditions in {exp_dir}") 43 | print(f"conditions: {conditions}") 44 | 45 | baseline_dir = exp_dir / baseline_key 46 | baseline_files = sorted(list(baseline_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem)) 47 | 48 | metrics = [] 49 | for condition in tqdm(conditions): 50 | cond_dir = exp_dir / condition 51 | cond_files = sorted(list(cond_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem)) 52 | 53 | print(f"computing fad for {baseline_dir} and {cond_dir}") 54 | frechet_score = frechet.score(baseline_dir, cond_dir) 55 | 56 | # make sure we have the same number of files 57 | num_files = min(len(baseline_files), len(cond_files)) 58 | baseline_files = baseline_files[:num_files] 59 | cond_files = cond_files[:num_files] 60 | assert len(list(baseline_files)) == len(list(cond_files)), f"number of files in {baseline_dir} and {cond_dir} do not match. {len(list(baseline_files))} vs {len(list(cond_files))}" 61 | 62 | def process(baseline_file, cond_file): 63 | # make sure the files match (same name) 64 | assert baseline_file.stem == cond_file.stem, f"baseline file {baseline_file} and cond file {cond_file} do not match" 65 | 66 | # load the files 67 | baseline_sig = AudioSignal(str(baseline_file)) 68 | cond_sig = AudioSignal(str(cond_file)) 69 | 70 | cond_sig.resample(baseline_sig.sample_rate) 71 | cond_sig.truncate_samples(baseline_sig.length) 72 | 73 | # if our condition is inpainting, we need to trim the conditioning off 74 | if "inpaint" in condition: 75 | ctx_amt = float(condition.split("_")[-1]) 76 | ctx_samples = int(ctx_amt * baseline_sig.sample_rate) 77 | print(f"found inpainting condition. trimming off {ctx_samples} samples from {cond_file} and {baseline_file}") 78 | cond_sig.trim(ctx_samples, ctx_samples) 79 | baseline_sig.trim(ctx_samples, ctx_samples) 80 | 81 | return { 82 | # "sisdr": -sisdr_loss(baseline_sig, cond_sig).item(), 83 | # "stft": stft_loss(baseline_sig, cond_sig).item(), 84 | "mel": mel_loss(baseline_sig, cond_sig).item(), 85 | "frechet": frechet_score, 86 | # "visqol": vsq, 87 | "condition": condition, 88 | "file": baseline_file.stem, 89 | } 90 | 91 | print(f"processing {len(baseline_files)} files in {baseline_dir} and {cond_dir}") 92 | metrics.extend(tqdm(map(process, baseline_files, cond_files), total=len(baseline_files))) 93 | 94 | metric_keys = [k for k in metrics[0].keys() if k not in ("condition", "file")] 95 | 96 | 97 | for mk in metric_keys: 98 | stat = pandas.DataFrame(metrics) 99 | stat = stat.groupby(['condition'])[mk].agg(['mean', 'count', 'std']) 100 | stat.to_csv(exp_dir / f"stats-{mk}.csv") 101 | 102 | df = pandas.DataFrame(metrics) 103 | df.to_csv(exp_dir / "metrics-all.csv", index=False) 104 | 105 | 106 | if __name__ == "__main__": 107 | args = argbind.parse_args() 108 | 109 | with argbind.scope(args): 110 | eval() -------------------------------------------------------------------------------- /vampnet/modules/layers.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Optional 3 | from typing import Tuple 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from einops import rearrange 9 | from torch.nn.utils import weight_norm 10 | 11 | # Scripting this brings model speed up 1.4x 12 | @torch.jit.script 13 | def snake(x, alpha): 14 | shape = x.shape 15 | x = x.reshape(shape[0], shape[1], -1) 16 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 17 | x = x.reshape(shape) 18 | return x 19 | 20 | 21 | class Snake1d(nn.Module): 22 | def __init__(self, channels): 23 | super().__init__() 24 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 25 | 26 | def forward(self, x): 27 | return snake(x, self.alpha) 28 | 29 | 30 | def num_params(model): 31 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 32 | 33 | 34 | def recurse_children(module, fn): 35 | for child in module.children(): 36 | if isinstance(child, nn.ModuleList): 37 | for c in child: 38 | yield recurse_children(c, fn) 39 | if isinstance(child, nn.ModuleDict): 40 | for c in child.values(): 41 | yield recurse_children(c, fn) 42 | 43 | yield recurse_children(child, fn) 44 | yield fn(child) 45 | 46 | 47 | def WNConv1d(*args, **kwargs): 48 | return weight_norm(nn.Conv1d(*args, **kwargs)) 49 | 50 | 51 | def WNConvTranspose1d(*args, **kwargs): 52 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 53 | 54 | 55 | class SequentialWithFiLM(nn.Module): 56 | """ 57 | handy wrapper for nn.Sequential that allows FiLM layers to be 58 | inserted in between other layers. 59 | """ 60 | 61 | def __init__(self, *layers): 62 | super().__init__() 63 | self.layers = nn.ModuleList(layers) 64 | 65 | @staticmethod 66 | def has_film(module): 67 | mod_has_film = any( 68 | [res for res in recurse_children(module, lambda c: isinstance(c, FiLM))] 69 | ) 70 | return mod_has_film 71 | 72 | def forward(self, x, cond): 73 | for layer in self.layers: 74 | if self.has_film(layer): 75 | x = layer(x, cond) 76 | else: 77 | x = layer(x) 78 | return x 79 | 80 | 81 | class FiLM(nn.Module): 82 | def __init__(self, input_dim: int, output_dim: int): 83 | super().__init__() 84 | 85 | self.input_dim = input_dim 86 | self.output_dim = output_dim 87 | 88 | if input_dim > 0: 89 | self.beta = nn.Linear(input_dim, output_dim) 90 | self.gamma = nn.Linear(input_dim, output_dim) 91 | 92 | def forward(self, x, r): 93 | if self.input_dim == 0: 94 | return x 95 | else: 96 | beta, gamma = self.beta(r), self.gamma(r) 97 | beta, gamma = ( 98 | beta.view(x.size(0), self.output_dim, 1), 99 | gamma.view(x.size(0), self.output_dim, 1), 100 | ) 101 | x = x * (gamma + 1) + beta 102 | return x 103 | 104 | 105 | class CodebookEmbedding(nn.Module): 106 | def __init__( 107 | self, 108 | vocab_size: int, 109 | latent_dim: int, 110 | n_codebooks: int, 111 | emb_dim: int, 112 | special_tokens: Optional[Tuple[str]] = None, 113 | ): 114 | super().__init__() 115 | self.n_codebooks = n_codebooks 116 | self.emb_dim = emb_dim 117 | self.latent_dim = latent_dim 118 | self.vocab_size = vocab_size 119 | 120 | if special_tokens is not None: 121 | for tkn in special_tokens: 122 | self.special = nn.ParameterDict( 123 | { 124 | tkn: nn.Parameter(torch.randn(n_codebooks, self.latent_dim)) 125 | for tkn in special_tokens 126 | } 127 | ) 128 | self.special_idxs = { 129 | tkn: i + vocab_size for i, tkn in enumerate(special_tokens) 130 | } 131 | 132 | self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1) 133 | 134 | def from_codes(self, codes: torch.Tensor, codec): 135 | """ 136 | get a sequence of continuous embeddings from a sequence of discrete codes. 137 | unlike it's counterpart in the original VQ-VAE, this function adds for any special tokens 138 | necessary for the language model, like . 139 | """ 140 | n_codebooks = codes.shape[1] 141 | latent = [] 142 | for i in range(n_codebooks): 143 | c = codes[:, i, :] 144 | 145 | lookup_table = codec.quantizer.quantizers[i].codebook.weight 146 | if hasattr(self, "special"): 147 | special_lookup = torch.cat( 148 | [self.special[tkn][i : i + 1] for tkn in self.special], dim=0 149 | ) 150 | lookup_table = torch.cat([lookup_table, special_lookup], dim=0) 151 | 152 | l = F.embedding(c, lookup_table).transpose(1, 2) 153 | latent.append(l) 154 | 155 | latent = torch.cat(latent, dim=1) 156 | return latent 157 | 158 | def forward(self, latents: torch.Tensor): 159 | """ 160 | project a sequence of latents to a sequence of embeddings 161 | """ 162 | x = self.out_proj(latents) 163 | return x 164 | 165 | -------------------------------------------------------------------------------- /token_telephone/vamp_helper.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import time 3 | import os 4 | from contextlib import contextmanager 5 | import random 6 | 7 | import numpy as np 8 | import audiotools as at 9 | from audiotools import AudioSignal 10 | import argbind 11 | import shutil 12 | import torch 13 | import yaml 14 | 15 | 16 | from vampnet.interface import Interface, signal_concat 17 | from vampnet import mask as pmask 18 | 19 | from ttutil import log 20 | 21 | # TODO: incorporate discord bot (if mem allows) 22 | # in a separate thread, send audio samples for listening 23 | # and send back the results 24 | # as well as the params for sampling 25 | # also a command that lets you clear the current signal 26 | # if you want to start over 27 | 28 | 29 | device = "cuda" if torch.cuda.is_available() else "cpu" 30 | 31 | VAMPNET_DIR = Path(".").resolve() 32 | 33 | @contextmanager 34 | def chdir(path): 35 | old_dir = os.getcwd() 36 | os.chdir(path) 37 | try: 38 | yield 39 | finally: 40 | os.chdir(old_dir) 41 | 42 | def load_interface(model_choice="default") -> Interface: 43 | with chdir(VAMPNET_DIR): 44 | 45 | 46 | # populate the model choices with any interface.yml files in the generated confs 47 | MODEL_CHOICES = { 48 | "default": { 49 | "Interface.coarse_ckpt": "models/vampnet/coarse.pth", 50 | "Interface.coarse2fine_ckpt": "models/vampnet/c2f.pth", 51 | "Interface.codec_ckpt": "models/vampnet/codec.pth", 52 | } 53 | } 54 | generated_confs = Path("conf/generated") 55 | for conf_file in generated_confs.glob("*/interface.yml"): 56 | with open(conf_file) as f: 57 | _conf = yaml.safe_load(f) 58 | 59 | # check if the coarse, c2f, and codec ckpts exist 60 | # otherwise, dont' add this model choice 61 | if not ( 62 | Path(_conf["Interface.coarse_ckpt"]).exists() and 63 | Path(_conf["Interface.coarse2fine_ckpt"]).exists() and 64 | Path(_conf["Interface.codec_ckpt"]).exists() 65 | ): 66 | continue 67 | 68 | MODEL_CHOICES[conf_file.parent.name] = _conf 69 | 70 | interface = Interface( 71 | device=device, 72 | coarse_ckpt=MODEL_CHOICES[model_choice]["Interface.coarse_ckpt"], 73 | coarse2fine_ckpt=MODEL_CHOICES[model_choice]["Interface.coarse2fine_ckpt"], 74 | codec_ckpt=MODEL_CHOICES[model_choice]["Interface.codec_ckpt"], 75 | ) 76 | 77 | interface.model_choices = MODEL_CHOICES 78 | interface.to("cuda" if torch.cuda.is_available() else "cpu") 79 | return interface 80 | 81 | def load_model(interface: Interface, model_choice: str): 82 | interface.reload( 83 | interface.model_choices[model_choice]["Interface.coarse_ckpt"], 84 | interface.model_choices[model_choice]["Interface.coarse2fine_ckpt"], 85 | ) 86 | 87 | def ez_variation( 88 | interface, 89 | sig: AudioSignal, 90 | seed: int = None, 91 | model_choice: str = None, 92 | ): 93 | t0 = time.time() 94 | 95 | if seed is None: 96 | seed = int(torch.randint(0, 2**32, (1,)).item()) 97 | at.util.seed(seed) 98 | 99 | # reload the model if necessary 100 | if model_choice is not None: 101 | load_model(interface, model_choice) 102 | 103 | # SAMPLING MASK PARAMS, hard code for now, we'll prob want a more preset-ey thing for the actual thin 104 | # we probably honestly just want to oscillate between the same 4 presets 105 | # in a predictable order such that they have a predictable outcome 106 | periodic_p = random.choice([3]) 107 | n_mask_codebooks = 3 108 | sampletemp = random.choice([1.0,]) 109 | dropout = random.choice([0.0, 0.0]) 110 | 111 | top_p = None # NOTE: top p may be the culprit behind the collapse into single pitches. 112 | 113 | # parameters for the build_mask function 114 | build_mask_kwargs = dict( 115 | rand_mask_intensity=1.0, 116 | prefix_s=0.0, 117 | suffix_s=0.0, 118 | periodic_prompt=int(periodic_p), 119 | periodic_prompt2=int(periodic_p), 120 | periodic_prompt_width=1, 121 | _dropout=dropout, 122 | upper_codebook_mask=int(n_mask_codebooks), 123 | upper_codebook_mask_2=int(n_mask_codebooks), 124 | ) 125 | 126 | # parameters for the vamp function 127 | vamp_kwargs = dict( 128 | temperature=sampletemp, 129 | typical_filtering=True, 130 | typical_mass=0.15, 131 | typical_min_tokens=64, 132 | top_p=top_p, 133 | seed=seed, 134 | sample_cutoff=1.0, 135 | ) 136 | 137 | # save the mask as a txt file 138 | interface.set_chunk_size(10.0) 139 | sig, mask, codes = interface.vamp( 140 | sig, 141 | batch_size=1, 142 | feedback_steps=1, 143 | time_stretch_factor=1, 144 | build_mask_kwargs=build_mask_kwargs, 145 | vamp_kwargs=vamp_kwargs, 146 | return_mask=True, 147 | ) 148 | 149 | log(f"vamp took {time.time() - t0} seconds") 150 | return sig 151 | 152 | 153 | 154 | def main(): 155 | import tqdm 156 | 157 | interface = load_interface() 158 | sig = AudioSignal.excerpt("assets/example.wav", duration=7.0) 159 | sig = interface.preprocess(sig) 160 | sig.write('ttout/in.wav') 161 | insig = sig.clone() 162 | 163 | fdbk_every = 4 164 | fdbk = 0.5 165 | 166 | for i in tqdm.tqdm(range(1000)): 167 | sig = ez_variation(interface, sig, model_choice="orchestral") 168 | sig.write(f'ttout/out{i}.wav') 169 | 170 | 171 | if __name__ == "__main__": 172 | main() -------------------------------------------------------------------------------- /scripts/utils/xeno-canto-dl.py: -------------------------------------------------------------------------------- 1 | from xenopy import Query 2 | 3 | 4 | SPECIES = [ 5 | "American Robin", 6 | "Northern Cardinal", 7 | "Mourning Dove", 8 | "American Crow", 9 | "Baltimore Oriole", 10 | "Blue Jay", 11 | "Eastern Bluebird", 12 | "House Finch", 13 | "American Goldfinch", 14 | "House Sparrow", 15 | "Song Sparrow", 16 | "Tufted Titmouse", 17 | "White-breasted Nuthatch", 18 | "European Starling", 19 | "American Redstart", 20 | "Red-winged Blackbird", 21 | "Brown-headed Cowbird", 22 | "Common Grackle", 23 | "Boat-tailed Grackle", 24 | "Common Yellowthroat", 25 | "Northern Mockingbird", 26 | "Carolina Wren", 27 | "Eastern Meadowlark", 28 | "Chipping Sparrow", 29 | "Tree Swallow", 30 | "Barn Swallow", 31 | "Cliff Swallow", 32 | "Pine Siskin", 33 | "Indigo Bunting", 34 | "Eastern Towhee", 35 | "Carolina Chickadee", 36 | "Great Crested Flycatcher", 37 | "Eastern Wood-Pewee", 38 | "Ovenbird", 39 | "Northern Flicker", 40 | "Red-eyed Vireo", 41 | "American Woodcock", 42 | "Eastern Phoebe", 43 | "Downy Woodpecker", 44 | "Scarlet Tanager", 45 | "Yellow Warbler", 46 | "White-eyed Vireo", 47 | "Common Loon", 48 | "White-throated Sparrow", 49 | "Yellow-throated Vireo", 50 | "Great Blue Heron", 51 | "Belted Kingfisher", 52 | "Pied-billed Grebe", 53 | "Wild Turkey", 54 | "Wood Thrush", 55 | "Rose-breasted Grosbeak", 56 | "Field Sparrow", 57 | "Hooded Warbler", 58 | "Northern Parula", 59 | "Chestnut-sided Warbler", 60 | "Blue-winged Warbler", 61 | "Red-bellied Woodpecker", 62 | "Yellow-billed Cuckoo", 63 | "Gray Catbird", 64 | "Northern Saw-whet Owl", 65 | "Osprey", 66 | "Common Nighthawk", 67 | "Broad-winged Hawk", 68 | "Black-throated Green Warbler", 69 | "Great Horned Owl", 70 | "Common Raven", 71 | "Barred Owl", 72 | "Canada Warbler", 73 | "Magnolia Warbler", 74 | "Black-and-white Warbler", 75 | "Eastern Kingbird", 76 | "Swainson's Thrush", 77 | "Worm-eating Warbler", 78 | "Prairie Warbler", 79 | "Baltimore Oriole", 80 | "Black-throated Blue Warbler", 81 | "Louisiana Waterthrush", 82 | "Blackburnian Warbler", 83 | "Black-capped Chickadee", 84 | "Cerulean Warbler", 85 | "Red-shouldered Hawk", 86 | "Cooper's Hawk", 87 | "Yellow-throated Warbler", 88 | "Blue-headed Vireo", 89 | "Blackpoll Warbler", 90 | "Ruffed Grouse", 91 | "Kentucky Warbler", 92 | "Hermit Thrush", 93 | "Cedar Waxwing", 94 | "Eastern Screech-Owl", 95 | "Northern Goshawk", 96 | "Green Heron", 97 | "Red-tailed Hawk", 98 | "Black Vulture", 99 | "Hairy Woodpecker", 100 | "Golden-crowned Kinglet", 101 | "Ruby-crowned Kinglet", 102 | "Bicknell's Thrush", 103 | "Blue-gray Gnatcatcher", 104 | "Veery", 105 | "Pileated Woodpecker", 106 | "Purple Finch", 107 | "White-crowned Sparrow", 108 | "Snow Bunting", 109 | "Pine Grosbeak", 110 | "American Tree Sparrow", 111 | "Dark-eyed Junco", 112 | "Snowy Owl", 113 | "White-winged Crossbill", 114 | "Red Crossbill", 115 | "Common Redpoll", 116 | "Northern Shrike", 117 | "Northern Harrier", 118 | "Rough-legged Hawk", 119 | "Long-eared Owl", 120 | "Evening Grosbeak", 121 | "Northern Pintail", 122 | "American Black Duck", 123 | "Mallard", 124 | "Canvasback", 125 | "Redhead", 126 | "Ring-necked Duck", 127 | "Greater Scaup", 128 | "Lesser Scaup", 129 | "Bufflehead", 130 | "Common Goldeneye", 131 | "Hooded Merganser", 132 | "Common Merganser", 133 | "Red-breasted Merganser", 134 | "Ruddy Duck", 135 | "Wood Duck", 136 | "Gadwall", 137 | "American Wigeon", 138 | "Northern Shoveler", 139 | "Green-winged Teal", 140 | "Blue-winged Teal", 141 | "Cinnamon Teal", 142 | "Ringed Teal", 143 | "Cape Teal", 144 | "Northern Fulmar", 145 | "Yellow-billed Loon", 146 | "Red-throated Loon", 147 | "Arctic Loon", 148 | "Pacific Loon", 149 | "Horned Grebe", 150 | "Red-necked Grebe", 151 | "Eared Grebe", 152 | "Western Grebe", 153 | "Clark's Grebe", 154 | "Double-crested Cormorant", 155 | "Pelagic Cormorant", 156 | "Great Cormorant", 157 | "American White Pelican", 158 | "Brown Pelican", 159 | "Brandt's Cormorant", 160 | "Least Bittern", 161 | "Great Egret", 162 | "Snowy Egret", 163 | "Little Blue Heron", 164 | "Tricolored Heron", 165 | "Reddish Egret", 166 | "Black-crowned Night-Heron", 167 | "Yellow-crowned Night-Heron", 168 | "White Ibis", 169 | "Glossy Ibis", 170 | "Roseate Spoonbill", 171 | "Wood Stork", 172 | "Black-bellied Whistling-Duck", 173 | "Fulvous Whistling-Duck", 174 | "Greater White-fronted Goose", 175 | "Snow Goose", 176 | "Ross's Goose", 177 | "Canada Goose", 178 | "Brant", 179 | "Mute Swan", 180 | "Tundra Swan", 181 | "Whooper Swan", 182 | "Sandhill Crane", 183 | "Black-necked Stilt", 184 | "American Avocet", 185 | "Northern Jacana", 186 | "Greater Yellowlegs", 187 | "Lesser Yellowlegs", 188 | "Willet", 189 | "Spotted Sandpiper", 190 | "Upland Sandpiper", 191 | "Whimbrel", 192 | "Long-billed Curlew", 193 | "Marbled Godwit", 194 | "Ruddy Turnstone", 195 | "Red Knot", 196 | "Sanderling", 197 | "Semipalmated Sandpiper", 198 | "Western Sandpiper", 199 | "Least Sandpiper", 200 | "White-rumped Sandpiper", 201 | "Baird's Sandpiper", 202 | "Pectoral Sandpiper", 203 | "Dunlin", 204 | "Buff-breasted Sandpiper", 205 | "Short-billed Dowitcher", 206 | "Long-billed Dowitcher", 207 | "Common Snipe", 208 | "American Woodcock", 209 | "Wilson's Phalarope", 210 | "Red-necked Phalarope", 211 | "Red Phalarope" 212 | ] 213 | 214 | from pathlib import Path 215 | 216 | def remove_spaces(s): 217 | return s.replace(" ", "") 218 | 219 | for species in SPECIES: 220 | if Path("/media/CHONK/hugo/xeno-canto-full/" + remove_spaces(species)).exists(): 221 | continue 222 | try: 223 | q = Query( 224 | name=species, q="A", length="10-30", 225 | ) 226 | 227 | # retrieve metadata 228 | metafiles = q.retrieve_meta(verbose=True) 229 | # retrieve recordings 230 | q.retrieve_recordings(multiprocess=True, nproc=10, attempts=10, outdir="/media/CHONK/hugo/xeno-canto-full/") 231 | 232 | except: 233 | print("Failed to download " + species) 234 | continue -------------------------------------------------------------------------------- /unloop/max/pan~.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 664.0, 441.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "id" : "obj-2", 44 | "maxclass" : "newobj", 45 | "numinlets" : 2, 46 | "numoutlets" : 1, 47 | "outlettype" : [ "signal" ], 48 | "patching_rect" : [ 241.0, 273.0, 29.5, 22.0 ], 49 | "text" : "*~" 50 | } 51 | 52 | } 53 | , { 54 | "box" : { 55 | "id" : "obj-1", 56 | "maxclass" : "newobj", 57 | "numinlets" : 2, 58 | "numoutlets" : 1, 59 | "outlettype" : [ "signal" ], 60 | "patching_rect" : [ 160.0, 273.0, 29.5, 22.0 ], 61 | "text" : "*~" 62 | } 63 | 64 | } 65 | , { 66 | "box" : { 67 | "comment" : "right signal", 68 | "id" : "obj-28", 69 | "index" : 2, 70 | "maxclass" : "outlet", 71 | "numinlets" : 1, 72 | "numoutlets" : 0, 73 | "patching_rect" : [ 241.0, 316.0, 30.0, 30.0 ] 74 | } 75 | 76 | } 77 | , { 78 | "box" : { 79 | "comment" : "left signal", 80 | "id" : "obj-27", 81 | "index" : 1, 82 | "maxclass" : "outlet", 83 | "numinlets" : 1, 84 | "numoutlets" : 0, 85 | "patching_rect" : [ 160.0, 316.0, 30.0, 30.0 ] 86 | } 87 | 88 | } 89 | , { 90 | "box" : { 91 | "id" : "obj-26", 92 | "maxclass" : "newobj", 93 | "numinlets" : 2, 94 | "numoutlets" : 1, 95 | "outlettype" : [ "signal" ], 96 | "patching_rect" : [ 160.0, 230.0, 43.0, 22.0 ], 97 | "text" : "cycle~" 98 | } 99 | 100 | } 101 | , { 102 | "box" : { 103 | "id" : "obj-25", 104 | "maxclass" : "newobj", 105 | "numinlets" : 2, 106 | "numoutlets" : 1, 107 | "outlettype" : [ "signal" ], 108 | "patching_rect" : [ 241.0, 230.0, 43.0, 22.0 ], 109 | "text" : "cycle~" 110 | } 111 | 112 | } 113 | , { 114 | "box" : { 115 | "id" : "obj-24", 116 | "maxclass" : "newobj", 117 | "numinlets" : 2, 118 | "numoutlets" : 1, 119 | "outlettype" : [ "signal" ], 120 | "patching_rect" : [ 265.0, 182.0, 49.0, 22.0 ], 121 | "text" : "+~ 0.75" 122 | } 123 | 124 | } 125 | , { 126 | "box" : { 127 | "id" : "obj-23", 128 | "maxclass" : "newobj", 129 | "numinlets" : 2, 130 | "numoutlets" : 1, 131 | "outlettype" : [ "signal" ], 132 | "patching_rect" : [ 184.0, 151.0, 37.0, 22.0 ], 133 | "text" : "*~ #1" 134 | } 135 | 136 | } 137 | , { 138 | "box" : { 139 | "id" : "obj-22", 140 | "maxclass" : "newobj", 141 | "numinlets" : 1, 142 | "numoutlets" : 1, 143 | "outlettype" : [ "signal" ], 144 | "patching_rect" : [ 184.0, 111.0, 58.0, 22.0 ], 145 | "text" : "sig~ 0.25" 146 | } 147 | 148 | } 149 | , { 150 | "box" : { 151 | "id" : "obj-11", 152 | "maxclass" : "comment", 153 | "numinlets" : 1, 154 | "numoutlets" : 0, 155 | "patching_rect" : [ 183.0, 8.0, 60.0, 20.0 ], 156 | "text" : "pan" 157 | } 158 | 159 | } 160 | , { 161 | "box" : { 162 | "id" : "obj-10", 163 | "maxclass" : "comment", 164 | "numinlets" : 1, 165 | "numoutlets" : 0, 166 | "patching_rect" : [ 124.0, 8.0, 60.0, 20.0 ], 167 | "text" : "audio" 168 | } 169 | 170 | } 171 | , { 172 | "box" : { 173 | "comment" : "pan value (0...1)", 174 | "id" : "obj-8", 175 | "index" : 2, 176 | "maxclass" : "inlet", 177 | "numinlets" : 0, 178 | "numoutlets" : 1, 179 | "outlettype" : [ "float" ], 180 | "patching_rect" : [ 198.0, 33.0, 30.0, 30.0 ] 181 | } 182 | 183 | } 184 | , { 185 | "box" : { 186 | "comment" : "mono audio in", 187 | "id" : "obj-7", 188 | "index" : 1, 189 | "maxclass" : "inlet", 190 | "numinlets" : 0, 191 | "numoutlets" : 1, 192 | "outlettype" : [ "signal" ], 193 | "patching_rect" : [ 134.0, 33.0, 30.0, 30.0 ] 194 | } 195 | 196 | } 197 | ], 198 | "lines" : [ { 199 | "patchline" : { 200 | "destination" : [ "obj-27", 0 ], 201 | "source" : [ "obj-1", 0 ] 202 | } 203 | 204 | } 205 | , { 206 | "patchline" : { 207 | "destination" : [ "obj-28", 0 ], 208 | "source" : [ "obj-2", 0 ] 209 | } 210 | 211 | } 212 | , { 213 | "patchline" : { 214 | "destination" : [ "obj-23", 0 ], 215 | "source" : [ "obj-22", 0 ] 216 | } 217 | 218 | } 219 | , { 220 | "patchline" : { 221 | "destination" : [ "obj-24", 0 ], 222 | "order" : 0, 223 | "source" : [ "obj-23", 0 ] 224 | } 225 | 226 | } 227 | , { 228 | "patchline" : { 229 | "destination" : [ "obj-26", 1 ], 230 | "order" : 1, 231 | "source" : [ "obj-23", 0 ] 232 | } 233 | 234 | } 235 | , { 236 | "patchline" : { 237 | "destination" : [ "obj-25", 1 ], 238 | "source" : [ "obj-24", 0 ] 239 | } 240 | 241 | } 242 | , { 243 | "patchline" : { 244 | "destination" : [ "obj-2", 0 ], 245 | "source" : [ "obj-25", 0 ] 246 | } 247 | 248 | } 249 | , { 250 | "patchline" : { 251 | "destination" : [ "obj-1", 0 ], 252 | "source" : [ "obj-26", 0 ] 253 | } 254 | 255 | } 256 | , { 257 | "patchline" : { 258 | "destination" : [ "obj-1", 1 ], 259 | "order" : 1, 260 | "source" : [ "obj-7", 0 ] 261 | } 262 | 263 | } 264 | , { 265 | "patchline" : { 266 | "destination" : [ "obj-2", 1 ], 267 | "order" : 0, 268 | "source" : [ "obj-7", 0 ] 269 | } 270 | 271 | } 272 | , { 273 | "patchline" : { 274 | "destination" : [ "obj-23", 1 ], 275 | "midpoints" : [ 207.5, 96.0, 252.0, 96.0, 252.0, 147.0, 211.5, 147.0 ], 276 | "source" : [ "obj-8", 0 ] 277 | } 278 | 279 | } 280 | ] 281 | } 282 | 283 | } 284 | -------------------------------------------------------------------------------- /unloop/max/dry-wet.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 84.0, 131.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "id" : "obj-154", 44 | "maxclass" : "newobj", 45 | "numinlets" : 1, 46 | "numoutlets" : 1, 47 | "outlettype" : [ "" ], 48 | "patching_rect" : [ 163.354036390781403, 177.018632590770721, 39.0, 22.0 ], 49 | "presentation" : 1, 50 | "presentation_rect" : [ 1469.090271848838711, 2039.0, 39.0, 22.0 ], 51 | "text" : "atodb" 52 | } 53 | 54 | } 55 | , { 56 | "box" : { 57 | "id" : "obj-156", 58 | "lastchannelcount" : 0, 59 | "maxclass" : "live.gain~", 60 | "numinlets" : 2, 61 | "numoutlets" : 5, 62 | "outlettype" : [ "signal", "signal", "", "float", "list" ], 63 | "parameter_enable" : 1, 64 | "patching_rect" : [ 133.0, 223.0, 48.0, 136.0 ], 65 | "presentation" : 1, 66 | "presentation_rect" : [ 1469.090271848838711, 2101.0, 48.0, 136.0 ], 67 | "saved_attribute_attributes" : { 68 | "valueof" : { 69 | "parameter_longname" : "live.gain~[26]", 70 | "parameter_mmax" : 6.0, 71 | "parameter_mmin" : -70.0, 72 | "parameter_modmode" : 0, 73 | "parameter_shortname" : "live.gain~", 74 | "parameter_type" : 0, 75 | "parameter_unitstyle" : 4 76 | } 77 | 78 | } 79 | , 80 | "varname" : "live.gain~[1]" 81 | } 82 | 83 | } 84 | , { 85 | "box" : { 86 | "id" : "obj-157", 87 | "maxclass" : "newobj", 88 | "numinlets" : 2, 89 | "numoutlets" : 1, 90 | "outlettype" : [ "float" ], 91 | "patching_rect" : [ 163.354036390781403, 140.37267005443573, 29.5, 22.0 ], 92 | "presentation" : 1, 93 | "presentation_rect" : [ 1469.090271848838711, 2009.0, 29.5, 22.0 ], 94 | "text" : "!- 1." 95 | } 96 | 97 | } 98 | , { 99 | "box" : { 100 | "id" : "obj-158", 101 | "maxclass" : "newobj", 102 | "numinlets" : 6, 103 | "numoutlets" : 1, 104 | "outlettype" : [ "" ], 105 | "patching_rect" : [ 61.0, 100.0, 90.0, 22.0 ], 106 | "presentation" : 1, 107 | "presentation_rect" : [ 1397.090271848838711, 1978.0, 94.0, 22.0 ], 108 | "text" : "scale 0. 1. 1. 0." 109 | } 110 | 111 | } 112 | , { 113 | "box" : { 114 | "id" : "obj-160", 115 | "maxclass" : "newobj", 116 | "numinlets" : 1, 117 | "numoutlets" : 1, 118 | "outlettype" : [ "" ], 119 | "patching_rect" : [ 50.0, 168.944098472595215, 39.0, 22.0 ], 120 | "presentation" : 1, 121 | "presentation_rect" : [ 1386.090271848838711, 2039.0, 39.0, 22.0 ], 122 | "text" : "atodb" 123 | } 124 | 125 | } 126 | , { 127 | "box" : { 128 | "id" : "obj-162", 129 | "lastchannelcount" : 0, 130 | "maxclass" : "live.gain~", 131 | "numinlets" : 2, 132 | "numoutlets" : 5, 133 | "outlettype" : [ "signal", "signal", "", "float", "list" ], 134 | "parameter_enable" : 1, 135 | "patching_rect" : [ 50.0, 223.0, 48.0, 136.0 ], 136 | "presentation" : 1, 137 | "presentation_rect" : [ 1386.090271848838711, 2101.0, 48.0, 136.0 ], 138 | "saved_attribute_attributes" : { 139 | "valueof" : { 140 | "parameter_longname" : "live.gain~[25]", 141 | "parameter_mmax" : 6.0, 142 | "parameter_mmin" : -70.0, 143 | "parameter_modmode" : 0, 144 | "parameter_shortname" : "live.gain~", 145 | "parameter_type" : 0, 146 | "parameter_unitstyle" : 4 147 | } 148 | 149 | } 150 | , 151 | "varname" : "live.gain~" 152 | } 153 | 154 | } 155 | , { 156 | "box" : { 157 | "comment" : "dry", 158 | "id" : "obj-216", 159 | "index" : 1, 160 | "maxclass" : "inlet", 161 | "numinlets" : 0, 162 | "numoutlets" : 1, 163 | "outlettype" : [ "signal" ], 164 | "patching_rect" : [ 25.000060151161279, 40.0, 30.0, 30.0 ] 165 | } 166 | 167 | } 168 | , { 169 | "box" : { 170 | "comment" : "mix", 171 | "id" : "obj-223", 172 | "index" : 3, 173 | "maxclass" : "inlet", 174 | "numinlets" : 0, 175 | "numoutlets" : 1, 176 | "outlettype" : [ "" ], 177 | "patching_rect" : [ 313.0, 40.0, 30.0, 30.0 ] 178 | } 179 | 180 | } 181 | , { 182 | "box" : { 183 | "comment" : "wet", 184 | "id" : "obj-227", 185 | "index" : 2, 186 | "maxclass" : "inlet", 187 | "numinlets" : 0, 188 | "numoutlets" : 1, 189 | "outlettype" : [ "signal" ], 190 | "patching_rect" : [ 133.0, 44.0, 30.0, 30.0 ] 191 | } 192 | 193 | } 194 | , { 195 | "box" : { 196 | "comment" : "", 197 | "id" : "obj-230", 198 | "index" : 1, 199 | "maxclass" : "outlet", 200 | "numinlets" : 1, 201 | "numoutlets" : 0, 202 | "patching_rect" : [ 50.0, 419.0, 30.0, 30.0 ] 203 | } 204 | 205 | } 206 | ], 207 | "lines" : [ { 208 | "patchline" : { 209 | "destination" : [ "obj-156", 0 ], 210 | "source" : [ "obj-154", 0 ] 211 | } 212 | 213 | } 214 | , { 215 | "patchline" : { 216 | "destination" : [ "obj-230", 0 ], 217 | "source" : [ "obj-156", 0 ] 218 | } 219 | 220 | } 221 | , { 222 | "patchline" : { 223 | "destination" : [ "obj-154", 0 ], 224 | "source" : [ "obj-157", 0 ] 225 | } 226 | 227 | } 228 | , { 229 | "patchline" : { 230 | "destination" : [ "obj-157", 0 ], 231 | "order" : 0, 232 | "source" : [ "obj-158", 0 ] 233 | } 234 | 235 | } 236 | , { 237 | "patchline" : { 238 | "destination" : [ "obj-160", 0 ], 239 | "order" : 1, 240 | "source" : [ "obj-158", 0 ] 241 | } 242 | 243 | } 244 | , { 245 | "patchline" : { 246 | "destination" : [ "obj-162", 0 ], 247 | "source" : [ "obj-160", 0 ] 248 | } 249 | 250 | } 251 | , { 252 | "patchline" : { 253 | "destination" : [ "obj-230", 0 ], 254 | "source" : [ "obj-162", 0 ] 255 | } 256 | 257 | } 258 | , { 259 | "patchline" : { 260 | "destination" : [ "obj-162", 0 ], 261 | "source" : [ "obj-216", 0 ] 262 | } 263 | 264 | } 265 | , { 266 | "patchline" : { 267 | "destination" : [ "obj-158", 0 ], 268 | "source" : [ "obj-223", 0 ] 269 | } 270 | 271 | } 272 | , { 273 | "patchline" : { 274 | "destination" : [ "obj-156", 0 ], 275 | "source" : [ "obj-227", 0 ] 276 | } 277 | 278 | } 279 | ] 280 | } 281 | 282 | } 283 | -------------------------------------------------------------------------------- /vampnet/mask.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from audiotools import AudioSignal 5 | 6 | from .util import scalar_to_batch_tensor 7 | 8 | def _gamma(r): 9 | return (r * torch.pi / 2).cos().clamp(1e-10, 1.0) 10 | 11 | def _invgamma(y): 12 | if not torch.is_tensor(y): 13 | y = torch.tensor(y)[None] 14 | return 2 * y.acos() / torch.pi 15 | 16 | def full_mask(x: torch.Tensor): 17 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 18 | return torch.ones_like(x).long() 19 | 20 | def empty_mask(x: torch.Tensor): 21 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 22 | return torch.zeros_like(x).long() 23 | 24 | def apply_mask( 25 | x: torch.Tensor, 26 | mask: torch.Tensor, 27 | mask_token: int 28 | ): 29 | assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq), but got {mask.ndim}" 30 | assert mask.shape == x.shape, f"mask must be same shape as x, but got {mask.shape} and {x.shape}" 31 | assert mask.dtype == torch.long, "mask must be long dtype, but got {mask.dtype}" 32 | assert ~torch.any(mask > 1), "mask must be binary" 33 | assert ~torch.any(mask < 0), "mask must be binary" 34 | 35 | fill_x = torch.full_like(x, mask_token) 36 | x = x * (1 - mask) + fill_x * mask 37 | 38 | return x, mask 39 | 40 | def random( 41 | x: torch.Tensor, 42 | r: torch.Tensor 43 | ): 44 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 45 | if not isinstance(r, torch.Tensor): 46 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device) 47 | 48 | r = _gamma(r)[:, None, None] 49 | probs = torch.ones_like(x) * r 50 | 51 | mask = torch.bernoulli(probs) 52 | mask = mask.round().long() 53 | 54 | return mask 55 | 56 | def linear_random( 57 | x: torch.Tensor, 58 | r: torch.Tensor, 59 | ): 60 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 61 | if not isinstance(r, torch.Tensor): 62 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float() 63 | r = r[:, None, None] 64 | 65 | probs = torch.ones_like(x).to(x.device).float() 66 | # expand to batch and codebook dims 67 | probs = probs.expand(x.shape[0], x.shape[1], -1) 68 | probs = probs * r 69 | 70 | mask = torch.bernoulli(probs) 71 | mask = mask.round().long() 72 | 73 | return mask 74 | 75 | def inpaint(x: torch.Tensor, 76 | n_prefix, 77 | n_suffix, 78 | ): 79 | assert n_prefix is not None 80 | assert n_suffix is not None 81 | 82 | mask = full_mask(x) 83 | 84 | # if we have a prefix or suffix, set their mask prob to 0 85 | if n_prefix > 0: 86 | if not isinstance(n_prefix, torch.Tensor): 87 | n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device) 88 | for i, n in enumerate(n_prefix): 89 | if n > 0: 90 | mask[i, :, :n] = 0.0 91 | if n_suffix > 0: 92 | if not isinstance(n_suffix, torch.Tensor): 93 | n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device) 94 | for i, n in enumerate(n_suffix): 95 | if n > 0: 96 | mask[i, :, -n:] = 0.0 97 | 98 | 99 | return mask 100 | 101 | def periodic_mask(x: torch.Tensor, 102 | period: int,width: int = 1, 103 | random_roll=False, 104 | ): 105 | mask = full_mask(x) 106 | if period == 0: 107 | return mask 108 | 109 | if not isinstance(period, torch.Tensor): 110 | period = scalar_to_batch_tensor(period, x.shape[0]) 111 | for i, factor in enumerate(period): 112 | if factor == 0: 113 | continue 114 | for j in range(mask.shape[-1]): 115 | if j % factor == 0: 116 | # figure out how wide the mask should be 117 | j_start = max(0, j - width // 2 ) 118 | j_end = min(mask.shape[-1] - 1, j + width // 2 ) + 1 119 | # flip a coin for each position in the mask 120 | j_mask = torch.bernoulli(torch.ones(j_end - j_start)) 121 | assert torch.all(j_mask == 1) 122 | j_fill = torch.ones_like(j_mask) * (1 - j_mask) 123 | assert torch.all(j_fill == 0) 124 | # fill 125 | mask[i, :, j_start:j_end] = j_fill 126 | if random_roll: 127 | # add a random offset to the mask 128 | offset = torch.randint(0, period[0], (1,)) 129 | mask = torch.roll(mask, offset.item(), dims=-1) 130 | 131 | return mask 132 | 133 | def codebook_unmask( 134 | mask: torch.Tensor, 135 | n_conditioning_codebooks: int 136 | ): 137 | if n_conditioning_codebooks == None: 138 | return mask 139 | # if we have any conditioning codebooks, set their mask to 0 140 | mask = mask.clone() 141 | mask[:, :n_conditioning_codebooks, :] = 0 142 | return mask 143 | 144 | def codebook_mask(mask: torch.Tensor, val1: int, val2: int = None): 145 | mask = mask.clone() 146 | mask[:, val1:, :] = 1 147 | # val2 = val2 or val1 148 | # vs = torch.linspace(val1, val2, mask.shape[1]) 149 | # for t, v in enumerate(vs): 150 | # v = int(v) 151 | # mask[:, v:, t] = 1 152 | 153 | return mask 154 | 155 | def mask_and( 156 | mask1: torch.Tensor, 157 | mask2: torch.Tensor 158 | ): 159 | assert mask1.shape == mask2.shape, "masks must be same shape" 160 | return torch.min(mask1, mask2) 161 | 162 | def dropout( 163 | mask: torch.Tensor, 164 | p: float, 165 | ): 166 | # instead of the above, mask along the last dimensions 167 | tsteps = mask.shape[-1] 168 | tsteps_to_drop = int(tsteps * p) 169 | tsteps_to_keep = tsteps - tsteps_to_drop 170 | idxs_to_drop = torch.randint(0, tsteps, (tsteps_to_drop,)) 171 | mask = mask.clone() 172 | mask[:, :, idxs_to_drop] = 1 173 | return mask.long() 174 | 175 | 176 | 177 | 178 | def mask_or( 179 | mask1: torch.Tensor, 180 | mask2: torch.Tensor 181 | ): 182 | assert mask1.shape == mask2.shape, f"masks must be same shape, but got {mask1.shape} and {mask2.shape}" 183 | assert mask1.max() <= 1, "mask1 must be binary" 184 | assert mask2.max() <= 1, "mask2 must be binary" 185 | assert mask1.min() >= 0, "mask1 must be binary" 186 | assert mask2.min() >= 0, "mask2 must be binary" 187 | return (mask1 + mask2).clamp(0, 1) 188 | 189 | def time_stretch_mask( 190 | x: torch.Tensor, 191 | stretch_factor: int, 192 | ): 193 | assert stretch_factor >= 1, "stretch factor must be >= 1" 194 | c_seq_len = x.shape[-1] 195 | x = x.repeat_interleave(stretch_factor, dim=-1) 196 | 197 | # trim cz to the original length 198 | x = x[:, :, :c_seq_len] 199 | 200 | mask = periodic_mask(x, stretch_factor, width=1) 201 | return mask 202 | 203 | def onset_mask( 204 | sig: AudioSignal, 205 | z: torch.Tensor, 206 | interface, 207 | width: int = 1, 208 | ): 209 | import librosa 210 | 211 | onset_frame_idxs = librosa.onset.onset_detect( 212 | y=sig.samples[0][0].detach().cpu().numpy(), sr=sig.sample_rate, 213 | hop_length=interface.codec.hop_length, 214 | backtrack=True, 215 | ) 216 | if len(onset_frame_idxs) == 0: 217 | print("no onsets detected") 218 | print("onset_frame_idxs", onset_frame_idxs) 219 | print("mask shape", z.shape) 220 | 221 | mask = torch.ones_like(z) 222 | for idx in onset_frame_idxs: 223 | mask[:, :, idx-width:idx+width] = 0 224 | 225 | return mask 226 | 227 | 228 | 229 | if __name__ == "__main__": 230 | sig = AudioSignal("assets/example.wav") 231 | -------------------------------------------------------------------------------- /unloop/max/panner-cleat.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 59.0, 106.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "id" : "obj-3", 44 | "maxclass" : "newobj", 45 | "numinlets" : 4, 46 | "numoutlets" : 0, 47 | "patching_rect" : [ 364.666666666666629, 337.068983197212219, 75.0, 22.0 ], 48 | "text" : "dac~ 1 2 3 4" 49 | } 50 | 51 | } 52 | , { 53 | "box" : { 54 | "id" : "obj-2", 55 | "maxclass" : "newobj", 56 | "numinlets" : 3, 57 | "numoutlets" : 4, 58 | "outlettype" : [ "signal", "signal", "signal", "signal" ], 59 | "patching_rect" : [ 369.0, 305.0, 62.0, 22.0 ], 60 | "text" : "quadpan~" 61 | } 62 | 63 | } 64 | , { 65 | "box" : { 66 | "id" : "obj-211", 67 | "maxclass" : "newobj", 68 | "numinlets" : 2, 69 | "numoutlets" : 0, 70 | "patching_rect" : [ 112.0, 336.89655339717865, 55.0, 22.0 ], 71 | "text" : "dac~ 1 2" 72 | } 73 | 74 | } 75 | , { 76 | "box" : { 77 | "id" : "obj-200", 78 | "maxclass" : "newobj", 79 | "numinlets" : 6, 80 | "numoutlets" : 1, 81 | "outlettype" : [ "" ], 82 | "patching_rect" : [ 229.5, 225.0, 114.0, 22.0 ], 83 | "text" : "scale 0. 1000. -1. 1." 84 | } 85 | 86 | } 87 | , { 88 | "box" : { 89 | "id" : "obj-198", 90 | "maxclass" : "newobj", 91 | "numinlets" : 4, 92 | "numoutlets" : 2, 93 | "outlettype" : [ "signal", "signal" ], 94 | "patching_rect" : [ 112.0, 305.0, 50.5, 22.0 ], 95 | "text" : "pan2" 96 | } 97 | 98 | } 99 | , { 100 | "box" : { 101 | "id" : "obj-194", 102 | "maxclass" : "newobj", 103 | "numinlets" : 1, 104 | "numoutlets" : 2, 105 | "outlettype" : [ "float", "float" ], 106 | "patching_rect" : [ 229.5, 174.0, 74.0, 22.0 ], 107 | "text" : "unpack 0. 0." 108 | } 109 | 110 | } 111 | , { 112 | "box" : { 113 | "color" : [ 0.2, 0.0, 0.8, 1.0 ], 114 | "id" : "obj-193", 115 | "maxclass" : "newobj", 116 | "numinlets" : 0, 117 | "numoutlets" : 1, 118 | "outlettype" : [ "" ], 119 | "patching_rect" : [ 112.0, 186.0, 92.0, 22.0 ], 120 | "text" : "r panner-choice" 121 | } 122 | 123 | } 124 | , { 125 | "box" : { 126 | "id" : "obj-185", 127 | "maxclass" : "newobj", 128 | "numinlets" : 2, 129 | "numoutlets" : 3, 130 | "outlettype" : [ "signal", "signal", "signal" ], 131 | "patching_rect" : [ 112.0, 217.0, 49.0, 22.0 ], 132 | "text" : "gate~ 3" 133 | } 134 | 135 | } 136 | , { 137 | "box" : { 138 | "id" : "obj-184", 139 | "maxclass" : "newobj", 140 | "numinlets" : 3, 141 | "numoutlets" : 0, 142 | "patching_rect" : [ 221.5, 305.0, 59.0, 22.0 ], 143 | "text" : "16panner" 144 | } 145 | 146 | } 147 | , { 148 | "box" : { 149 | "id" : "obj-43", 150 | "maxclass" : "newobj", 151 | "numinlets" : 2, 152 | "numoutlets" : 2, 153 | "outlettype" : [ "", "" ], 154 | "patching_rect" : [ 229.5, 128.0, 51.0, 22.0 ], 155 | "text" : "route xy" 156 | } 157 | 158 | } 159 | , { 160 | "box" : { 161 | "comment" : "", 162 | "id" : "obj-213", 163 | "index" : 1, 164 | "maxclass" : "inlet", 165 | "numinlets" : 0, 166 | "numoutlets" : 1, 167 | "outlettype" : [ "signal" ], 168 | "patching_rect" : [ 142.0, 76.0, 30.0, 30.0 ] 169 | } 170 | 171 | } 172 | , { 173 | "box" : { 174 | "comment" : "", 175 | "id" : "obj-214", 176 | "index" : 2, 177 | "maxclass" : "inlet", 178 | "numinlets" : 0, 179 | "numoutlets" : 1, 180 | "outlettype" : [ "" ], 181 | "patching_rect" : [ 229.5, 76.0, 30.0, 30.0 ] 182 | } 183 | 184 | } 185 | ], 186 | "lines" : [ { 187 | "patchline" : { 188 | "destination" : [ "obj-184", 0 ], 189 | "source" : [ "obj-185", 1 ] 190 | } 191 | 192 | } 193 | , { 194 | "patchline" : { 195 | "destination" : [ "obj-198", 0 ], 196 | "source" : [ "obj-185", 0 ] 197 | } 198 | 199 | } 200 | , { 201 | "patchline" : { 202 | "destination" : [ "obj-2", 0 ], 203 | "source" : [ "obj-185", 2 ] 204 | } 205 | 206 | } 207 | , { 208 | "patchline" : { 209 | "destination" : [ "obj-185", 0 ], 210 | "source" : [ "obj-193", 0 ] 211 | } 212 | 213 | } 214 | , { 215 | "patchline" : { 216 | "destination" : [ "obj-184", 2 ], 217 | "order" : 1, 218 | "source" : [ "obj-194", 1 ] 219 | } 220 | 221 | } 222 | , { 223 | "patchline" : { 224 | "destination" : [ "obj-184", 1 ], 225 | "order" : 1, 226 | "source" : [ "obj-194", 0 ] 227 | } 228 | 229 | } 230 | , { 231 | "patchline" : { 232 | "destination" : [ "obj-2", 2 ], 233 | "order" : 0, 234 | "source" : [ "obj-194", 1 ] 235 | } 236 | 237 | } 238 | , { 239 | "patchline" : { 240 | "destination" : [ "obj-2", 1 ], 241 | "order" : 0, 242 | "source" : [ "obj-194", 0 ] 243 | } 244 | 245 | } 246 | , { 247 | "patchline" : { 248 | "destination" : [ "obj-200", 0 ], 249 | "order" : 2, 250 | "source" : [ "obj-194", 0 ] 251 | } 252 | 253 | } 254 | , { 255 | "patchline" : { 256 | "destination" : [ "obj-211", 1 ], 257 | "source" : [ "obj-198", 1 ] 258 | } 259 | 260 | } 261 | , { 262 | "patchline" : { 263 | "destination" : [ "obj-211", 0 ], 264 | "source" : [ "obj-198", 0 ] 265 | } 266 | 267 | } 268 | , { 269 | "patchline" : { 270 | "destination" : [ "obj-3", 3 ], 271 | "source" : [ "obj-2", 3 ] 272 | } 273 | 274 | } 275 | , { 276 | "patchline" : { 277 | "destination" : [ "obj-3", 2 ], 278 | "source" : [ "obj-2", 2 ] 279 | } 280 | 281 | } 282 | , { 283 | "patchline" : { 284 | "destination" : [ "obj-3", 1 ], 285 | "source" : [ "obj-2", 1 ] 286 | } 287 | 288 | } 289 | , { 290 | "patchline" : { 291 | "destination" : [ "obj-3", 0 ], 292 | "source" : [ "obj-2", 0 ] 293 | } 294 | 295 | } 296 | , { 297 | "patchline" : { 298 | "destination" : [ "obj-198", 1 ], 299 | "source" : [ "obj-200", 0 ] 300 | } 301 | 302 | } 303 | , { 304 | "patchline" : { 305 | "destination" : [ "obj-185", 1 ], 306 | "source" : [ "obj-213", 0 ] 307 | } 308 | 309 | } 310 | , { 311 | "patchline" : { 312 | "destination" : [ "obj-43", 0 ], 313 | "source" : [ "obj-214", 0 ] 314 | } 315 | 316 | } 317 | , { 318 | "patchline" : { 319 | "destination" : [ "obj-194", 0 ], 320 | "source" : [ "obj-43", 0 ] 321 | } 322 | 323 | } 324 | ] 325 | } 326 | 327 | } 328 | -------------------------------------------------------------------------------- /scripts/exp/experiment.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import random 3 | from typing import List 4 | import tempfile 5 | import subprocess 6 | 7 | import argbind 8 | from tqdm import tqdm 9 | import torch 10 | 11 | from vampnet.interface import Interface 12 | from vampnet import mask as pmask 13 | import audiotools as at 14 | 15 | Interface: Interface = argbind.bind(Interface) 16 | 17 | 18 | 19 | def calculate_bitrate( 20 | interface, num_codebooks, 21 | downsample_factor 22 | ): 23 | bit_width = 10 24 | sr = interface.codec.sample_rate 25 | hop = interface.codec.hop_size 26 | rate = (sr / hop) * ((bit_width * num_codebooks) / downsample_factor) 27 | return rate 28 | 29 | def baseline(sig, interface): 30 | return interface.preprocess(sig) 31 | 32 | def reconstructed(sig, interface): 33 | return interface.decode( 34 | interface.encode(sig) 35 | ) 36 | 37 | def coarse2fine(sig, interface): 38 | z = interface.encode(sig) 39 | z = z[:, :interface.c2f.n_conditioning_codebooks, :] 40 | 41 | z = interface.coarse_to_fine(z) 42 | return interface.decode(z) 43 | 44 | class CoarseCond: 45 | 46 | def __init__(self, num_conditioning_codebooks, downsample_factor): 47 | self.num_conditioning_codebooks = num_conditioning_codebooks 48 | self.downsample_factor = downsample_factor 49 | 50 | def __call__(self, sig, interface): 51 | z = interface.encode(sig) 52 | mask = pmask.full_mask(z) 53 | mask = pmask.codebook_unmask(mask, self.num_conditioning_codebooks) 54 | mask = pmask.periodic_mask(mask, self.downsample_factor) 55 | 56 | zv = interface.coarse_vamp(z, mask) 57 | zv = interface.coarse_to_fine(zv) 58 | return interface.decode(zv) 59 | 60 | def opus(sig, interface, bitrate=128): 61 | sig = interface.preprocess(sig) 62 | 63 | with tempfile.NamedTemporaryFile(suffix=".wav") as f: 64 | sig.write(f.name) 65 | 66 | opus_name = Path(f.name).with_suffix(".opus") 67 | # convert to opus 68 | cmd = [ 69 | "ffmpeg", "-y", "-i", f.name, 70 | "-c:a", "libopus", 71 | "-b:a", f"{bitrate}", 72 | opus_name 73 | ] 74 | subprocess.run(cmd, check=True) 75 | 76 | # convert back to wav 77 | output_name = Path(f"{f.name}-opus").with_suffix(".wav") 78 | cmd = [ 79 | "ffmpeg", "-y", "-i", opus_name, 80 | output_name 81 | ] 82 | 83 | subprocess.run(cmd, check=True) 84 | 85 | sig = at.AudioSignal( 86 | output_name, 87 | sample_rate=sig.sample_rate 88 | ) 89 | return sig 90 | 91 | def mask_ratio_1_step(ratio=1.0): 92 | def wrapper(sig, interface): 93 | z = interface.encode(sig) 94 | mask = pmask.linear_random(z, ratio) 95 | zv = interface.coarse_vamp( 96 | z, 97 | mask, 98 | sampling_steps=1, 99 | ) 100 | 101 | return interface.decode(zv) 102 | return wrapper 103 | 104 | def num_sampling_steps(num_steps=1): 105 | def wrapper(sig, interface: Interface): 106 | z = interface.encode(sig) 107 | mask = pmask.periodic_mask(z, 16) 108 | zv = interface.coarse_vamp( 109 | z, 110 | mask, 111 | sampling_steps=num_steps, 112 | ) 113 | 114 | zv = interface.coarse_to_fine(zv) 115 | return interface.decode(zv) 116 | return wrapper 117 | 118 | def beat_mask(ctx_time): 119 | def wrapper(sig, interface): 120 | beat_mask = interface.make_beat_mask( 121 | sig, 122 | before_beat_s=ctx_time/2, 123 | after_beat_s=ctx_time/2, 124 | invert=True 125 | ) 126 | 127 | z = interface.encode(sig) 128 | 129 | zv = interface.coarse_vamp( 130 | z, beat_mask 131 | ) 132 | 133 | zv = interface.coarse_to_fine(zv) 134 | return interface.decode(zv) 135 | return wrapper 136 | 137 | def inpaint(ctx_time): 138 | def wrapper(sig, interface: Interface): 139 | z = interface.encode(sig) 140 | mask = pmask.inpaint(z, interface.s2t(ctx_time), interface.s2t(ctx_time)) 141 | 142 | zv = interface.coarse_vamp(z, mask) 143 | zv = interface.coarse_to_fine(zv) 144 | 145 | return interface.decode(zv) 146 | return wrapper 147 | 148 | def token_noise(noise_amt): 149 | def wrapper(sig, interface: Interface): 150 | z = interface.encode(sig) 151 | mask = pmask.random(z, noise_amt) 152 | z = torch.where( 153 | mask, 154 | torch.randint_like(z, 0, interface.coarse.vocab_size), 155 | z 156 | ) 157 | return interface.decode(z) 158 | return wrapper 159 | 160 | EXP_REGISTRY = {} 161 | 162 | EXP_REGISTRY["gen-compression"] = { 163 | "baseline": baseline, 164 | "reconstructed": reconstructed, 165 | "coarse2fine": coarse2fine, 166 | **{ 167 | f"{n}_codebooks_downsampled_{x}x": CoarseCond(num_conditioning_codebooks=n, downsample_factor=x) 168 | for (n, x) in ( 169 | (1, 1), # 1 codebook, no downsampling 170 | (4, 4), # 4 codebooks, downsampled 4x 171 | (4, 16), # 4 codebooks, downsampled 16x 172 | (4, 32), # 4 codebooks, downsampled 16x 173 | ) 174 | }, 175 | **{ 176 | f"token_noise_{x}": mask_ratio_1_step(ratio=x) 177 | for x in [0.25, 0.5, 0.75] 178 | }, 179 | 180 | } 181 | 182 | 183 | EXP_REGISTRY["sampling-steps"] = { 184 | # "codec": reconstructed, 185 | **{f"steps_{n}": num_sampling_steps(n) for n in [1, 4, 12, 36, 64, 72]}, 186 | } 187 | 188 | 189 | EXP_REGISTRY["musical-sampling"] = { 190 | **{f"beat_mask_{t}": beat_mask(t) for t in [0.075]}, 191 | **{f"inpaint_{t}": inpaint(t) for t in [0.5, 1.0,]}, # multiply these by 2 (they go left and right) 192 | } 193 | 194 | @argbind.bind(without_prefix=True) 195 | def main( 196 | sources=[ 197 | "/media/CHONK/hugo/spotdl/val", 198 | ], 199 | output_dir: str = "./samples", 200 | max_excerpts: int = 2000, 201 | exp_type: str = "gen-compression", 202 | seed: int = 0, 203 | ext: str = [".mp3"], 204 | ): 205 | at.util.seed(seed) 206 | interface = Interface() 207 | 208 | output_dir = Path(output_dir) 209 | output_dir.mkdir(exist_ok=True, parents=True) 210 | 211 | from audiotools.data.datasets import AudioLoader, AudioDataset 212 | 213 | loader = AudioLoader(sources=sources, shuffle_state=seed, ext=ext) 214 | dataset = AudioDataset(loader, 215 | sample_rate=interface.codec.sample_rate, 216 | duration=interface.coarse.chunk_size_s, 217 | n_examples=max_excerpts, 218 | without_replacement=True, 219 | ) 220 | 221 | if exp_type in EXP_REGISTRY: 222 | SAMPLE_CONDS = EXP_REGISTRY[exp_type] 223 | else: 224 | raise ValueError(f"Unknown exp_type {exp_type}") 225 | 226 | 227 | indices = list(range(max_excerpts)) 228 | random.shuffle(indices) 229 | for i in tqdm(indices): 230 | # if all our files are already there, skip 231 | done = [] 232 | for name in SAMPLE_CONDS: 233 | o_dir = Path(output_dir) / name 234 | done.append((o_dir / f"{i}.wav").exists()) 235 | if all(done): 236 | continue 237 | 238 | sig = dataset[i]["signal"] 239 | results = { 240 | name: cond(sig, interface).cpu() 241 | for name, cond in SAMPLE_CONDS.items() 242 | } 243 | 244 | for name, sig in results.items(): 245 | o_dir = Path(output_dir) / name 246 | o_dir.mkdir(exist_ok=True, parents=True) 247 | 248 | sig.write(o_dir / f"{i}.wav") 249 | 250 | if __name__ == "__main__": 251 | args = argbind.parse_args() 252 | 253 | with argbind.scope(args): 254 | main() 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: salad bowl (vampnet) 3 | emoji: 🥗 4 | colorFrom: yellow 5 | colorTo: green 6 | sdk: gradio 7 | sdk_version: 5.23.2 8 | python_version: 3.11 9 | app_file: app.py 10 | pinned: false 11 | license: cc-by-nc-4.0 12 | --- 13 | 14 | # VampNet 15 | 16 | # Table of contents 17 | 18 | - [setting up](#setting-up) 19 | - [programmatic usage](#programmatic-usage) 20 | - [launching the web app](#launching-the-web-app) 21 | - [training / fine-tuning](#training--fine-tuning) 22 | - [training a model](#training-a-model) 23 | - [debugging training](#debugging-training) 24 | - [fine-tuning](#fine-tuning) 25 | - [exporting your model](#exporting-your-model) 26 | - [unloop](#unloop) 27 | - [token telephone](#token-telephone) 28 | - [a note on argbind](#a-note-on-argbind) 29 | - [take a look at the pretrained models](#take-a-look-at-the-pretrained-models) 30 | - [licensing for pretrained models](#licensing-for-pretrained-models) 31 | 32 | ## setting up 33 | 34 | python 3.9-3.11 works well. (for example, using conda) 35 | ```bash 36 | conda create -n vampnet python=3.9 37 | conda activate vampnet 38 | ``` 39 | 40 | install VampNet 41 | 42 | ```bash 43 | git clone https://github.com/hugofloresgarcia/vampnet.git 44 | pip install -e ./vampnet 45 | ``` 46 | 47 | ## programmatic usage 48 | 49 | quick start! 50 | ```python 51 | import random 52 | import vampnet 53 | import audiotools as at 54 | 55 | # load the default vampnet model 56 | interface = vampnet.interface.Interface.default() 57 | 58 | # list available finetuned models 59 | finetuned_model_choices = interface.available_models() 60 | print(f"available finetuned models: {finetuned_model_choices}") 61 | 62 | # pick a random finetuned model 63 | model_choice = random.choice(finetuned_model_choices) 64 | print(f"choosing model: {model_choice}") 65 | 66 | # load a finetuned model 67 | interface.load_finetuned(model_choice) 68 | 69 | # load an example audio file 70 | signal = at.AudioSignal("assets/example.wav") 71 | 72 | # get the tokens for the audio 73 | codes = interface.encode(signal) 74 | 75 | # build a mask for the audio 76 | mask = interface.build_mask( 77 | codes, signal, 78 | periodic_prompt=7, 79 | upper_codebook_mask=3, 80 | ) 81 | 82 | # generate the output tokens 83 | output_tokens = interface.vamp( 84 | codes, mask, return_mask=False, 85 | temperature=1.0, 86 | typical_filtering=True, 87 | ) 88 | 89 | # convert them to a signal 90 | output_signal = interface.decode(output_tokens) 91 | 92 | # save the output signal 93 | output_signal.write("scratch/output.wav") 94 | ``` 95 | 96 | 97 | # Launching the Web app 98 | You can launch a gradio UI to play with vampnet. 99 | 100 | ```bash 101 | python app.py 102 | ``` 103 | 104 | # Training / Fine-tuning 105 | 106 | ## Training a model 107 | 108 | To train a model, run the following script: 109 | 110 | ```bash 111 | python scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints 112 | ``` 113 | 114 | for multi-gpu training, use torchrun: 115 | 116 | ```bash 117 | torchrun --nproc_per_node gpu scripts/exp/train.py --args.load conf/vampnet.yml --save_path path/to/ckpt 118 | ``` 119 | 120 | You can edit `conf/vampnet.yml` to change the dataset paths or any training hyperparameters. 121 | 122 | For coarse2fine models, you can use `conf/c2f.yml` as a starting configuration. 123 | 124 | See `python scripts/exp/train.py -h` for a list of options. 125 | 126 | ## Debugging training 127 | 128 | To debug training, it's easier to debug with 1 gpu and 0 workers 129 | 130 | ```bash 131 | CUDA_VISIBLE_DEVICES=0 python -m pdb scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints --num_workers 0 132 | ``` 133 | 134 | # Fine-tuning 135 | 136 | To fine-tune a model, use the script in `scripts/exp/fine_tune.py` 137 | 138 | for an audio folder 139 | ```bash 140 | python scripts/exp/fine_tune.py /path/to/audio/folder 141 | ``` 142 | 143 | for multiple files 144 | ```bash 145 | python scripts/exp/fine_tune.py "/path/to/audio1.mp3 /path/to/audio2/ /path/to/audio3.wav" 146 | ``` 147 | 148 | This creates configuration files for a fine tuning train job. The save_paths will be set to `runs//coarse` and `runs//c2f`. 149 | 150 | launch the coarse job: 151 | ```bash 152 | python scripts/exp/train.py --args.load conf/generated//coarse.yml 153 | ``` 154 | 155 | this will save the coarse model to `runs//coarse/ckpt/best/`. 156 | 157 | launch the c2f job: 158 | ```bash 159 | python scripts/exp/train.py --args.load conf/generated//c2f.yml 160 | ``` 161 | 162 | # Resuming a Training/Finetuning Job from checkpoint. 163 | 164 | To resume from checkpoint, use the `--resume` flag and the `--save_path` to point to the checkpoint you want to resume from. 165 | ```bash 166 | python scripts/exp/train.py --args.load conf/generated/steve/coarse.yml --save_path runs/steve/coarse --resume 167 | ``` 168 | 169 | # Exporting your model 170 | 171 | Once your model has been fine-tuned, you can export it to a HuggingFace model. 172 | 173 | In order to use your model in `app.py`, you will need to export it to HuggingFace. 174 | 175 | **NOTE**: In order to export, you will need a [huggingface account](https://huggingface.co/). 176 | 177 | Now, log in to huggingface using the command line: 178 | ```bash 179 | huggingface-cli login 180 | ``` 181 | 182 | replace the contents of the file named `./DEFAULT_HF_MODEL_REPO` with your `/vampnet`. A model repo will be automatically created for you with `export.py`. The default is `hugggof/vampnet`. 183 | 184 | for example, if my username is `hugggof`, I would run the following command:` 185 | ```bash 186 | echo 'hugggof/vampnet' > ./DEFAULT_HF_MODEL_REPO 187 | ``` 188 | 189 | Now, run the following command to export your model (replace `` with the name of your model): 190 | 191 | ```bash 192 | python scripts/exp/export.py --name --model latest 193 | ``` 194 | 195 | Once that's done, your model should appear on the list of available models in the gradio interface. 196 | Simply run `python app.py` and select your model from the dropdown list. 197 | 198 | 199 | # Unloop 200 | 201 | Make sure you have Max installed on your laptop! 202 | 203 | **NOTE**: To run unloop (with a GPU-powered server), you will need to install the vampnet repo in both your local machine and your GPU server. 204 | 205 | ## start a vampnet gradio server 206 | 207 | First, **on your GPU server**, run the gradio server: 208 | ```bash 209 | python app.py --args.load conf/interface.yml --Interface.device cuda 210 | ``` 211 | This will run a vampnet gradio API on your GPU server. Copy the address. It will be something like `https://127.0.0.1:7860/`. 212 | 213 | **IMPORTANT** Make sure that this gradio port (by default `7860`) is forwarded to your local machine, where you have Max installed. 214 | 215 | ## start the unloop gradio client 216 | Now, **on your local machine**, run the unloop gradio client. 217 | ``` 218 | cd unloop 219 | pip install -r requirements.txt 220 | python client.py --vampnet_url https://127.0.0.1:7860/ # replace with your gradio server address 221 | ``` 222 | This will start a gradio client that connects to the gradio server running on your GPU server. 223 | 224 | ## start the unloop Max patch 225 | Now, open the unloop Max patch. It's located at `unloop/max/unloop.maxpat`. 226 | 227 | In the tape controls, check the heartbeat (`<3`) to make sure the connection to the local gradio client is working. 228 | 229 | have fun! 230 | 231 | # Token Telephone 232 | 233 | Instructions forthcoming, but the sauce is in `token_telephone/tt.py` 234 | 235 | ## A note on argbind 236 | This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files. 237 | Config files are stored in the `conf/` folder. 238 | 239 | ### Take a look at the pretrained models 240 | All the pretrained models (trained by hugo) are stored here: https://huggingface.co/hugggof/vampnet 241 | 242 | ### Licensing for Pretrained Models: 243 | The weights for the models are licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml). Likewise, any VampNet models fine-tuned on the pretrained models are also licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml). 244 | 245 | Download the pretrained models from [this link](https://zenodo.org/record/8136629). Then, extract the models to the `models/` folder. 246 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /vampnet/beats.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import warnings 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from typing import Any 7 | from typing import List 8 | from typing import Tuple 9 | from typing import Union 10 | 11 | import librosa 12 | import torch 13 | import numpy as np 14 | from audiotools import AudioSignal 15 | 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | ################### 20 | # beat sync utils # 21 | ################### 22 | 23 | AGGREGATOR_REGISTRY = { 24 | "mean": np.mean, 25 | "median": np.median, 26 | "max": np.max, 27 | "min": np.min, 28 | } 29 | 30 | 31 | def list_aggregators() -> list: 32 | return list(AGGREGATOR_REGISTRY.keys()) 33 | 34 | 35 | @dataclass 36 | class TimeSegment: 37 | start: float 38 | end: float 39 | 40 | @property 41 | def duration(self): 42 | return self.end - self.start 43 | 44 | def __str__(self) -> str: 45 | return f"{self.start} - {self.end}" 46 | 47 | def find_overlapping_segment( 48 | self, segments: List["TimeSegment"] 49 | ) -> Union["TimeSegment", None]: 50 | """Find the first segment that overlaps with this segment, or None if no segment overlaps""" 51 | for s in segments: 52 | if s.start <= self.start and s.end >= self.end: 53 | return s 54 | return None 55 | 56 | 57 | def mkdir(path: Union[Path, str]) -> Path: 58 | p = Path(path) 59 | p.mkdir(parents=True, exist_ok=True) 60 | return p 61 | 62 | 63 | 64 | ################### 65 | # beat data # 66 | ################### 67 | @dataclass 68 | class BeatSegment(TimeSegment): 69 | downbeat: bool = False # if there's a downbeat on the start_time 70 | 71 | 72 | class Beats: 73 | def __init__(self, beat_times, downbeat_times): 74 | if isinstance(beat_times, np.ndarray): 75 | beat_times = beat_times.tolist() 76 | if isinstance(downbeat_times, np.ndarray): 77 | downbeat_times = downbeat_times.tolist() 78 | self._beat_times = beat_times 79 | self._downbeat_times = downbeat_times 80 | self._use_downbeats = False 81 | 82 | def use_downbeats(self, use_downbeats: bool = True): 83 | """use downbeats instead of beats when calling beat_times""" 84 | self._use_downbeats = use_downbeats 85 | 86 | def beat_segments(self, signal: AudioSignal) -> List[BeatSegment]: 87 | """ 88 | segments a song into time segments corresponding to beats. 89 | the first segment starts at 0 and ends at the first beat time. 90 | the last segment starts at the last beat time and ends at the end of the song. 91 | """ 92 | beat_times = self._beat_times.copy() 93 | downbeat_times = self._downbeat_times 94 | beat_times.insert(0, 0) 95 | beat_times.append(signal.signal_duration) 96 | 97 | downbeat_ids = np.intersect1d(beat_times, downbeat_times, return_indices=True)[ 98 | 1 99 | ] 100 | is_downbeat = [ 101 | True if i in downbeat_ids else False for i in range(len(beat_times)) 102 | ] 103 | segments = [ 104 | BeatSegment(start_time, end_time, downbeat) 105 | for start_time, end_time, downbeat in zip( 106 | beat_times[:-1], beat_times[1:], is_downbeat 107 | ) 108 | ] 109 | return segments 110 | 111 | def get_beats(self) -> np.ndarray: 112 | """returns an array of beat times, in seconds 113 | if downbeats is True, returns an array of downbeat times, in seconds 114 | """ 115 | return np.array( 116 | self._downbeat_times if self._use_downbeats else self._beat_times 117 | ) 118 | 119 | @property 120 | def beat_times(self) -> np.ndarray: 121 | """return beat times""" 122 | return np.array(self._beat_times) 123 | 124 | @property 125 | def downbeat_times(self) -> np.ndarray: 126 | """return downbeat times""" 127 | return np.array(self._downbeat_times) 128 | 129 | def beat_times_to_feature_frames( 130 | self, signal: AudioSignal, features: np.ndarray 131 | ) -> np.ndarray: 132 | """convert beat times to frames, given an array of time-varying features""" 133 | beat_times = self.get_beats() 134 | beat_frames = ( 135 | beat_times * signal.sample_rate / signal.signal_length * features.shape[-1] 136 | ).astype(np.int64) 137 | return beat_frames 138 | 139 | def sync_features( 140 | self, feature_frames: np.ndarray, features: np.ndarray, aggregate="median" 141 | ) -> np.ndarray: 142 | """sync features to beats""" 143 | if aggregate not in AGGREGATOR_REGISTRY: 144 | raise ValueError(f"unknown aggregation method {aggregate}") 145 | 146 | return librosa.util.sync( 147 | features, feature_frames, aggregate=AGGREGATOR_REGISTRY[aggregate] 148 | ) 149 | 150 | def to_json(self) -> dict: 151 | """return beats and downbeats as json""" 152 | return { 153 | "beats": self._beat_times, 154 | "downbeats": self._downbeat_times, 155 | "use_downbeats": self._use_downbeats, 156 | } 157 | 158 | @classmethod 159 | def from_dict(cls, data: dict): 160 | """load beats and downbeats from json""" 161 | inst = cls(data["beats"], data["downbeats"]) 162 | inst.use_downbeats(data["use_downbeats"]) 163 | return inst 164 | 165 | def save(self, output_dir: Path): 166 | """save beats and downbeats to json""" 167 | mkdir(output_dir) 168 | with open(output_dir / "beats.json", "w") as f: 169 | json.dump(self.to_json(), f) 170 | 171 | @classmethod 172 | def load(cls, input_dir: Path): 173 | """load beats and downbeats from json""" 174 | beats_file = Path(input_dir) / "beats.json" 175 | with open(beats_file, "r") as f: 176 | data = json.load(f) 177 | return cls.from_dict(data) 178 | 179 | 180 | ################### 181 | # beat tracking # 182 | ################### 183 | 184 | 185 | class BeatTracker: 186 | def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]: 187 | """extract beats from an audio signal""" 188 | raise NotImplementedError 189 | 190 | def __call__(self, signal: AudioSignal) -> Beats: 191 | """extract beats from an audio signal 192 | NOTE: if the first beat (and/or downbeat) is detected within the first 100ms of the audio, 193 | it is discarded. This is to avoid empty bins with no beat synced features in the first beat. 194 | Args: 195 | signal (AudioSignal): signal to beat track 196 | Returns: 197 | Tuple[np.ndarray, np.ndarray]: beats and downbeats 198 | """ 199 | beats, downbeats = self.extract_beats(signal) 200 | return Beats(beats, downbeats) 201 | 202 | 203 | class WaveBeat(BeatTracker): 204 | def __init__(self, ckpt_path: str = "checkpoints/wavebeat", device: str = "cpu"): 205 | from wavebeat.dstcn import dsTCNModel 206 | 207 | model = dsTCNModel.load_from_checkpoint(ckpt_path, map_location=torch.device(device)) 208 | model.eval() 209 | 210 | self.device = device 211 | self.model = model 212 | 213 | def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]: 214 | """returns beat and downbeat times, in seconds""" 215 | # extract beats 216 | self.model.to('cuda' if torch.cuda.is_available() else 'cpu') 217 | beats, downbeats = self.model.predict_beats_from_array( 218 | audio=signal.audio_data.squeeze(0), 219 | sr=signal.sample_rate, 220 | use_gpu=torch.cuda.is_available(), 221 | ) 222 | 223 | return beats, downbeats 224 | 225 | 226 | class MadmomBeats(BeatTracker): 227 | def __init__(self): 228 | raise NotImplementedError 229 | 230 | def extract_beats(self, signal: AudioSignal) -> Tuple[np.ndarray, np.ndarray]: 231 | """returns beat and downbeat times, in seconds""" 232 | pass 233 | 234 | 235 | BEAT_TRACKER_REGISTRY = { 236 | "wavebeat": WaveBeat, 237 | "madmom": MadmomBeats, 238 | } 239 | 240 | 241 | def list_beat_trackers() -> list: 242 | return list(BEAT_TRACKER_REGISTRY.keys()) 243 | 244 | 245 | def load_beat_tracker(beat_tracker: str, **kwargs) -> BeatTracker: 246 | if beat_tracker not in BEAT_TRACKER_REGISTRY: 247 | raise ValueError( 248 | f"Unknown beat tracker {beat_tracker}. Available: {list_beat_trackers()}" 249 | ) 250 | 251 | return BEAT_TRACKER_REGISTRY[beat_tracker](**kwargs) -------------------------------------------------------------------------------- /unloop/client.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pathlib import Path 3 | import shutil 4 | import json 5 | 6 | import argbind 7 | import audiotools as at 8 | from gradio_client import Client, handle_file 9 | from pythonosc.osc_server import ThreadingOSCUDPServer 10 | from pythonosc.udp_client import SimpleUDPClient 11 | from pythonosc.dispatcher import Dispatcher 12 | import torch 13 | 14 | class Timer: 15 | 16 | def __init__(self): 17 | self.times = {} 18 | 19 | def tick(self, name: str): 20 | self.times[name] = time.time() 21 | 22 | def tock(self, name: str): 23 | toc = time.time() - self.times[name] 24 | print(f"{name} took {toc} seconds") 25 | return toc 26 | 27 | def __str__(self): 28 | return str(self.times) 29 | 30 | timer = Timer() 31 | 32 | DOWNLOADS_DIR = ".gradio" 33 | 34 | def clear_file(file): 35 | file = Path(file) 36 | if file.exists(): 37 | file.unlink() 38 | 39 | 40 | class OSCManager: 41 | 42 | def __init__( 43 | self, 44 | ip: str, 45 | s_port: str, 46 | r_port: str, 47 | process_fn: callable, 48 | # param_change_callback: callable = None 49 | ): 50 | self.ip = ip 51 | self.s_port = s_port 52 | self.r_port = r_port 53 | 54 | # register the process_fn 55 | self.process_fn = process_fn 56 | 57 | print(f"will send to {ip}:{s_port}") 58 | self.client = SimpleUDPClient(ip, s_port) 59 | 60 | 61 | def start_server(self,): 62 | dispatcher = Dispatcher() 63 | dispatcher.map("/process", self.process_fn) 64 | 65 | def send_heartbeat(_, *args): 66 | # print("Received heartbeat") 67 | self.client.send_message("/heartbeat", "pong") 68 | 69 | dispatcher.map("/heartbeat", lambda a, *r: send_heartbeat(a, *r)) 70 | 71 | dispatcher.map("/cleanup", lambda a, *r: clear_file(r[0])) 72 | 73 | dispatcher.set_default_handler(lambda a, *r: print(a, r)) 74 | 75 | server = ThreadingOSCUDPServer((self.ip, self.r_port), dispatcher) 76 | print(f"Serving on {server.server_address}") 77 | server.serve_forever() 78 | 79 | def error(self, msg: str): 80 | self.client.send_message("/error", msg) 81 | 82 | def log(self, msg: str): 83 | self.client.send_message("/log", msg) 84 | 85 | 86 | class GradioOSCClient: 87 | 88 | def __init__(self, 89 | ip: str, 90 | s_port: int, r_port: int, 91 | vampnet_url: str = None, # url for vampnet 92 | ): 93 | self.osc_manager = OSCManager( 94 | ip=ip, s_port=s_port, r_port=r_port, 95 | process_fn=self.process, 96 | ) 97 | 98 | self.clients = {} 99 | if vampnet_url is not None: 100 | self.clients["vampnet"] = Client(src=vampnet_url, download_files=DOWNLOADS_DIR) 101 | 102 | assert len(self.clients) > 0, "At least one client must be specified!" 103 | 104 | self.batch_size = 2# TODO: automatically get batch size from client. 105 | 106 | self.osc_manager.log("hello from gradio client!") 107 | 108 | self.inf_idx = 0 109 | 110 | 111 | def param_changed(self, param_name, new_value): 112 | print(f"Parameter {param_name} changed to {new_value}") 113 | 114 | def vampnet_process(self, address: str, *args): 115 | client = self.clients["vampnet"] 116 | 117 | # query id --- audiofile ---- model_choice --- periodic --- drop --- seed 118 | query_id = args[0] 119 | client_type = args[1] 120 | audio_path = Path(args[2]) 121 | model_choice = args[3] 122 | periodic_p = args[4] 123 | dropout = args[5] 124 | seed = args[6] 125 | looplength_ms = args[7] 126 | typical_filter = args[8] 127 | typical_mass = args[9] 128 | typical_min_tokens = args[10] 129 | upper_codebook_mask = args[11] 130 | onset_mask_width = args[12] 131 | sampling_steps = args[13] 132 | temperature = args[14] 133 | top_p = args[15] 134 | beat_mask_ms = args[16] 135 | num_feedback_steps = args[17] 136 | 137 | if not audio_path.exists(): 138 | print(f"File {audio_path} does not exist") 139 | self.osc_manager.error(f"File {audio_path} does not exist") 140 | return 141 | 142 | sig = at.AudioSignal(audio_path) 143 | sig.to_mono() 144 | sig.sample_rate = 48000 # HOT PATCH (FIXME IN MAX: sample rate is being forced to 48k) 145 | 146 | # grab the looplength only 147 | # TODO: although I added this, 148 | # the max patch is still configured to crop anything past the looplength off 149 | # so we'll have to change that in order to make an effect. 150 | end_sample = int((looplength_ms * sig.sample_rate) / 1000) 151 | 152 | # grab the remainder of the waveform 153 | num_cut_samples = sig.samples.shape[-1] - end_sample 154 | cut_wav = sig.samples[..., -num_cut_samples:] 155 | 156 | sig.samples = sig.samples[..., :end_sample] 157 | # write the file back 158 | sig.write(audio_path) 159 | 160 | timer.tick("predict") 161 | print(f"Processing {address} with args {args}") 162 | # breakpoint() 163 | job = client.submit( 164 | input_audio=handle_file(audio_path), 165 | sampletemp=temperature, 166 | top_p=top_p, 167 | periodic_p=periodic_p, 168 | dropout=dropout, 169 | stretch_factor=1, 170 | onset_mask_width=onset_mask_width, 171 | typical_filtering=bool(typical_filter), 172 | typical_mass=typical_mass, 173 | typical_min_tokens=typical_min_tokens, 174 | seed=seed, 175 | model_choice=model_choice, 176 | n_mask_codebooks=upper_codebook_mask, 177 | pitch_shift_amt=0, 178 | sample_cutoff=1.0, 179 | sampling_steps=sampling_steps, 180 | beat_mask_ms=int(beat_mask_ms), 181 | num_feedback_steps=num_feedback_steps, 182 | api_name="/vamp_1" 183 | ) 184 | 185 | while not job.done(): 186 | time.sleep(0.1) 187 | self.osc_manager.client.send_message("/progress", [query_id, str(job.status().code)]) 188 | 189 | result = job.result() 190 | # audio_file = result 191 | # audio_files = [audio_file] * self.batch_size 192 | audio_files = list(result[:self.batch_size]) 193 | # if each file is missing a .wav at the end, add it 194 | first_audio = audio_files[0] 195 | if not first_audio.endswith(".wav"): 196 | for audio_file in set(audio_files): 197 | if not audio_file.endswith(".wav"): 198 | shutil.move(audio_file, f"{audio_file}.wav") 199 | audio_file = f"{audio_file}.wav" 200 | audio_files = [f"{audio}.wav" for audio in audio_files if not audio.endswith(".wav")] 201 | 202 | for audio_file in audio_files: 203 | # load the file, add the cut samples back 204 | sig = at.AudioSignal(audio_file) 205 | sig.resample(48000) 206 | sig.samples = torch.cat([sig.samples, cut_wav], dim=-1) 207 | sig.write(audio_file) 208 | seed = result[-1] 209 | 210 | timer.tock("predict") 211 | 212 | # send a message that the process is done 213 | self.osc_manager.log(f"query {query_id} has been processed") 214 | self.osc_manager.client.send_message("/process-result", [query_id] + audio_files) 215 | 216 | 217 | def process(self, address: str, *args): 218 | query_id = args[0] 219 | client_type = args[1] 220 | audio_path = Path(args[2]) 221 | 222 | if client_type == "vampnet": 223 | self.vampnet_process(address, *args) 224 | return 225 | elif client_type == "sketch2sound": 226 | self.process_s2s(address, *args) 227 | return 228 | else: 229 | raise ValueError(f"Unknown client type {client_type}") 230 | 231 | def gradio_main( 232 | vampnet_url: str = None 233 | ): 234 | system = GradioOSCClient( 235 | vampnet_url=vampnet_url, 236 | ip="127.0.0.1", s_port=8003, r_port=8001, 237 | ) 238 | 239 | system.osc_manager.start_server() 240 | 241 | 242 | if __name__ == "__main__": 243 | try: 244 | gradio_main = argbind.bind(gradio_main, without_prefix=True) 245 | 246 | args = argbind.parse_args() 247 | with argbind.scope(args): 248 | gradio_main() 249 | 250 | except Exception as e: 251 | import shutil 252 | shutil.rmtree(DOWNLOADS_DIR, ignore_errors=True) 253 | raise e -------------------------------------------------------------------------------- /vampnet/control.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from functools import partial 3 | from typing import Optional 4 | 5 | from torch import nn 6 | 7 | import vampnet.dsp.signal as sn 8 | from vampnet.dsp.signal import Signal 9 | from vampnet.mask import random_along_time 10 | from torch import Tensor 11 | import torch 12 | 13 | 14 | class MedianFilterAugment(nn.Module): 15 | 16 | def __init__(self, 17 | kernel_size: int, 18 | train_min: int = 1, 19 | train_max: int = 20, 20 | ): 21 | super().__init__() 22 | self.kernel_size = kernel_size 23 | self.train_min = train_min 24 | self.train_max = train_max 25 | 26 | def forward(self, x: Tensor) -> Tensor: 27 | if self.training: 28 | sizes = torch.randint( 29 | self.train_min, 30 | self.train_max, 31 | size=(x.shape[0],) 32 | ) 33 | else: 34 | sizes = self.kernel_size 35 | # print(f"median filter sizes: {sizes}") 36 | return sn.median_filter_1d(x, sizes) 37 | 38 | class RMS(nn.Module): 39 | 40 | def __init__(self, 41 | hop_length, 42 | window_length=2048, 43 | n_quantize=None, 44 | sample_rate=44100, 45 | median_filter_size: Optional[int] = None, 46 | train_median_filter_min=1, 47 | train_median_filter_max=15, 48 | ): 49 | super().__init__() 50 | 51 | self.hop_length = hop_length 52 | self.window_length = window_length 53 | self.n_quantize = n_quantize 54 | self.sample_rate = sample_rate 55 | 56 | self.mf = MedianFilterAugment( 57 | kernel_size=median_filter_size, 58 | train_min=train_median_filter_min, 59 | train_max=train_median_filter_max 60 | ) if median_filter_size is not None else None 61 | 62 | @property 63 | def dim(self): 64 | return 1 65 | 66 | def extract(self, sig: Signal) -> Tensor: 67 | rmsd = sn.rms(sig, 68 | window_length=self.window_length, 69 | hop_length=self.hop_length, 70 | )[:, :, :-1] # TODO: cutting the last frame to match DAC tokens but why :'( 71 | nb, _, _ = rmsd.shape 72 | 73 | if self.n_quantize is not None: 74 | # standardize to 0-1 75 | rmsd = (rmsd - rmsd.min()) / (rmsd.max() - rmsd.min()) 76 | 77 | # quantize to 128 steps 78 | rmsd = torch.round(rmsd * self.n_quantize) 79 | rmsd = rmsd / self.n_quantize 80 | 81 | if self.mf is not None: 82 | rmsd = self.mf(rmsd) 83 | 84 | return rmsd 85 | 86 | 87 | 88 | class HarmonicChroma(nn.Module): 89 | 90 | def __init__(self, 91 | hop_length: int, window_length: int = 4096, 92 | n_chroma: int = 48, sample_rate: int = 44100, 93 | top_n: int = 0 94 | ): 95 | super().__init__() 96 | from torchaudio.prototype.transforms import ChromaScale 97 | self.hop_length = hop_length 98 | self.window_length = window_length 99 | self.n_chroma = n_chroma 100 | self.sample_rate = sample_rate 101 | self.top_n = top_n 102 | 103 | # HUGO: this representation, as is, 104 | # encodes timbre information in the chroma 105 | # which is not what we want!!! 106 | # would a median filter help perhaps? 107 | self.chroma = ChromaScale( 108 | sample_rate=self.sample_rate, 109 | n_freqs=self.window_length // 2 + 1, 110 | n_chroma=self.n_chroma, 111 | octwidth=5.0, 112 | ) 113 | 114 | @property 115 | def dim(self): 116 | return self.n_chroma 117 | 118 | def extract(self, sig: Signal) -> Tensor: 119 | from vampnet.dsp.hpss import hpss 120 | self.chroma.to(sig.wav.device) 121 | 122 | # spectrogram 123 | spec = sn.stft(sig, 124 | window_length=self.window_length, 125 | hop_length=self.hop_length 126 | ) 127 | # magnitude 128 | spec = torch.abs(spec) 129 | 130 | # hpss 131 | spec = hpss(spec, kernel_size=51, hard=True)[0] 132 | 133 | # chroma 134 | chroma = self.chroma(spec) 135 | 136 | # get the rms of this spec 137 | rms_d = sn.rms_from_spec( 138 | spec, window_length=self.window_length 139 | ) 140 | 141 | # convert the rms to db 142 | rms_d = 10 * torch.log10(rms_d + 1e-7) 143 | 144 | # make a mask based on the rms < -40 145 | mask = torch.where(rms_d < -40, torch.zeros_like(rms_d), torch.ones_like(rms_d)) 146 | 147 | # remove anything below 80 (where the fuck did I get this number from?) 148 | chroma = torch.where(chroma < 100, torch.zeros_like(chroma), chroma) 149 | 150 | # Get top 2 values and indices along the -2 dimension 151 | if self.top_n: 152 | _, topk_indices = torch.topk(chroma, self.top_n, dim=-2) 153 | 154 | # Create a mask for the top 2 values 155 | topk_mask = torch.zeros_like(chroma).scatter_(-2, topk_indices, 1.0) 156 | 157 | # Retain only the top 2 values 158 | chroma = chroma * topk_mask 159 | 160 | # apply the mask 161 | chroma = chroma * mask.unsqueeze(-2) 162 | 163 | # Apply softmax along dim=-2 164 | if self.top_n > 0: 165 | chroma = torch.nn.functional.softmax(chroma, dim=-2) 166 | 167 | # mask out any timesteps whose chroma have all equal values (all 0s before softmax) 168 | # TODO: i did this with chatgpt, there's gott a be a better way 169 | chroma_mean = chroma.mean(dim=-2, keepdim=True) 170 | chroma_diff = torch.abs(chroma - chroma_mean) 171 | equal_mask = torch.all(chroma_diff < 1e-6, dim=-2, keepdim=True) 172 | 173 | # Set chroma values to zero for timesteps with all equal values 174 | chroma = torch.where(equal_mask, torch.zeros_like(chroma), chroma) 175 | 176 | 177 | return chroma[:, 0, :, :-1] # mono only :( FIX ME! 178 | 179 | 180 | # TODO: try harmonic mel? 181 | 182 | CONTROLLERS = { 183 | "rms": RMS, 184 | "rmsq128": partial(RMS, n_quantize=128), 185 | "rmsq16": partial(RMS, n_quantize=16), 186 | "rms-median": partial(RMS, median_filter_size=5), 187 | "rmsq16-median": partial(RMS, n_quantize=16, median_filter_size=3), 188 | "hchroma": HarmonicChroma, 189 | "hchroma-12c-top2": partial(HarmonicChroma, n_chroma=12, top_n=2), # TODO: refactor me. If this works, this should just be named hchroma. 190 | "hchroma-36c-top3": partial(HarmonicChroma, n_chroma=36, top_n=3) # TODO: refactor me. If this works, this should just be named hchroma. 191 | } 192 | 193 | class Sketch2SoundController(nn.Module): 194 | 195 | def __init__( 196 | self, 197 | ctrl_keys: list[str], 198 | hop_length: str, 199 | sample_rate: int, 200 | ): 201 | super().__init__() 202 | 203 | assert all([k in CONTROLLERS for k in ctrl_keys]), f"got an unsupported control key in {ctrl_keys}!\n supported: {CONTROLLERS.keys()}" 204 | 205 | self.hop_length = hop_length 206 | self.ctrl_keys = ctrl_keys 207 | self.sample_rate = sample_rate 208 | 209 | self.controllers = { 210 | k: CONTROLLERS[k](hop_length=hop_length, sample_rate=sample_rate) 211 | for k in self.ctrl_keys 212 | } 213 | 214 | @property 215 | def ctrl_dims(self, ) -> dict[str, int]: 216 | return { 217 | k: controller.dim for k, controller in self.controllers.items() 218 | } 219 | 220 | def extract(self, sig: Signal) -> dict[str, Tensor]: 221 | ctrls = { 222 | k: controller.extract(sig) for k, controller in self.controllers.items() 223 | } 224 | return ctrls 225 | 226 | def random_mask(self, ctrls: dict[str, Tensor], r: float): 227 | masks = {} 228 | for k, ctrl in ctrls.items(): 229 | masks[k] = 1-random_along_time(ctrl, r) 230 | return masks 231 | 232 | def empty_mask(self, ctrls: dict[str, Tensor]): 233 | first_key = next(iter(ctrls)) 234 | mask = torch.zeros_like(ctrls[first_key]) 235 | return {k: mask for k in ctrls} 236 | 237 | 238 | def test_controller(): 239 | controller = Sketch2SoundController( 240 | ctrl_keys=["rms-median", "rms", "rmsq128"], 241 | hop_length=512, 242 | sample_rate=44100 243 | ) 244 | controller.train() 245 | # sig = sn.read_from_file("assets/example.wav") 246 | # sig = sn.read_from_file("/Users/hugo/Downloads/DCS_SE_FullChoir_ScaleUpDown06_A2_DYN.wav") 247 | # sig = sn.excerpt('/Users/hugo/Downloads/(guitarra - hugo mix) bubararu - tambor negro.wav', offset=0, duration=10) 248 | sig = sn.read_from_file("assets/voice-prompt.wav") 249 | ctrls = controller.extract(sig) 250 | print(f"given sig of shape {sig.wav.shape}, extracted controls: {ctrls}") 251 | 252 | # print the whole thing 253 | # torch.set_printoptions(profile="full") 254 | # print(ctrls["hchroma"][0][0][:, 200:210]) 255 | 256 | # imshow the chroma 257 | import matplotlib.pyplot as plt 258 | 259 | # Define relative heights for the subplots 260 | fig, (ax1, ax2, ax3, ax4) = plt.subplots( 261 | 4, 1, 262 | sharex=True, 263 | ) 264 | 265 | # Display the spectrogram on the top 266 | ax1.imshow(sn.stft(sig, hop_length=512, window_length=2048).abs()[0][0].cpu().log().numpy(), aspect='auto', origin='lower') 267 | # display rms on the bottom 268 | ax2.plot(ctrls["rms-median"][0][0]) 269 | ax3.plot(ctrls["rms"][0][0]) 270 | ax4.plot(ctrls["rmsq128"][0][0]) 271 | 272 | plt.tight_layout() # Ensure proper spacing 273 | plt.savefig("img.png") 274 | 275 | 276 | if __name__ == "__main__": 277 | test_controller() -------------------------------------------------------------------------------- /scripts/utils/visualize_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO: train a linear probe 3 | usage: 4 | python gtzan_embeddings.py --args.load conf/interface.yml --Interface.device cuda --path_to_audio /path/to/audio/labels --output_dir /path/to/output 5 | """ 6 | from pathlib import Path 7 | from typing import List 8 | 9 | import audiotools as at 10 | from audiotools import AudioSignal 11 | import argbind 12 | import torch 13 | import numpy as np 14 | import zipfile 15 | import json 16 | 17 | from vampnet.interface import Interface 18 | import tqdm 19 | 20 | # bind the Interface to argbind 21 | Interface = argbind.bind(Interface) 22 | 23 | DEBUG = False 24 | 25 | 26 | def smart_plotly_export(fig, save_path: Path): 27 | img_format = save_path.suffix[1:] 28 | if img_format == "html": 29 | fig.write_html(save_path) 30 | elif img_format == 'bytes': 31 | return fig.to_image(format='png') 32 | #TODO: come back and make this prettier 33 | elif img_format == 'numpy': 34 | import io 35 | from PIL import Image 36 | 37 | def plotly_fig2array(fig): 38 | #convert Plotly fig to an array 39 | fig_bytes = fig.to_image(format="png", width=1200, height=700) 40 | buf = io.BytesIO(fig_bytes) 41 | img = Image.open(buf) 42 | return np.asarray(img) 43 | 44 | return plotly_fig2array(fig) 45 | elif img_format == 'jpeg' or 'png' or 'webp': 46 | fig.write_image(save_path) 47 | else: 48 | raise ValueError("invalid image format") 49 | 50 | 51 | def dim_reduce(annotated_embeddings, layer, output_dir, n_components=3, method="tsne"): 52 | """ 53 | dimensionality reduction for visualization! 54 | saves an html plotly figure to save_path 55 | parameters: 56 | annotated_embeddings (list): the annotated enmbeddings to be reduced; embeddings have shape (samples, features) 57 | labels (list): list of labels for embedding 58 | save_path (str): path where u wanna save ur figure 59 | method (str): umap, tsne, or pca 60 | title (str): title for ur figure 61 | returns: 62 | proj (np.ndarray): projection vector with shape (samples, dimensions) 63 | """ 64 | import pandas as pd 65 | import plotly.express as px 66 | 67 | fig_name = f"vampnet-embeddings-layer={layer}" 68 | fig_title = f"{fig_name}_{method}" 69 | save_path = (output_dir / fig_name).with_suffix(".html") 70 | 71 | if method == "umap": 72 | from umap import UMAP 73 | reducer = umap.UMAP(n_components=n_components) 74 | elif method == "tsne": 75 | from sklearn.manifold import TSNE 76 | 77 | reducer = TSNE(n_components=n_components) 78 | elif method == "pca": 79 | from sklearn.decomposition import PCA 80 | 81 | reducer = PCA(n_components=n_components) 82 | else: 83 | raise ValueError(f"invalid method: {method}") 84 | 85 | labels = [emb.label for emb in annotated_embeddings] 86 | names = [emb.filename for emb in annotated_embeddings] 87 | embs = [emb.embedding for emb in annotated_embeddings] 88 | embs_at_layer = np.stack(embs)[:, layer, :] 89 | projs = reducer.fit_transform(embs_at_layer) 90 | 91 | df = pd.DataFrame( 92 | { 93 | "label": labels, 94 | "name": names, 95 | "x": projs[:, 0], 96 | "y": projs[:, 1], 97 | } 98 | ) 99 | if n_components == 2: 100 | fig = px.scatter( 101 | df, x="x", y="y", color="label", hover_name="name", title=fig_title, 102 | ) 103 | 104 | elif n_components == 3: 105 | df['z'] = projs[:, 2] 106 | fig = px.scatter_3d( 107 | df, x="x", y="y", z="z", color="label", hover_name="name", title=fig_title 108 | ) 109 | else: 110 | raise ValueError(f"can't plot {n_components} components") 111 | 112 | fig.update_traces( 113 | marker=dict(size=6, line=dict(width=1, color="DarkSlateGrey")), 114 | selector=dict(mode="markers"), 115 | ) 116 | 117 | return smart_plotly_export(fig, save_path) 118 | 119 | 120 | 121 | # per JukeMIR, we want the emebddings from the middle layer? 122 | def vampnet_embed(sig: AudioSignal, interface: Interface, layer=10): 123 | with torch.inference_mode(): 124 | # preprocess the signal 125 | sig = interface.preprocess(sig) 126 | 127 | # get the coarse vampnet model 128 | vampnet = interface.coarse 129 | 130 | # get the tokens 131 | z = interface.encode(sig)[:, :vampnet.n_codebooks, :] 132 | z_latents = vampnet.embedding.from_codes(z, interface.codec) 133 | 134 | # do a forward pass through the model, get the embeddings 135 | _z, embeddings = vampnet(z_latents, return_activations=True) 136 | # print(f"got embeddings with shape {embeddings.shape}") 137 | # [layer, batch, time, n_dims] 138 | # [20, 1, 600ish, 768] 139 | 140 | 141 | # squeeze batch dim (1 bc layer should be dim 0) 142 | assert embeddings.shape[1] == 1, f"expected batch dim to be 1, got {embeddings.shape[0]}" 143 | embeddings = embeddings.squeeze(1) 144 | 145 | num_layers = embeddings.shape[0] 146 | assert layer < num_layers, f"layer {layer} is out of bounds for model with {num_layers} layers" 147 | 148 | # do meanpooling over the time dimension 149 | embeddings = embeddings.mean(dim=-2) 150 | # [20, 768] 151 | 152 | # return the embeddings 153 | return embeddings 154 | 155 | from dataclasses import dataclass, fields 156 | @dataclass 157 | class AnnotatedEmbedding: 158 | label: str 159 | filename: str 160 | embedding: np.ndarray 161 | 162 | def save(self, path): 163 | """Save the Embedding object to a given path as a zip file.""" 164 | with zipfile.ZipFile(path, 'w') as archive: 165 | 166 | # Save numpy array 167 | with archive.open('embedding.npy', 'w') as f: 168 | np.save(f, self.embedding) 169 | 170 | # Save non-numpy data as json 171 | non_numpy_data = {f.name: getattr(self, f.name) for f in fields(self) if f.name != 'embedding'} 172 | with archive.open('data.json', 'w') as f: 173 | f.write(json.dumps(non_numpy_data).encode('utf-8')) 174 | 175 | @classmethod 176 | def load(cls, path): 177 | """Load the Embedding object from a given zip path.""" 178 | with zipfile.ZipFile(path, 'r') as archive: 179 | 180 | # Load numpy array 181 | with archive.open('embedding.npy') as f: 182 | embedding = np.load(f) 183 | 184 | # Load non-numpy data from json 185 | with archive.open('data.json') as f: 186 | data = json.loads(f.read().decode('utf-8')) 187 | 188 | return cls(embedding=embedding, **data) 189 | 190 | 191 | @argbind.bind(without_prefix=True) 192 | def main( 193 | path_to_audio: str = None, 194 | cache_dir: str = "./.emb_cache", 195 | output_dir: str = "./vampnet_embeddings", 196 | layers: List[int] = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19], 197 | method: str = "tsne", 198 | n_components: int = 2, 199 | ): 200 | path_to_audio = Path(path_to_audio) 201 | assert path_to_audio.exists(), f"{path_to_audio} does not exist" 202 | 203 | cache_dir = Path(cache_dir) 204 | output_dir = Path(output_dir) 205 | output_dir.mkdir(exist_ok=True, parents=True) 206 | 207 | # load our interface 208 | # argbind will automatically load the default config, 209 | interface = Interface() 210 | 211 | # we expect path_to_audio to consist of a folder for each label, so let's get the list of labels 212 | labels = [Path(x).name for x in path_to_audio.iterdir() if x.is_dir()] 213 | print(f"Found {len(labels)} labels") 214 | print(f"labels: {labels}") 215 | 216 | # collect audio files, labels, and embeddings 217 | annotated_embeddings = [] 218 | for label in labels: 219 | audio_files = list(at.util.find_audio(path_to_audio / label)) 220 | print(f"Found {len(audio_files)} audio files for label {label}") 221 | 222 | for audio_file in tqdm.tqdm(audio_files, desc=f"embedding label {label}"): 223 | # check if we have a cached embedding for this file 224 | cached_path = cache_dir / f"{label}_{audio_file.stem}.emb" 225 | if cached_path.exists(): 226 | # if so, load it 227 | if DEBUG: 228 | print(f"loading cached embedding for {cached_path.stem}") 229 | embedding = AnnotatedEmbedding.load(cached_path) 230 | else: 231 | try: 232 | sig = AudioSignal(audio_file) 233 | except Exception as e: 234 | print(f"failed to load {audio_file.name} with error {e}") 235 | print(f"skipping {audio_file.name}") 236 | continue 237 | 238 | # gets the embedding 239 | emb = vampnet_embed(sig, interface).cpu().numpy() 240 | 241 | # create an embedding we can save/load 242 | embedding = AnnotatedEmbedding( 243 | label=label, filename=audio_file.name, embedding=emb 244 | ) 245 | 246 | # cache the embeddings 247 | cached_path.parent.mkdir(exist_ok=True, parents=True) 248 | embedding.save(cached_path) 249 | annotated_embeddings.append(embedding) 250 | 251 | # now, let's do a dim reduction on the embeddings and visualize them. 252 | for layer in tqdm.tqdm(layers, desc="dim reduction"): 253 | dim_reduce( 254 | annotated_embeddings, 255 | layer, 256 | output_dir=output_dir, 257 | n_components=n_components, 258 | method=method, 259 | ) 260 | 261 | 262 | if __name__ == "__main__": 263 | args = argbind.parse_args() 264 | with argbind.scope(args): 265 | main() 266 | -------------------------------------------------------------------------------- /scripts/utils/gtzan_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO: train a linear probe 3 | usage: 4 | python gtzan_embeddings.py --args.load conf/interface.yml --Interface.device cuda --path_to_gtzan /path/to/gtzan/genres_original --output_dir /path/to/output 5 | """ 6 | from pathlib import Path 7 | from typing import List 8 | 9 | import audiotools as at 10 | from audiotools import AudioSignal 11 | import argbind 12 | import torch 13 | import numpy as np 14 | import zipfile 15 | import json 16 | 17 | from vampnet.interface import Interface 18 | import tqdm 19 | 20 | # bind the Interface to argbind 21 | Interface = argbind.bind(Interface) 22 | 23 | DEBUG = False 24 | 25 | def smart_plotly_export(fig, save_path): 26 | img_format = save_path.split('.')[-1] 27 | if img_format == 'html': 28 | fig.write_html(save_path) 29 | elif img_format == 'bytes': 30 | return fig.to_image(format='png') 31 | #TODO: come back and make this prettier 32 | elif img_format == 'numpy': 33 | import io 34 | from PIL import Image 35 | 36 | def plotly_fig2array(fig): 37 | #convert Plotly fig to an array 38 | fig_bytes = fig.to_image(format="png", width=1200, height=700) 39 | buf = io.BytesIO(fig_bytes) 40 | img = Image.open(buf) 41 | return np.asarray(img) 42 | 43 | return plotly_fig2array(fig) 44 | elif img_format == 'jpeg' or 'png' or 'webp': 45 | fig.write_image(save_path) 46 | else: 47 | raise ValueError("invalid image format") 48 | 49 | def dim_reduce(emb, labels, save_path, n_components=3, method='tsne', title=''): 50 | """ 51 | dimensionality reduction for visualization! 52 | saves an html plotly figure to save_path 53 | parameters: 54 | emb (np.ndarray): the samples to be reduces with shape (samples, features) 55 | labels (list): list of labels for embedding 56 | save_path (str): path where u wanna save ur figure 57 | method (str): umap, tsne, or pca 58 | title (str): title for ur figure 59 | returns: 60 | proj (np.ndarray): projection vector with shape (samples, dimensions) 61 | """ 62 | import pandas as pd 63 | import plotly.express as px 64 | if method == 'umap': 65 | from umap import UMAP 66 | reducer = umap.UMAP(n_components=n_components) 67 | elif method == 'tsne': 68 | from sklearn.manifold import TSNE 69 | reducer = TSNE(n_components=n_components) 70 | elif method == 'pca': 71 | from sklearn.decomposition import PCA 72 | reducer = PCA(n_components=n_components) 73 | else: 74 | raise ValueError 75 | 76 | proj = reducer.fit_transform(emb) 77 | 78 | if n_components == 2: 79 | df = pd.DataFrame(dict( 80 | x=proj[:, 0], 81 | y=proj[:, 1], 82 | instrument=labels 83 | )) 84 | fig = px.scatter(df, x='x', y='y', color='instrument', 85 | title=title+f"_{method}") 86 | 87 | elif n_components == 3: 88 | df = pd.DataFrame(dict( 89 | x=proj[:, 0], 90 | y=proj[:, 1], 91 | z=proj[:, 2], 92 | instrument=labels 93 | )) 94 | fig = px.scatter_3d(df, x='x', y='y', z='z', 95 | color='instrument', 96 | title=title) 97 | else: 98 | raise ValueError("cant plot more than 3 components") 99 | 100 | fig.update_traces(marker=dict(size=6, 101 | line=dict(width=1, 102 | color='DarkSlateGrey')), 103 | selector=dict(mode='markers')) 104 | 105 | return smart_plotly_export(fig, save_path) 106 | 107 | 108 | 109 | # per JukeMIR, we want the emebddings from the middle layer? 110 | def vampnet_embed(sig: AudioSignal, interface: Interface, layer=10): 111 | with torch.inference_mode(): 112 | # preprocess the signal 113 | sig = interface.preprocess(sig) 114 | 115 | # get the coarse vampnet model 116 | vampnet = interface.coarse 117 | 118 | # get the tokens 119 | z = interface.encode(sig)[:, :vampnet.n_codebooks, :] 120 | z_latents = vampnet.embedding.from_codes(z, interface.codec) 121 | 122 | # do a forward pass through the model, get the embeddings 123 | _z, embeddings = vampnet(z_latents, return_activations=True) 124 | # print(f"got embeddings with shape {embeddings.shape}") 125 | # [layer, batch, time, n_dims] 126 | # [20, 1, 600ish, 768] 127 | 128 | 129 | # squeeze batch dim (1 bc layer should be dim 0) 130 | assert embeddings.shape[1] == 1, f"expected batch dim to be 1, got {embeddings.shape[0]}" 131 | embeddings = embeddings.squeeze(1) 132 | 133 | num_layers = embeddings.shape[0] 134 | assert layer < num_layers, f"layer {layer} is out of bounds for model with {num_layers} layers" 135 | 136 | # do meanpooling over the time dimension 137 | embeddings = embeddings.mean(dim=-2) 138 | # [20, 768] 139 | 140 | # return the embeddings 141 | return embeddings 142 | 143 | from dataclasses import dataclass, fields 144 | @dataclass 145 | class Embedding: 146 | genre: str 147 | filename: str 148 | embedding: np.ndarray 149 | 150 | def save(self, path): 151 | """Save the Embedding object to a given path as a zip file.""" 152 | with zipfile.ZipFile(path, 'w') as archive: 153 | 154 | # Save numpy array 155 | with archive.open('embedding.npy', 'w') as f: 156 | np.save(f, self.embedding) 157 | 158 | # Save non-numpy data as json 159 | non_numpy_data = {f.name: getattr(self, f.name) for f in fields(self) if f.name != 'embedding'} 160 | with archive.open('data.json', 'w') as f: 161 | f.write(json.dumps(non_numpy_data).encode('utf-8')) 162 | 163 | @classmethod 164 | def load(cls, path): 165 | """Load the Embedding object from a given zip path.""" 166 | with zipfile.ZipFile(path, 'r') as archive: 167 | 168 | # Load numpy array 169 | with archive.open('embedding.npy') as f: 170 | embedding = np.load(f) 171 | 172 | # Load non-numpy data from json 173 | with archive.open('data.json') as f: 174 | data = json.loads(f.read().decode('utf-8')) 175 | 176 | return cls(embedding=embedding, **data) 177 | 178 | 179 | @argbind.bind(without_prefix=True) 180 | def main( 181 | path_to_gtzan: str = None, 182 | cache_dir: str = "./.gtzan_emb_cache", 183 | output_dir: str = "./gtzan_vampnet_embeddings", 184 | layers: List[int] = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19] 185 | ): 186 | path_to_gtzan = Path(path_to_gtzan) 187 | assert path_to_gtzan.exists(), f"{path_to_gtzan} does not exist" 188 | 189 | cache_dir = Path(cache_dir) 190 | output_dir = Path(output_dir) 191 | output_dir.mkdir(exist_ok=True, parents=True) 192 | 193 | # load our interface 194 | # argbind will automatically load the default config, 195 | interface = Interface() 196 | 197 | # gtzan should have a folder for each genre, so let's get the list of genres 198 | genres = [Path(x).name for x in path_to_gtzan.iterdir() if x.is_dir()] 199 | print(f"Found {len(genres)} genres") 200 | print(f"genres: {genres}") 201 | 202 | # collect audio files, genres, and embeddings 203 | data = [] 204 | for genre in genres: 205 | audio_files = list(at.util.find_audio(path_to_gtzan / genre)) 206 | print(f"Found {len(audio_files)} audio files for genre {genre}") 207 | 208 | for audio_file in tqdm.tqdm(audio_files, desc=f"embedding genre {genre}"): 209 | # check if we have a cached embedding for this file 210 | cached_path = (cache_dir / f"{genre}_{audio_file.stem}.emb") 211 | if cached_path.exists(): 212 | # if so, load it 213 | if DEBUG: 214 | print(f"loading cached embedding for {cached_path.stem}") 215 | embedding = Embedding.load(cached_path) 216 | else: 217 | try: 218 | sig = AudioSignal(audio_file) 219 | except Exception as e: 220 | print(f"failed to load {audio_file.name} with error {e}") 221 | print(f"skipping {audio_file.name}") 222 | continue 223 | 224 | # gets the embedding 225 | emb = vampnet_embed(sig, interface).cpu().numpy() 226 | 227 | # create an embedding we can save/load 228 | embedding = Embedding( 229 | genre=genre, 230 | filename=audio_file.name, 231 | embedding=emb 232 | ) 233 | 234 | # cache the embeddings 235 | cached_path.parent.mkdir(exist_ok=True, parents=True) 236 | embedding.save(cached_path) 237 | data.append(embedding) 238 | 239 | # now, let's do a dim reduction on the embeddings 240 | # and visualize them. 241 | 242 | # collect a list of embeddings and labels 243 | embeddings = [d.embedding for d in data] 244 | labels = [d.genre for d in data] 245 | 246 | # convert the embeddings to a numpy array 247 | embeddings = np.stack(embeddings) 248 | 249 | # do dimensionality reduction for each layer we're given 250 | for layer in tqdm.tqdm(layers, desc="dim reduction"): 251 | dim_reduce( 252 | embeddings[:, layer, :], labels, 253 | save_path=str(output_dir / f'vampnet-gtzan-layer={layer}.html'), 254 | n_components=2, method='tsne', 255 | title=f'vampnet-gtzan-layer={layer}' 256 | ) 257 | 258 | 259 | 260 | 261 | if __name__ == "__main__": 262 | args = argbind.parse_args() 263 | with argbind.scope(args): 264 | main() -------------------------------------------------------------------------------- /vampnet/newmask.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from .util import scalar_to_batch_tensor 6 | 7 | def _gamma(r): 8 | return (r * torch.pi / 2).cos().clamp(1e-10, 1.0) 9 | 10 | def _invgamma(y): 11 | if not torch.is_tensor(y): 12 | y = torch.tensor(y)[None] 13 | return 2 * y.acos() / torch.pi 14 | 15 | def full_mask(x: torch.Tensor): 16 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 17 | return torch.ones_like(x).int() 18 | 19 | def empty_mask(x: torch.Tensor): 20 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 21 | return torch.zeros_like(x).int() 22 | 23 | def apply_mask( 24 | x: torch.Tensor, 25 | mask: torch.Tensor, 26 | mask_token: int 27 | ): 28 | assert mask.ndim == 3, f"mask must be (batch, n_codebooks, seq), but got {mask.ndim}" 29 | assert mask.shape == x.shape, f"mask must be same shape as x, but got {mask.shape} and {x.shape}" 30 | assert mask.dtype == torch.int, f"mask must be int dtype, but got {mask.dtype}" 31 | assert ~torch.any(mask > 1), "mask must be binary" 32 | assert ~torch.any(mask < 0), "mask must be binary" 33 | mask = mask.int() 34 | 35 | fill_x = torch.full_like(x, mask_token) 36 | x = x * (1 - mask) + fill_x * mask 37 | 38 | return x 39 | 40 | def random( 41 | x: torch.Tensor, 42 | r: torch.Tensor 43 | ): 44 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 45 | if not isinstance(r, torch.Tensor): 46 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device) 47 | 48 | r = _gamma(r)[:, None, None] 49 | probs = torch.ones_like(x) * r 50 | 51 | mask = torch.bernoulli(probs) 52 | mask = mask.round().int() 53 | 54 | return mask, torch.zeros_like(mask).bool() 55 | 56 | def random_along_time(x: torch.Tensor, r: torch.Tensor): 57 | assert x.ndim == 3, "x must be (batch, channel, seq)" 58 | if not isinstance(r, torch.Tensor): 59 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device) 60 | 61 | x = x[:, 0, :] 62 | r = _gamma(r)[:, None] 63 | probs = torch.ones_like(x) * r 64 | 65 | mask = torch.bernoulli(probs) 66 | mask = mask.round().int() 67 | 68 | return mask 69 | 70 | 71 | def stemgen_random(x: torch.Tensor, r: torch.Tensor): 72 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 73 | if not isinstance(r, torch.Tensor): 74 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device) 75 | 76 | # Assuming x is your input tensor and r is the probability for the Bernoulli distribution 77 | nb, nc, nt = x.shape 78 | 79 | # Randomly sample a codebook level to infer for each item in the batch 80 | c = torch.randint(0, nc, (nb,)).to(x.device) 81 | 82 | # Create a mask tensor of the same shape as x, initially filled with ones 83 | mask = torch.ones_like(x).long().to(x.device) 84 | ignore_indices_mask = torch.zeros_like(x).long().to(x.device) 85 | 86 | # Iterate over each item in the batch 87 | for i in range(nb): 88 | # Create the Bernoulli mask for the sampled level 89 | level_mask = torch.bernoulli(torch.ones(nt).to(x.device) * r[i]).long() 90 | 91 | # Apply the mask to the sampled level 92 | mask[i, c[i]] = level_mask 93 | 94 | # All levels below the sampled level are unmasked (zeros) 95 | mask[i, :c[i]] = 0 96 | ignore_indices_mask[i, :c[i]] = 1 97 | 98 | # All levels above the sampled level are masked (ones) 99 | mask[i, c[i]+1:] = 1 100 | ignore_indices_mask[i, c[i]+1:] = 1 101 | 102 | # save a debug mask to np txt 103 | # import numpy as np 104 | # np.savetxt("mask.txt", mask[0].cpu().numpy(), fmt="%d") 105 | # np.savetxt("ignore_indices_mask.txt", ignore_indices_mask[0].cpu().numpy(), fmt="%d") 106 | 107 | return mask.int(), ignore_indices_mask.bool() 108 | 109 | 110 | def hugo_random(x: torch.Tensor, r:torch.Tensor): 111 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 112 | if not isinstance(r, torch.Tensor): 113 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float() 114 | 115 | r = _gamma(r)[:, None, None] 116 | 117 | nb, nc, nt = x.shape 118 | 119 | probs = torch.ones_like(x) * r 120 | mask = torch.bernoulli(probs) 121 | # alternatively, the mask level could be the cumsum of the mask 122 | mask = mask.round().long().to(x.device) 123 | mask_levels = nc - mask.sum(dim=1) - 1 124 | 125 | # create a new mask, where all levels below the mask level are masked 126 | # shape (nb, nc, nt) where new_mask[i, CB:, t] = 1, CB = mask_level[i, t] 127 | # mask = mask_levels[:, :, None] > torch.arange(nc)[None, None, :] 128 | mask = (mask_levels[:, None, :] < torch.arange(nc, device=x.device)[None, :, None]).long() 129 | 130 | ignore_levels = mask_levels + 1 131 | ignore_indices_mask = (ignore_levels[:, None, :] < torch.arange(nc, device=x.device)[None, :, None]).long() 132 | 133 | # for _b in range(nb): 134 | # for _t in range(nt): 135 | # for _c in range(nc): 136 | # if mask[_b, _c, _t] == 1: 137 | # mask[_b, _c:, _t] = 1 138 | # ignore_indices_mask[_b, _c + 1:, _t] = 1 139 | # break 140 | 141 | return mask.long(), ignore_indices_mask.bool() 142 | 143 | 144 | def better_cond_random_but_not_working(x: torch.Tensor, r:torch.Tensor): 145 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 146 | if not isinstance(r, torch.Tensor): 147 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float() 148 | 149 | r = _gamma(r)[:, None, None] 150 | 151 | nb, nc, nt = x.shape 152 | 153 | probs = torch.ones_like(x) * r 154 | mask = torch.bernoulli(probs) 155 | 156 | mask = mask.round().long().to(x.device) 157 | 158 | # there cannot be anything unmasked if there's an masked token 159 | # in the same timestep and below it 160 | for i in range(nb): 161 | for j in range(nc): 162 | for t in range(nt): 163 | if mask[i, j, t] == 1: 164 | mask[i, j:, t] = 1 165 | break 166 | 167 | # an ignore indices mask, since we can truly only predict one token 168 | # per timestep 169 | ignore_indices = torch.zeros_like(x) 170 | for i in range(nb): 171 | for j in range(nc): 172 | for t in range(nt): 173 | if mask[i, j, t] == 1: 174 | ignore_indices[i, j, t+1:] = 1 175 | break 176 | return mask.int(), ignore_indices 177 | 178 | 179 | @torch.jit.script_if_tracing 180 | def linear_random( 181 | x: torch.Tensor, 182 | r: torch.Tensor, 183 | ): 184 | assert x.ndim == 3, "x must be (batch, n_codebooks, seq)" 185 | if not isinstance(r, torch.Tensor): 186 | r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float() 187 | r = r[:, None, None] 188 | 189 | probs = torch.ones_like(x).to(x.device).float() 190 | # expand to batch and codebook dims 191 | probs = probs.expand(x.shape[0], x.shape[1], -1) 192 | probs = probs * r 193 | 194 | mask = torch.bernoulli(probs) 195 | mask = mask.round().int() 196 | 197 | return mask 198 | 199 | @torch.jit.script_if_tracing 200 | def inpaint(x: torch.Tensor, n_prefix: int, n_suffix: int,): 201 | assert n_prefix is not None 202 | assert n_suffix is not None 203 | 204 | mask = full_mask(x) 205 | 206 | # if we have a prefix or suffix, set their mask prob to 0 207 | if n_prefix > 0: 208 | if not isinstance(n_prefix, torch.Tensor): 209 | n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device) 210 | for i, n in enumerate(n_prefix): 211 | if n > 0: 212 | mask[i, :, :n] = 0.0 213 | if n_suffix > 0: 214 | if not isinstance(n_suffix, torch.Tensor): 215 | n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device) 216 | for i, n in enumerate(n_suffix): 217 | if n > 0: 218 | mask[i, :, -n:] = 0.0 219 | return mask 220 | 221 | @torch.jit.script_if_tracing 222 | def periodic_mask(x: torch.Tensor, period: int, 223 | width: int = 1, random_roll: bool = False,): 224 | mask = full_mask(x) 225 | if period == 0: 226 | return full_mask(x) 227 | 228 | if not isinstance(period, torch.Tensor): 229 | period = scalar_to_batch_tensor(period, x.shape[0]) 230 | if period.ndim == 0: 231 | period = period[None] 232 | 233 | for i, factor in enumerate(period): 234 | if factor == 0: 235 | continue 236 | for j in range(mask.shape[-1]): 237 | if j % factor == 0: 238 | # figure out how wide the mask should be 239 | j_start = max(0, j - width // 2 ) 240 | j_end = min(mask.shape[-1] - 1, j + width // 2 ) + 1 241 | # flip a coin for each position in the mask 242 | j_mask = torch.bernoulli(torch.ones(j_end - j_start)) 243 | assert torch.all(j_mask == 1) 244 | j_fill = torch.ones_like(j_mask) * (1 - j_mask) 245 | assert torch.all(j_fill == 0) 246 | # fill 247 | mask[i, :, j_start:j_end] = j_fill 248 | 249 | return mask 250 | 251 | def codebook_unmask( 252 | mask: torch.Tensor, 253 | n_conditioning_codebooks: int 254 | ): 255 | if n_conditioning_codebooks == None: 256 | return mask 257 | # if we have any conditioning codebooks, set their mask to 0 258 | mask = mask.clone() 259 | mask[:, :n_conditioning_codebooks, :] = 0 260 | return mask 261 | 262 | def codebook_mask(mask: torch.Tensor, val1: int, val2: int = None): 263 | mask = mask.clone() 264 | mask[:, val1:, :] = 1 265 | # val2 = val2 or val1 266 | # vs = torch.linspace(val1, val2, mask.shape[1]) 267 | # for t, v in enumerate(vs): 268 | # v = int(v) 269 | # mask[:, v:, t] = 1 270 | 271 | return mask 272 | 273 | @torch.jit.script_if_tracing 274 | def mask_and( 275 | mask1: torch.Tensor, 276 | mask2: torch.Tensor 277 | ): 278 | assert mask1.shape == mask2.shape, "masks must be same shape" 279 | return torch.min(mask1, mask2) 280 | 281 | def drop_ones(mask: torch.Tensor, p: float): 282 | oldshp = mask.shape 283 | mask = mask.view(-1) 284 | 285 | # find ones idxs 286 | ones_idxs = torch.where(mask == 1)[0] 287 | # shuffle idxs 288 | ones_idxs_idxs = torch.randperm(len(ones_idxs)) 289 | ones_idxs = ones_idxs[ones_idxs_idxs] 290 | # drop p% of ones 291 | ones_idxs = ones_idxs[:int(len(ones_idxs) * p)] 292 | # set those idxs to 0 293 | mask[ones_idxs] = 0 294 | 295 | mask = mask.view(oldshp) 296 | return mask 297 | 298 | 299 | def mask_or( 300 | mask1: torch.Tensor, 301 | mask2: torch.Tensor 302 | ): 303 | assert mask1.shape == mask2.shape, f"masks must be same shape, but got {mask1.shape} and {mask2.shape}" 304 | assert mask1.max() <= 1, "mask1 must be binary" 305 | assert mask2.max() <= 1, "mask2 must be binary" 306 | assert mask1.min() >= 0, "mask1 must be binary" 307 | assert mask2.min() >= 0, "mask2 must be binary" 308 | return (mask1 + mask2).clamp(0, 1) 309 | 310 | def time_stretch_mask( 311 | x: torch.Tensor, 312 | stretch_factor: int, 313 | ): 314 | assert stretch_factor >= 1, "stretch factor must be >= 1" 315 | c_seq_len = x.shape[-1] 316 | x = x.repeat_interleave(stretch_factor, dim=-1) 317 | 318 | # trim cz to the original length 319 | x = x[:, :, :c_seq_len] 320 | 321 | mask = periodic_mask(x, stretch_factor, width=1) 322 | return mask 323 | 324 | def onset_mask( 325 | onset_frame_idxs: torch.Tensor, 326 | z: torch.Tensor, 327 | width: int = 1, 328 | ): 329 | if len(onset_frame_idxs) == 0: 330 | print("no onsets detected") 331 | # print("onset_frame_idxs", onset_frame_idxs) 332 | # print("mask shape", z.shape) 333 | 334 | mask = torch.ones_like(z).int() 335 | for idx in onset_frame_idxs: 336 | mask[:, :, idx-width:idx+width] = 0 337 | 338 | return mask.int() 339 | 340 | def tria_mask( 341 | codes: torch.Tensor, 342 | min_amt: float = 0.1, 343 | max_amt: float = 0.4, 344 | ): 345 | """ 346 | unmasks a prefix of the codes tensor, 347 | in the range provided 348 | """ 349 | 350 | mask = full_mask(codes) 351 | nb, nc, nt = codes.shape 352 | for i in range(nb): 353 | amt = torch.rand(1) * (max_amt - min_amt) + min_amt 354 | amt = int(amt * nt) 355 | mask[i, :, :amt] = 0 356 | 357 | return mask 358 | 359 | 360 | 361 | 362 | 363 | 364 | if __name__ == "__main__": 365 | sig = AudioSignal("assets/example.wav") 366 | -------------------------------------------------------------------------------- /unloop/max/click.maxpat: -------------------------------------------------------------------------------- 1 | { 2 | "patcher" : { 3 | "fileversion" : 1, 4 | "appversion" : { 5 | "major" : 8, 6 | "minor" : 6, 7 | "revision" : 5, 8 | "architecture" : "x64", 9 | "modernui" : 1 10 | } 11 | , 12 | "classnamespace" : "box", 13 | "rect" : [ 59.0, 106.0, 640.0, 480.0 ], 14 | "bglocked" : 0, 15 | "openinpresentation" : 0, 16 | "default_fontsize" : 12.0, 17 | "default_fontface" : 0, 18 | "default_fontname" : "Arial", 19 | "gridonopen" : 1, 20 | "gridsize" : [ 15.0, 15.0 ], 21 | "gridsnaponopen" : 1, 22 | "objectsnaponopen" : 1, 23 | "statusbarvisible" : 2, 24 | "toolbarvisible" : 1, 25 | "lefttoolbarpinned" : 0, 26 | "toptoolbarpinned" : 0, 27 | "righttoolbarpinned" : 0, 28 | "bottomtoolbarpinned" : 0, 29 | "toolbars_unpinned_last_save" : 0, 30 | "tallnewobj" : 0, 31 | "boxanimatetime" : 200, 32 | "enablehscroll" : 1, 33 | "enablevscroll" : 1, 34 | "devicewidth" : 0.0, 35 | "description" : "", 36 | "digest" : "", 37 | "tags" : "", 38 | "style" : "", 39 | "subpatcher_template" : "", 40 | "assistshowspatchername" : 0, 41 | "boxes" : [ { 42 | "box" : { 43 | "id" : "obj-323", 44 | "maxclass" : "newobj", 45 | "numinlets" : 2, 46 | "numoutlets" : 1, 47 | "outlettype" : [ "" ], 48 | "patching_rect" : [ 129.751264274120331, 104.0, 32.0, 22.0 ], 49 | "text" : "gate" 50 | } 51 | 52 | } 53 | , { 54 | "box" : { 55 | "id" : "obj-321", 56 | "maxclass" : "newobj", 57 | "numinlets" : 2, 58 | "numoutlets" : 1, 59 | "outlettype" : [ "" ], 60 | "patching_rect" : [ 59.239391028881073, 100.0, 32.0, 22.0 ], 61 | "text" : "gate" 62 | } 63 | 64 | } 65 | , { 66 | "box" : { 67 | "format" : 6, 68 | "id" : "obj-319", 69 | "maxclass" : "flonum", 70 | "numinlets" : 1, 71 | "numoutlets" : 2, 72 | "outlettype" : [ "", "bang" ], 73 | "parameter_enable" : 0, 74 | "patching_rect" : [ 108.0, 266.418817639350891, 50.0, 22.0 ] 75 | } 76 | 77 | } 78 | , { 79 | "box" : { 80 | "id" : "obj-317", 81 | "maxclass" : "newobj", 82 | "numinlets" : 1, 83 | "numoutlets" : 2, 84 | "outlettype" : [ "bang", "bang" ], 85 | "patching_rect" : [ 129.751264274120331, 135.0, 32.0, 22.0 ], 86 | "text" : "t b b" 87 | } 88 | 89 | } 90 | , { 91 | "box" : { 92 | "id" : "obj-316", 93 | "maxclass" : "newobj", 94 | "numinlets" : 1, 95 | "numoutlets" : 2, 96 | "outlettype" : [ "bang", "bang" ], 97 | "patching_rect" : [ 59.239391028881073, 129.0, 32.0, 22.0 ], 98 | "text" : "t b b" 99 | } 100 | 101 | } 102 | , { 103 | "box" : { 104 | "id" : "obj-311", 105 | "maxclass" : "message", 106 | "numinlets" : 2, 107 | "numoutlets" : 1, 108 | "outlettype" : [ "" ], 109 | "patching_rect" : [ 150.735043168067932, 223.683774471282959, 29.5, 22.0 ], 110 | "text" : "600" 111 | } 112 | 113 | } 114 | , { 115 | "box" : { 116 | "id" : "obj-310", 117 | "maxclass" : "message", 118 | "numinlets" : 2, 119 | "numoutlets" : 1, 120 | "outlettype" : [ "" ], 121 | "patching_rect" : [ 108.0, 223.683774471282959, 29.5, 22.0 ], 122 | "text" : "400" 123 | } 124 | 125 | } 126 | , { 127 | "box" : { 128 | "id" : "obj-307", 129 | "maxclass" : "newobj", 130 | "numinlets" : 1, 131 | "numoutlets" : 1, 132 | "outlettype" : [ "" ], 133 | "patching_rect" : [ 579.222337603569031, 292.05984354019165, 77.0, 22.0 ], 134 | "text" : "loadmess 80" 135 | } 136 | 137 | } 138 | , { 139 | "box" : { 140 | "id" : "obj-284", 141 | "maxclass" : "newobj", 142 | "numinlets" : 2, 143 | "numoutlets" : 1, 144 | "outlettype" : [ "int" ], 145 | "patching_rect" : [ 509.991567671298981, 297.188048720359802, 29.5, 22.0 ], 146 | "text" : "* 8" 147 | } 148 | 149 | } 150 | , { 151 | "box" : { 152 | "id" : "obj-286", 153 | "maxclass" : "newobj", 154 | "numinlets" : 4, 155 | "numoutlets" : 1, 156 | "outlettype" : [ "signal" ], 157 | "patching_rect" : [ 451.871908962726593, 353.598305702209473, 106.0, 22.0 ], 158 | "text" : "reson~ 1. 100. 10." 159 | } 160 | 161 | } 162 | , { 163 | "box" : { 164 | "id" : "obj-281", 165 | "maxclass" : "newobj", 166 | "numinlets" : 2, 167 | "numoutlets" : 1, 168 | "outlettype" : [ "signal" ], 169 | "patching_rect" : [ 50.0, 432.230785131454468, 40.0, 22.0 ], 170 | "text" : "*~ 40." 171 | } 172 | 173 | } 174 | , { 175 | "box" : { 176 | "format" : 6, 177 | "id" : "obj-279", 178 | "maxclass" : "flonum", 179 | "numinlets" : 1, 180 | "numoutlets" : 2, 181 | "outlettype" : [ "", "bang" ], 182 | "parameter_enable" : 0, 183 | "patching_rect" : [ 579.222337603569031, 319.410271167755127, 50.0, 22.0 ] 184 | } 185 | 186 | } 187 | , { 188 | "box" : { 189 | "id" : "obj-272", 190 | "maxclass" : "newobj", 191 | "numinlets" : 2, 192 | "numoutlets" : 1, 193 | "outlettype" : [ "int" ], 194 | "patching_rect" : [ 380.795777916908264, 300.606852173805237, 29.5, 22.0 ], 195 | "text" : "* 4" 196 | } 197 | 198 | } 199 | , { 200 | "box" : { 201 | "id" : "obj-275", 202 | "maxclass" : "newobj", 203 | "numinlets" : 4, 204 | "numoutlets" : 1, 205 | "outlettype" : [ "signal" ], 206 | "patching_rect" : [ 322.795777916908264, 356.66199141740799, 106.0, 22.0 ], 207 | "text" : "reson~ 1. 100. 10." 208 | } 209 | 210 | } 211 | , { 212 | "box" : { 213 | "id" : "obj-264", 214 | "maxclass" : "newobj", 215 | "numinlets" : 2, 216 | "numoutlets" : 1, 217 | "outlettype" : [ "int" ], 218 | "patching_rect" : [ 247.598402619361877, 300.606852173805237, 29.5, 22.0 ], 219 | "text" : "* 2" 220 | } 221 | 222 | } 223 | , { 224 | "box" : { 225 | "id" : "obj-259", 226 | "maxclass" : "newobj", 227 | "numinlets" : 4, 228 | "numoutlets" : 1, 229 | "outlettype" : [ "signal" ], 230 | "patching_rect" : [ 189.47874391078949, 356.66199141740799, 106.0, 22.0 ], 231 | "text" : "reson~ 1. 100. 10." 232 | } 233 | 234 | } 235 | , { 236 | "box" : { 237 | "id" : "obj-200", 238 | "maxclass" : "newobj", 239 | "numinlets" : 1, 240 | "numoutlets" : 1, 241 | "outlettype" : [ "signal" ], 242 | "patching_rect" : [ 50.0, 266.418817639350891, 39.0, 22.0 ], 243 | "text" : "click~" 244 | } 245 | 246 | } 247 | , { 248 | "box" : { 249 | "id" : "obj-4", 250 | "maxclass" : "newobj", 251 | "numinlets" : 4, 252 | "numoutlets" : 1, 253 | "outlettype" : [ "signal" ], 254 | "patching_rect" : [ 50.0, 356.66199141740799, 106.0, 22.0 ], 255 | "text" : "reson~ 1. 100. 10." 256 | } 257 | 258 | } 259 | , { 260 | "box" : { 261 | "comment" : "1 enables click, 0 disables.", 262 | "id" : "obj-324", 263 | "index" : 1, 264 | "maxclass" : "inlet", 265 | "numinlets" : 0, 266 | "numoutlets" : 1, 267 | "outlettype" : [ "int" ], 268 | "patching_rect" : [ 59.239391028881073, 40.0, 30.0, 30.0 ] 269 | } 270 | 271 | } 272 | , { 273 | "box" : { 274 | "comment" : "bang for beat", 275 | "id" : "obj-325", 276 | "index" : 2, 277 | "maxclass" : "inlet", 278 | "numinlets" : 0, 279 | "numoutlets" : 1, 280 | "outlettype" : [ "bang" ], 281 | "patching_rect" : [ 104.239391028881073, 40.0, 30.0, 30.0 ] 282 | } 283 | 284 | } 285 | , { 286 | "box" : { 287 | "comment" : "bang for downbeat", 288 | "id" : "obj-326", 289 | "index" : 3, 290 | "maxclass" : "inlet", 291 | "numinlets" : 0, 292 | "numoutlets" : 1, 293 | "outlettype" : [ "bang" ], 294 | "patching_rect" : [ 175.239391028881073, 40.0, 30.0, 30.0 ] 295 | } 296 | 297 | } 298 | , { 299 | "box" : { 300 | "comment" : "signal out", 301 | "id" : "obj-327", 302 | "index" : 1, 303 | "maxclass" : "outlet", 304 | "numinlets" : 1, 305 | "numoutlets" : 0, 306 | "patching_rect" : [ 49.999889028881171, 514.23083500000007, 30.0, 30.0 ] 307 | } 308 | 309 | } 310 | ], 311 | "lines" : [ { 312 | "patchline" : { 313 | "destination" : [ "obj-259", 0 ], 314 | "order" : 1, 315 | "source" : [ "obj-200", 0 ] 316 | } 317 | 318 | } 319 | , { 320 | "patchline" : { 321 | "destination" : [ "obj-275", 0 ], 322 | "order" : 0, 323 | "source" : [ "obj-200", 0 ] 324 | } 325 | 326 | } 327 | , { 328 | "patchline" : { 329 | "destination" : [ "obj-4", 0 ], 330 | "order" : 2, 331 | "source" : [ "obj-200", 0 ] 332 | } 333 | 334 | } 335 | , { 336 | "patchline" : { 337 | "destination" : [ "obj-281", 0 ], 338 | "source" : [ "obj-259", 0 ] 339 | } 340 | 341 | } 342 | , { 343 | "patchline" : { 344 | "destination" : [ "obj-259", 2 ], 345 | "source" : [ "obj-264", 0 ] 346 | } 347 | 348 | } 349 | , { 350 | "patchline" : { 351 | "destination" : [ "obj-275", 2 ], 352 | "source" : [ "obj-272", 0 ] 353 | } 354 | 355 | } 356 | , { 357 | "patchline" : { 358 | "destination" : [ "obj-281", 0 ], 359 | "source" : [ "obj-275", 0 ] 360 | } 361 | 362 | } 363 | , { 364 | "patchline" : { 365 | "destination" : [ "obj-259", 3 ], 366 | "order" : 2, 367 | "source" : [ "obj-279", 0 ] 368 | } 369 | 370 | } 371 | , { 372 | "patchline" : { 373 | "destination" : [ "obj-275", 3 ], 374 | "order" : 1, 375 | "source" : [ "obj-279", 0 ] 376 | } 377 | 378 | } 379 | , { 380 | "patchline" : { 381 | "destination" : [ "obj-286", 3 ], 382 | "order" : 0, 383 | "source" : [ "obj-279", 0 ] 384 | } 385 | 386 | } 387 | , { 388 | "patchline" : { 389 | "destination" : [ "obj-4", 3 ], 390 | "order" : 3, 391 | "source" : [ "obj-279", 0 ] 392 | } 393 | 394 | } 395 | , { 396 | "patchline" : { 397 | "destination" : [ "obj-327", 0 ], 398 | "source" : [ "obj-281", 0 ] 399 | } 400 | 401 | } 402 | , { 403 | "patchline" : { 404 | "destination" : [ "obj-286", 2 ], 405 | "source" : [ "obj-284", 0 ] 406 | } 407 | 408 | } 409 | , { 410 | "patchline" : { 411 | "destination" : [ "obj-281", 0 ], 412 | "source" : [ "obj-286", 0 ] 413 | } 414 | 415 | } 416 | , { 417 | "patchline" : { 418 | "destination" : [ "obj-279", 0 ], 419 | "source" : [ "obj-307", 0 ] 420 | } 421 | 422 | } 423 | , { 424 | "patchline" : { 425 | "destination" : [ "obj-319", 0 ], 426 | "source" : [ "obj-310", 0 ] 427 | } 428 | 429 | } 430 | , { 431 | "patchline" : { 432 | "destination" : [ "obj-319", 0 ], 433 | "source" : [ "obj-311", 0 ] 434 | } 435 | 436 | } 437 | , { 438 | "patchline" : { 439 | "destination" : [ "obj-200", 0 ], 440 | "source" : [ "obj-316", 1 ] 441 | } 442 | 443 | } 444 | , { 445 | "patchline" : { 446 | "destination" : [ "obj-310", 0 ], 447 | "source" : [ "obj-316", 0 ] 448 | } 449 | 450 | } 451 | , { 452 | "patchline" : { 453 | "destination" : [ "obj-200", 0 ], 454 | "source" : [ "obj-317", 0 ] 455 | } 456 | 457 | } 458 | , { 459 | "patchline" : { 460 | "destination" : [ "obj-311", 0 ], 461 | "source" : [ "obj-317", 1 ] 462 | } 463 | 464 | } 465 | , { 466 | "patchline" : { 467 | "destination" : [ "obj-264", 0 ], 468 | "order" : 2, 469 | "source" : [ "obj-319", 0 ] 470 | } 471 | 472 | } 473 | , { 474 | "patchline" : { 475 | "destination" : [ "obj-272", 0 ], 476 | "order" : 1, 477 | "source" : [ "obj-319", 0 ] 478 | } 479 | 480 | } 481 | , { 482 | "patchline" : { 483 | "destination" : [ "obj-284", 0 ], 484 | "order" : 0, 485 | "source" : [ "obj-319", 0 ] 486 | } 487 | 488 | } 489 | , { 490 | "patchline" : { 491 | "destination" : [ "obj-4", 2 ], 492 | "order" : 3, 493 | "source" : [ "obj-319", 0 ] 494 | } 495 | 496 | } 497 | , { 498 | "patchline" : { 499 | "destination" : [ "obj-316", 0 ], 500 | "source" : [ "obj-321", 0 ] 501 | } 502 | 503 | } 504 | , { 505 | "patchline" : { 506 | "destination" : [ "obj-317", 0 ], 507 | "source" : [ "obj-323", 0 ] 508 | } 509 | 510 | } 511 | , { 512 | "patchline" : { 513 | "destination" : [ "obj-321", 0 ], 514 | "order" : 1, 515 | "source" : [ "obj-324", 0 ] 516 | } 517 | 518 | } 519 | , { 520 | "patchline" : { 521 | "destination" : [ "obj-323", 0 ], 522 | "order" : 0, 523 | "source" : [ "obj-324", 0 ] 524 | } 525 | 526 | } 527 | , { 528 | "patchline" : { 529 | "destination" : [ "obj-321", 1 ], 530 | "source" : [ "obj-325", 0 ] 531 | } 532 | 533 | } 534 | , { 535 | "patchline" : { 536 | "destination" : [ "obj-323", 1 ], 537 | "source" : [ "obj-326", 0 ] 538 | } 539 | 540 | } 541 | , { 542 | "patchline" : { 543 | "destination" : [ "obj-281", 0 ], 544 | "source" : [ "obj-4", 0 ] 545 | } 546 | 547 | } 548 | ] 549 | } 550 | 551 | } 552 | --------------------------------------------------------------------------------