├── LICENSE ├── README.md ├── assets ├── audios │ ├── clotho_birds_long.wav │ ├── cut_liszt_5s.mp3 │ ├── gtzan_blues.00002.au │ └── librispeech_1688-142285-0000.flac └── figs │ ├── framework.png │ └── gtzan.png ├── audio_understanding ├── audio_encoders │ ├── panns.py │ ├── piano_transcription_crnn.py │ └── whisper.py ├── data │ └── samplers.py ├── datasets │ ├── audiocaps.py │ ├── clotho.py │ ├── gtzan.py │ ├── librispeech.py │ ├── maestro.py │ └── wavcaps.py ├── llm │ ├── llama.py │ └── rope.py ├── target_transforms │ └── midi.py ├── tokenizers │ ├── bert.py │ └── bert_midi.py └── utils.py ├── configs ├── asr_librispeech.yaml ├── audio_caption_clotho.yaml ├── music_tagging_gtzan.yaml └── piano_transcription_maestro.yaml ├── env.sh ├── inference.py ├── scripts ├── download_clotho.sh ├── download_gtzan.sh ├── download_librispeech.sh └── download_maestro.sh ├── train.py └── train_accelerate.py /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/README.md -------------------------------------------------------------------------------- /assets/audios/clotho_birds_long.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/audios/clotho_birds_long.wav -------------------------------------------------------------------------------- /assets/audios/cut_liszt_5s.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/audios/cut_liszt_5s.mp3 -------------------------------------------------------------------------------- /assets/audios/gtzan_blues.00002.au: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/audios/gtzan_blues.00002.au -------------------------------------------------------------------------------- /assets/audios/librispeech_1688-142285-0000.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/audios/librispeech_1688-142285-0000.flac -------------------------------------------------------------------------------- /assets/figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/figs/framework.png -------------------------------------------------------------------------------- /assets/figs/gtzan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/assets/figs/gtzan.png -------------------------------------------------------------------------------- /audio_understanding/audio_encoders/panns.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/audio_encoders/panns.py -------------------------------------------------------------------------------- /audio_understanding/audio_encoders/piano_transcription_crnn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/audio_encoders/piano_transcription_crnn.py -------------------------------------------------------------------------------- /audio_understanding/audio_encoders/whisper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/audio_encoders/whisper.py -------------------------------------------------------------------------------- /audio_understanding/data/samplers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/data/samplers.py -------------------------------------------------------------------------------- /audio_understanding/datasets/audiocaps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/audiocaps.py -------------------------------------------------------------------------------- /audio_understanding/datasets/clotho.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/clotho.py -------------------------------------------------------------------------------- /audio_understanding/datasets/gtzan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/gtzan.py -------------------------------------------------------------------------------- /audio_understanding/datasets/librispeech.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/librispeech.py -------------------------------------------------------------------------------- /audio_understanding/datasets/maestro.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/maestro.py -------------------------------------------------------------------------------- /audio_understanding/datasets/wavcaps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/datasets/wavcaps.py -------------------------------------------------------------------------------- /audio_understanding/llm/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/llm/llama.py -------------------------------------------------------------------------------- /audio_understanding/llm/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/llm/rope.py -------------------------------------------------------------------------------- /audio_understanding/target_transforms/midi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/target_transforms/midi.py -------------------------------------------------------------------------------- /audio_understanding/tokenizers/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/tokenizers/bert.py -------------------------------------------------------------------------------- /audio_understanding/tokenizers/bert_midi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/tokenizers/bert_midi.py -------------------------------------------------------------------------------- /audio_understanding/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/audio_understanding/utils.py -------------------------------------------------------------------------------- /configs/asr_librispeech.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/configs/asr_librispeech.yaml -------------------------------------------------------------------------------- /configs/audio_caption_clotho.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/configs/audio_caption_clotho.yaml -------------------------------------------------------------------------------- /configs/music_tagging_gtzan.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/configs/music_tagging_gtzan.yaml -------------------------------------------------------------------------------- /configs/piano_transcription_maestro.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/configs/piano_transcription_maestro.yaml -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/env.sh -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/inference.py -------------------------------------------------------------------------------- /scripts/download_clotho.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/scripts/download_clotho.sh -------------------------------------------------------------------------------- /scripts/download_gtzan.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/scripts/download_gtzan.sh -------------------------------------------------------------------------------- /scripts/download_librispeech.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/scripts/download_librispeech.sh -------------------------------------------------------------------------------- /scripts/download_maestro.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/scripts/download_maestro.sh -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/train.py -------------------------------------------------------------------------------- /train_accelerate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiuqiangkong/audio_understanding/HEAD/train_accelerate.py --------------------------------------------------------------------------------