├── models ├── __init__.py ├── one_tier │ └── wavent.py └── two_tier │ ├── two_tier_generate32k.py │ ├── two_tier_generate16k.py.ol │ └── two_tier_generate16k.py ├── datasets ├── __init__.py ├── .DS_Store ├── music │ ├── .DS_Store │ ├── drum-preprocess.sh │ ├── prune_flacs.py │ ├── sum_flacs.py │ ├── SNAREdrum-preprocessERRORS.md │ ├── preprocess.sh │ ├── build_features.py │ ├── download_archive_preprocess.sh │ ├── preprocess.py │ ├── log_mp3s │ ├── new_experiment32k.py │ ├── new_experiment16k.py │ ├── new_experiment48k.py │ ├── new_experiment16k_conditioning.py │ └── drum-preprocess.py ├── dataset.py └── dataset_conditioning.py ├── .DS_Store ├── sampleRNN-pydotprint.png ├── train48k.sh ├── .gitignore ├── clean_results.py ├── lib ├── generate.py └── __init__.py ├── LICENSE └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/.DS_Store -------------------------------------------------------------------------------- /datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/datasets/.DS_Store -------------------------------------------------------------------------------- /datasets/music/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/datasets/music/.DS_Store -------------------------------------------------------------------------------- /sampleRNN-pydotprint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/sampleRNN-pydotprint.png -------------------------------------------------------------------------------- /datasets/music/drum-preprocess.sh: -------------------------------------------------------------------------------- 1 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P ) 2 | echo "Preprocessing" 3 | python drum-preprocess.py "$SCRIPTPATH" 4 | echo "Done!" 5 | 6 | echo "Writing datasets" 7 | python _drum2npy.py 8 | echo "Done!" 9 | -------------------------------------------------------------------------------- /datasets/music/prune_flacs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import glob 4 | 5 | DIR = "." 6 | fs = glob.glob(DIR+"/*.flac") 7 | for f in fs: 8 | size = float(subprocess.check_output('ffprobe -i "{}/{}" -show_entries format=duration -v quiet -of csv="p=0"'.format(DIR, f), shell=True)) 9 | if size != 3.762563: 10 | print f 11 | print size -------------------------------------------------------------------------------- /datasets/music/sum_flacs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import glob 4 | 5 | DIR = "." 6 | fs = glob.glob(DIR+"/*.wav") 7 | t = 0 8 | print 'counting...' 9 | for f in fs: 10 | size = float(subprocess.check_output('ffprobe -i "{}/{}" -show_entries format=duration -v quiet -of csv="p=0"'.format(DIR, f), shell=True)) 11 | t = t + size 12 | print t, ' seconds' -------------------------------------------------------------------------------- /train48k.sh: -------------------------------------------------------------------------------- 1 | THEANO_FLAGS=mode=FAST_RUN,device=cuda$1,floatX=float32 python -u models/two_tier/two_tier48k.py --exp $2 --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set $2 2 | sleep 1; 3 | while true; 4 | do 5 | THEANO_FLAGS=mode=FAST_RUN,device=cuda$1,floatX=float32 python -u models/two_tier/two_tier48k.py --exp $2 --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set $2 --resume; sleep 1; done; 6 | -------------------------------------------------------------------------------- /datasets/music/SNAREdrum-preprocessERRORS.md: -------------------------------------------------------------------------------- 1 | # SNARE drum-preprocess.py ERRORS 2 | ## sample-rnn 3 | ## 4/6/2017 4 | 5 | B 6 | = 7 | 8 | ./p295d.flac 4.28575 9 | 10 | ./p1290d.flac 3.980813 11 | 12 | ./p1290u.flac 3.980813 13 | 14 | ./p295.flac 4.28575 15 | 16 | ./p295u.flac 4.28575 17 | 18 | ./p1290.flac 3.980813 19 | 20 | BR 21 | = 22 | 23 | ./p295d.flac 4.28575 24 | 25 | ./p1290d.flac 3.980813 26 | 27 | ./p1290u.flac 3.980813 28 | 29 | ./p295.flac 4.28575 30 | 31 | ./p295u.flac 4.28575 32 | 33 | ./p1290.flac 3.980813 34 | 35 | FR 36 | == 37 | 38 | ./p295d.flac 4.28575 39 | 40 | ./p1290d.flac 3.980813 41 | 42 | ./p1290u.flac 3.980813 43 | 44 | ./p295.flac 4.28575 45 | 46 | ./p295u.flac 4.28575 47 | 48 | ./p1290.flac 3.980813 49 | -------------------------------------------------------------------------------- /datasets/music/preprocess.sh: -------------------------------------------------------------------------------- 1 | # Requires 2GB of free disk space at most. 2 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P ) 3 | echo "Converting from OGG to 16Khz, 16bit mono-channel WAV" 4 | # Next line with & executes in a forked shell in the background. That's parallel and not recommended. 5 | # Remove if causing problem 6 | #for file in "$DL_PATH"*_64kb.mp3; do ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" _64kb.mp3`.wav" & done 7 | for file in "$SCRIPTPATH"*.ogg; do 8 | ffmpeg -i "$file" -ar 16000 -ac 1 "$SCRIPTPATH""`basename "$file" .ogg`.wav" 9 | done 10 | echo "Cleaning up" 11 | rm "$SCRIPTPATH"*.ogg 12 | 13 | echo "Preprocessing" 14 | python preprocess.py "$SCRIPTPATH" 15 | echo "Done!" 16 | 17 | echo "Writing datasets" 18 | python _2npy.py 19 | echo "Done!" 20 | -------------------------------------------------------------------------------- /datasets/music/build_features.py: -------------------------------------------------------------------------------- 1 | # Given an array of PCM samples, return a feature matrix 2 | # The feature matrix doesn't need to be upsampled to the sample rate 3 | # However long the matrix is, we assume it matches the length of the WAV 4 | # So you can use any frame_rate (hop_size) 5 | import numpy as np 6 | import librosa 7 | import librosa.onset 8 | def build_dummy_features(filename): 9 | features = np.ones((1000,1),dtype='float32') 10 | for i,_ in enumerate(features): 11 | features[i,0] = i/1000.0 12 | return features 13 | 14 | def build_onset_envelope_feature(filename): 15 | y, sr = librosa.load(filename) 16 | hop_length=128 17 | onset_env = librosa.onset.onset_strength(y=y, sr=sr, 18 | aggregate=np.median, hop_length=hop_length, fmax=8000) 19 | # normalize the onset_env 20 | onset_env = (onset_env - np.mean(onset_env))/np.std(onset_env) 21 | 22 | num_frames = len(onset_env) 23 | feature_matrix = np.ones((num_frames,1),dtype='float32') 24 | feature_matrix[:,0] = onset_env 25 | return feature_matrix 26 | 27 | -------------------------------------------------------------------------------- /datasets/music/download_archive_preprocess.sh: -------------------------------------------------------------------------------- 1 | # Requires 2GB of free disk space at most. 2 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P ) 3 | DL_PATH="$SCRIPTPATH"/download/ 4 | mkdir -p "$DL_PATH" 5 | echo "Downloading files to "$DL_PATH"" 6 | # See: https://blog.archive.org/2012/04/26/downloading-in-bulk-using-wget/ 7 | wget -r -H -nc -nH --cut-dir=1 -A .ogg -R *_vbr.mp3 -e robots=off -P "$DL_PATH" -l1 -i ./itemlist.txt -B 'http://archive.org/download/' 8 | echo "Organizing files and folders" 9 | mv "$DL_PATH"*/*.ogg "$DL_PATH" 10 | rmdir "$DL_PATH"*/ 11 | echo "Converting from OGG to 16Khz, 16bit mono-channel WAV" 12 | # Next line with & executes in a forked shell in the background. That's parallel and not recommended. 13 | # Remove if causing problem 14 | #for file in "$DL_PATH"*_64kb.mp3; do ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" _64kb.mp3`.wav" & done 15 | for file in "$DL_PATH"*.ogg; do 16 | ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" .ogg`.wav" 17 | done 18 | echo "Cleaning up" 19 | rm "$DL_PATH"*.ogg 20 | 21 | echo "Preprocessing" 22 | python preprocess.py "$DL_PATH" 23 | echo "Done!" 24 | 25 | echo "Writing datasets" 26 | python _2npy.py 27 | echo "Done!" 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | #lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | *.wav 91 | datasets/music/rev-preprocess.sh 92 | 93 | datasets/music/*/ 94 | results_2t/* 95 | -------------------------------------------------------------------------------- /clean_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | cwd = os.getcwd() 4 | results_dir = os.path.join(cwd, 'results_2t') 5 | def get_subdirectories(a_dir): 6 | return [name for name in os.listdir(a_dir) 7 | if os.path.isdir(os.path.join(a_dir, name))] 8 | experiments = get_subdirectories(results_dir) 9 | num_epochs = 5 10 | hit_list = ["params_e"+str(n)+"_" for n in xrange(num_epochs)] 11 | unused_files = [] 12 | for e in experiments: 13 | e_dir = os.path.join(results_dir, e) 14 | params = os.path.join(e_dir, "params") 15 | for root, dirs, files in os.walk(params): 16 | for file in files: 17 | for hit in hit_list: 18 | if file.startswith(hit): 19 | print file 20 | unused_files.append(os.path.join(root, file)) 21 | def prompt_delete(num_prompts): 22 | num_prompts -= 1 23 | if num_prompts >= 0: 24 | prompt = input("Do you want to delete these "+str(len(unused_files))+" files? ['Y'/'n']") 25 | if prompt == "Y" or prompt == "yes": 26 | print 'removing old epochs...' 27 | for uf in unused_files: 28 | os.remove(uf) 29 | elif prompt == "n" or prompt == "no": 30 | print "clean aborted: 0 files deleted" 31 | else: 32 | print "warning:", prompt, "is an unknown command" 33 | prompt_delete(num_prompts) 34 | else: 35 | print "0 files deleted: Good-bye" 36 | if len(unused_files) > 0: 37 | prompt_delete(3) 38 | else: 39 | print 'found 0 files to clean: Good-bye' 40 | -------------------------------------------------------------------------------- /datasets/music/preprocess.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import subprocess 3 | 4 | RAW_DATA_DIR=str(sys.argv[1]) 5 | OUTPUT_DIR=os.path.join(RAW_DATA_DIR, "parts") 6 | os.makedirs(OUTPUT_DIR) 7 | print RAW_DATA_DIR 8 | print OUTPUT_DIR 9 | 10 | # Step 1: write all filenames to a list 11 | with open(os.path.join(OUTPUT_DIR, 'preprocess_file_list.txt'), 'w') as f: 12 | for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR): 13 | for filename in filenames: 14 | if filename.endswith(".wav"): 15 | f.write("file '" + dirpath + '/'+ filename + "'\n") 16 | 17 | # Step 2: concatenate everything into one massive wav file 18 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(OUTPUT_DIR, OUTPUT_DIR)) 19 | audio = "preprocess_all_audio.wav" 20 | # # get the length of the resulting file 21 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True)) 22 | print length, "DURATION" 23 | # reverse the audio file 24 | if sys.argv[2] == True: 25 | os.system("sox preprocess_all_audio.wav reverse_preprocess_audio.wav reverse") 26 | audio = "reverse_preprocess_audio.wav" 27 | # # Step 3: split the big file into 8-second chunks 28 | for i in xrange((int(length)//8 - 1)/3): 29 | os.system('ffmpeg -ss {} -t 8 -i {}/{} -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(i, OUTPUT_DIR, audio, OUTPUT_DIR, i)) 30 | 31 | # # Step 4: clean up temp files 32 | #os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR)) 33 | os.system('rm {}/preprocess_file_list.txt'.format(OUTPUT_DIR)) 34 | -------------------------------------------------------------------------------- /datasets/music/log_mp3s: -------------------------------------------------------------------------------- 1 | download$ for f in *; do ffmpeg -i $f 2>&1 | grep Duration; done 2 | Duration: 00:22:18.52, start: 0.000000, bitrate: 320 kb/s 3 | Duration: 00:15:13.07, start: 0.000000, bitrate: 320 kb/s 4 | Duration: 00:13:44.23, start: 0.000000, bitrate: 320 kb/s 5 | Duration: 00:21:17.55, start: 0.000000, bitrate: 320 kb/s 6 | Duration: 00:24:03.82, start: 0.000000, bitrate: 320 kb/s 7 | Duration: 00:23:00.14, start: 0.000000, bitrate: 320 kb/s 8 | Duration: 00:21:24.58, start: 0.000000, bitrate: 320 kb/s 9 | Duration: 00:07:09.15, start: 0.000000, bitrate: 320 kb/s 10 | Duration: 00:07:20.90, start: 0.000000, bitrate: 320 kb/s 11 | Duration: 00:09:58.42, start: 0.000000, bitrate: 320 kb/s 12 | Duration: 00:10:17.88, start: 0.000000, bitrate: 320 kb/s 13 | Duration: 00:22:07.47, start: 0.000000, bitrate: 320 kb/s 14 | Duration: 00:09:47.16, start: 0.000000, bitrate: 320 kb/s 15 | Duration: 00:08:31.91, start: 0.000000, bitrate: 320 kb/s 16 | Duration: 00:07:00.63, start: 0.000000, bitrate: 320 kb/s 17 | Duration: 00:12:31.47, start: 0.000000, bitrate: 320 kb/s 18 | Duration: 00:19:19.51, start: 0.000000, bitrate: 320 kb/s 19 | Duration: 00:40:38.57, start: 0.000000, bitrate: 320 kb/s 20 | Duration: 00:26:01.98, start: 0.000000, bitrate: 320 kb/s 21 | Duration: 00:13:57.26, start: 0.000000, bitrate: 320 kb/s 22 | Duration: 00:16:23.42, start: 0.000000, bitrate: 320 kb/s 23 | Duration: 00:24:17.95, start: 0.025057, bitrate: 137 kb/s 24 | Duration: 00:17:26.14, start: 0.000000, bitrate: 320 kb/s 25 | Duration: 00:23:03.66, start: 0.000000, bitrate: 320 kb/s 26 | Duration: 00:20:31.32, start: 0.000000, bitrate: 320 kb/s 27 | Duration: 00:18:35.52, start: 0.000000, bitrate: 320 kb/s 28 | Duration: 00:25:45.52, start: 0.000000, bitrate: 320 kb/s 29 | Duration: 00:27:36.38, start: 0.000000, bitrate: 320 kb/s 30 | Duration: 00:16:26.45, start: 0.000000, bitrate: 320 kb/s 31 | Duration: 00:11:07.99, start: 0.000000, bitrate: 320 kb/s 32 | Duration: 00:24:12.24, start: 0.000000, bitrate: 320 kb/s 33 | Duration: 00:18:32.30, start: 0.000000, bitrate: 320 kb/s 34 | 35 | 560 minutes total 36 | -------------------------------------------------------------------------------- /datasets/music/new_experiment32k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, os, subprocess, scikits.audiolab, random, time, glob 3 | 4 | PWD = os.getcwd() 5 | print 'PWD is', PWD 6 | #store dataset name 7 | DATASET_NAME = str(sys.argv[1]) 8 | DOWNLOAD_DIR = str(sys.argv[2]) 9 | print 'dl_dir is set to', DOWNLOAD_DIR 10 | #create the 11 | print "creating directory for", DATASET_NAME 12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME) 13 | os.makedirs(DATASET_DIR) 14 | #move samples from directory to use dataset name 15 | print "moving samples" 16 | types = {'wav', "mp3"} 17 | for t in types: 18 | os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR)) 19 | #run proprocess 20 | print "preprocessing" 21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts") 22 | os.makedirs(OUTPUT_DIR) 23 | # Step 1: write all filenames to a list 24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f: 25 | for dirpath, dirnames, filenames in os.walk(DATASET_DIR): 26 | for filename in filenames: 27 | if filename.endswith(".wav") or filename.endswith("mp3"): 28 | f.write("file '" + dirpath + '/'+ filename + "'\n") 29 | 30 | # Step 2: concatenate everything into one massive wav file 31 | print "concatenate all files" 32 | os.system('pwd') 33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR)) 34 | audio = "preprocess_all_audio.wav" 35 | print "get length" 36 | # # get the length of the resulting file 37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True)) 38 | print length, "DURATION" 39 | print "print big file into chunks" 40 | # # Step 3: split the big file into 8-second chunks 41 | # overlapping 3 times per 8 seconds 42 | ''' 43 | for i in xrange(int((length//8)*3)-1): 44 | time = (i * 8 )/ 3 45 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 46 | ''' 47 | size = 8 48 | num = 3200 49 | for i in xrange(0, num): 50 | time = i * ((length-size)/float(num)) 51 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 32000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 52 | print "clean up" 53 | # # Step 4: clean up temp files 54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR)) 55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR)) 56 | print 'save as .npy' 57 | __RAND_SEED = 123 58 | def __fixed_shuffle(inp_list): 59 | if isinstance(inp_list, list): 60 | random.seed(__RAND_SEED) 61 | random.shuffle(inp_list) 62 | return 63 | #import collections 64 | #if isinstance(inp_list, (collections.Sequence)): 65 | if isinstance(inp_list, numpy.ndarray): 66 | numpy.random.seed(__RAND_SEED) 67 | numpy.random.shuffle(inp_list) 68 | return 69 | # destructive operations; in place; no need to return 70 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 71 | 72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac")) 73 | __fixed_shuffle(paths) 74 | 75 | arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths] 76 | np_arr = np.array(arr) 77 | # 88/6/6 split 78 | length = len(np_arr) 79 | train_size = int(np.floor(length * .88)) # train 80 | test_size = int(np.floor(length * .06)) # test 81 | 82 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr) 83 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size]) 84 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size]) 85 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:]) 86 | 87 | #pass dataset name through two_tier.py || three_tier.py to datasets.py -------------------------------------------------------------------------------- /datasets/music/new_experiment16k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, os, subprocess, scikits.audiolab, random, time, glob 3 | 4 | PWD = os.getcwd() 5 | print 'PWD is', PWD 6 | #store dataset name 7 | DATASET_NAME = str(sys.argv[1]) 8 | DOWNLOAD_DIR = str(sys.argv[2]) 9 | print 'dl_dir is set to', DOWNLOAD_DIR 10 | #create the 11 | print "creating directory for", DATASET_NAME 12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME) 13 | os.makedirs(DATASET_DIR) 14 | #move samples from directory to use dataset name 15 | print "moving samples" 16 | types = {'wav', "mp3"} 17 | for t in types: 18 | os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR)) 19 | #run proprocess 20 | print "preprocessing" 21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts") 22 | os.makedirs(OUTPUT_DIR) 23 | # Step 1: write all filenames to a list 24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f: 25 | for dirpath, dirnames, filenames in os.walk(DATASET_DIR): 26 | for filename in filenames: 27 | if filename.endswith(".wav") or filename.endswith("mp3"): 28 | f.write("file '" + dirpath + '/'+ filename + "'\n") 29 | 30 | # Step 2: concatenate everything into one massive wav file 31 | print "concatenate all files" 32 | os.system('pwd') 33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR)) 34 | audio = "preprocess_all_audio.wav" 35 | print "get length" 36 | # # get the length of the resulting file 37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True)) 38 | print length, "DURATION" 39 | print "print big file into chunks" 40 | # # Step 3: split the big file into 8-second chunks 41 | # overlapping 3 times per 8 seconds 42 | ''' 43 | for i in xrange(int((length//8)*3)-1): 44 | time = (i * 8 )/ 3 45 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 46 | ''' 47 | size = 12 48 | num = 6400 49 | for i in xrange(0, num): 50 | time = i * ((length-size)/float(num)) 51 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 52 | print "clean up" 53 | # # Step 4: clean up temp files 54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR)) 55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR)) 56 | print 'save as .npy' 57 | __RAND_SEED = 123 58 | def __fixed_shuffle(inp_list): 59 | if isinstance(inp_list, list): 60 | random.seed(__RAND_SEED) 61 | random.shuffle(inp_list) 62 | return 63 | #import collections 64 | #if isinstance(inp_list, (collections.Sequence)): 65 | if isinstance(inp_list, numpy.ndarray): 66 | numpy.random.seed(__RAND_SEED) 67 | numpy.random.shuffle(inp_list) 68 | return 69 | # destructive operations; in place; no need to return 70 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 71 | 72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac")) 73 | __fixed_shuffle(paths) 74 | 75 | arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths] 76 | np_arr = np.array(arr) 77 | # 88/6/6 split 78 | length = len(np_arr) 79 | train_size = int(np.floor(length * .88)) # train 80 | test_size = int(np.floor(length * .06)) # test 81 | 82 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr) 83 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size]) 84 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size]) 85 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:]) 86 | 87 | #pass dataset name through two_tier.py || three_tier.py to datasets.py 88 | -------------------------------------------------------------------------------- /datasets/music/new_experiment48k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, os, subprocess, random, time, glob, soundfile as sf 3 | 4 | PWD = os.getcwd() 5 | print 'PWD is', PWD 6 | #store dataset name 7 | DATASET_NAME = str(sys.argv[1]) 8 | DOWNLOAD_DIR = str(sys.argv[2]) 9 | print 'dl_dir is set to', DOWNLOAD_DIR 10 | #create the 11 | print "creating directory for", DATASET_NAME 12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME) 13 | os.makedirs(DATASET_DIR) 14 | #move samples from directory to use dataset name 15 | print "moving samples" 16 | types = {'wav', "mp3"} 17 | for t in types: 18 | os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR)) 19 | #run proprocess 20 | print "preprocessing" 21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts") 22 | os.makedirs(OUTPUT_DIR) 23 | # Step 1: write all filenames to a list 24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f: 25 | for dirpath, dirnames, filenames in os.walk(DATASET_DIR): 26 | for filename in filenames: 27 | if filename.endswith(".wav") or filename.endswith("mp3"): 28 | f.write("file '" + dirpath + '/'+ filename + "'\n") 29 | 30 | # Step 2: concatenate everything into one massive wav file 31 | print "concatenate all files" 32 | os.system('pwd') 33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR)) 34 | audio = "preprocess_all_audio.wav" 35 | print "get length" 36 | # # get the length of the resulting file 37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True)) 38 | print length, "DURATION" 39 | print "print big file into chunks" 40 | # # Step 3: split the big file into 8-second chunks 41 | # overlapping 3 times per 8 seconds 42 | ''' 43 | for i in xrange(int((length//8)*3)-1): 44 | time = (i * 8 )/ 3 45 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 48000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 46 | ''' 47 | size = 8 48 | num = 6400 49 | for i in xrange(0, num): 50 | time = i * ((length-size)/float(num)) 51 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 48000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 52 | print "clean up" 53 | # # Step 4: clean up temp files 54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR)) 55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR)) 56 | print 'save as .npy' 57 | __RAND_SEED = 123 58 | def __fixed_shuffle(inp_list): 59 | if isinstance(inp_list, list): 60 | random.seed(__RAND_SEED) 61 | random.shuffle(inp_list) 62 | return 63 | #import collections 64 | #if isinstance(inp_list, (collections.Sequence)): 65 | if isinstance(inp_list, numpy.ndarray): 66 | numpy.random.seed(__RAND_SEED) 67 | numpy.random.shuffle(inp_list) 68 | return 69 | # destructive operations; in place; no need to return 70 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 71 | 72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac")) 73 | __fixed_shuffle(paths) 74 | 75 | #arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths] 76 | arr = [sf.read(p)[0].astype('float16') for p in paths] 77 | np_arr = np.array(arr) 78 | # 88/6/6 split 79 | length = len(np_arr) 80 | train_size = int(np.floor(length * .88)) # train 81 | test_size = int(np.floor(length * .06)) # test 82 | 83 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr) 84 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size]) 85 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size]) 86 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:]) 87 | 88 | #pass dataset name through two_tier.py || three_tier.py to datasets.py 89 | -------------------------------------------------------------------------------- /lib/generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import time 3 | import scipy.io.wavfile 4 | import glob 5 | import sys 6 | import numpy 7 | import pickle 8 | import theano 9 | import theano.tensor as T 10 | 11 | tag = sys.argv[1] 12 | name = glob.glob("../results*/" + tag + "/args.pkl")[0] 13 | params = pickle.load(open(name, "r")) 14 | print params 15 | info = {} 16 | for p in xrange(1,len(params),2): 17 | if p+1 < len(params): 18 | info[params[p][2:]] = params[p+1] 19 | print info 20 | #exit() 21 | 22 | Q_TYPE = info["q_type"] 23 | Q_LEVELS = int(info["q_levels"]) 24 | N_RNN = int(info["n_rnn"]) 25 | DIM = int(info["dim"]) 26 | FRAME_SIZE = int(info["frame_size"]) 27 | 28 | 29 | #{'dim': '1024', 'q_type': 'linear', 'learn_h0': 'True', 'weight_norm': 'True', 'q_levels': '256', 'skip_conn': 'False', 'batch_size': '128', 'n_frames': '64', 'emb_size': '256', 'exp': 'KURT2x4', 'frame_size': '16', 'which_set': 'KURT', 'rnn_type': 'GRU', 'n_rnn': '4'} 30 | 31 | ###grab this stuff 32 | #args 33 | #Q_TYPE 34 | #Q_TEVELS 35 | #N_RNN 36 | #DIM 37 | #FRAME_SIZE 38 | 39 | BITRATE = 16000 40 | N_SEQS = 20 # Number of samples to generate every time monitoring. 41 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 42 | H0_MULT = 1 43 | 44 | RESULTS_DIR = 'results_2t' 45 | RESULTS_DIR = name.split("/")[1] 46 | print RESULTS_DIR 47 | 48 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 49 | ### Create directories ### 50 | # FOLDER_PREFIX: root, contains: 51 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 52 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 53 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 54 | 55 | print SAMPLES_PATH 56 | # Uniform [-0.5, 0.5) for half of initial state for generated samples 57 | # to study the behaviour of the model and also to introduce some diversity 58 | # to samples in a simple way. [it's disabled for now] 59 | sequences = T.imatrix('sequences') 60 | h0 = T.tensor3('h0') 61 | reset = T.iscalar('reset') 62 | mask = T.matrix('mask') 63 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM) 64 | fixed_rand_h0 -= 0.5 65 | fixed_rand_h0 = fixed_rand_h0.astype('float32') 66 | 67 | def generate_and_save_samples(): 68 | # Sampling at frame level 69 | frame_level_generate_fn = theano.function( 70 | [sequences, h0, reset], 71 | frame_level_rnn(sequences, h0, reset), 72 | on_unused_input='warn' 73 | ) 74 | def write_audio_file(name, data): 75 | data = data.astype('float32') 76 | data -= data.min() 77 | data /= data.max() 78 | data -= 0.5 79 | data *= 0.95 80 | scipy.io.wavfile.write( 81 | os.path.join(SAMPLES_PATH, name+'.wav'), 82 | BITRATE, 83 | data) 84 | 85 | total_time = time() 86 | # Generate N_SEQS' sample files, each 5 seconds long 87 | N_SECS = 5 88 | LENGTH = N_SECS*BITRATE 89 | 90 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 91 | samples[:, :FRAME_SIZE] = Q_ZERO 92 | 93 | # First half zero, others fixed random at each checkpoint 94 | h0 = numpy.zeros( 95 | (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), 96 | dtype='float32' 97 | ) 98 | h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) 99 | frame_level_outputs = None 100 | 101 | for t in xrange(FRAME_SIZE, LENGTH): 102 | 103 | if t % FRAME_SIZE == 0: 104 | frame_level_outputs, h0 = frame_level_generate_fn( 105 | samples[:, t-FRAME_SIZE:t], 106 | h0, 107 | #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), 108 | numpy.int32(t == FRAME_SIZE) 109 | ) 110 | 111 | samples[:, t] = sample_level_generate_fn( 112 | frame_level_outputs[:, t % FRAME_SIZE], 113 | samples[:, t-FRAME_SIZE:t], 114 | ) 115 | 116 | total_time = time() - total_time 117 | log = "{} samples of {} seconds length generated in {} seconds." 118 | log = log.format(N_SEQS, N_SECS, total_time) 119 | print log, 120 | 121 | for i in xrange(N_SEQS): 122 | samp = samples[i] 123 | if Q_TYPE == 'mu-law': 124 | from datasets.dataset import mu2linear 125 | samp = mu2linear(samp) 126 | elif Q_TYPE == 'a-law': 127 | raise NotImplementedError('a-law is not implemented') 128 | write_audio_file("sample_{}_{}".format(tag, i), samp) 129 | 130 | generate_and_save_samples() -------------------------------------------------------------------------------- /datasets/music/new_experiment16k_conditioning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, os, subprocess, scikits.audiolab, random, time, glob, math 3 | 4 | 5 | from build_features import * 6 | 7 | PWD = os.getcwd() 8 | print 'PWD is', PWD 9 | #store dataset name 10 | DATASET_NAME = str(sys.argv[1]) 11 | DOWNLOAD_DIR = str(sys.argv[2]) 12 | print 'dl_dir is set to', DOWNLOAD_DIR 13 | #create the 14 | print "creating directory for", DATASET_NAME 15 | DATASET_DIR = os.path.join(PWD, DATASET_NAME) 16 | os.makedirs(DATASET_DIR) 17 | #move samples from directory to use dataset name 18 | print "moving samples" 19 | types = {'wav', "mp3"} 20 | for t in types: 21 | os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR)) 22 | #run proprocess 23 | print "preprocessing" 24 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts") 25 | os.makedirs(OUTPUT_DIR) 26 | # Step 1: write all filenames to a list 27 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f: 28 | for dirpath, dirnames, filenames in os.walk(DATASET_DIR): 29 | for filename in filenames: 30 | if filename.endswith(".wav") or filename.endswith("mp3"): 31 | f.write("file '" + dirpath + '/'+ filename + "'\n") 32 | 33 | # Step 2: concatenate everything into one massive wav file 34 | print "concatenate all files" 35 | os.system('pwd') 36 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR)) 37 | audio = "preprocess_all_audio.wav" 38 | print "get length" 39 | # # get the length of the resulting file 40 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True)) 41 | print length, "DURATION" 42 | print "print big file into chunks" 43 | # # Step 3: split the big file into 8-second chunks 44 | # overlapping 3 times per 8 seconds 45 | ''' 46 | for i in xrange(int((length//8)*3)-1): 47 | time = (i * 8 )/ 3 48 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 49 | ''' 50 | 51 | # size in seconds of each chunk 52 | size = 8 53 | # number of chunks 54 | num_chunks = 3200 55 | 56 | # cj (conditioning) generate the feature matrix for the entire dataset WAV 57 | features = build_onset_envelope_feature("{}/preprocess_all_audio.wav".format(OUTPUT_DIR)) 58 | # frame_rate is the number of feature frames per second 59 | # calcualte it by comparing length of features to length of audio 60 | # don't confuse feature_frames for the SampleRNN frames 61 | total_num_frames = features.shape[0] 62 | num_features = features.shape[1] 63 | frame_rate = len(features)/float(length) 64 | # number of frames per chunk of audio 65 | frames_per_chunk = int(math.floor((size)*frame_rate)) 66 | # a matrix of chunks x frames x features 67 | feature_matrix = np.zeros((num_chunks, frames_per_chunk, num_features), dtype='float32') 68 | 69 | 70 | 71 | for i in xrange(0, num_chunks): 72 | time = i * ((length-size)/float(num_chunks)) 73 | 74 | # build the feature_matrix 75 | # it's the feature timesliced according to the start and end times of the chunk 76 | start_frame = int(math.floor((time)*frame_rate)) 77 | end_frame = start_frame + frames_per_chunk 78 | if(len(features)<=end_frame): 79 | end_frame = len(features)-1 80 | # print "start_frame", start_frame 81 | # print "end_frame", end_frame 82 | # print "features[start:end].shape", features[start_frame:end_frame].shape 83 | # print "len(features)", len(features) 84 | # print "time", time 85 | # print "frames_per_chunk", frames_per_chunk 86 | # print "frame_rate", frame_rate 87 | # print "total_num_frames", total_num_frames 88 | # print "num_features", num_features 89 | feature_matrix[i] = features[start_frame:end_frame] 90 | 91 | os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i)) 92 | print "clean up" 93 | 94 | 95 | 96 | # # Step 4: clean up temp files 97 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR)) 98 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR)) 99 | print 'save as .npy' 100 | __RAND_SEED = 123 101 | def __fixed_shuffle(inp_list): 102 | if isinstance(inp_list, list): 103 | random.seed(__RAND_SEED) 104 | random.shuffle(inp_list) 105 | return 106 | #import collections 107 | #if isinstance(inp_list, (collections.Sequence)): 108 | if isinstance(inp_list, numpy.ndarray): 109 | numpy.random.seed(__RAND_SEED) 110 | numpy.random.shuffle(inp_list) 111 | return 112 | # destructive operations; in place; no need to return 113 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 114 | 115 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac")) 116 | __fixed_shuffle(paths) 117 | 118 | 119 | 120 | 121 | 122 | # CJ (conditioning) 123 | # For conditioning, the np_arr should be structured as follows 124 | # np_arr[0] are the PCM samples as usual 125 | # np_arr[1] are the feature vectors 126 | 127 | # Turn the FLACs into PCM samples 128 | samples = np.array([(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths]) 129 | print samples.shape 130 | 131 | print feature_matrix.shape 132 | 133 | 134 | # 88/6/6 split 135 | length = samples.shape[0] 136 | train_size = int(np.floor(length * .88)) # train 137 | test_size = int(np.floor(length * .06)) # test 138 | 139 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), samples) 140 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), samples[:train_size]) 141 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), samples[train_size:train_size + test_size]) 142 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), samples[train_size + test_size:]) 143 | 144 | np.save(os.path.join(DATASET_DIR,'all_features.npy'), feature_matrix) 145 | np.save(os.path.join(DATASET_DIR,'features_train.npy'), feature_matrix[:train_size]) 146 | np.save(os.path.join(DATASET_DIR,'features_valid.npy'), feature_matrix[train_size:train_size + test_size]) 147 | np.save(os.path.join(DATASET_DIR,'features_test.npy'), feature_matrix[train_size + test_size:]) 148 | 149 | #pass dataset name through two_tier.py || three_tier.py to datasets.py -------------------------------------------------------------------------------- /datasets/music/drum-preprocess.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import subprocess 3 | # requires sox, ffmpeg, and ffprobe command line tools 4 | 5 | RAW_DATA_DIR=str(sys.argv[1]) 6 | TEMP_DIR=os.path.join(RAW_DATA_DIR, "temp") 7 | FR_DIR=os.path.join(RAW_DATA_DIR, "fr-parts") 8 | BR_DIR=os.path.join(RAW_DATA_DIR, "br-parts") 9 | F_DIR=os.path.join(RAW_DATA_DIR, "f-parts") 10 | B_DIR=os.path.join(RAW_DATA_DIR, "b-parts") 11 | SAMPLE_RATE = 16000 12 | os.makedirs(TEMP_DIR) 13 | os.makedirs(FR_DIR) 14 | os.makedirs(BR_DIR) 15 | os.makedirs(F_DIR) 16 | os.makedirs(B_DIR) 17 | 18 | def createParts(): 19 | def renderFlacs(fr, br, f, b): 20 | os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac 21 | os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac 22 | os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac 23 | os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac 24 | #pitch down 25 | os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac 26 | os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac 27 | os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac 28 | os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac 29 | #pitch up 30 | os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac 31 | os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac 32 | os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac 33 | os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac 34 | #initial preparation 35 | os.system('ffmpeg -i "{}" -ac 1 -ab 16k -ar {} {}/this_temp.wav'.format(full_name, SAMPLE_RATE, TEMP_DIR)) #resample this file as mono 16000smpls/s 36 | this_length = float(subprocess.check_output('ffprobe -i {}/this_temp.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(TEMP_DIR), shell=True)) #check length of resampled audio 37 | print full_name, ':', this_length, 'DURATION' 38 | pad_length = longest_length - this_length 39 | os.system('sox {}/this_temp.wav {}/r_temp.wav reverse'.format(TEMP_DIR, TEMP_DIR)) # reverse file 40 | if pad_length > 0.: # every audiofile except the largest 41 | #create temp files 42 | os.system('ffmpeg -f lavfi -i anullsrc=channel_layout=mono:sample_rate={} -t {} {}/anullsrc_temp.wav'.format(SAMPLE_RATE, pad_length, TEMP_DIR)) #create anullsrc_temp.wav zero-pad 43 | os.system('sox {}/anullsrc_temp.wav {}/r_temp.wav {}/fr_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #FR 44 | os.system('sox {}/r_temp.wav {}/anullsrc_temp.wav {}/br_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #BR 45 | os.system('sox {}/anullsrc_temp.wav {}/this_temp.wav {}/f_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #F 46 | os.system('sox {}/this_temp.wav {}/anullsrc_temp.wav {}/b_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #B 47 | # extend the data set by copying and repitching each sample up+down 1 semitone 48 | os.system('sox {}/fr_temp.wav {}/fr_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#FR down 49 | os.system('sox {}/br_temp.wav {}/br_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#BR down 50 | os.system('sox {}/f_temp.wav {}/f_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#F down 51 | os.system('sox {}/b_temp.wav {}/b_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#B down 52 | os.system('sox {}/fr_temp.wav {}/fr_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#FR up 53 | os.system('sox {}/br_temp.wav {}/br_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#BR up 54 | os.system('sox {}/f_temp.wav {}/f_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#F up 55 | os.system('sox {}/b_temp.wav {}/b_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#D up 56 | #final export 57 | renderFlacs('fr', 'br', 'f', 'b') #render parts 58 | #clean up temp files 59 | os.system('rm {}/anullsrc_temp.wav'.format(TEMP_DIR)) 60 | os.system('rm {}/fr_down.wav'.format(TEMP_DIR)) 61 | os.system('rm {}/br_down.wav'.format(TEMP_DIR)) 62 | os.system('rm {}/f_down.wav'.format(TEMP_DIR)) 63 | os.system('rm {}/b_down.wav'.format(TEMP_DIR)) 64 | os.system('rm {}/fr_up.wav'.format(TEMP_DIR)) 65 | os.system('rm {}/br_up.wav'.format(TEMP_DIR)) 66 | os.system('rm {}/f_up.wav'.format(TEMP_DIR)) 67 | os.system('rm {}/b_up.wav'.format(TEMP_DIR)) 68 | else: #longest file 69 | # extend the data set by copying and repitching each sample up+down 1 semitone 70 | os.system('sox {}/this_temp.wav {}/r_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))# up 71 | os.system('sox {}/this_temp.wav {}/r_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))# down 72 | os.system('sox {}/r_temp.wav {}/this_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#r up 73 | os.system('sox {}/r_temp.wav {}/this_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#r down 74 | # final export 75 | renderFlacs('r', 'r', 'this', 'this') 76 | #clean up temp files 77 | os.system('rm {}/r_up.wav'.format(TEMP_DIR)) 78 | os.system('rm {}/r_down.wav'.format(TEMP_DIR)) 79 | os.system('rm {}/this_up.wav'.format(TEMP_DIR)) 80 | os.system('rm {}/this_down.wav'.format(TEMP_DIR)) 81 | os.system('rm {}/r_temp.wav'.format(TEMP_DIR)) 82 | os.system('rm {}/this_temp.wav'.format(TEMP_DIR)) 83 | 84 | # Step 1: Find the largest file size in the audio dataset 85 | objects = os.listdir(RAW_DATA_DIR) 86 | sofar = 0 87 | largest = "" 88 | for item in objects: 89 | if ".wav" in item: 90 | size = os.path.getsize(item) 91 | if size > sofar: 92 | sofar = size 93 | largest = item 94 | 95 | print "Largest file is ", sofar 96 | print largest 97 | os.system('ffmpeg -i "{}" -ac 1 -ab 16k -ar {} {}/longest_temp.wav'.format(largest, SAMPLE_RATE, TEMP_DIR)) #resample the largest file as mono 98 | longest_length = float(subprocess.check_output('ffprobe -i {}/longest_temp.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(TEMP_DIR), shell=True)) 99 | #clean up longest temp wav 100 | os.system('rm {}/longest_temp.wav'.format(TEMP_DIR)) 101 | 102 | i = 0 103 | for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR): 104 | for filename in filenames: 105 | if filename.endswith(".wav"): 106 | full_name = dirpath + '/'+ filename # raw audio file 107 | createParts() 108 | i += 1 109 | #remove empty temp dir 110 | #os.system('rmdir {}'.format(TEMP_DIR)) 111 | -------------------------------------------------------------------------------- /datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | """ 4 | import numpy as np 5 | import random, time, os, glob 6 | 7 | def __getFile(dataset_name): 8 | return 'music/'+dataset_name+'/music_{}.npy' 9 | 10 | __base = [ 11 | ('Local', 'datasets/') 12 | ] 13 | 14 | __train = lambda s: s.format('train') 15 | __valid = lambda s: s.format('valid') 16 | __test = lambda s: s.format('test') 17 | 18 | def find_dataset(filename): 19 | for (k, v) in __base: 20 | tmp_path = os.path.join(v, filename) 21 | if os.path.exists(tmp_path): 22 | #print "Path on {}:".format(k) 23 | #print tmp_path 24 | return tmp_path 25 | #print "not found on {}".format(k) 26 | raise Exception('{} NOT FOUND!'.format(filename)) 27 | 28 | ### Basic utils ### 29 | def __round_to(x, y): 30 | """round x up to the nearest y""" 31 | return int(np.ceil(x / float(y))) * y 32 | 33 | def __normalize(data): 34 | """To range [0., 1.]""" 35 | data -= data.min(axis=1)[:, None] 36 | data /= data.max(axis=1)[:, None] 37 | return data 38 | 39 | def __linear_quantize(data, q_levels): 40 | """ 41 | floats in (0, 1) to ints in [0, q_levels-1] 42 | scales normalized across axis 1 43 | """ 44 | # Normalization is on mini-batch not whole file 45 | #eps = np.float64(1e-5) 46 | #data -= data.min(axis=1)[:, None] 47 | #data *= ((q_levels - eps) / data.max(axis=1)[:, None]) 48 | #data += eps/2 49 | #data = data.astype('int32') 50 | 51 | eps = np.float64(1e-5) 52 | data *= (q_levels - eps) 53 | data += eps/2 54 | data = data.astype('int32') 55 | return data 56 | 57 | def __a_law_quantize(data): 58 | """ 59 | :todo: 60 | """ 61 | raise NotImplementedError 62 | 63 | def linear2mu(x, mu=255): 64 | """ 65 | From Joao 66 | x should be normalized between -1 and 1 67 | Converts an array according to mu-law and discretizes it 68 | Note: 69 | mu2linear(linear2mu(x)) != x 70 | Because we are compressing to 8 bits here. 71 | They will sound pretty much the same, though. 72 | :usage: 73 | >>> bitrate, samples = scipy.io.wavfile.read('orig.wav') 74 | >>> norm = __normalize(samples)[None, :] # It takes 2D as inp 75 | >>> mu_encoded = linear2mu(2.*norm-1.) # From [0, 1] to [-1, 1] 76 | >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype 77 | 0, 255, dtype('int16') 78 | >>> mu_decoded = mu2linear(mu_encoded) # Back to linear 79 | >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype 80 | -1, 0.9574371, dtype('float32') 81 | """ 82 | x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu) 83 | return ((x_mu + 1)/2 * mu).astype('int16') 84 | 85 | def mu2linear(x, mu=255): 86 | """ 87 | From Joao with modifications 88 | Converts an integer array from mu to linear 89 | For important notes and usage see: linear2mu 90 | """ 91 | mu = float(mu) 92 | x = x.astype('float32') 93 | y = 2. * (x - (mu+1.)/2.) / (mu+1.) 94 | return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.) 95 | 96 | def __mu_law_quantize(data): 97 | return linear2mu(data) 98 | 99 | def __batch_quantize(data, q_levels, q_type): 100 | """ 101 | One of 'linear', 'a-law', 'mu-law' for q_type. 102 | """ 103 | data = data.astype('float64') 104 | #data = __normalize(data) 105 | if q_type == 'linear': 106 | return __linear_quantize(data, q_levels) 107 | if q_type == 'a-law': 108 | return __a_law_quantize(data) 109 | if q_type == 'mu-law': 110 | # from [0, 1] to [-1, 1] 111 | #data = 2.*data-1. 112 | # Automatically quantized to 256 bins. 113 | return __mu_law_quantize(data) 114 | raise NotImplementedError 115 | 116 | __RAND_SEED = 123 117 | def __fixed_shuffle(inp_list): 118 | if isinstance(inp_list, list): 119 | random.seed(__RAND_SEED) 120 | random.shuffle(inp_list) 121 | return 122 | #import collections 123 | #if isinstance(inp_list, (collections.Sequence)): 124 | if isinstance(inp_list, np.ndarray): 125 | np.random.seed(__RAND_SEED) 126 | np.random.shuffle(inp_list) 127 | return 128 | # destructive operations; in place; no need to return 129 | raise ValueError("inp_list is neither a list nor a np.ndarray but a "+type(inp_list)) 130 | 131 | def __make_random_batches(inp_list, batch_size): 132 | batches = [] 133 | for i in xrange(len(inp_list) / batch_size): 134 | batches.append(inp_list[i*batch_size:(i+1)*batch_size]) 135 | 136 | __fixed_shuffle(batches) 137 | return batches 138 | 139 | 140 | ### MUSIC DATASET LOADER ### 141 | def __music_feed_epoch(files, 142 | batch_size, 143 | seq_len, 144 | overlap, 145 | q_levels, 146 | q_zero, 147 | q_type, 148 | real_valued=False): 149 | """ 150 | Helper function to load music dataset. 151 | Generator that yields training inputs (subbatch, reset). `subbatch` contains 152 | quantized audio data; `reset` is a boolean indicating the start of a new 153 | sequence (i.e. you should reset h0 whenever `reset` is True). 154 | Feeds subsequences which overlap by a specified amount, so that the model 155 | can always have target for every input in a given subsequence. 156 | Assumes all flac files have the same length. 157 | returns: (subbatch, reset) 158 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP) 159 | reset: True or False 160 | """ 161 | batches = __make_random_batches(files, batch_size) 162 | 163 | for bch in batches: 164 | # batch_seq_len = length of longest sequence in the batch, rounded up to 165 | # the nearest SEQ_LEN. 166 | batch_seq_len = len(bch[0]) # should be 8*16000 167 | batch_seq_len = __round_to(batch_seq_len, seq_len) 168 | 169 | batch = np.zeros( 170 | (batch_size, batch_seq_len), 171 | dtype='float64' 172 | ) 173 | 174 | mask = np.ones(batch.shape, dtype='float32') 175 | 176 | for i, data in enumerate(bch): 177 | #data, fs, enc = scikits.audiolab.flacread(path) 178 | # data is float16 from reading the npy file 179 | batch[i, :len(data)] = data 180 | # This shouldn't change anything. All the flac files for Music 181 | # are the same length and the mask should be 1 every where. 182 | # mask[i, len(data):] = np.float32(0) 183 | 184 | if not real_valued: 185 | batch = __batch_quantize(batch, q_levels, q_type) 186 | 187 | batch = np.concatenate([ 188 | np.full((batch_size, overlap), q_zero, dtype='int32'), 189 | batch 190 | ], axis=1) 191 | else: 192 | batch -= __music_train_mean_std[0] 193 | batch /= __music_train_mean_std[1] 194 | batch = np.concatenate([ 195 | np.full((batch_size, overlap), 0, dtype='float32'), 196 | batch 197 | ], axis=1).astype('float32') 198 | 199 | mask = np.concatenate([ 200 | np.full((batch_size, overlap), 1, dtype='float32'), 201 | mask 202 | ], axis=1) 203 | 204 | for i in xrange(batch_seq_len // seq_len): 205 | reset = np.int32(i==0) 206 | subbatch = batch[:, i*seq_len : (i+1)*seq_len + overlap] 207 | submask = mask[:, i*seq_len : (i+1)*seq_len + overlap] 208 | yield (subbatch, reset, submask) 209 | 210 | def music_train_feed_epoch(d_name, *args): 211 | """ 212 | :parameters: 213 | batch_size: int 214 | seq_len: 215 | overlap: 216 | q_levels: 217 | q_zero: 218 | q_type: One the following 'linear', 'a-law', or 'mu-law' 219 | 4,340 (9.65 hours) in total 220 | With batch_size = 128: 221 | 4,224 (9.39 hours) in total 222 | 3,712 (88%, 8.25 hours)for training set 223 | 256 (6%, .57 hours) for validation set 224 | 256 (6%, .57 hours) for test set 225 | Note: 226 | 32 of Beethoven's piano sonatas available on archive.org (Public Domain) 227 | :returns: 228 | A generator yielding (subbatch, reset, submask) 229 | """ 230 | # Just check if valid/test sets are also available. If not, raise. 231 | find_dataset(__valid(__getFile(d_name))) 232 | find_dataset(__test(__getFile(d_name))) 233 | # Load train set 234 | data_path = find_dataset(__train(__getFile(d_name))) 235 | files = np.load(data_path) 236 | generator = __music_feed_epoch(files, *args) 237 | return generator 238 | 239 | def music_valid_feed_epoch(d_name, *args): 240 | """ 241 | See: 242 | music_train_feed_epoch 243 | """ 244 | data_path = find_dataset(__valid(__getFile(d_name))) 245 | files = np.load(data_path) 246 | generator = __music_feed_epoch(files, *args) 247 | return generator 248 | 249 | def music_test_feed_epoch(d_name, *args): 250 | """ 251 | See: 252 | music_train_feed_epoch 253 | """ 254 | data_path = find_dataset(__test(__getFile(d_name))) 255 | files = np.load(data_path) 256 | generator = __music_feed_epoch(files, *args) 257 | return generator 258 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /datasets/dataset_conditioning.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | """ 4 | import numpy as np 5 | import random, time, os, glob 6 | 7 | def __getFile(dataset_name): 8 | return 'music/'+dataset_name+'/music_{}.npy' 9 | 10 | def __getFeatures(dataset_name): 11 | return 'music/'+dataset_name+'/features_{}.npy' 12 | 13 | __base = [ 14 | ('Local', 'datasets/') 15 | ] 16 | 17 | __train = lambda s: s.format('train') 18 | __valid = lambda s: s.format('valid') 19 | __test = lambda s: s.format('test') 20 | 21 | def find_dataset(filename): 22 | for (k, v) in __base: 23 | tmp_path = os.path.join(v, filename) 24 | if os.path.exists(tmp_path): 25 | #print "Path on {}:".format(k) 26 | #print tmp_path 27 | return tmp_path 28 | #print "not found on {}".format(k) 29 | raise Exception('{} NOT FOUND!'.format(filename)) 30 | 31 | ### Basic utils ### 32 | def __round_to(x, y): 33 | """round x up to the nearest y""" 34 | return int(np.ceil(x / float(y))) * y 35 | 36 | def __normalize(data): 37 | """To range [0., 1.]""" 38 | data -= data.min(axis=1)[:, None] 39 | data /= data.max(axis=1)[:, None] 40 | return data 41 | 42 | def __linear_quantize(data, q_levels): 43 | """ 44 | floats in (0, 1) to ints in [0, q_levels-1] 45 | scales normalized across axis 1 46 | """ 47 | # Normalization is on mini-batch not whole file 48 | #eps = np.float64(1e-5) 49 | #data -= data.min(axis=1)[:, None] 50 | #data *= ((q_levels - eps) / data.max(axis=1)[:, None]) 51 | #data += eps/2 52 | #data = data.astype('int32') 53 | 54 | eps = np.float64(1e-5) 55 | data *= (q_levels - eps) 56 | data += eps/2 57 | data = data.astype('int32') 58 | return data 59 | 60 | def __a_law_quantize(data): 61 | """ 62 | :todo: 63 | """ 64 | raise NotImplementedError 65 | 66 | def linear2mu(x, mu=255): 67 | """ 68 | From Joao 69 | x should be normalized between -1 and 1 70 | Converts an array according to mu-law and discretizes it 71 | Note: 72 | mu2linear(linear2mu(x)) != x 73 | Because we are compressing to 8 bits here. 74 | They will sound pretty much the same, though. 75 | :usage: 76 | >>> bitrate, samples = scipy.io.wavfile.read('orig.wav') 77 | >>> norm = __normalize(samples)[None, :] # It takes 2D as inp 78 | >>> mu_encoded = linear2mu(2.*norm-1.) # From [0, 1] to [-1, 1] 79 | >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype 80 | 0, 255, dtype('int16') 81 | >>> mu_decoded = mu2linear(mu_encoded) # Back to linear 82 | >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype 83 | -1, 0.9574371, dtype('float32') 84 | """ 85 | x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu) 86 | return ((x_mu + 1)/2 * mu).astype('int16') 87 | 88 | def mu2linear(x, mu=255): 89 | """ 90 | From Joao with modifications 91 | Converts an integer array from mu to linear 92 | For important notes and usage see: linear2mu 93 | """ 94 | mu = float(mu) 95 | x = x.astype('float32') 96 | y = 2. * (x - (mu+1.)/2.) / (mu+1.) 97 | return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.) 98 | 99 | def __mu_law_quantize(data): 100 | return linear2mu(data) 101 | 102 | def __batch_quantize(data, q_levels, q_type): 103 | """ 104 | One of 'linear', 'a-law', 'mu-law' for q_type. 105 | """ 106 | data = data.astype('float64') 107 | data = __normalize(data) 108 | if q_type == 'linear': 109 | return __linear_quantize(data, q_levels) 110 | if q_type == 'a-law': 111 | return __a_law_quantize(data) 112 | if q_type == 'mu-law': 113 | # from [0, 1] to [-1, 1] 114 | data = 2.*data-1. 115 | # Automatically quantized to 256 bins. 116 | return __mu_law_quantize(data) 117 | raise NotImplementedError 118 | 119 | __RAND_SEED = 123 120 | def __fixed_shuffle(inp_list): 121 | if isinstance(inp_list, list): 122 | random.seed(__RAND_SEED) 123 | random.shuffle(inp_list) 124 | return 125 | #import collections 126 | #if isinstance(inp_list, (collections.Sequence)): 127 | if isinstance(inp_list, np.ndarray): 128 | np.random.seed(__RAND_SEED) 129 | np.random.shuffle(inp_list) 130 | return 131 | # destructive operations; in place; no need to return 132 | raise ValueError("inp_list is neither a list nor a np.ndarray but a "+type(inp_list)) 133 | 134 | def __make_random_batches(sample_data, feature_data, batch_size): 135 | batches = [] 136 | print "sample_data.shape", sample_data.shape 137 | print "feature_data.shape", feature_data.shape 138 | print "len(sample_data)", len(sample_data) 139 | print "batch_size", batch_size 140 | print len(sample_data) / batch_size 141 | 142 | for i in xrange(len(sample_data) / batch_size): 143 | sample_batch = sample_data[i*batch_size:(i+1)*batch_size] 144 | feature_batch = feature_data[i*batch_size:(i+1)*batch_size] 145 | batches.append([sample_batch, feature_batch]) 146 | 147 | print "len(batches)", len(batches) 148 | __fixed_shuffle(batches) 149 | return batches 150 | 151 | 152 | ### MUSIC DATASET LOADER ### 153 | def __music_feed_epoch(sample_data, feature_data, 154 | batch_size, 155 | seq_len, 156 | overlap, 157 | q_levels, 158 | q_zero, 159 | q_type, 160 | real_valued=False): 161 | """ 162 | Helper function to load music dataset. 163 | Generator that yields training inputs (subbatch, reset). `subbatch` contains 164 | quantized audio data; `reset` is a boolean indicating the start of a new 165 | sequence (i.e. you should reset h0 whenever `reset` is True). 166 | Feeds subsequences which overlap by a specified amount, so that the model 167 | can always have target for every input in a given subsequence. 168 | Assumes all flac files have the same length. 169 | returns: (subbatch, reset) 170 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP) 171 | reset: True or False 172 | """ 173 | batches = __make_random_batches(sample_data, feature_data, batch_size) 174 | 175 | for bch in batches: 176 | 177 | print "len(bch)", len(bch) 178 | print bch[0].shape 179 | print bch[1].shape 180 | # batch_seq_len = length of longest sequence in the batch, rounded up to 181 | # the nearest SEQ_LEN. 182 | batch_seq_len = len(bch[0][0]) # should be 8*16000 183 | batch_seq_len = __round_to(batch_seq_len, seq_len) 184 | print "batch_seq_len", batch_seq_len 185 | 186 | batch = np.zeros( 187 | (batch_size, batch_seq_len), 188 | dtype='float64' 189 | ) 190 | 191 | num_features = bch[1].shape[2] 192 | # cj (conditioning) 193 | features = np.zeros((batch_size, batch_seq_len, num_features), dtype='float32') 194 | print "num_features", num_features 195 | print "features.shape", features.shape 196 | 197 | mask = np.ones(batch.shape, dtype='float32') 198 | 199 | for i, _ in enumerate(bch[0]): 200 | chunk_samples = bch[0][i] 201 | chunk_features = bch[1][i] 202 | # print "len(chunk_samples)", len(chunk_samples) 203 | # print "len(chunk_features)", len(chunk_features) 204 | # samples are in data[0] 205 | #data, fs, enc = scikits.audiolab.flacread(path) 206 | # data is float16 from reading the npy file 207 | batch[i, :len(chunk_samples)] = chunk_samples 208 | # This shouldn't change anything. All the flac files for Music 209 | # are the same length and the mask should be 1 every where. 210 | # mask[i, len(data):] = np.float32(0) 211 | # print "batch.shape", batch.shape 212 | 213 | # feature matrix is in data[1] 214 | x = np.linspace(0, len(chunk_features), len(chunk_samples)) 215 | xp = np.linspace(0, len(chunk_features), len(chunk_features)) 216 | ## now is the time to upsample 217 | for j in xrange(num_features): 218 | fp = chunk_features[:,j] 219 | interpolated = np.interp(x, xp, fp) 220 | # print "interpolated.shape", interpolated.shape 221 | # print "chunk_feats.shape", chunk_features.shape 222 | features[i, :len(chunk_samples), j] = interpolated 223 | 224 | if not real_valued: 225 | batch = __batch_quantize(batch, q_levels, q_type) 226 | 227 | batch = np.concatenate([ 228 | np.full((batch_size, overlap), q_zero, dtype='int32'), 229 | batch 230 | ], axis=1) 231 | else: 232 | batch -= __music_train_mean_std[0] 233 | batch /= __music_train_mean_std[1] 234 | batch = np.concatenate([ 235 | np.full((batch_size, overlap), 0, dtype='float32'), 236 | batch 237 | ], axis=1).astype('float32') 238 | 239 | 240 | mask = np.concatenate([ 241 | np.full((batch_size, overlap), 1, dtype='float32'), 242 | mask 243 | ], axis=1) 244 | 245 | 246 | # cj (conditioning): not sure what this is for 247 | """features = np.concatenate([ 248 | np.full((batch_size, overlap, num_features), 0, dtype='float32'), 249 | features 250 | ], axis=1)""" 251 | print "overlap", overlap 252 | 253 | for i in xrange(batch_seq_len // seq_len): 254 | reset = np.int32(i==0) 255 | subbatch = batch[:, i*seq_len : (i+1)*seq_len + overlap] 256 | submask = mask[:, i*seq_len : (i+1)*seq_len + overlap] 257 | subfeatures = features[:, i*seq_len : (i+1)*seq_len] 258 | # calculate the mean features over the whole sequence 259 | #subfeatures = np.mean(features, axis=1).reshape(features.shape[0], features.shape[2]) 260 | yield (subbatch, reset, submask, subfeatures) 261 | 262 | def music_train_feed_epoch(d_name, *args): 263 | """ 264 | :parameters: 265 | batch_size: int 266 | seq_len: 267 | overlap: 268 | q_levels: 269 | q_zero: 270 | q_type: One the following 'linear', 'a-law', or 'mu-law' 271 | 4,340 (9.65 hours) in total 272 | With batch_size = 128: 273 | 4,224 (9.39 hours) in total 274 | 3,712 (88%, 8.25 hours)for training set 275 | 256 (6%, .57 hours) for validation set 276 | 256 (6%, .57 hours) for test set 277 | Note: 278 | 32 of Beethoven's piano sonatas available on archive.org (Public Domain) 279 | :returns: 280 | A generator yielding (subbatch, reset, submask) 281 | """ 282 | # Just check if valid/test sets are also available. If not, raise. 283 | find_dataset(__valid(__getFile(d_name))) 284 | find_dataset(__test(__getFile(d_name))) 285 | # Load train set 286 | data_path = find_dataset(__train(__getFile(d_name))) 287 | sample_data = np.load(data_path) 288 | 289 | # get local conditioning features 290 | data_path = find_dataset(__train(__getFeatures(d_name))) 291 | feature_data = np.load(data_path) 292 | print "feature file: ", data_path 293 | print "feature_data.shape", feature_data.shape 294 | 295 | generator = __music_feed_epoch(sample_data, feature_data, *args) 296 | return generator 297 | 298 | def get_feature_data(d_name): 299 | data_path = find_dataset(__train(__getFeatures(d_name))) 300 | feature_data = np.load(data_path) 301 | return feature_data 302 | 303 | def music_valid_feed_epoch(d_name, *args): 304 | """ 305 | See: 306 | music_train_feed_epoch 307 | """ 308 | data_path = find_dataset(__valid(__getFile(d_name))) 309 | sample_data = np.load(data_path) 310 | # get local conditioning features 311 | data_path = find_dataset(__train(__getFeatures(d_name))) 312 | feature_data = np.load(data_path) 313 | 314 | generator = __music_feed_epoch(sample_data, feature_data, *args) 315 | return generator 316 | 317 | def music_test_feed_epoch(d_name, *args): 318 | """ 319 | See: 320 | music_train_feed_epoch 321 | """ 322 | data_path = find_dataset(__test(__getFile(d_name))) 323 | sample_data = np.load(data_path) 324 | # get local conditioning features 325 | data_path = find_dataset(__train(__getFeatures(d_name))) 326 | feature_data = np.load(data_path) 327 | 328 | generator = __music_feed_epoch(sample_data, feature_data, *args) 329 | return generator 330 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dadabots SampleRNN 2 | ## Generating Black Metal, Mathcore, Skate Punk, Beatbox, Meshuggah 3 | 4 | Code accompanying the NIPS 2017 paper [Generating Black Metal and Math Rock: Beyond 5 | Bach, Beethoven, and Beatles](http://dadabots.com/nips2017/generating-black-metal-and-math-rock.pdf) and MUME 2018 paper [Generating Albums with SampleRNN to Imitate Metal, Rock, and Punk Bands](http://musicalmetacreation.org/buddydrive/file/carr/) 6 | 7 | We modified a SampleRNN architecture to generate music in modern genres such as black metal, math rock, skate punk, beatbox, etc 8 | 9 | This early example of neural synthesis is a proof-of-concept for how machine learning can drive new types of music software. Creating music can be as simple as specifying a set of music influences on which a model trains. We demonstrate a method for generating albums that imitate bands in experimental music genres previously unrealized by traditional synthesis techniques 10 | (e.g. additive, subtractive, FM, granular, concatenative). Unlike MIDI and symbolic models, SampleRNN generates raw audio in the time domain. This requirement becomes increasingly important in modern music styles where timbre and space are used compositionally. Long developmental compositions with rapid transitions between sections are possible by increasing the depth of the network beyond the number used for speech datasets. We are delighted by the unique characteristic artifacts of neural synthesis. 11 | 12 | We've created[ several albums](https://dadabots.bandcamp.com/) this way. Read our papers for more expalnation of how we use this as part of a creative workflow, how to choose good datasets, etc. 13 | 14 | Dadabots is CJ Carr [[github]](https://github.com/Cortexelus) [[website]](http://cortexel.us) and Zack Zukowski [[github]](https://github.com/ZVK) [[website]](http://zackzukowski.com/) 15 | 16 | # SampleRNN (Dadabots fork) 17 | 18 | Original SampleRNN paper [SampleRNN: An Unconditional End-to-End Neural Audio Generation Model](https://openreview.net/forum?id=SkxKPDv5xl). 19 | 20 | ## Features 21 | - Load a dataset of audio 22 | - Train a model on that audio to predict "given what just happened, what comes next?" 23 | - Generate new audio by iteratively choosing "what next comes" indefinitely 24 | 25 | ## Modifications from original code: 26 | - Auto-preprocessing (audio conversion, concatenation, chunking, and saving .npy files). We find splitting an album into 32 00 overlapping chunks of 8 seconds to give us good results. 27 | - New scripts for generating 100s of audio examples in parallel from a trained net. 28 | - New scripts for different sample rates are available (16k, 32k). 32k audio sounds better, but the nets take longer to train, and they don't learn structure as well as 16k. 29 | - Any processed datasets can be loaded into the two-tier network via arguments. This significantly speeds up the workflow without having to change code. 30 | - Sampling is picked from distribution (not argmax). This makes better sense because certain sounds (noise, texture, the "s" sound in speech) are inherently stochastic. Also this is significant for avoiding traps (the generated audio gets stuck in a loop). 31 | - Wny number of RNN layers is now possible (until you run out of memory). This was significant to getting good results. The original limit was insufficient for music, we get good results with 5 layers. 32 | - Local conditioning. Although we haven't fully researched the possibilities of local conditioning, we coded it in. 33 | - Fix bad amplitude normalization causing DC offsets (see [issue](https://github.com/soroushmehr/sampleRNN_ICLR2017/issues/24)) 34 | 35 | ## Dependencies 36 | 37 | The original code lists: 38 | - cuDNN 5105 39 | - Python 2.7.12 40 | - Numpy 1.11.1 41 | - Theano 0.8.2 42 | - Lasagne 0.2.dev1 43 | - ffmpeg (libav-tools) 44 | 45 | But we get much faster code using the next generation of GPU architecture with: 46 | - CUDA 9.2 47 | - cuDNN 8.0 48 | - Theano 1.0 49 | - NVIDIA V100 GPU 50 | 51 | ## Setup 52 | 53 | A detailed description of how we setup this code on Ubuntu 16.04 with NVIDIA 100 GPU can be found here. 54 | 55 | [DETAILED SETUP INSTRUCTIONS](https://github.com/Cortexelus/dadabots_sampleRNN/wiki/Installing-Dadabots-SampleRNN-on-Ubuntu) 56 | 57 | 58 | 59 | ## Datasets 60 | To create a new dataset, place your audio here: 61 | ``` 62 | datasets/music/downloads/ 63 | ``` 64 | then run the new experiment python script located in the datasets/music directory: 65 | 66 | 16k sample rate: 67 | ``` 68 | cd datasets/music/ 69 | sudo python new_experiment16k.py krallice downloads/ 70 | ``` 71 | 72 | 32k sample rate: 73 | ``` 74 | cd datasets/music/ 75 | sudo python new_experiment32k.py krallice downloads/ 76 | ``` 77 | 78 | ## Training 79 | To train a model on an existing dataset with accelerated GPU processing, you need to run following lines from the root of `dadabots_sampleRNN` folder which corresponds to the best found set of hyper-paramters. 80 | 81 | Mission control center: 82 | ``` 83 | $ pwd 84 | /root/cj/https://github.com/Cortexelus/dadabots_sampleRNN 85 | ``` 86 | 87 | ### Training SampleRNN (2-tier) 88 | ``` 89 | $ python models/two_tier/two_tier32k.py -h 90 | usage: two_tier.py [-h] [--exp EXP] --n_frames N_FRAMES --frame_size 91 | FRAME_SIZE --weight_norm WEIGHT_NORM --emb_size EMB_SIZE 92 | --skip_conn SKIP_CONN --dim DIM --n_rnn {1,2,3,4,5} 93 | --rnn_type {LSTM,GRU} --learn_h0 LEARN_H0 --q_levels 94 | Q_LEVELS --q_type {linear,a-law,mu-law} --which_set 95 | {...} --batch_size {64,128,256} [--debug] 96 | [--resume] 97 | 98 | two_tier.py No default value! Indicate every argument. 99 | 100 | optional arguments: 101 | -h, --help show this help message and exit 102 | --exp EXP Experiment name (name it anything you want) 103 | --n_frames N_FRAMES How many "frames" to include in each Truncated BPTT 104 | pass 105 | --frame_size FRAME_SIZE 106 | How many samples per frame 107 | --weight_norm WEIGHT_NORM 108 | Adding learnable weight normalization to all the 109 | linear layers (except for the embedding layer) 110 | --emb_size EMB_SIZE Size of embedding layer (0 to disable) 111 | --skip_conn SKIP_CONN 112 | Add skip connections to RNN 113 | --dim DIM Dimension of RNN and MLPs 114 | --n_rnn {1,2,3,4,5,6,7,8,9,10,11,12,n,...} 115 | Number of layers in the stacked RNN 116 | --rnn_type {LSTM,GRU} 117 | GRU or LSTM 118 | --learn_h0 LEARN_H0 Whether to learn the initial state of RNN 119 | --q_levels Q_LEVELS Number of bins for quantization of audio samples. 120 | Should be 256 for mu-law. 121 | --q_type {linear,a-law,mu-law} 122 | Quantization in linear-scale, a-law-companding, or mu- 123 | law compandig. With mu-/a-law quantization level shoud 124 | be set as 256 125 | --which_set {...} 126 | The name of the dataset you created. In the above example "krallice" 127 | --batch_size {64,128,256} 128 | size of mini-batch 129 | --debug Debug mode 130 | --resume Resume the same model from the last checkpoint. Order 131 | of params are important. [for now] 132 | ``` 133 | 134 | 135 | If you're using cuda9 with v100 gpus, you need "device=cuda0" 136 | 137 | If you're using cuda8 with K80 gpus or earlier, you may need "device=gpu0" instead 138 | 139 | If you have 8 GPUs, you can run up to 8 experiments in parallel, by setting device to cuda0, cuda1, cuda2, cuda3... cuda7 140 | 141 | 142 | #### Our best hyperparameters 143 | 144 | After training 100s of models with different hyperparameters, these were our best hyperparameters (at the limits of the V100 hardware) for the kind of music we wanted to generate. Further explanation for our choices can be found in our papers. 145 | 146 | 147 | ``` 148 | THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python -u models/two_tier/two_tier16k.py --exp krallice_experiment --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set krallice 149 | ``` 150 | 151 | 152 | ### Training SampleRNN (3-tier) 153 | 154 | There's also a 3-tier option, but we initially had better results with 2-tier, so we don't use 3-tier. It doesn't have the modifications we made to 2-tier. 155 | 156 | ``` 157 | $ python models/three_tier/three_tier.py -h 158 | usage: three_tier16k.py [-h] [--exp EXP] --seq_len SEQ_LEN --big_frame_size 159 | BIG_FRAME_SIZE --frame_size FRAME_SIZE --weight_norm 160 | WEIGHT_NORM --emb_size EMB_SIZE --skip_conn SKIP_CONN 161 | --dim DIM --n_rnn {1,2,3,4,5} --rnn_type {LSTM,GRU} 162 | --learn_h0 LEARN_H0 --q_levels Q_LEVELS --q_type 163 | {linear,a-law,mu-law} --which_set {ONOM,BLIZZ,MUSIC} 164 | --batch_size {64,128,256} [--debug] [--resume] 165 | 166 | three_tier.py No default value! Indicate every argument. 167 | 168 | optional arguments: 169 | -h, --help show this help message and exit 170 | --exp EXP Experiment name 171 | --seq_len SEQ_LEN How many samples to include in each Truncated BPTT 172 | pass 173 | --big_frame_size BIG_FRAME_SIZE 174 | How many samples per big frame in tier 3 175 | --frame_size FRAME_SIZE 176 | How many samples per frame in tier 2 177 | --weight_norm WEIGHT_NORM 178 | Adding learnable weight normalization to all the 179 | linear layers (except for the embedding layer) 180 | --emb_size EMB_SIZE Size of embedding layer (> 0) 181 | --skip_conn SKIP_CONN 182 | Add skip connections to RNN 183 | --dim DIM Dimension of RNN and MLPs 184 | --n_rnn {1,2,3,4,5} Number of layers in the stacked RNN 185 | --rnn_type {LSTM,GRU} 186 | GRU or LSTM 187 | --learn_h0 LEARN_H0 Whether to learn the initial state of RNN 188 | --q_levels Q_LEVELS Number of bins for quantization of audio samples. 189 | Should be 256 for mu-law. 190 | --q_type {linear,a-law,mu-law} 191 | Quantization in linear-scale, a-law-companding, or mu- 192 | law compandig. With mu-/a-law quantization level shoud 193 | be set as 256 194 | --which_set WHICH_SET 195 | any preprocessed set in the datasets/music/ directory 196 | --batch_size {64,128,256} 197 | size of mini-batch 198 | --debug Debug mode 199 | --resume Resume the same model from the last checkpoint. Order 200 | of params are important. [for now] 201 | ``` 202 | To run: 203 | ``` 204 | $ THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python -u models/three_tier/three_tier.py --exp 3TIER --seq_len 512 --big_frame_size 8 --frame_size 2 --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type GRU --q_levels 256 --q_type linear --batch_size 128 --weight_norm True --learn_h0 True --which_set MUSIC 205 | 206 | ``` 207 | 208 | ## Generating 209 | 210 | Generate 100 songs (4 minutes each) from a trained 32k model: 211 | ``` 212 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu0,floatX=float32 python -u models/two_tier/two_tier_generate32k.py --exp krallice_experiment --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set krallice --n_secs 240 --n_seqs 100 213 | ``` 214 | 215 | All the parameters have to be the same as when you trained it. Notice we're calling `two_tier_generate32k.py` with two new flags `--n_secs` and `--n_seqs` 216 | 217 | It will take just as much time to generate 100 songs as 5, because they are created in parallel (up to a hardware memory limit). 218 | 219 | This will generate from the latest checkpoint. However, we found the latest checkpoint does not always create the best music. Instead we listen to the test audio generated at each checkpoint, choose our favorite checkpoint, and delete the newer checkpoints, before generating a huge batch with this script. 220 | 221 | 222 | ## Creative Workflow 223 | 224 | At this point, we suggest human curation. Listen through the generated audio, find the best parts, and use them in your music. Read our [MUME 2018 paper](http://musicalmetacreation.org/buddydrive/file/carr/) to see how our workflow changed over the course of six albums. 225 | 226 | 227 | ## Reference 228 | If you are using this code, please cite our paper: 229 | 230 | Generating Albums with SampleRNN to Imitate Metal, Rock, and Punk Bands. CJ Carr, Zack Zukowski (MUME 2018). 231 | 232 | And the original paper: 233 | 234 | SampleRNN: An Unconditional End-to-End Neural Audio Generation Model. Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, Yoshua Bengio, 5th International Conference on Learning Representations (ICLR 2017). 235 | 236 | ## License 237 | 238 | This documentation licensed CC-BY 4.0 239 | 240 | The source code is licensed Apache 2.0 241 | 242 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | import ops 2 | #import lasagne 3 | #from theano.compile.nanguardmode import NanGuardMode 4 | 5 | import math 6 | import time 7 | import locale 8 | 9 | import numpy 10 | import theano 11 | import theano.tensor as T 12 | import theano.gof 13 | 14 | import cPickle as pickle 15 | #import pickle 16 | import warnings 17 | import sys, os, errno, glob 18 | 19 | import matplotlib 20 | matplotlib.use('Agg') 21 | import matplotlib.pyplot as plt 22 | 23 | # TODO: Grouping is not working on cluster! :-? 24 | # Set a locale first or you won't get grouping at all 25 | locale.setlocale(locale.LC_ALL, '') 26 | # 'en_US.UTF-8' 27 | 28 | _params = {} 29 | def param(name, *args, **kwargs): 30 | """ 31 | A wrapper for `theano.shared` which enables parameter sharing in models. 32 | 33 | Creates and returns theano shared variables similarly to `theano.shared`, 34 | except if you try to create a param with the same name as a 35 | previously-created one, `param(...)` will just return the old one instead of 36 | making a new one. 37 | 38 | This constructor also adds a `param` attribute to the shared variables it 39 | creates, so that you can easily search a graph for all params. 40 | """ 41 | 42 | if name not in _params: 43 | kwargs['name'] = name 44 | param = theano.shared(*args, **kwargs) 45 | param.param = True 46 | _params[name] = param 47 | return _params[name] 48 | 49 | def delete_params(name): 50 | to_delete = [p_name for p_name in _params if name in p_name] 51 | for p_name in to_delete: 52 | del _params[p_name] 53 | 54 | def search(node, critereon): 55 | """ 56 | Traverse the Theano graph starting at `node` and return a list of all nodes 57 | which match the `critereon` function. When optimizing a cost function, you 58 | can use this to get a list of all of the trainable params in the graph, like 59 | so: 60 | 61 | `lib.search(cost, lambda x: hasattr(x, "param"))` 62 | or 63 | `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)` 64 | """ 65 | 66 | def _search(node, critereon, visited): 67 | if node in visited: 68 | return [] 69 | visited.add(node) 70 | 71 | results = [] 72 | if isinstance(node, T.Apply): 73 | for inp in node.inputs: 74 | results += _search(inp, critereon, visited) 75 | else: # Variable node 76 | if critereon(node): 77 | results.append(node) 78 | if node.owner is not None: 79 | results += _search(node.owner, critereon, visited) 80 | return results 81 | 82 | return _search(node, critereon, set()) 83 | 84 | def floatX(x): 85 | """ 86 | Convert `x` to the numpy type specified in `theano.config.floatX`. 87 | """ 88 | if theano.config.floatX == 'float16': 89 | return numpy.float16(x) 90 | elif theano.config.floatX == 'float32': 91 | return numpy.float32(x) 92 | else: # Theano's default float type is float64 93 | print "Warning: lib.floatX using float64" 94 | return numpy.float64(x) 95 | 96 | def save_params(path): 97 | param_vals = {} 98 | for name, param in _params.iteritems(): 99 | param_vals[name] = param.get_value() 100 | 101 | with open(path, 'wb') as f: 102 | pickle.dump(param_vals, f) 103 | 104 | def load_params(path): 105 | with open(path, 'rb') as f: 106 | param_vals = pickle.load(f) 107 | 108 | for name, val in param_vals.iteritems(): 109 | _params[name].set_value(val) 110 | 111 | def clear_all_params(): 112 | to_delete = [p_name for p_name in _params] 113 | for p_name in to_delete: 114 | del _params[p_name] 115 | 116 | def ensure_dir(dirname): 117 | """ 118 | Ensure that a named directory exists; if it does not, attempt to create it. 119 | """ 120 | try: 121 | os.makedirs(dirname) 122 | except OSError, e: 123 | if e.errno != errno.EEXIST: 124 | raise 125 | 126 | __model_setting_file_name = 'model_settings.txt' 127 | def print_model_settings(locals_var, path=None, sys_arg=False): 128 | """ 129 | Prints all variables in upper case in locals_var, 130 | except for T which usually stands for theano.tensor. 131 | If locals() passed as input to this method, will print 132 | all the variables in upper case defined so far, that is 133 | model settings. 134 | 135 | With `path` as an address to a directory it will _append_ it 136 | as a file named `model_settings.txt` as well. 137 | 138 | With `sys_arg` set to True, log information about Python, Numpy, 139 | and Theano and passed arguments to the script will be added too. 140 | args.pkl would be overwritten, specially in case of resuming a job. 141 | But again that wouldn't be much of a problem as all the passed args 142 | to the script except for '--resume' should be the same. 143 | 144 | With both `path` and `sys_arg` passed, dumps the theano.config. 145 | 146 | :usage: 147 | >>> import theano.tensor as T 148 | >>> import lib 149 | >>> BATCH_SIZE, DIM = 128, 512 150 | >>> DATA_PATH = '/Path/to/dataset' 151 | >>> lib.print_model_settings(locals(), path='./') 152 | """ 153 | log = "" 154 | if sys_arg: 155 | try: 156 | log += "Python:\n" 157 | log += "\tsys.version_info\t{}\n".format(str(sys.version_info)) 158 | log += "Numpy:\n" 159 | log += "\t.__version__\t{}\n".format(numpy.__version__) 160 | log += "Theano:\n" 161 | log += "\t.__version__\t{}\n".format(theano.__version__) 162 | log += "\n\nAll passed args:\n" 163 | log += str(sys.argv) 164 | log += "\n" 165 | except: 166 | print "Something went wrong during sys_arg logging. Continue anyway!" 167 | 168 | log += "\nModel settings:" 169 | all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')] 170 | all_vars = sorted(all_vars, key=lambda x: x[0]) 171 | for var_name, var_value in all_vars: 172 | log += ("\n\t%-20s %s" % (var_name, var_value)) 173 | print log 174 | if path is not None: 175 | ensure_dir(path) 176 | # Don't override, just append if by mistake there is something in the file. 177 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 178 | f.write(log) 179 | if sys_arg: 180 | with open(os.path.join(path, 'th_conf.txt'), 'a+') as f: 181 | f.write(str(theano.config)) 182 | with open(os.path.join(path, 'args.pkl'), 'wb') as f: 183 | pickle.dump(sys.argv, f) 184 | # To load: 185 | # >>> import cPickle as pickle 186 | # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb')) 187 | 188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True): 189 | """ 190 | Default criterion: 191 | lambda x: hasattr(x, 'param') and x.param==True 192 | This will return every parameter for cost from computation graph. 193 | 194 | To exclude a parameter, just set 'param' to False: 195 | >>> h0 = lib.param('h0',\ 196 | numpy.zeros((3, 2*512), dtype=theano.config.floatX)) 197 | >>> print h0.param # Default: True 198 | >>> h0.param = False 199 | 200 | In this case one still can get list of all params (False or True) by: 201 | >>> lib.get_params(cost, lambda x: hasattr(x, 'param') 202 | 203 | :returns: 204 | A list of params 205 | """ 206 | return search(cost, criterion) 207 | 208 | def print_params_info(params, path=None): 209 | """ 210 | Print information about the parameters in the given param set. 211 | 212 | With `path` as an address to a directory it will _append_ it 213 | as a file named `model_settings.txt` as well. 214 | 215 | :usage: 216 | >>> params = lib.get_params(cost) 217 | >>> lib.print_params_info(params, path='./') 218 | """ 219 | params = sorted(params, key=lambda p: p.name) 220 | values = [p.get_value(borrow=True) for p in params] 221 | shapes = [p.shape for p in values] 222 | total_param_count = 0 223 | multiply_all = lambda a, b: a*b 224 | log = "\nParams for cost:" 225 | for param, value, shape in zip(params, values, shapes): 226 | log += ("\n\t%-20s %s" % (shape, param.name)) 227 | total_param_count += reduce(multiply_all, shape) 228 | 229 | log += "\nTotal parameter count for this cost:\n\t{0}".format( 230 | locale.format("%d", total_param_count, grouping=True) 231 | ) 232 | print log 233 | 234 | if path is not None: 235 | ensure_dir(path) 236 | # Don't override, just append if by mistake there is something in the file. 237 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 238 | f.write(log) 239 | 240 | __train_log_file_name = 'train_log.pkl' 241 | def save_training_info(values, path): 242 | """ 243 | Gets a set of values as dictionary and append them to a log file. 244 | stores in /train_log.pkl 245 | """ 246 | file_name = os.path.join(path, __train_log_file_name) 247 | try: 248 | with open(file_name, "rb") as f: 249 | log = pickle.load(f) 250 | except IOError: # first time 251 | log = {} 252 | for k in values.keys(): 253 | log[k] = [] 254 | for k, v in values.items(): 255 | log[k].append(v) 256 | with open(file_name, "wb") as f: 257 | pickle.dump(log, f) 258 | 259 | resume_key = 'last resume index' 260 | def resumable(path, 261 | iter_key='iter', 262 | epoch_key='epoch', 263 | add_resume_counter=True, 264 | other_keys=[]): 265 | """ 266 | :warning: 267 | This is a naive implementation of resuming a training session 268 | and does not save and reload the training loop. The serialization 269 | of training loop and everything is costly and error-prone. 270 | 271 | :todo: 272 | - Save and load a serializable training loop. (See warning above) 273 | - Heavily dependent on the "model" file and the names used there right 274 | now. It's really easy to miss anything. 275 | 276 | `path` should be pointing at the root directory where `train_log.pkl` 277 | (See __train_log_file_name) and `params/` reside. 278 | 279 | Always assuming all the values in the log dictionary (except `resume_key`), 280 | are lists with the same length. 281 | """ 282 | file_name = os.path.join(path, __train_log_file_name) 283 | # Raise error if does not exists. 284 | with open(file_name, "rb") as f: 285 | log = pickle.load(f) 286 | 287 | param_found = False 288 | res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl') 289 | for reverse_idx in range(-1, -len(log[epoch_key])-1, -1): 290 | ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx] 291 | print "> Params file for epoch {} iter {}".format(ep, it), 292 | last_path = glob.glob(res_path.format(ep, it)) 293 | if len(last_path) == 1: 294 | res_path = last_path[0] 295 | param_found = True 296 | print "found." 297 | break 298 | elif len(last_path) == 0: 299 | print "[NOT FOUND]. FALLING BACK TO..." 300 | else: # > 1 301 | # choose one, warning, rare 302 | print "[multiple version found]:" 303 | for l_path in last_path: 304 | print l_path 305 | res_path = last_path[0] 306 | param_found = True 307 | print "Arbitrarily choosing first:\n\t{}".format(res_path) 308 | 309 | assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log) 310 | # Finishing for loop with no success 311 | assert param_found, 'No matching params file with train_log' 312 | 313 | acceptable_len = reverse_idx+len(log[epoch_key])+1 314 | if acceptable_len != len(log[epoch_key]): 315 | # Backup of the old train_log 316 | with open(file_name+'.backup', 'wb') as f: 317 | pickle.dump(log, f) 318 | 319 | # Change the log file to match the last existing checkpoint. 320 | for k, v in log.items(): 321 | # Fix resume indices 322 | if k == resume_key: 323 | log[k] = [i for i in log[k] if i < acceptable_len] 324 | continue 325 | # Rest is useless with no param file. 326 | log[k] = v[:acceptable_len] 327 | 328 | epochs = log[epoch_key] 329 | iters = log[iter_key] 330 | 331 | if add_resume_counter: 332 | resume_val = len(epochs) 333 | if not resume_key in log.keys(): 334 | log[resume_key] = [resume_val] 335 | else: 336 | if log[resume_key] == [] or log[resume_key][-1] != resume_val: 337 | log[resume_key].append(resume_val) 338 | with open(file_name, "wb") as f: 339 | pickle.dump(log, f) 340 | 341 | last_epoch = epochs[-1] 342 | last_iter = iters[-1] 343 | 344 | # The if-else statement is more readable than `next`: 345 | #iters_to_consume = next((last_iter%(i-1) for (e, i) in\ 346 | # zip(epochs, iters) if e == 1), last_iter) 347 | if last_epoch == 0: 348 | iters_to_consume = last_iter 349 | else: 350 | for e, i in zip(epochs, iters): 351 | # first time. Epoch turns from 0 to 1. 352 | # At the end of each `epoch` there should be 353 | # a monitoring step so it will gives number 354 | # number of iterations per epoch 355 | if e == 1: 356 | iters_per_epoch = i - 1 357 | break 358 | iters_to_consume = last_iter % iters_per_epoch 359 | 360 | last_other_keys = [log[k][-1] for k in other_keys] 361 | return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys 362 | 363 | def plot_traing_info(x, ylist, path): 364 | """ 365 | Loads log file and plot x and y values as provided by input. 366 | Saves as /train_log.png 367 | """ 368 | file_name = os.path.join(path, __train_log_file_name) 369 | try: 370 | with open(file_name, "rb") as f: 371 | log = pickle.load(f) 372 | except IOError: # first time 373 | warnings.warn("There is no {} file here!!!".format(file_name)) 374 | return 375 | plt.figure() 376 | x_vals = log[x] 377 | for y in ylist: 378 | y_vals = log[y] 379 | if len(y_vals) != len(x_vals): 380 | warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x)) 381 | plt.plot(x_vals, y_vals, label=y) 382 | # assert len(y_vals) == len(x_vals), "not the same len" 383 | plt.xlabel(x) 384 | plt.legend() 385 | #plt.show() 386 | plt.savefig(file_name[:-3]+'png', bbox_inches='tight') 387 | plt.close('all') 388 | 389 | def create_logging_folders(path): 390 | """ 391 | Handle structure of folders and naming here instead of training file. 392 | 393 | :todo: 394 | - Implement! 395 | """ 396 | pass 397 | 398 | def tv(var): 399 | """ 400 | :todo: 401 | - add tv() function for theano variables so that instead of calling 402 | x.tag.test_value, you can get the same thing just by calling the method 403 | in a faster way... 404 | - also for x.tag.test_value.shape 405 | """ 406 | # Based on EAFP (easier to ask for forgiveness than permission) 407 | try: 408 | return var.tag.test_value 409 | except AttributeError: 410 | print "NONE, test_value has not been set." 411 | import ipdb; ipdb.set_trace() 412 | 413 | ## Rather than LBYL (look before you leap) 414 | #if hasattr(var, 'tag'): 415 | # if hasattr(var.tag, 'test_value'): 416 | # return var.tag.test_value 417 | # else: 418 | # print "NONE, test_value has not set." 419 | # import ipdb; ipdb.set_trace() 420 | #else: 421 | # print "NONE, tag has not set." 422 | # import ipdb; ipdb.set_trace() 423 | 424 | def tvs(var): 425 | """ 426 | :returns: 427 | var.tag.test_value.shape 428 | """ 429 | return tv(var).shape 430 | 431 | def _is_symbolic(v): 432 | r"""Return `True` if any of the arguments are symbolic. 433 | See: 434 | https://github.com/Theano/Theano/wiki/Cookbook 435 | """ 436 | symbolic = False 437 | v = list(v) 438 | for _container, _iter in [(v, xrange(len(v)))]: 439 | for _k in _iter: 440 | _v = _container[_k] 441 | if isinstance(_v, theano.gof.Variable): 442 | symbolic = True 443 | return symbolic 444 | 445 | def unique_list(inp_list): 446 | """ 447 | returns a list with unique values of inp_list. 448 | :usage: 449 | >>> inp_list = ['a', 'b', 'c'] 450 | >>> unique_inp_list = unique_list(inp_list*2) 451 | """ 452 | return list(set(inp_list)) 453 | -------------------------------------------------------------------------------- /models/one_tier/wavent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | WaveNets Audio Generation Model 4 | 5 | How-to-run example: 6 | 7 | sampleRNN$ 8 | THEANO_FLAGS=mode=FAST_RUN,device=gpu1,floatX=float32,lib.cnmem=.95 python models/one_tier/wavent.py --dim 64 --q_levels 256 --q_type linear --which_set MUSIC --batch_size 8 --wavenet_blocks 4 --dilation_layers_per_block 10 --sequence_len_to_train 1600 9 | """ 10 | import time 11 | from datetime import datetime 12 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 13 | exp_start = time.time() 14 | 15 | import os, sys 16 | sys.path.insert(1, os.getcwd()) 17 | import argparse 18 | 19 | import numpy 20 | numpy.random.seed(123) 21 | np = numpy 22 | import random 23 | random.seed(123) 24 | 25 | import theano 26 | import theano.tensor as T 27 | import theano.ifelse 28 | import lasagne 29 | import scipy.io.wavfile 30 | 31 | import lib 32 | 33 | 34 | ### Parsing passed args/hyperparameters ### 35 | def get_args(): 36 | def t_or_f(arg): 37 | ua = str(arg).upper() 38 | if 'TRUE'.startswith(ua): 39 | return True 40 | elif 'FALSE'.startswith(ua): 41 | return False 42 | else: 43 | raise ValueError('Arg is neither `True` nor `False`') 44 | 45 | def check_non_negative(value): 46 | ivalue = int(value) 47 | if ivalue < 0: 48 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 49 | return ivalue 50 | 51 | def check_positive(value): 52 | ivalue = int(value) 53 | if ivalue < 1: 54 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 55 | return ivalue 56 | 57 | def check_unit_interval(value): 58 | fvalue = float(value) 59 | if fvalue < 0 or fvalue > 1: 60 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 61 | return fvalue 62 | 63 | # No default value here. Indicate every single arguement. 64 | parser = argparse.ArgumentParser( 65 | description='two_tier.py\nNo default value! Indicate every argument.') 66 | 67 | # Hyperparameter arguements: 68 | parser.add_argument('--exp', help='Experiment name', 69 | type=str, required=False, default='_') 70 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 71 | type=check_positive, required=True) 72 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 73 | type=check_positive, required=True) 74 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 75 | choices=['linear', 'a-law', 'mu-law'], required=True) 76 | #parser.add_argument('--nll_coeff', help='Value of alpha in [0, 1] for cost=alpha*NLL+(1-alpha)*FFT_cost',\ 77 | # type=check_unit_interval, required=True) 78 | parser.add_argument('--which_set', help='ONOM, BLIZZ, or MUSIC', 79 | choices=['ONOM', 'BLIZZ', 'MUSIC', 'HUCK'], required=True) 80 | parser.add_argument('--batch_size', help='size of mini-batch', 81 | type=check_positive, choices=[8, 16, 32, 64, 128, 256], required=True) 82 | parser.add_argument('--wavenet_blocks', help='Number of wavnet blocks to use', 83 | type=check_positive, required=True) 84 | parser.add_argument('--dilation_layers_per_block', help='number of dilation layers per block', 85 | type=check_positive, required=True) 86 | 87 | parser.add_argument('--sequence_len_to_train', help='size of output map', 88 | type=check_positive, required=True) 89 | 90 | parser.add_argument('--debug', help='debug mode', required=False, default=False, action='store_true') 91 | 92 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 93 | required=False, default=False, action='store_true') 94 | 95 | args = parser.parse_args() 96 | 97 | # Create tag for this experiment based on passed args 98 | tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 99 | print "Created experiment tag for these args:" 100 | print tag 101 | 102 | return args, tag 103 | 104 | args, tag = get_args() 105 | 106 | # N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass 107 | OVERLAP = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1# How many samples per frame 108 | #GLOBAL_NORM = args.global_norm 109 | DIM = args.dim # Model dimensionality. 110 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 111 | Q_TYPE = args.q_type # log- or linear-scale 112 | #NLL_COEFF = args.nll_coeff 113 | WHICH_SET = args.which_set 114 | BATCH_SIZE = args.batch_size 115 | #DATA_PATH = args.data_path 116 | 117 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 118 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 119 | 120 | # Fixed hyperparams 121 | GRAD_CLIP = 1 # Elementwise grad clip threshold 122 | BITRATE = 16000 123 | 124 | # Other constants 125 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 126 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 127 | #TRAIN_MODE = 'time-iters' 128 | # To use PRINT_TIME for validation, 129 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 130 | #TRAIN_MODE = 'iters-time' 131 | # To use PRINT_ITERS for validation, 132 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 133 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations. 134 | STOP_ITERS = 100000 # Stop after this many iterations 135 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds. 136 | STOP_TIME = 60*60*60 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 137 | N_SEQS = 10 # Number of samples to generate every time monitoring. 138 | FOLDER_PREFIX = os.path.join('results_wavenets', tag) 139 | SEQ_LEN = args.sequence_len_to_train # Total length (# of samples) of each truncated BPTT sequence 140 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 141 | 142 | LEARNING_RATE = lib.floatX(numpy.float32(0.0001)) 143 | RESUME = args.resume 144 | 145 | epoch_str = 'epoch' 146 | iter_str = 'iter' 147 | lowest_valid_str = 'lowest valid cost' 148 | corresp_test_str = 'correponding test cost' 149 | train_nll_str, valid_nll_str, test_nll_str = \ 150 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 151 | 152 | if args.debug: 153 | import warnings 154 | warnings.warn('----------RUNNING IN DEBUG MODE----------') 155 | TRAIN_MODE = 'time-iters' 156 | PRINT_TIME = 100 157 | STOP_TIME = 300 158 | STOP_ITERS = 1000 159 | 160 | ### Create directories ### 161 | # FOLDER_PREFIX: root, contains: 162 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 163 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 164 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 165 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 166 | 167 | if not os.path.exists(FOLDER_PREFIX): 168 | os.makedirs(FOLDER_PREFIX) 169 | 170 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 171 | 172 | if not os.path.exists(PARAMS_PATH): 173 | os.makedirs(PARAMS_PATH) 174 | 175 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 176 | 177 | if not os.path.exists(SAMPLES_PATH): 178 | os.makedirs(SAMPLES_PATH) 179 | 180 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 181 | 182 | if not os.path.exists(BEST_PATH): 183 | os.makedirs(BEST_PATH) 184 | 185 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 186 | 187 | ### Creating computation graph ### 188 | 189 | def create_wavenet_block(inp, num_dilation_layer, input_dim, output_dim, name =None): 190 | assert name is not None 191 | layer_out = inp 192 | skip_contrib = [] 193 | skip_weights = lib.param(name+".parametrized_weights", lib.floatX(numpy.ones((num_dilation_layer,)))) 194 | for i in range(num_dilation_layer): 195 | layer_out, skip_c = lib.ops.dil_conv_1D( 196 | layer_out, 197 | output_dim, 198 | input_dim if i == 0 else output_dim, 199 | 2, 200 | dilation = 2**i, 201 | non_linearity = 'gated', 202 | name = name+".dilation_{}".format(i+1) 203 | ) 204 | skip_c = skip_c*skip_weights[i] 205 | 206 | skip_contrib.append(skip_c) 207 | 208 | skip_out = skip_contrib[-1] 209 | 210 | j = 0 211 | for i in range(num_dilation_layer-1): 212 | j += 2**(num_dilation_layer-i-1) 213 | skip_out = skip_out + skip_contrib[num_dilation_layer-2 - i][:,j:] 214 | 215 | return layer_out, skip_out 216 | 217 | def create_model(inp): 218 | out = (inp.astype(theano.config.floatX)/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5)) 219 | l_out = out.dimshuffle(0,1,'x') 220 | 221 | skips = [] 222 | for i in range(args.wavenet_blocks): 223 | l_out, skip_out = create_wavenet_block(l_out, args.dilation_layers_per_block, 1 if i == 0 else args.dim, args.dim, name = "block_{}".format(i+1)) 224 | skips.append(skip_out) 225 | 226 | out = skips[-1] 227 | 228 | for i in range(args.wavenet_blocks - 1): 229 | out = out + skips[args.wavenet_blocks - 2 - i][:,(2**args.dilation_layers_per_block - 1)*(i+1):] 230 | 231 | for i in range(3): 232 | out = lib.ops.conv1d("out_{}".format(i+1), out, args.dim, args.dim, 1, non_linearity='relu') 233 | 234 | out = lib.ops.conv1d("final", out, args.dim, args.q_levels, 1, non_linearity='identity') 235 | 236 | return out 237 | 238 | sequences = T.imatrix('sequences') 239 | h0 = T.tensor3('h0') 240 | reset = T.iscalar('reset') 241 | mask = T.matrix('mask') 242 | 243 | if args.debug: 244 | # Solely for debugging purposes. 245 | # Maybe I should set the compute_test_value=warn from here. 246 | sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN), dtype='int32') 247 | 248 | input_sequences = sequences[:, :-1] 249 | target_sequences = sequences[:, (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1:] 250 | 251 | target_mask = mask[:, (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1:] 252 | 253 | output = create_model(input_sequences) 254 | 255 | cost = T.nnet.categorical_crossentropy( 256 | T.nnet.softmax(output.reshape((-1, Q_LEVELS))), 257 | target_sequences.flatten() 258 | ) 259 | 260 | cost = cost.reshape(target_sequences.shape) 261 | cost = cost * target_mask 262 | # Don't use these lines; could end up with NaN 263 | # Specially at the end of audio files where mask is 264 | # all zero for some of the shorter files in mini-batch. 265 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 266 | #cost = cost.mean(axis=0) 267 | 268 | # Use this one instead. 269 | cost = cost.sum() 270 | cost = cost / target_mask.sum() 271 | 272 | # By default we report cross-entropy cost in bits. 273 | # Switch to nats by commenting out this line: 274 | # log_2(e) = 1.44269504089 275 | cost = cost * lib.floatX(numpy.log2(numpy.e)) 276 | 277 | ### Getting the params, grads, updates, and Theano functions ### 278 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) 279 | lib.print_params_info(params, path=FOLDER_PREFIX) 280 | 281 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn') 282 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 283 | 284 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) 285 | 286 | # Training function 287 | train_fn = theano.function( 288 | [sequences, mask], 289 | cost, 290 | updates=updates, 291 | on_unused_input='warn' 292 | ) 293 | 294 | # Validation and Test function 295 | test_fn = theano.function( 296 | [sequences, mask], 297 | cost, 298 | on_unused_input='warn' 299 | ) 300 | 301 | # Sampling at frame level 302 | generate_fn = theano.function( 303 | [sequences], 304 | lib.ops.softmax_and_sample(output), 305 | on_unused_input='warn' 306 | ) 307 | 308 | 309 | def generate_and_save_samples(tag): 310 | def write_audio_file(name, data): 311 | data = data.astype('float32') 312 | data -= data.min() 313 | data /= data.max() 314 | data -= 0.5 315 | data *= 0.95 316 | scipy.io.wavfile.write( 317 | os.path.join(SAMPLES_PATH, name+'.wav'), 318 | BITRATE, 319 | data) 320 | 321 | total_time = time.time() 322 | # Generate N_SEQS' sample files, each 5 seconds long 323 | N_SECS = 5 324 | LENGTH = N_SECS*BITRATE 325 | 326 | if args.debug: 327 | LENGTH = 1024 328 | 329 | num_prev_samples_to_use = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1 330 | 331 | samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use), dtype='int32') 332 | samples[:, :num_prev_samples_to_use] = Q_ZERO 333 | 334 | for t in range(LENGTH): 335 | samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = generate_fn(samples[:, t:t + num_prev_samples_to_use+1]) 336 | if (t > 2*BITRATE) and( t < 3*BITRATE): 337 | samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = Q_ZERO 338 | 339 | total_time = time.time() - total_time 340 | log = "{} samples of {} seconds length generated in {} seconds." 341 | log = log.format(N_SEQS, N_SECS, total_time) 342 | print log, 343 | 344 | for i in xrange(N_SEQS): 345 | samp = samples[i, num_prev_samples_to_use: ] 346 | if Q_TYPE == 'mu-law': 347 | from datasets.dataset import mu2linear 348 | samp = mu2linear(samp) 349 | elif Q_TYPE == 'a-law': 350 | raise NotImplementedError('a-law is not implemented') 351 | write_audio_file("sample_{}_{}".format(tag, i), samp) 352 | 353 | ### Import the data_feeder ### 354 | # Handling WHICH_SET 355 | if WHICH_SET == 'ONOM': 356 | from datasets.dataset import onom_train_feed_epoch as train_feeder 357 | from datasets.dataset import onom_valid_feed_epoch as valid_feeder 358 | from datasets.dataset import onom_test_feed_epoch as test_feeder 359 | elif WHICH_SET == 'BLIZZ': 360 | from datasets.dataset import blizz_train_feed_epoch as train_feeder 361 | from datasets.dataset import blizz_valid_feed_epoch as valid_feeder 362 | from datasets.dataset import blizz_test_feed_epoch as test_feeder 363 | elif WHICH_SET == 'MUSIC': 364 | from datasets.dataset import music_train_feed_epoch as train_feeder 365 | from datasets.dataset import music_valid_feed_epoch as valid_feeder 366 | from datasets.dataset import music_test_feed_epoch as test_feeder 367 | elif WHICH_SET == 'HUCK': 368 | from datasets.dataset import huck_train_feed_epoch as train_feeder 369 | from datasets.dataset import huck_valid_feed_epoch as valid_feeder 370 | from datasets.dataset import huck_test_feed_epoch as test_feeder 371 | 372 | 373 | def monitor(data_feeder): 374 | """ 375 | Cost and time of test_fn on a given dataset section. 376 | Pass only one of `valid_feeder` or `test_feeder`. 377 | Don't pass `train_feed`. 378 | 379 | :returns: 380 | Mean cost over the input dataset (data_feeder) 381 | Total time spent 382 | """ 383 | _total_time = 0. 384 | _costs = [] 385 | _data_feeder = data_feeder(BATCH_SIZE, 386 | SEQ_LEN, 387 | OVERLAP, 388 | Q_LEVELS, 389 | Q_ZERO, 390 | Q_TYPE) 391 | 392 | for _seqs, _reset, _mask in _data_feeder: 393 | _start_time = time.time() 394 | _cost = test_fn(_seqs, _mask) 395 | _total_time += time.time() - _start_time 396 | 397 | _costs.append(_cost) 398 | 399 | return numpy.mean(_costs), _total_time 400 | 401 | 402 | print "Wall clock time spent before training started: {:.2f}h"\ 403 | .format((time.time()-exp_start)/3600.) 404 | print "Training!" 405 | total_iters = 0 406 | total_time = 0. 407 | last_print_time = 0. 408 | last_print_iters = 0 409 | costs = [] 410 | lowest_valid_cost = numpy.finfo(numpy.float32).max 411 | corresponding_test_cost = numpy.finfo(numpy.float32).max 412 | new_lowest_cost = False 413 | end_of_batch = False 414 | epoch = 0 # Important for mostly other datasets rather than Blizz 415 | 416 | # Initial load train dataset 417 | tr_feeder = train_feeder(BATCH_SIZE, 418 | SEQ_LEN, 419 | OVERLAP, 420 | Q_LEVELS, 421 | Q_ZERO, 422 | Q_TYPE) 423 | 424 | 425 | 426 | if RESUME: 427 | # Check if checkpoint from previous run is not corrupted. 428 | # Then overwrite some of the variables above. 429 | iters_to_consume, res_path, epoch, total_iters,\ 430 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 431 | lib.resumable(path=FOLDER_PREFIX, 432 | iter_key=iter_str, 433 | epoch_key=epoch_str, 434 | add_resume_counter=True, 435 | other_keys=[lowest_valid_str, 436 | corresp_test_str, 437 | test_nll_str]) 438 | # At this point we saved the pkl file. 439 | last_print_iters = total_iters 440 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 441 | # Consumes this much iters to get to the last point in training data. 442 | consume_time = time.time() 443 | for i in xrange(iters_to_consume): 444 | tr_feeder.next() 445 | consume_time = time.time() - consume_time 446 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 447 | format(consume_time, iters_to_consume) 448 | 449 | lib.load_params(res_path) 450 | print "Parameters from last available checkpoint loaded from path {}".format(res_path) 451 | 452 | test_time = 0.0 453 | 454 | while True: 455 | # THIS IS ONE ITERATION 456 | if total_iters % 500 == 0: 457 | print total_iters, 458 | 459 | total_iters += 1 460 | 461 | try: 462 | # Take as many mini-batches as possible from train set 463 | mini_batch = tr_feeder.next() 464 | except StopIteration: 465 | # Mini-batches are finished. Load it again. 466 | # Basically, one epoch. 467 | tr_feeder = train_feeder(BATCH_SIZE, 468 | SEQ_LEN, 469 | OVERLAP, 470 | Q_LEVELS, 471 | Q_ZERO, 472 | Q_TYPE) 473 | 474 | # and start taking new mini-batches again. 475 | mini_batch = tr_feeder.next() 476 | epoch += 1 477 | end_of_batch = True 478 | print "[Another epoch]", 479 | 480 | seqs, reset, mask = mini_batch 481 | 482 | 483 | ##Remove this 484 | # print seqs.shape 485 | # targ = generate_fn(seqs) 486 | # print targ.shape 487 | ##### 488 | 489 | start_time = time.time() 490 | cost = train_fn(seqs, mask) 491 | total_time += time.time() - start_time 492 | #print "This cost:", cost, "This h0.mean()", h0.mean() 493 | 494 | costs.append(cost) 495 | 496 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \ 497 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \ 498 | (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \ 499 | (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \ 500 | end_of_batch: 501 | print "\nValidation!", 502 | valid_cost, valid_time = monitor(valid_feeder) 503 | print "Done!" 504 | 505 | # Only when the validation cost is improved get the cost for test set. 506 | if valid_cost < lowest_valid_cost: 507 | lowest_valid_cost = valid_cost 508 | print "\n>>> Best validation cost of {} reached. Testing!"\ 509 | .format(valid_cost), 510 | test_cost, test_time = monitor(test_feeder) 511 | print "Done!" 512 | # Report last one which is the lowest on validation set: 513 | print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time) 514 | corresponding_test_cost = test_cost 515 | new_lowest_cost = True 516 | 517 | # Stdout the training progress 518 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 519 | print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n" 520 | print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n" 521 | print_info += "\tvalid cost:{:.4f}\ttotal time:{:.2f}h\n" 522 | print_info += "\ttest cost:{:.4f}\ttotal time:{:.2f}h" 523 | print_info = print_info.format(epoch, 524 | total_iters, 525 | (time.time()-exp_start)/3600, 526 | lowest_valid_cost, 527 | corresponding_test_cost, 528 | numpy.mean(costs), 529 | total_time/3600, 530 | total_time/total_iters, 531 | valid_cost, 532 | valid_time/3600, 533 | test_cost, 534 | test_time/3600) 535 | print print_info 536 | 537 | # Save and graph training progress 538 | x_axis_str = 'iter' 539 | train_nll_str, valid_nll_str, test_nll_str = \ 540 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 541 | training_info = {'epoch' : epoch, 542 | x_axis_str : total_iters, 543 | train_nll_str : numpy.mean(costs), 544 | valid_nll_str : valid_cost, 545 | test_nll_str : test_cost, 546 | 'lowest valid cost' : lowest_valid_cost, 547 | 'correponding test cost' : corresponding_test_cost, 548 | 'train time' : total_time, 549 | 'valid time' : valid_time, 550 | 'test time' : test_time, 551 | 'wall clock time' : time.time()-exp_start} 552 | lib.save_training_info(training_info, FOLDER_PREFIX) 553 | print "Train info saved!", 554 | 555 | y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str] 556 | lib.plot_traing_info(x_axis_str, y_axis_strs, FOLDER_PREFIX) 557 | print "Plotted!" 558 | 559 | # Generate and save samples 560 | print "Sampling!", 561 | tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}" 562 | tag = tag.format(epoch, 563 | total_iters, 564 | total_time/3600, 565 | numpy.mean(cost), 566 | valid_cost) 567 | tag += ("_best" if new_lowest_cost else "") 568 | # Generate samples 569 | generate_and_save_samples(tag) 570 | print "Done!" 571 | 572 | # Save params of model 573 | lib.save_params( 574 | os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag)) 575 | ) 576 | print "Params saved!" 577 | 578 | if total_iters-last_print_iters == PRINT_ITERS \ 579 | or total_time-last_print_time >= PRINT_TIME: 580 | # If we are here b/c of onom_end_of_batch, we shouldn't mess 581 | # with costs and last_print_iters 582 | costs = [] 583 | last_print_time += PRINT_TIME 584 | last_print_iters += PRINT_ITERS 585 | 586 | end_of_batch = False 587 | new_lowest_cost = False 588 | 589 | print "Validation Done!\nBack to Training..." 590 | 591 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \ 592 | (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \ 593 | ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \ 594 | (total_iters == STOP_ITERS or total_time >= STOP_TIME)): 595 | 596 | print "Done! Total iters:", total_iters, "Total time: ", total_time 597 | print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 598 | print "Wall clock time spent: {:.2f}h"\ 599 | .format((time.time()-exp_start)/3600) 600 | 601 | sys.exit() 602 | -------------------------------------------------------------------------------- /models/two_tier/two_tier_generate32k.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | 4 | Two-tier model, Quantized input 5 | For more info: 6 | $ python two_tier.py -h 7 | 8 | How-to-run example: 9 | sampleRNN$ pwd 10 | /u/mehris/sampleRNN 11 | 12 | sampleRNN$ \ 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \ 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \ 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \ 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \ 17 | --batch_size 128 --which_set MUSIC 18 | 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the 20 | resume code as many time as possible, depending on the TRAIN_MODE. 21 | (folder name, file name, flags, their order, and the values are important) 22 | """ 23 | from time import time 24 | from datetime import datetime 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 26 | exp_start = time() 27 | 28 | import os, sys, glob 29 | sys.path.insert(1, os.getcwd()) 30 | import argparse 31 | import datetime 32 | import numpy 33 | numpy.random.seed(123) 34 | np = numpy 35 | import random 36 | random.seed(123) 37 | import re 38 | 39 | 40 | import theano 41 | import theano.tensor as T 42 | import theano.ifelse 43 | import lasagne 44 | import scipy.io.wavfile 45 | 46 | import lib 47 | 48 | LEARNING_RATE = 0.001 49 | 50 | ### Parsing passed args/hyperparameters ### 51 | def get_args(): 52 | def t_or_f(arg): 53 | ua = str(arg).upper() 54 | if 'TRUE'.startswith(ua): 55 | return True 56 | elif 'FALSE'.startswith(ua): 57 | return False 58 | else: 59 | raise ValueError('Arg is neither `True` nor `False`') 60 | 61 | def check_non_negative(value): 62 | ivalue = int(value) 63 | if ivalue < 0: 64 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 65 | return ivalue 66 | 67 | def check_positive(value): 68 | ivalue = int(value) 69 | if ivalue < 1: 70 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 71 | return ivalue 72 | 73 | def check_unit_interval(value): 74 | fvalue = float(value) 75 | if fvalue < 0 or fvalue > 1: 76 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 77 | return fvalue 78 | 79 | # No default value here. Indicate every single arguement. 80 | parser = argparse.ArgumentParser( 81 | description='two_tier.py\nNo default value! Indicate every argument.') 82 | 83 | # Hyperparameter arguements: 84 | parser.add_argument('--exp', help='Experiment name', 85 | type=str, required=False, default='_') 86 | parser.add_argument('--n_frames', help='How many "frames" to include in each\ 87 | Truncated BPTT pass', type=check_positive, required=True) 88 | parser.add_argument('--frame_size', help='How many samples per frame',\ 89 | type=check_positive, required=True) 90 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization\ 91 | to all the linear layers (except for the embedding layer)',\ 92 | type=t_or_f, required=True) 93 | parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True) 94 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True) 95 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 96 | type=check_positive, required=True) 97 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 98 | type=check_positive, choices=xrange(1,40), required=True) 99 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 100 | required=True) 101 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 102 | type=t_or_f, required=True) 103 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 104 | type=check_positive, required=True) 105 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 106 | choices=['linear', 'a-law', 'mu-law'], required=True) 107 | parser.add_argument('--which_set', help='the directory name of the dataset' , 108 | type=str, required=True) 109 | parser.add_argument('--batch_size', help='size of mini-batch', 110 | type=check_positive, choices=xrange(0, 129), required=True) 111 | 112 | parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true') 113 | # NEW 114 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 115 | required=False, default=False, action='store_true') 116 | 117 | parser.add_argument('--n_secs', help='Seconds to generate',\ 118 | type=check_positive, required=True) 119 | parser.add_argument('--n_seqs', help='Number wavs to generate',\ 120 | type=check_positive, required=True) 121 | 122 | 123 | args = parser.parse_args() 124 | 125 | # NEW 126 | # Create tag for this experiment based on passed args 127 | tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 128 | tag = re.sub(r'-n_secs[0-9]+', "", tag) 129 | tag = re.sub(r'-n_seqs[0-9]+', "", tag) 130 | tag = re.sub(r'_generate', "", tag) 131 | tag += '-lr'+str(LEARNING_RATE) 132 | print "Created experiment tag for these args:" 133 | print tag 134 | 135 | return args, tag 136 | 137 | args, tag = get_args() 138 | 139 | 140 | print "sup" 141 | 142 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass 143 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame 144 | WEIGHT_NORM = args.weight_norm 145 | EMB_SIZE = args.emb_size 146 | SKIP_CONN = args.skip_conn 147 | DIM = args.dim # Model dimensionality. 148 | N_RNN = args.n_rnn # How many RNNs to stack 149 | RNN_TYPE = args.rnn_type 150 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 151 | LEARN_H0 = args.learn_h0 152 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 153 | Q_TYPE = args.q_type # log- or linear-scale 154 | WHICH_SET = args.which_set 155 | BATCH_SIZE = args.batch_size 156 | RESUME = args.resume 157 | N_SECS = args.n_secs 158 | N_SEQS = args.n_seqs 159 | 160 | 161 | print "hi" 162 | 163 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 164 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 165 | 166 | # Fixed hyperparams 167 | GRAD_CLIP = 1 # Elementwise grad clip threshold 168 | BITRATE = 32000 169 | 170 | # Other constants 171 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 172 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 173 | #TRAIN_MODE = 'time-iters' 174 | # To use PRINT_TIME for validation, 175 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 176 | #TRAIN_MODE = 'iters-time' 177 | # To use PRINT_ITERS for validation, 178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 179 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations. 180 | STOP_ITERS = 100000 # Stop after this many iterations 181 | # TODO: 182 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds. 183 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 184 | # TODO: 185 | RESULTS_DIR = 'results_2t' 186 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 187 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence 188 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 189 | 190 | 191 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE 192 | 193 | 194 | epoch_str = 'epoch' 195 | iter_str = 'iter' 196 | lowest_valid_str = 'lowest valid cost' 197 | corresp_test_str = 'correponding test cost' 198 | train_nll_str, valid_nll_str, test_nll_str = \ 199 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 200 | 201 | if args.debug: 202 | import warnings 203 | warnings.warn('----------RUNNING IN DEBUG MODE----------') 204 | TRAIN_MODE = 'time' 205 | PRINT_TIME = 100 206 | STOP_TIME = 3000 207 | STOP_ITERS = 1000 208 | 209 | ### Create directories ### 210 | # FOLDER_PREFIX: root, contains: 211 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 212 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 213 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 214 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 215 | if not os.path.exists(FOLDER_PREFIX): 216 | os.makedirs(FOLDER_PREFIX) 217 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 218 | if not os.path.exists(PARAMS_PATH): 219 | os.makedirs(PARAMS_PATH) 220 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 221 | if not os.path.exists(SAMPLES_PATH): 222 | os.makedirs(SAMPLES_PATH) 223 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 224 | if not os.path.exists(BEST_PATH): 225 | os.makedirs(BEST_PATH) 226 | 227 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 228 | 229 | ### Import the data_feeder ### 230 | # Handling WHICH_SET 231 | from datasets.dataset import music_train_feed_epoch as train_feeder 232 | from datasets.dataset import music_valid_feed_epoch as valid_feeder 233 | from datasets.dataset import music_test_feed_epoch as test_feeder 234 | 235 | def load_data(data_feeder): 236 | """ 237 | Helper function to deal with interface of different datasets. 238 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 239 | """ 240 | return data_feeder(WHICH_SET, BATCH_SIZE, 241 | SEQ_LEN, 242 | OVERLAP, 243 | Q_LEVELS, 244 | Q_ZERO, 245 | Q_TYPE) 246 | 247 | ### Creating computation graph ### 248 | def frame_level_rnn(input_sequences, h0, reset): 249 | """ 250 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) 251 | h0.shape: (batch size, N_RNN, DIM) 252 | reset.shape: () 253 | 254 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 255 | """ 256 | frames = input_sequences.reshape(( 257 | input_sequences.shape[0], 258 | input_sequences.shape[1] // FRAME_SIZE, 259 | FRAME_SIZE 260 | )) 261 | 262 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 263 | # (a reasonable range to pass as inputs to the RNN) 264 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 265 | frames *= lib.floatX(2) 266 | # (128, 64, 4) 267 | 268 | # Initial state of RNNs 269 | learned_h0 = lib.param( 270 | 'FrameLevel.h0', 271 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 272 | ) 273 | # Handling LEARN_H0 274 | learned_h0.param = LEARN_H0 275 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 276 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 277 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 278 | 279 | # Handling RNN_TYPE 280 | # Handling SKIP_CONN 281 | if RNN_TYPE == 'GRU': 282 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 283 | N_RNN, 284 | FRAME_SIZE, 285 | DIM, 286 | frames, 287 | h0=h0, 288 | weightnorm=WEIGHT_NORM, 289 | skip_conn=SKIP_CONN) 290 | elif RNN_TYPE == 'LSTM': 291 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 292 | N_RNN, 293 | FRAME_SIZE, 294 | DIM, 295 | frames, 296 | h0=h0, 297 | weightnorm=WEIGHT_NORM, 298 | skip_conn=SKIP_CONN) 299 | 300 | # rnns_out (bs, seqlen, dim) (128, 64, 512) 301 | output = lib.ops.Linear( 302 | 'FrameLevel.Output', 303 | DIM, 304 | FRAME_SIZE * DIM, 305 | rnns_out, 306 | initialization='he', 307 | weightnorm=WEIGHT_NORM 308 | ) 309 | # output: (2, 9, 4*dim) 310 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 311 | # output: (2, 9*4, dim) 312 | 313 | return (output, last_hidden) 314 | 315 | def sample_level_predictor(frame_level_outputs, prev_samples): 316 | """ 317 | batch size = BATCH_SIZE * SEQ_LEN 318 | SEQ_LEN = N_FRAMES * FRAME_SIZE 319 | 320 | frame_level_outputs.shape: (batch size, DIM) 321 | prev_samples.shape: (batch size, FRAME_SIZE) int32 322 | 323 | output.shape: (batch size, Q_LEVELS) 324 | """ 325 | # Handling EMB_SIZE 326 | if EMB_SIZE == 0: 327 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 328 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 329 | last_out_shape = Q_LEVELS 330 | elif EMB_SIZE > 0: 331 | prev_samples = lib.ops.Embedding( 332 | 'SampleLevel.Embedding', 333 | Q_LEVELS, 334 | EMB_SIZE, 335 | prev_samples) 336 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 337 | last_out_shape = EMB_SIZE 338 | else: 339 | raise ValueError('EMB_SIZE cannot be negative.') 340 | 341 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) 342 | 343 | out = lib.ops.Linear( 344 | 'SampleLevel.L1_PrevSamples', 345 | FRAME_SIZE * last_out_shape, 346 | DIM, 347 | prev_samples, 348 | biases=False, 349 | initialization='he', 350 | weightnorm=WEIGHT_NORM) 351 | # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM) 352 | 353 | out += frame_level_outputs 354 | # ^ (2*(9*4), dim) 355 | 356 | # L2 357 | out = lib.ops.Linear('SampleLevel.L2', 358 | DIM, 359 | DIM, 360 | out, 361 | initialization='he', 362 | weightnorm=WEIGHT_NORM) 363 | out = T.nnet.relu(out) 364 | 365 | # L3 366 | out = lib.ops.Linear('SampleLevel.L3', 367 | DIM, 368 | DIM, 369 | out, 370 | initialization='he', 371 | weightnorm=WEIGHT_NORM) 372 | out = T.nnet.relu(out) 373 | 374 | # Output 375 | # We apply the softmax later 376 | out = lib.ops.Linear('SampleLevel.Output', 377 | DIM, 378 | Q_LEVELS, 379 | out, 380 | weightnorm=WEIGHT_NORM) 381 | return out 382 | 383 | sequences = T.imatrix('sequences') 384 | h0 = T.tensor3('h0') 385 | reset = T.iscalar('reset') 386 | mask = T.matrix('mask') 387 | 388 | if args.debug: 389 | # Solely for debugging purposes. 390 | # Maybe I should set the compute_test_value=warn from here. 391 | sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32') 392 | h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 393 | reset.tag.test_value = numpy.array(1, dtype='int32') 394 | mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32') 395 | 396 | input_sequences = sequences[:, :-FRAME_SIZE] 397 | target_sequences = sequences[:, FRAME_SIZE:] 398 | 399 | target_mask = mask[:, FRAME_SIZE:] 400 | 401 | frame_level_outputs, new_h0 =\ 402 | frame_level_rnn(input_sequences, h0, reset) 403 | 404 | prev_samples = sequences[:, :-1] 405 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1)) 406 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') 407 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE)) 408 | # (batch_size*n_frames*frame_size, frame_size) 409 | 410 | sample_level_outputs = sample_level_predictor( 411 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)), 412 | prev_samples, 413 | ) 414 | 415 | cost = T.nnet.categorical_crossentropy( 416 | T.nnet.softmax(sample_level_outputs), 417 | target_sequences.flatten() 418 | ) 419 | cost = cost.reshape(target_sequences.shape) 420 | cost = cost * target_mask 421 | # Don't use these lines; could end up with NaN 422 | # Specially at the end of audio files where mask is 423 | # all zero for some of the shorter files in mini-batch. 424 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 425 | #cost = cost.mean(axis=0) 426 | 427 | # Use this one instead. 428 | cost = cost.sum() 429 | cost = cost / target_mask.sum() 430 | 431 | # By default we report cross-entropy cost in bits. 432 | # Switch to nats by commenting out this line: 433 | # log_2(e) = 1.44269504089 434 | cost = cost * lib.floatX(numpy.log2(numpy.e)) 435 | 436 | ### Getting the params, grads, updates, and Theano functions ### 437 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) 438 | lib.print_params_info(params, path=FOLDER_PREFIX) 439 | 440 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn') 441 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 442 | 443 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) 444 | 445 | # Training function 446 | train_fn = theano.function( 447 | [sequences, h0, reset, mask], 448 | [cost, new_h0], 449 | updates=updates, 450 | on_unused_input='warn' 451 | ) 452 | 453 | # Validation and Test function, hence no updates 454 | test_fn = theano.function( 455 | [sequences, h0, reset, mask], 456 | [cost, new_h0], 457 | on_unused_input='warn' 458 | ) 459 | 460 | # Sampling at frame level 461 | frame_level_generate_fn = theano.function( 462 | [sequences, h0, reset], 463 | frame_level_rnn(sequences, h0, reset), 464 | on_unused_input='warn' 465 | ) 466 | 467 | # Sampling at audio sample level 468 | frame_level_outputs = T.matrix('frame_level_outputs') 469 | prev_samples = T.imatrix('prev_samples') 470 | sample_level_generate_fn = theano.function( 471 | [frame_level_outputs, prev_samples], 472 | lib.ops.softmax_and_sample( 473 | sample_level_predictor( 474 | frame_level_outputs, 475 | prev_samples, 476 | ) 477 | ), 478 | on_unused_input='warn' 479 | ) 480 | 481 | # Uniform [-0.5, 0.5) for half of initial state for generated samples 482 | # to study the behaviour of the model and also to introduce some diversity 483 | # to samples in a simple way. [it's disabled for now] 484 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM) 485 | fixed_rand_h0 -= 0.5 486 | fixed_rand_h0 = fixed_rand_h0.astype('float32') 487 | 488 | def generate_and_save_samples(tag, N_SECS=5): 489 | def write_audio_file(name, data): 490 | data = data.astype('float32') 491 | data -= data.min() 492 | data /= data.max() 493 | data -= 0.5 494 | data *= 0.95 495 | scipy.io.wavfile.write( 496 | os.path.join(SAMPLES_PATH, name+'.wav'), 497 | BITRATE, 498 | data) 499 | 500 | total_time = time() 501 | # Generate N_SEQS' sample files, each 5 seconds long 502 | LENGTH = N_SECS*BITRATE if not args.debug else 100 503 | 504 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 505 | samples[:, :FRAME_SIZE] = Q_ZERO 506 | 507 | # First half zero, others fixed random at each checkpoint 508 | h0 = numpy.zeros( 509 | (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), 510 | dtype='float32' 511 | ) 512 | h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) 513 | frame_level_outputs = None 514 | 515 | for t in xrange(FRAME_SIZE, LENGTH): 516 | 517 | if t % FRAME_SIZE == 0: 518 | frame_level_outputs, h0 = frame_level_generate_fn( 519 | samples[:, t-FRAME_SIZE:t], 520 | h0, 521 | #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), 522 | numpy.int32(t == FRAME_SIZE) 523 | ) 524 | 525 | samples[:, t] = sample_level_generate_fn( 526 | frame_level_outputs[:, t % FRAME_SIZE], 527 | samples[:, t-FRAME_SIZE:t], 528 | ) 529 | 530 | total_time = time() - total_time 531 | log = "{} samples of {} seconds length generated in {} seconds." 532 | log = log.format(N_SEQS, N_SECS, total_time) 533 | print log 534 | 535 | for i in xrange(N_SEQS): 536 | samp = samples[i] 537 | if Q_TYPE == 'mu-law': 538 | from datasets.dataset import mu2linear 539 | samp = mu2linear(samp) 540 | elif Q_TYPE == 'a-law': 541 | raise NotImplementedError('a-law is not implemented') 542 | 543 | now = datetime.datetime.now() 544 | now_time = "{}:{}:{}".format(now.hour, now.minute, now.second) 545 | 546 | file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i) 547 | print "writing...", file_name 548 | write_audio_file(file_name, samp) 549 | 550 | 551 | 552 | def monitor(data_feeder): 553 | """ 554 | Cost and time of test_fn on a given dataset section. 555 | Pass only one of `valid_feeder` or `test_feeder`. 556 | Don't pass `train_feed`. 557 | 558 | :returns: 559 | Mean cost over the input dataset (data_feeder) 560 | Total time spent 561 | """ 562 | _total_time = time() 563 | _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 564 | _costs = [] 565 | _data_feeder = load_data(data_feeder) 566 | for _seqs, _reset, _mask in _data_feeder: 567 | _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask) 568 | _costs.append(_cost) 569 | 570 | return numpy.mean(_costs), time() - _total_time 571 | 572 | print "Wall clock time spent before training started: {:.2f}h"\ 573 | .format((time()-exp_start)/3600.) 574 | print "Training!" 575 | total_iters = 0 576 | total_time = 0. 577 | last_print_time = 0. 578 | last_print_iters = 0 579 | costs = [] 580 | lowest_valid_cost = numpy.finfo(numpy.float32).max 581 | corresponding_test_cost = numpy.finfo(numpy.float32).max 582 | new_lowest_cost = False 583 | end_of_batch = False 584 | epoch = 0 585 | 586 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 587 | 588 | # Initial load train dataset 589 | tr_feeder = load_data(train_feeder) 590 | 591 | ### Handling the resume option: 592 | if True: #if Resume: 593 | # Check if checkpoint from previous run is not corrupted. 594 | # Then overwrite some of the variables above. 595 | iters_to_consume, res_path, epoch, total_iters,\ 596 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 597 | lib.resumable(path=FOLDER_PREFIX, 598 | iter_key=iter_str, 599 | epoch_key=epoch_str, 600 | add_resume_counter=True, 601 | other_keys=[lowest_valid_str, 602 | corresp_test_str, 603 | test_nll_str]) 604 | # At this point we saved the pkl file. 605 | last_print_iters = total_iters 606 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 607 | # Consumes this much iters to get to the last point in training data. 608 | consume_time = time() 609 | for i in xrange(iters_to_consume): 610 | tr_feeder.next() 611 | consume_time = time() - consume_time 612 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 613 | format(consume_time, iters_to_consume) 614 | 615 | lib.load_params(res_path) 616 | print "Parameters from last available checkpoint loaded." 617 | 618 | 619 | 620 | # 2. Stdout the training progress 621 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 622 | print_info = print_info.format(epoch, 623 | total_iters, 624 | (time()-exp_start)/3600) 625 | print print_info 626 | 627 | tag = "e{}_i{}" 628 | tag = tag.format(epoch, 629 | total_iters) 630 | 631 | # 5. Generate and save samples (time consuming) 632 | # If not successful, we still have the params to sample afterward 633 | print "Sampling!", 634 | # Generate samples 635 | generate_and_save_samples(tag, N_SECS) 636 | print "Done!" 637 | 638 | print "Wall clock time spent: {:.2f}h"\ 639 | .format((time()-exp_start)/3600) 640 | 641 | sys.exit() -------------------------------------------------------------------------------- /models/two_tier/two_tier_generate16k.py.ol: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | 4 | Two-tier model, Quantized input 5 | For more info: 6 | $ python two_tier.py -h 7 | 8 | How-to-run example: 9 | sampleRNN$ pwd 10 | /u/mehris/sampleRNN 11 | 12 | sampleRNN$ \ 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \ 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \ 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \ 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \ 17 | --batch_size 128 --which_set MUSIC 18 | 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the 20 | resume code as many time as possible, depending on the TRAIN_MODE. 21 | (folder name, file name, flags, their order, and the values are important) 22 | """ 23 | from time import time 24 | from datetime import datetime 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 26 | exp_start = time() 27 | 28 | import os, sys, glob 29 | sys.path.insert(1, os.getcwd()) 30 | import argparse 31 | import datetime 32 | import numpy 33 | numpy.random.seed(123) 34 | np = numpy 35 | import random 36 | random.seed(123) 37 | import re 38 | 39 | 40 | import theano 41 | import theano.tensor as T 42 | import theano.ifelse 43 | import lasagne 44 | import scipy.io.wavfile 45 | 46 | import lib 47 | 48 | LEARNING_RATE = 0.001 49 | 50 | ### Parsing passed args/hyperparameters ### 51 | def get_args(): 52 | def t_or_f(arg): 53 | ua = str(arg).upper() 54 | if 'TRUE'.startswith(ua): 55 | return True 56 | elif 'FALSE'.startswith(ua): 57 | return False 58 | else: 59 | raise ValueError('Arg is neither `True` nor `False`') 60 | 61 | def check_non_negative(value): 62 | ivalue = int(value) 63 | if ivalue < 0: 64 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 65 | return ivalue 66 | 67 | def check_positive(value): 68 | ivalue = int(value) 69 | if ivalue < 1: 70 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 71 | return ivalue 72 | 73 | def check_unit_interval(value): 74 | fvalue = float(value) 75 | if fvalue < 0 or fvalue > 1: 76 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 77 | return fvalue 78 | 79 | # No default value here. Indicate every single arguement. 80 | parser = argparse.ArgumentParser( 81 | description='two_tier.py\nNo default value! Indicate every argument.') 82 | 83 | # Hyperparameter arguements: 84 | parser.add_argument('--exp', help='Experiment name', 85 | type=str, required=False, default='_') 86 | parser.add_argument('--n_frames', help='How many "frames" to include in each\ 87 | Truncated BPTT pass', type=check_positive, required=True) 88 | parser.add_argument('--frame_size', help='How many samples per frame',\ 89 | type=check_positive, required=True) 90 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization\ 91 | to all the linear layers (except for the embedding layer)',\ 92 | type=t_or_f, required=True) 93 | parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True) 94 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True) 95 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 96 | type=check_positive, required=True) 97 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 98 | type=check_positive, choices=xrange(1,40), required=True) 99 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 100 | required=True) 101 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 102 | type=t_or_f, required=True) 103 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 104 | type=check_positive, required=True) 105 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 106 | choices=['linear', 'a-law', 'mu-law'], required=True) 107 | parser.add_argument('--which_set', help='the directory name of the dataset' , 108 | type=str, required=True) 109 | parser.add_argument('--batch_size', help='size of mini-batch', 110 | type=check_positive, choices=xrange(0, 999), required=True) 111 | 112 | parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true') 113 | # NEW 114 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 115 | required=False, default=False, action='store_true') 116 | 117 | parser.add_argument('--n_secs', help='Seconds to generate',\ 118 | type=check_positive, required=True) 119 | parser.add_argument('--n_seqs', help='Number wavs to generate',\ 120 | type=check_positive, required=True) 121 | 122 | 123 | args = parser.parse_args() 124 | 125 | # NEW 126 | # Create tag for this experiment based on passed args 127 | tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 128 | tag = re.sub(r'-n_secs[0-9]+', "", tag) 129 | tag = re.sub(r'-n_seqs[0-9]+', "", tag) 130 | tag = re.sub(r'_generate', "", tag) 131 | tag += '-lr'+str(LEARNING_RATE) 132 | print "Created experiment tag for these args:" 133 | print tag 134 | 135 | return args, tag 136 | 137 | args, tag = get_args() 138 | 139 | 140 | print "sup" 141 | 142 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass 143 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame 144 | WEIGHT_NORM = args.weight_norm 145 | EMB_SIZE = args.emb_size 146 | SKIP_CONN = args.skip_conn 147 | DIM = args.dim # Model dimensionality. 148 | N_RNN = args.n_rnn # How many RNNs to stack 149 | RNN_TYPE = args.rnn_type 150 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 151 | LEARN_H0 = args.learn_h0 152 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 153 | Q_TYPE = args.q_type # log- or linear-scale 154 | WHICH_SET = args.which_set 155 | BATCH_SIZE = args.batch_size 156 | RESUME = args.resume 157 | N_SECS = args.n_secs 158 | N_SEQS = args.n_seqs 159 | 160 | 161 | print "hi" 162 | 163 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 164 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 165 | 166 | # Fixed hyperparams 167 | GRAD_CLIP = 1 # Elementwise grad clip threshold 168 | BITRATE = 16000 169 | 170 | # Other constants 171 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 172 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 173 | #TRAIN_MODE = 'time-iters' 174 | # To use PRINT_TIME for validation, 175 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 176 | #TRAIN_MODE = 'iters-time' 177 | # To use PRINT_ITERS for validation, 178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 179 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations. 180 | STOP_ITERS = 100000 # Stop after this many iterations 181 | # TODO: 182 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds. 183 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 184 | # TODO: 185 | RESULTS_DIR = 'results_2t' 186 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 187 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence 188 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 189 | 190 | 191 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE 192 | 193 | 194 | epoch_str = 'epoch' 195 | iter_str = 'iter' 196 | lowest_valid_str = 'lowest valid cost' 197 | corresp_test_str = 'correponding test cost' 198 | train_nll_str, valid_nll_str, test_nll_str = \ 199 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 200 | 201 | if args.debug: 202 | import warnings 203 | warnings.warn('----------RUNNING IN DEBUG MODE----------') 204 | TRAIN_MODE = 'time' 205 | PRINT_TIME = 100 206 | STOP_TIME = 3000 207 | STOP_ITERS = 1000 208 | 209 | ### Create directories ### 210 | # FOLDER_PREFIX: root, contains: 211 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 212 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 213 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 214 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 215 | if not os.path.exists(FOLDER_PREFIX): 216 | os.makedirs(FOLDER_PREFIX) 217 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 218 | if not os.path.exists(PARAMS_PATH): 219 | os.makedirs(PARAMS_PATH) 220 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 221 | if not os.path.exists(SAMPLES_PATH): 222 | os.makedirs(SAMPLES_PATH) 223 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 224 | if not os.path.exists(BEST_PATH): 225 | os.makedirs(BEST_PATH) 226 | 227 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 228 | 229 | ### Import the data_feeder ### 230 | # Handling WHICH_SET 231 | from datasets.dataset import music_train_feed_epoch as train_feeder 232 | from datasets.dataset import music_valid_feed_epoch as valid_feeder 233 | from datasets.dataset import music_test_feed_epoch as test_feeder 234 | 235 | def load_data(data_feeder): 236 | """ 237 | Helper function to deal with interface of different datasets. 238 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 239 | """ 240 | return data_feeder(WHICH_SET, BATCH_SIZE, 241 | SEQ_LEN, 242 | OVERLAP, 243 | Q_LEVELS, 244 | Q_ZERO, 245 | Q_TYPE) 246 | 247 | ### Creating computation graph ### 248 | def frame_level_rnn(input_sequences, h0, reset): 249 | """ 250 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) 251 | h0.shape: (batch size, N_RNN, DIM) 252 | reset.shape: () 253 | 254 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 255 | """ 256 | frames = input_sequences.reshape(( 257 | input_sequences.shape[0], 258 | input_sequences.shape[1] // FRAME_SIZE, 259 | FRAME_SIZE 260 | )) 261 | 262 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 263 | # (a reasonable range to pass as inputs to the RNN) 264 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 265 | frames *= lib.floatX(2) 266 | # (128, 64, 4) 267 | 268 | # Initial state of RNNs 269 | learned_h0 = lib.param( 270 | 'FrameLevel.h0', 271 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 272 | ) 273 | # Handling LEARN_H0 274 | learned_h0.param = LEARN_H0 275 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 276 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 277 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 278 | 279 | # Handling RNN_TYPE 280 | # Handling SKIP_CONN 281 | if RNN_TYPE == 'GRU': 282 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 283 | N_RNN, 284 | FRAME_SIZE, 285 | DIM, 286 | frames, 287 | h0=h0, 288 | weightnorm=WEIGHT_NORM, 289 | skip_conn=SKIP_CONN) 290 | elif RNN_TYPE == 'LSTM': 291 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 292 | N_RNN, 293 | FRAME_SIZE, 294 | DIM, 295 | frames, 296 | h0=h0, 297 | weightnorm=WEIGHT_NORM, 298 | skip_conn=SKIP_CONN) 299 | 300 | # rnns_out (bs, seqlen, dim) (128, 64, 512) 301 | output = lib.ops.Linear( 302 | 'FrameLevel.Output', 303 | DIM, 304 | FRAME_SIZE * DIM, 305 | rnns_out, 306 | initialization='he', 307 | weightnorm=WEIGHT_NORM 308 | ) 309 | # output: (2, 9, 4*dim) 310 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 311 | # output: (2, 9*4, dim) 312 | 313 | return (output, last_hidden) 314 | 315 | def sample_level_predictor(frame_level_outputs, prev_samples): 316 | """ 317 | batch size = BATCH_SIZE * SEQ_LEN 318 | SEQ_LEN = N_FRAMES * FRAME_SIZE 319 | 320 | frame_level_outputs.shape: (batch size, DIM) 321 | prev_samples.shape: (batch size, FRAME_SIZE) int32 322 | 323 | output.shape: (batch size, Q_LEVELS) 324 | """ 325 | # Handling EMB_SIZE 326 | if EMB_SIZE == 0: 327 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 328 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 329 | last_out_shape = Q_LEVELS 330 | elif EMB_SIZE > 0: 331 | prev_samples = lib.ops.Embedding( 332 | 'SampleLevel.Embedding', 333 | Q_LEVELS, 334 | EMB_SIZE, 335 | prev_samples) 336 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 337 | last_out_shape = EMB_SIZE 338 | else: 339 | raise ValueError('EMB_SIZE cannot be negative.') 340 | 341 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) 342 | 343 | out = lib.ops.Linear( 344 | 'SampleLevel.L1_PrevSamples', 345 | FRAME_SIZE * last_out_shape, 346 | DIM, 347 | prev_samples, 348 | biases=False, 349 | initialization='he', 350 | weightnorm=WEIGHT_NORM) 351 | # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM) 352 | 353 | out += frame_level_outputs 354 | # ^ (2*(9*4), dim) 355 | 356 | # L2 357 | out = lib.ops.Linear('SampleLevel.L2', 358 | DIM, 359 | DIM, 360 | out, 361 | initialization='he', 362 | weightnorm=WEIGHT_NORM) 363 | out = T.nnet.relu(out) 364 | 365 | # L3 366 | out = lib.ops.Linear('SampleLevel.L3', 367 | DIM, 368 | DIM, 369 | out, 370 | initialization='he', 371 | weightnorm=WEIGHT_NORM) 372 | out = T.nnet.relu(out) 373 | 374 | # Output 375 | # We apply the softmax later 376 | out = lib.ops.Linear('SampleLevel.Output', 377 | DIM, 378 | Q_LEVELS, 379 | out, 380 | weightnorm=WEIGHT_NORM) 381 | return out 382 | 383 | sequences = T.imatrix('sequences') 384 | h0 = T.tensor3('h0') 385 | reset = T.iscalar('reset') 386 | mask = T.matrix('mask') 387 | 388 | if args.debug: 389 | # Solely for debugging purposes. 390 | # Maybe I should set the compute_test_value=warn from here. 391 | sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32') 392 | h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 393 | reset.tag.test_value = numpy.array(1, dtype='int32') 394 | mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32') 395 | 396 | input_sequences = sequences[:, :-FRAME_SIZE] 397 | target_sequences = sequences[:, FRAME_SIZE:] 398 | 399 | target_mask = mask[:, FRAME_SIZE:] 400 | 401 | frame_level_outputs, new_h0 =\ 402 | frame_level_rnn(input_sequences, h0, reset) 403 | 404 | prev_samples = sequences[:, :-1] 405 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1)) 406 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') 407 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE)) 408 | # (batch_size*n_frames*frame_size, frame_size) 409 | 410 | sample_level_outputs = sample_level_predictor( 411 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)), 412 | prev_samples, 413 | ) 414 | 415 | cost = T.nnet.categorical_crossentropy( 416 | T.nnet.softmax(sample_level_outputs), 417 | target_sequences.flatten() 418 | ) 419 | cost = cost.reshape(target_sequences.shape) 420 | cost = cost * target_mask 421 | # Don't use these lines; could end up with NaN 422 | # Specially at the end of audio files where mask is 423 | # all zero for some of the shorter files in mini-batch. 424 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 425 | #cost = cost.mean(axis=0) 426 | 427 | # Use this one instead. 428 | cost = cost.sum() 429 | cost = cost / target_mask.sum() 430 | 431 | # By default we report cross-entropy cost in bits. 432 | # Switch to nats by commenting out this line: 433 | # log_2(e) = 1.44269504089 434 | cost = cost * lib.floatX(numpy.log2(numpy.e)) 435 | 436 | ### Getting the params, grads, updates, and Theano functions ### 437 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) 438 | lib.print_params_info(params, path=FOLDER_PREFIX) 439 | 440 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn') 441 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 442 | 443 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) 444 | 445 | # Training function 446 | train_fn = theano.function( 447 | [sequences, h0, reset, mask], 448 | [cost, new_h0], 449 | updates=updates, 450 | on_unused_input='warn' 451 | ) 452 | 453 | # Validation and Test function, hence no updates 454 | test_fn = theano.function( 455 | [sequences, h0, reset, mask], 456 | [cost, new_h0], 457 | on_unused_input='warn' 458 | ) 459 | 460 | # Sampling at frame level 461 | frame_level_generate_fn = theano.function( 462 | [sequences, h0, reset], 463 | frame_level_rnn(sequences, h0, reset), 464 | on_unused_input='warn' 465 | ) 466 | 467 | # Sampling at audio sample level 468 | frame_level_outputs = T.matrix('frame_level_outputs') 469 | prev_samples = T.imatrix('prev_samples') 470 | sample_level_generate_fn = theano.function( 471 | [frame_level_outputs, prev_samples], 472 | lib.ops.softmax_and_sample( 473 | sample_level_predictor( 474 | frame_level_outputs, 475 | prev_samples, 476 | ) 477 | ), 478 | on_unused_input='warn' 479 | ) 480 | 481 | # Uniform [-0.5, 0.5) for half of initial state for generated samples 482 | # to study the behaviour of the model and also to introduce some diversity 483 | # to samples in a simple way. [it's disabled for now] 484 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM) 485 | fixed_rand_h0 -= 0.5 486 | fixed_rand_h0 = fixed_rand_h0.astype('float32') 487 | 488 | def generate_and_save_samples(tag, N_SECS=5): 489 | def write_audio_file(name, data): 490 | data = data.astype('float32') 491 | data -= data.min() 492 | data /= data.max() 493 | data -= 0.5 494 | data *= 0.95 495 | scipy.io.wavfile.write( 496 | os.path.join(SAMPLES_PATH, name+'.wav'), 497 | BITRATE, 498 | data) 499 | 500 | total_time = time() 501 | # Generate N_SEQS' sample files, each 5 seconds long 502 | LENGTH = N_SECS*BITRATE if not args.debug else 100 503 | 504 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 505 | samples[:, :FRAME_SIZE] = Q_ZERO 506 | 507 | # First half zero, others fixed random at each checkpoint 508 | h0 = numpy.zeros( 509 | (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), 510 | dtype='float32' 511 | ) 512 | h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) 513 | frame_level_outputs = None 514 | 515 | for t in xrange(FRAME_SIZE, LENGTH): 516 | 517 | if t % FRAME_SIZE == 0: 518 | frame_level_outputs, h0 = frame_level_generate_fn( 519 | samples[:, t-FRAME_SIZE:t], 520 | h0, 521 | #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), 522 | numpy.int32(t == FRAME_SIZE) 523 | ) 524 | 525 | samples[:, t] = sample_level_generate_fn( 526 | frame_level_outputs[:, t % FRAME_SIZE], 527 | samples[:, t-FRAME_SIZE:t], 528 | ) 529 | 530 | total_time = time() - total_time 531 | log = "{} samples of {} seconds length generated in {} seconds." 532 | log = log.format(N_SEQS, N_SECS, total_time) 533 | print log 534 | 535 | for i in xrange(N_SEQS): 536 | samp = samples[i] 537 | if Q_TYPE == 'mu-law': 538 | from datasets.dataset import mu2linear 539 | samp = mu2linear(samp) 540 | elif Q_TYPE == 'a-law': 541 | raise NotImplementedError('a-law is not implemented') 542 | 543 | now = datetime.datetime.now() 544 | now_time = "{}:{}:{}".format(now.hour, now.minute, now.second) 545 | 546 | file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i) 547 | print "writing...", file_name 548 | write_audio_file(file_name, samp) 549 | 550 | 551 | 552 | def monitor(data_feeder): 553 | """ 554 | Cost and time of test_fn on a given dataset section. 555 | Pass only one of `valid_feeder` or `test_feeder`. 556 | Don't pass `train_feed`. 557 | 558 | :returns: 559 | Mean cost over the input dataset (data_feeder) 560 | Total time spent 561 | """ 562 | _total_time = time() 563 | _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 564 | _costs = [] 565 | _data_feeder = load_data(data_feeder) 566 | for _seqs, _reset, _mask in _data_feeder: 567 | _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask) 568 | _costs.append(_cost) 569 | 570 | return numpy.mean(_costs), time() - _total_time 571 | 572 | print "Wall clock time spent before training started: {:.2f}h"\ 573 | .format((time()-exp_start)/3600.) 574 | print "Training!" 575 | total_iters = 0 576 | total_time = 0. 577 | last_print_time = 0. 578 | last_print_iters = 0 579 | costs = [] 580 | lowest_valid_cost = numpy.finfo(numpy.float32).max 581 | corresponding_test_cost = numpy.finfo(numpy.float32).max 582 | new_lowest_cost = False 583 | end_of_batch = False 584 | epoch = 0 585 | 586 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 587 | 588 | # Initial load train dataset 589 | tr_feeder = load_data(train_feeder) 590 | 591 | ### Handling the resume option: 592 | if True: #if Resume: 593 | # Check if checkpoint from previous run is not corrupted. 594 | # Then overwrite some of the variables above. 595 | iters_to_consume, res_path, epoch, total_iters,\ 596 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 597 | lib.resumable(path=FOLDER_PREFIX, 598 | iter_key=iter_str, 599 | epoch_key=epoch_str, 600 | add_resume_counter=True, 601 | other_keys=[lowest_valid_str, 602 | corresp_test_str, 603 | test_nll_str]) 604 | # At this point we saved the pkl file. 605 | last_print_iters = total_iters 606 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 607 | # Consumes this much iters to get to the last point in training data. 608 | consume_time = time() 609 | for i in xrange(iters_to_consume): 610 | tr_feeder.next() 611 | consume_time = time() - consume_time 612 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 613 | format(consume_time, iters_to_consume) 614 | 615 | lib.load_params(res_path) 616 | print "Parameters from last available checkpoint loaded." 617 | 618 | 619 | 620 | # 2. Stdout the training progress 621 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 622 | print_info = print_info.format(epoch, 623 | total_iters, 624 | (time()-exp_start)/3600) 625 | print print_info 626 | 627 | tag = "e{}_i{}" 628 | tag = tag.format(epoch, 629 | total_iters) 630 | 631 | # 5. Generate and save samples (time consuming) 632 | # If not successful, we still have the params to sample afterward 633 | print "Sampling!", 634 | # Generate samples 635 | generate_and_save_samples(tag, N_SECS) 636 | print "Done!" 637 | 638 | print "Wall clock time spent: {:.2f}h"\ 639 | .format((time()-exp_start)/3600) 640 | 641 | sys.exit() 642 | -------------------------------------------------------------------------------- /models/two_tier/two_tier_generate16k.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | 4 | Two-tier model, Quantized input 5 | For more info: 6 | $ python two_tier.py -h 7 | 8 | How-to-run example: 9 | sampleRNN$ pwd 10 | /u/mehris/sampleRNN 11 | 12 | sampleRNN$ \ 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \ 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \ 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \ 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \ 17 | --batch_size 128 --which_set MUSIC 18 | 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the 20 | resume code as many time as possible, depending on the TRAIN_MODE. 21 | (folder name, file name, flags, their order, and the values are important) 22 | """ 23 | from time import time 24 | from datetime import datetime 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 26 | exp_start = time() 27 | 28 | import os, sys, glob 29 | sys.path.insert(1, os.getcwd()) 30 | import argparse 31 | import datetime 32 | import numpy 33 | numpy.random.seed(123) 34 | np = numpy 35 | import random 36 | random.seed(123) 37 | import re 38 | 39 | 40 | import theano 41 | import theano.tensor as T 42 | import theano.ifelse 43 | import lasagne 44 | import scipy.io.wavfile 45 | 46 | import lib 47 | 48 | LEARNING_RATE = 0.001 49 | 50 | ### Parsing passed args/hyperparameters ### 51 | def get_args(): 52 | def t_or_f(arg): 53 | ua = str(arg).upper() 54 | if 'TRUE'.startswith(ua): 55 | return True 56 | elif 'FALSE'.startswith(ua): 57 | return False 58 | else: 59 | raise ValueError('Arg is neither `True` nor `False`') 60 | 61 | def check_non_negative(value): 62 | ivalue = int(value) 63 | if ivalue < 0: 64 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 65 | return ivalue 66 | 67 | def check_positive(value): 68 | ivalue = int(value) 69 | if ivalue < 1: 70 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 71 | return ivalue 72 | 73 | def check_unit_interval(value): 74 | fvalue = float(value) 75 | if fvalue < 0 or fvalue > 1: 76 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 77 | return fvalue 78 | 79 | # No default value here. Indicate every single arguement. 80 | parser = argparse.ArgumentParser( 81 | description='two_tier.py\nNo default value! Indicate every argument.') 82 | 83 | # Hyperparameter arguements: 84 | parser.add_argument('--exp', help='Experiment name', 85 | type=str, required=False, default='_') 86 | parser.add_argument('--n_frames', help='How many "frames" to include in each\ 87 | Truncated BPTT pass', type=check_positive, required=True) 88 | parser.add_argument('--frame_size', help='How many samples per frame',\ 89 | type=check_positive, required=True) 90 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization\ 91 | to all the linear layers (except for the embedding layer)',\ 92 | type=t_or_f, required=True) 93 | parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True) 94 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True) 95 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 96 | type=check_positive, required=True) 97 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 98 | type=check_positive, choices=xrange(1,40), required=True) 99 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 100 | required=True) 101 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 102 | type=t_or_f, required=True) 103 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 104 | type=check_positive, required=True) 105 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 106 | choices=['linear', 'a-law', 'mu-law'], required=True) 107 | parser.add_argument('--which_set', help='the directory name of the dataset' , 108 | type=str, required=True) 109 | parser.add_argument('--batch_size', help='size of mini-batch', 110 | type=check_positive, choices=xrange(0, 999), required=True) 111 | 112 | parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true') 113 | # NEW 114 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 115 | required=False, default=False, action='store_true') 116 | 117 | parser.add_argument('--n_secs', help='Seconds to generate',\ 118 | type=check_positive, required=True) 119 | parser.add_argument('--n_seqs', help='Number wavs to generate',\ 120 | type=check_positive, required=True) 121 | parser.add_argument('--temp', help='Temperature',\ 122 | type=float, required=True) 123 | 124 | args = parser.parse_args() 125 | 126 | # NEW 127 | # Create tag for this experiment based on passed args 128 | tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 129 | tag = re.sub(r'-n_secs[0-9]+', "", tag) 130 | tag = re.sub(r'-n_seqs[0-9]+', "", tag) 131 | tag = re.sub(r'-temp[0-9]*[\.]?[0-9]*', "", tag) 132 | tag = re.sub(r'_generate', "", tag) 133 | tag += '-lr'+str(LEARNING_RATE) 134 | print "Created experiment tag for these args:" 135 | print tag 136 | 137 | return args, tag 138 | 139 | args, tag = get_args() 140 | 141 | 142 | print "sup" 143 | 144 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass 145 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame 146 | WEIGHT_NORM = args.weight_norm 147 | EMB_SIZE = args.emb_size 148 | SKIP_CONN = args.skip_conn 149 | DIM = args.dim # Model dimensionality. 150 | N_RNN = args.n_rnn # How many RNNs to stack 151 | RNN_TYPE = args.rnn_type 152 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 153 | LEARN_H0 = args.learn_h0 154 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 155 | Q_TYPE = args.q_type # log- or linear-scale 156 | WHICH_SET = args.which_set 157 | BATCH_SIZE = args.batch_size 158 | RESUME = args.resume 159 | N_SECS = args.n_secs 160 | N_SEQS = args.n_seqs 161 | TEMPERATURE = args.temp 162 | 163 | 164 | print "hi" 165 | 166 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 167 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 168 | 169 | # Fixed hyperparams 170 | GRAD_CLIP = 1 # Elementwise grad clip threshold 171 | BITRATE = 16000 172 | 173 | # Other constants 174 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 175 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 176 | #TRAIN_MODE = 'time-iters' 177 | # To use PRINT_TIME for validation, 178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 179 | #TRAIN_MODE = 'iters-time' 180 | # To use PRINT_ITERS for validation, 181 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 182 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations. 183 | STOP_ITERS = 100000 # Stop after this many iterations 184 | # TODO: 185 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds. 186 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 187 | # TODO: 188 | RESULTS_DIR = 'results_2t' 189 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 190 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence 191 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 192 | 193 | 194 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE 195 | 196 | 197 | epoch_str = 'epoch' 198 | iter_str = 'iter' 199 | lowest_valid_str = 'lowest valid cost' 200 | corresp_test_str = 'correponding test cost' 201 | train_nll_str, valid_nll_str, test_nll_str = \ 202 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 203 | 204 | if args.debug: 205 | import warnings 206 | warnings.warn('----------RUNNING IN DEBUG MODE----------') 207 | TRAIN_MODE = 'time' 208 | PRINT_TIME = 100 209 | STOP_TIME = 3000 210 | STOP_ITERS = 1000 211 | 212 | ### Create directories ### 213 | # FOLDER_PREFIX: root, contains: 214 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 215 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 216 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 217 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 218 | if not os.path.exists(FOLDER_PREFIX): 219 | os.makedirs(FOLDER_PREFIX) 220 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 221 | if not os.path.exists(PARAMS_PATH): 222 | os.makedirs(PARAMS_PATH) 223 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 224 | if not os.path.exists(SAMPLES_PATH): 225 | os.makedirs(SAMPLES_PATH) 226 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 227 | if not os.path.exists(BEST_PATH): 228 | os.makedirs(BEST_PATH) 229 | 230 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 231 | 232 | ### Import the data_feeder ### 233 | # Handling WHICH_SET 234 | from datasets.dataset import music_train_feed_epoch as train_feeder 235 | from datasets.dataset import music_valid_feed_epoch as valid_feeder 236 | from datasets.dataset import music_test_feed_epoch as test_feeder 237 | 238 | def load_data(data_feeder): 239 | """ 240 | Helper function to deal with interface of different datasets. 241 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 242 | """ 243 | return data_feeder(WHICH_SET, BATCH_SIZE, 244 | SEQ_LEN, 245 | OVERLAP, 246 | Q_LEVELS, 247 | Q_ZERO, 248 | Q_TYPE) 249 | 250 | ### Creating computation graph ### 251 | def frame_level_rnn(input_sequences, h0, reset): 252 | """ 253 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) 254 | h0.shape: (batch size, N_RNN, DIM) 255 | reset.shape: () 256 | 257 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 258 | """ 259 | frames = input_sequences.reshape(( 260 | input_sequences.shape[0], 261 | input_sequences.shape[1] // FRAME_SIZE, 262 | FRAME_SIZE 263 | )) 264 | 265 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 266 | # (a reasonable range to pass as inputs to the RNN) 267 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 268 | frames *= lib.floatX(2) 269 | # (128, 64, 4) 270 | 271 | # Initial state of RNNs 272 | learned_h0 = lib.param( 273 | 'FrameLevel.h0', 274 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 275 | ) 276 | # Handling LEARN_H0 277 | learned_h0.param = LEARN_H0 278 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 279 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 280 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 281 | 282 | # Handling RNN_TYPE 283 | # Handling SKIP_CONN 284 | if RNN_TYPE == 'GRU': 285 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 286 | N_RNN, 287 | FRAME_SIZE, 288 | DIM, 289 | frames, 290 | h0=h0, 291 | weightnorm=WEIGHT_NORM, 292 | skip_conn=SKIP_CONN) 293 | elif RNN_TYPE == 'LSTM': 294 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 295 | N_RNN, 296 | FRAME_SIZE, 297 | DIM, 298 | frames, 299 | h0=h0, 300 | weightnorm=WEIGHT_NORM, 301 | skip_conn=SKIP_CONN) 302 | 303 | # rnns_out (bs, seqlen, dim) (128, 64, 512) 304 | output = lib.ops.Linear( 305 | 'FrameLevel.Output', 306 | DIM, 307 | FRAME_SIZE * DIM, 308 | rnns_out, 309 | initialization='he', 310 | weightnorm=WEIGHT_NORM 311 | ) 312 | # output: (2, 9, 4*dim) 313 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 314 | # output: (2, 9*4, dim) 315 | 316 | return (output, last_hidden) 317 | 318 | def sample_level_predictor(frame_level_outputs, prev_samples): 319 | """ 320 | batch size = BATCH_SIZE * SEQ_LEN 321 | SEQ_LEN = N_FRAMES * FRAME_SIZE 322 | 323 | frame_level_outputs.shape: (batch size, DIM) 324 | prev_samples.shape: (batch size, FRAME_SIZE) int32 325 | 326 | output.shape: (batch size, Q_LEVELS) 327 | """ 328 | # Handling EMB_SIZE 329 | if EMB_SIZE == 0: 330 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 331 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 332 | last_out_shape = Q_LEVELS 333 | elif EMB_SIZE > 0: 334 | prev_samples = lib.ops.Embedding( 335 | 'SampleLevel.Embedding', 336 | Q_LEVELS, 337 | EMB_SIZE, 338 | prev_samples) 339 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 340 | last_out_shape = EMB_SIZE 341 | else: 342 | raise ValueError('EMB_SIZE cannot be negative.') 343 | 344 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) 345 | 346 | out = lib.ops.Linear( 347 | 'SampleLevel.L1_PrevSamples', 348 | FRAME_SIZE * last_out_shape, 349 | DIM, 350 | prev_samples, 351 | biases=False, 352 | initialization='he', 353 | weightnorm=WEIGHT_NORM) 354 | # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM) 355 | 356 | out += frame_level_outputs 357 | # ^ (2*(9*4), dim) 358 | 359 | # L2 360 | out = lib.ops.Linear('SampleLevel.L2', 361 | DIM, 362 | DIM, 363 | out, 364 | initialization='he', 365 | weightnorm=WEIGHT_NORM) 366 | out = T.nnet.relu(out) 367 | 368 | # L3 369 | out = lib.ops.Linear('SampleLevel.L3', 370 | DIM, 371 | DIM, 372 | out, 373 | initialization='he', 374 | weightnorm=WEIGHT_NORM) 375 | out = T.nnet.relu(out) 376 | 377 | # Output 378 | # We apply the softmax later 379 | out = lib.ops.Linear('SampleLevel.Output', 380 | DIM, 381 | Q_LEVELS, 382 | out, 383 | weightnorm=WEIGHT_NORM) 384 | return out 385 | 386 | sequences = T.imatrix('sequences') 387 | h0 = T.tensor3('h0') 388 | reset = T.iscalar('reset') 389 | mask = T.matrix('mask') 390 | 391 | if args.debug: 392 | # Solely for debugging purposes. 393 | # Maybe I should set the compute_test_value=warn from here. 394 | sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32') 395 | h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 396 | reset.tag.test_value = numpy.array(1, dtype='int32') 397 | mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32') 398 | 399 | input_sequences = sequences[:, :-FRAME_SIZE] 400 | target_sequences = sequences[:, FRAME_SIZE:] 401 | 402 | target_mask = mask[:, FRAME_SIZE:] 403 | 404 | frame_level_outputs, new_h0 =\ 405 | frame_level_rnn(input_sequences, h0, reset) 406 | 407 | prev_samples = sequences[:, :-1] 408 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1)) 409 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') 410 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE)) 411 | # (batch_size*n_frames*frame_size, frame_size) 412 | 413 | sample_level_outputs = sample_level_predictor( 414 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)), 415 | prev_samples, 416 | ) 417 | 418 | cost = T.nnet.categorical_crossentropy( 419 | T.nnet.softmax(sample_level_outputs), 420 | target_sequences.flatten() 421 | ) 422 | cost = cost.reshape(target_sequences.shape) 423 | cost = cost * target_mask 424 | # Don't use these lines; could end up with NaN 425 | # Specially at the end of audio files where mask is 426 | # all zero for some of the shorter files in mini-batch. 427 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 428 | #cost = cost.mean(axis=0) 429 | 430 | # Use this one instead. 431 | cost = cost.sum() 432 | cost = cost / target_mask.sum() 433 | 434 | # By default we report cross-entropy cost in bits. 435 | # Switch to nats by commenting out this line: 436 | # log_2(e) = 1.44269504089 437 | cost = cost * lib.floatX(numpy.log2(numpy.e)) 438 | 439 | ### Getting the params, grads, updates, and Theano functions ### 440 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) 441 | lib.print_params_info(params, path=FOLDER_PREFIX) 442 | 443 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn') 444 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 445 | 446 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE) 447 | 448 | # Training function 449 | train_fn = theano.function( 450 | [sequences, h0, reset, mask], 451 | [cost, new_h0], 452 | updates=updates, 453 | on_unused_input='warn' 454 | ) 455 | 456 | # Validation and Test function, hence no updates 457 | test_fn = theano.function( 458 | [sequences, h0, reset, mask], 459 | [cost, new_h0], 460 | on_unused_input='warn' 461 | ) 462 | 463 | # Sampling at frame level 464 | frame_level_generate_fn = theano.function( 465 | [sequences, h0, reset], 466 | frame_level_rnn(sequences, h0, reset), 467 | on_unused_input='warn' 468 | ) 469 | 470 | # Sampling at audio sample level 471 | frame_level_outputs = T.matrix('frame_level_outputs') 472 | prev_samples = T.imatrix('prev_samples') 473 | sample_level_generate_fn = theano.function( 474 | [frame_level_outputs, prev_samples], 475 | lib.ops.softmax_and_sample( 476 | sample_level_predictor( 477 | frame_level_outputs, 478 | prev_samples, 479 | )/TEMPERATURE 480 | ), 481 | on_unused_input='warn' 482 | ) 483 | 484 | # Uniform [-0.5, 0.5) for half of initial state for generated samples 485 | # to study the behaviour of the model and also to introduce some diversity 486 | # to samples in a simple way. [it's disabled for now] 487 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM) 488 | fixed_rand_h0 -= 0.5 489 | fixed_rand_h0 = fixed_rand_h0.astype('float32') 490 | 491 | def generate_and_save_samples(tag, N_SECS=5): 492 | def write_audio_file(name, data): 493 | data = data.astype('float32') 494 | data -= data.min() 495 | data /= data.max() 496 | data -= 0.5 497 | data *= 0.95 498 | scipy.io.wavfile.write( 499 | os.path.join(SAMPLES_PATH, name+'.wav'), 500 | BITRATE, 501 | data) 502 | 503 | total_time = time() 504 | # Generate N_SEQS' sample files, each 5 seconds long 505 | LENGTH = N_SECS*BITRATE if not args.debug else 100 506 | 507 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 508 | samples[:, :FRAME_SIZE] = Q_ZERO 509 | 510 | # First half zero, others fixed random at each checkpoint 511 | h0 = numpy.zeros( 512 | (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM), 513 | dtype='float32' 514 | ) 515 | h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0) 516 | frame_level_outputs = None 517 | 518 | for t in xrange(FRAME_SIZE, LENGTH): 519 | 520 | if t % FRAME_SIZE == 0: 521 | frame_level_outputs, h0 = frame_level_generate_fn( 522 | samples[:, t-FRAME_SIZE:t], 523 | h0, 524 | #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'), 525 | numpy.int32(t == FRAME_SIZE) 526 | ) 527 | 528 | samples[:, t] = sample_level_generate_fn( 529 | frame_level_outputs[:, t % FRAME_SIZE], 530 | samples[:, t-FRAME_SIZE:t], 531 | ) 532 | 533 | total_time = time() - total_time 534 | log = "{} samples of {} seconds length generated in {} seconds." 535 | log = log.format(N_SEQS, N_SECS, total_time) 536 | print log 537 | 538 | for i in xrange(N_SEQS): 539 | samp = samples[i] 540 | if Q_TYPE == 'mu-law': 541 | from datasets.dataset import mu2linear 542 | samp = mu2linear(samp) 543 | elif Q_TYPE == 'a-law': 544 | raise NotImplementedError('a-law is not implemented') 545 | 546 | now = datetime.datetime.now() 547 | now_time = "{}:{}:{}".format(now.hour, now.minute, now.second) 548 | 549 | file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i) 550 | print "writing...", file_name 551 | write_audio_file(file_name, samp) 552 | 553 | 554 | 555 | def monitor(data_feeder): 556 | """ 557 | Cost and time of test_fn on a given dataset section. 558 | Pass only one of `valid_feeder` or `test_feeder`. 559 | Don't pass `train_feed`. 560 | 561 | :returns: 562 | Mean cost over the input dataset (data_feeder) 563 | Total time spent 564 | """ 565 | _total_time = time() 566 | _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 567 | _costs = [] 568 | _data_feeder = load_data(data_feeder) 569 | for _seqs, _reset, _mask in _data_feeder: 570 | _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask) 571 | _costs.append(_cost) 572 | 573 | return numpy.mean(_costs), time() - _total_time 574 | 575 | print "Wall clock time spent before training started: {:.2f}h"\ 576 | .format((time()-exp_start)/3600.) 577 | print "Training!" 578 | total_iters = 0 579 | total_time = 0. 580 | last_print_time = 0. 581 | last_print_iters = 0 582 | costs = [] 583 | lowest_valid_cost = numpy.finfo(numpy.float32).max 584 | corresponding_test_cost = numpy.finfo(numpy.float32).max 585 | new_lowest_cost = False 586 | end_of_batch = False 587 | epoch = 0 588 | 589 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32') 590 | 591 | # Initial load train dataset 592 | tr_feeder = load_data(train_feeder) 593 | 594 | ### Handling the resume option: 595 | if True: #if Resume: 596 | # Check if checkpoint from previous run is not corrupted. 597 | # Then overwrite some of the variables above. 598 | iters_to_consume, res_path, epoch, total_iters,\ 599 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 600 | lib.resumable(path=FOLDER_PREFIX, 601 | iter_key=iter_str, 602 | epoch_key=epoch_str, 603 | add_resume_counter=True, 604 | other_keys=[lowest_valid_str, 605 | corresp_test_str, 606 | test_nll_str]) 607 | # At this point we saved the pkl file. 608 | last_print_iters = total_iters 609 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 610 | # Consumes this much iters to get to the last point in training data. 611 | consume_time = time() 612 | for i in xrange(iters_to_consume): 613 | tr_feeder.next() 614 | consume_time = time() - consume_time 615 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 616 | format(consume_time, iters_to_consume) 617 | 618 | lib.load_params(res_path) 619 | print "Parameters from last available checkpoint loaded." 620 | 621 | 622 | 623 | # 2. Stdout the training progress 624 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 625 | print_info = print_info.format(epoch, 626 | total_iters, 627 | (time()-exp_start)/3600) 628 | print print_info 629 | 630 | tag = "e{}_i{}" 631 | tag = tag.format(epoch, 632 | total_iters) 633 | 634 | # 5. Generate and save samples (time consuming) 635 | # If not successful, we still have the params to sample afterward 636 | print "Sampling!", 637 | # Generate samples 638 | generate_and_save_samples(tag, N_SECS) 639 | print "Done!" 640 | 641 | print "Wall clock time spent: {:.2f}h"\ 642 | .format((time()-exp_start)/3600) 643 | 644 | sys.exit() 645 | --------------------------------------------------------------------------------