├── models
    ├── __init__.py
    ├── one_tier
    │   └── wavent.py
    └── two_tier
    │   ├── two_tier_generate32k.py
    │   ├── two_tier_generate16k.py.ol
    │   └── two_tier_generate16k.py
├── datasets
    ├── __init__.py
    ├── .DS_Store
    ├── music
    │   ├── .DS_Store
    │   ├── drum-preprocess.sh
    │   ├── prune_flacs.py
    │   ├── sum_flacs.py
    │   ├── SNAREdrum-preprocessERRORS.md
    │   ├── preprocess.sh
    │   ├── build_features.py
    │   ├── download_archive_preprocess.sh
    │   ├── preprocess.py
    │   ├── log_mp3s
    │   ├── new_experiment32k.py
    │   ├── new_experiment16k.py
    │   ├── new_experiment48k.py
    │   ├── new_experiment16k_conditioning.py
    │   └── drum-preprocess.py
    ├── dataset.py
    └── dataset_conditioning.py
├── .DS_Store
├── sampleRNN-pydotprint.png
├── train48k.sh
├── .gitignore
├── clean_results.py
├── lib
    ├── generate.py
    └── __init__.py
├── LICENSE
└── README.md


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/.DS_Store


--------------------------------------------------------------------------------
/datasets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/datasets/.DS_Store


--------------------------------------------------------------------------------
/datasets/music/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/datasets/music/.DS_Store


--------------------------------------------------------------------------------
/sampleRNN-pydotprint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dada-bots/dadabots_sampleRNN/HEAD/sampleRNN-pydotprint.png


--------------------------------------------------------------------------------
/datasets/music/drum-preprocess.sh:
--------------------------------------------------------------------------------
1 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P )
2 | echo "Preprocessing"
3 | python drum-preprocess.py "$SCRIPTPATH"
4 | echo "Done!"
5 | 
6 | echo "Writing datasets"
7 | python _drum2npy.py
8 | echo "Done!"
9 | 


--------------------------------------------------------------------------------
/datasets/music/prune_flacs.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import glob
 4 | 
 5 | DIR = "."
 6 | fs = glob.glob(DIR+"/*.flac")
 7 | for f in fs:
 8 |     size = float(subprocess.check_output('ffprobe -i "{}/{}" -show_entries format=duration -v quiet -of csv="p=0"'.format(DIR, f), shell=True))
 9 |     if size != 3.762563:
10 |         print f
11 |         print size


--------------------------------------------------------------------------------
/datasets/music/sum_flacs.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import glob
 4 | 
 5 | DIR = "."
 6 | fs = glob.glob(DIR+"/*.wav")
 7 | t = 0
 8 | print 'counting...'
 9 | for f in fs:
10 |     size = float(subprocess.check_output('ffprobe -i "{}/{}" -show_entries format=duration -v quiet -of csv="p=0"'.format(DIR, f), shell=True))
11 |     t = t + size   
12 | print t, ' seconds'


--------------------------------------------------------------------------------
/train48k.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=mode=FAST_RUN,device=cuda$1,floatX=float32 python -u models/two_tier/two_tier48k.py --exp $2 --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set $2
2 | sleep 1; 
3 | while true; 
4 | do 
5 | THEANO_FLAGS=mode=FAST_RUN,device=cuda$1,floatX=float32 python -u models/two_tier/two_tier48k.py --exp $2 --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set $2 --resume; sleep 1; done;
6 | 


--------------------------------------------------------------------------------
/datasets/music/SNAREdrum-preprocessERRORS.md:
--------------------------------------------------------------------------------
 1 | # SNARE drum-preprocess.py ERRORS
 2 | ## sample-rnn
 3 | ## 4/6/2017
 4 | 
 5 | B
 6 | =
 7 | 
 8 | ./p295d.flac 4.28575
 9 | 
10 | ./p1290d.flac 3.980813
11 | 
12 | ./p1290u.flac 3.980813
13 | 
14 | ./p295.flac 4.28575
15 | 
16 | ./p295u.flac 4.28575
17 | 
18 | ./p1290.flac 3.980813
19 | 
20 | BR
21 | =
22 | 
23 | ./p295d.flac 4.28575
24 | 
25 | ./p1290d.flac 3.980813
26 | 
27 | ./p1290u.flac 3.980813
28 | 
29 | ./p295.flac 4.28575
30 | 
31 | ./p295u.flac 4.28575
32 | 
33 | ./p1290.flac 3.980813
34 | 
35 | FR
36 | ==
37 | 
38 | ./p295d.flac 4.28575
39 | 
40 | ./p1290d.flac 3.980813
41 | 
42 | ./p1290u.flac 3.980813
43 | 
44 | ./p295.flac 4.28575
45 | 
46 | ./p295u.flac 4.28575
47 | 
48 | ./p1290.flac 3.980813
49 | 


--------------------------------------------------------------------------------
/datasets/music/preprocess.sh:
--------------------------------------------------------------------------------
 1 | # Requires 2GB of free disk space at most.
 2 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P )
 3 | echo "Converting from OGG to 16Khz, 16bit mono-channel WAV"
 4 | # Next line with & executes in a forked shell in the background. That's parallel and not recommended.
 5 | # Remove if causing problem
 6 | #for file in "$DL_PATH"*_64kb.mp3; do ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" _64kb.mp3`.wav" & done 
 7 | for file in "$SCRIPTPATH"*.ogg; do
 8 | 	ffmpeg -i "$file" -ar 16000 -ac 1 "$SCRIPTPATH""`basename "$file" .ogg`.wav"
 9 | done 
10 | echo "Cleaning up"
11 | rm "$SCRIPTPATH"*.ogg
12 | 
13 | echo "Preprocessing"
14 | python preprocess.py "$SCRIPTPATH"
15 | echo "Done!"
16 | 
17 | echo "Writing datasets"
18 | python _2npy.py
19 | echo "Done!"
20 | 


--------------------------------------------------------------------------------
/datasets/music/build_features.py:
--------------------------------------------------------------------------------
 1 | # Given an array of PCM samples, return a feature matrix
 2 | # The feature matrix doesn't need to be upsampled to the sample rate
 3 | # However long the matrix is, we assume it matches the length of the WAV
 4 | # So you can use any frame_rate (hop_size)
 5 | import numpy as np
 6 | import librosa
 7 | import librosa.onset
 8 | def build_dummy_features(filename):
 9 |     features = np.ones((1000,1),dtype='float32')
10 |     for i,_ in enumerate(features):
11 | 	    features[i,0] = i/1000.0 
12 |     return features
13 | 
14 | def build_onset_envelope_feature(filename):
15 | 	y, sr = librosa.load(filename)
16 | 	hop_length=128
17 | 	onset_env = librosa.onset.onset_strength(y=y, sr=sr, 
18 | 		aggregate=np.median, hop_length=hop_length, fmax=8000)
19 | 	# normalize the onset_env
20 | 	onset_env = (onset_env - np.mean(onset_env))/np.std(onset_env)
21 | 
22 | 	num_frames = len(onset_env)
23 | 	feature_matrix = np.ones((num_frames,1),dtype='float32')
24 | 	feature_matrix[:,0] = onset_env
25 | 	return feature_matrix
26 | 
27 | 


--------------------------------------------------------------------------------
/datasets/music/download_archive_preprocess.sh:
--------------------------------------------------------------------------------
 1 | # Requires 2GB of free disk space at most.
 2 | SCRIPTPATH=$( cd "$(dirname "$0")" ; pwd -P )
 3 | DL_PATH="$SCRIPTPATH"/download/
 4 | mkdir -p "$DL_PATH"
 5 | echo "Downloading files to "$DL_PATH""
 6 | # See: https://blog.archive.org/2012/04/26/downloading-in-bulk-using-wget/
 7 | wget -r -H -nc -nH --cut-dir=1 -A .ogg -R *_vbr.mp3 -e robots=off -P "$DL_PATH" -l1 -i ./itemlist.txt -B 'http://archive.org/download/'
 8 | echo "Organizing files and folders"
 9 | mv "$DL_PATH"*/*.ogg "$DL_PATH"
10 | rmdir "$DL_PATH"*/
11 | echo "Converting from OGG to 16Khz, 16bit mono-channel WAV"
12 | # Next line with & executes in a forked shell in the background. That's parallel and not recommended.
13 | # Remove if causing problem
14 | #for file in "$DL_PATH"*_64kb.mp3; do ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" _64kb.mp3`.wav" & done 
15 | for file in "$DL_PATH"*.ogg; do
16 | 	ffmpeg -i "$file" -ar 16000 -ac 1 "$DL_PATH""`basename "$file" .ogg`.wav"
17 | done 
18 | echo "Cleaning up"
19 | rm "$DL_PATH"*.ogg
20 | 
21 | echo "Preprocessing"
22 | python preprocess.py "$DL_PATH"
23 | echo "Done!"
24 | 
25 | echo "Writing datasets"
26 | python _2npy.py
27 | echo "Done!"
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | #lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | *.wav
91 | datasets/music/rev-preprocess.sh
92 | 
93 | datasets/music/*/
94 | results_2t/*
95 | 


--------------------------------------------------------------------------------
/clean_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | cwd = os.getcwd()
 4 | results_dir = os.path.join(cwd, 'results_2t')
 5 | def get_subdirectories(a_dir):
 6 |     return [name for name in os.listdir(a_dir)
 7 |             if os.path.isdir(os.path.join(a_dir, name))]
 8 | experiments = get_subdirectories(results_dir)
 9 | num_epochs = 5
10 | hit_list = ["params_e"+str(n)+"_" for n in xrange(num_epochs)]
11 | unused_files = []
12 | for e in experiments:
13 |     e_dir = os.path.join(results_dir, e)
14 |     params = os.path.join(e_dir, "params")
15 |     for root, dirs, files in os.walk(params):
16 |         for file in files:
17 |             for hit in hit_list:
18 |                 if file.startswith(hit):
19 |                     print file
20 |                     unused_files.append(os.path.join(root, file))
21 | def prompt_delete(num_prompts):
22 |     num_prompts -= 1
23 |     if num_prompts >= 0:
24 |         prompt = input("Do you want to delete these "+str(len(unused_files))+" files? ['Y'/'n']")
25 |         if prompt == "Y" or prompt == "yes":
26 |             print 'removing old epochs...'
27 |             for uf in unused_files:
28 |                 os.remove(uf)
29 |         elif prompt == "n" or prompt == "no":
30 |             print "clean aborted: 0 files deleted"
31 |         else:
32 |             print "warning:", prompt, "is an unknown command"
33 |             prompt_delete(num_prompts)
34 |     else:
35 |         print "0 files deleted: Good-bye"
36 | if len(unused_files) > 0:
37 |     prompt_delete(3)
38 | else:
39 |     print 'found 0 files to clean: Good-bye'
40 | 


--------------------------------------------------------------------------------
/datasets/music/preprocess.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import subprocess
 3 | 
 4 | RAW_DATA_DIR=str(sys.argv[1])
 5 | OUTPUT_DIR=os.path.join(RAW_DATA_DIR, "parts")
 6 | os.makedirs(OUTPUT_DIR)
 7 | print RAW_DATA_DIR
 8 | print OUTPUT_DIR
 9 | 
10 | # Step 1: write all filenames to a list
11 | with open(os.path.join(OUTPUT_DIR, 'preprocess_file_list.txt'), 'w') as f:
12 |     for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR):
13 |         for filename in filenames:
14 |             if filename.endswith(".wav"):
15 |                 f.write("file '" + dirpath + '/'+ filename + "'\n")
16 | 
17 | # Step 2: concatenate everything into one massive wav file
18 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(OUTPUT_DIR, OUTPUT_DIR))
19 | audio = "preprocess_all_audio.wav"
20 | # # get the length of the resulting file
21 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
22 | print length, "DURATION"
23 | # reverse the audio file
24 | if sys.argv[2] == True:
25 |     os.system("sox preprocess_all_audio.wav reverse_preprocess_audio.wav reverse")
26 |     audio = "reverse_preprocess_audio.wav"
27 | # # Step 3: split the big file into 8-second chunks
28 | for i in xrange((int(length)//8 - 1)/3):
29 |     os.system('ffmpeg -ss {} -t 8 -i {}/{} -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(i, OUTPUT_DIR, audio, OUTPUT_DIR, i))
30 | 
31 | # # Step 4: clean up temp files
32 | #os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
33 | os.system('rm {}/preprocess_file_list.txt'.format(OUTPUT_DIR))
34 | 


--------------------------------------------------------------------------------
/datasets/music/log_mp3s:
--------------------------------------------------------------------------------
 1 | download$ for f in *; do ffmpeg -i $f 2>&1 | grep Duration; done
 2 |   Duration: 00:22:18.52, start: 0.000000, bitrate: 320 kb/s
 3 |   Duration: 00:15:13.07, start: 0.000000, bitrate: 320 kb/s
 4 |   Duration: 00:13:44.23, start: 0.000000, bitrate: 320 kb/s
 5 |   Duration: 00:21:17.55, start: 0.000000, bitrate: 320 kb/s
 6 |   Duration: 00:24:03.82, start: 0.000000, bitrate: 320 kb/s
 7 |   Duration: 00:23:00.14, start: 0.000000, bitrate: 320 kb/s
 8 |   Duration: 00:21:24.58, start: 0.000000, bitrate: 320 kb/s
 9 |   Duration: 00:07:09.15, start: 0.000000, bitrate: 320 kb/s
10 |   Duration: 00:07:20.90, start: 0.000000, bitrate: 320 kb/s
11 |   Duration: 00:09:58.42, start: 0.000000, bitrate: 320 kb/s
12 |   Duration: 00:10:17.88, start: 0.000000, bitrate: 320 kb/s
13 |   Duration: 00:22:07.47, start: 0.000000, bitrate: 320 kb/s
14 |   Duration: 00:09:47.16, start: 0.000000, bitrate: 320 kb/s
15 |   Duration: 00:08:31.91, start: 0.000000, bitrate: 320 kb/s
16 |   Duration: 00:07:00.63, start: 0.000000, bitrate: 320 kb/s
17 |   Duration: 00:12:31.47, start: 0.000000, bitrate: 320 kb/s
18 |   Duration: 00:19:19.51, start: 0.000000, bitrate: 320 kb/s
19 |   Duration: 00:40:38.57, start: 0.000000, bitrate: 320 kb/s
20 |   Duration: 00:26:01.98, start: 0.000000, bitrate: 320 kb/s
21 |   Duration: 00:13:57.26, start: 0.000000, bitrate: 320 kb/s
22 |   Duration: 00:16:23.42, start: 0.000000, bitrate: 320 kb/s
23 |   Duration: 00:24:17.95, start: 0.025057, bitrate: 137 kb/s
24 |   Duration: 00:17:26.14, start: 0.000000, bitrate: 320 kb/s
25 |   Duration: 00:23:03.66, start: 0.000000, bitrate: 320 kb/s
26 |   Duration: 00:20:31.32, start: 0.000000, bitrate: 320 kb/s
27 |   Duration: 00:18:35.52, start: 0.000000, bitrate: 320 kb/s
28 |   Duration: 00:25:45.52, start: 0.000000, bitrate: 320 kb/s
29 |   Duration: 00:27:36.38, start: 0.000000, bitrate: 320 kb/s
30 |   Duration: 00:16:26.45, start: 0.000000, bitrate: 320 kb/s
31 |   Duration: 00:11:07.99, start: 0.000000, bitrate: 320 kb/s
32 |   Duration: 00:24:12.24, start: 0.000000, bitrate: 320 kb/s
33 |   Duration: 00:18:32.30, start: 0.000000, bitrate: 320 kb/s
34 | 
35 |   560 minutes total
36 | 


--------------------------------------------------------------------------------
/datasets/music/new_experiment32k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys, os, subprocess, scikits.audiolab, random, time, glob
 3 | 
 4 | PWD = os.getcwd()
 5 | print 'PWD is', PWD
 6 | #store dataset name
 7 | DATASET_NAME = str(sys.argv[1])
 8 | DOWNLOAD_DIR = str(sys.argv[2])
 9 | print 'dl_dir is set to', DOWNLOAD_DIR
10 | #create the 
11 | print "creating directory for", DATASET_NAME
12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME)
13 | os.makedirs(DATASET_DIR)
14 | #move samples from directory to use dataset name
15 | print "moving samples"
16 | types = {'wav', "mp3"}
17 | for t in types:
18 |     os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR))
19 | #run proprocess
20 | print "preprocessing"
21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts")
22 | os.makedirs(OUTPUT_DIR)
23 | # Step 1: write all filenames to a list
24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f:
25 |     for dirpath, dirnames, filenames in os.walk(DATASET_DIR):
26 |         for filename in filenames:
27 |             if filename.endswith(".wav") or filename.endswith("mp3"):
28 |                 f.write("file '" + dirpath + '/'+ filename + "'\n")
29 | 
30 | # Step 2: concatenate everything into one massive wav file
31 | print "concatenate all files"
32 | os.system('pwd')
33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR))
34 | audio = "preprocess_all_audio.wav"
35 | print "get length"
36 | # # get the length of the resulting file
37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
38 | print length, "DURATION"
39 | print "print big file into chunks"
40 | # # Step 3: split the big file into 8-second chunks
41 | # overlapping 3 times per 8 seconds
42 | '''
43 | for i in xrange(int((length//8)*3)-1):
44 |     time = (i * 8 )/ 3
45 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
46 | '''
47 | size = 8
48 | num = 3200
49 | for i in xrange(0, num):
50 |     time = i * ((length-size)/float(num))
51 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 32000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
52 | print "clean up"
53 | # # Step 4: clean up temp files
54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR))
56 | print 'save as .npy'
57 | __RAND_SEED = 123
58 | def __fixed_shuffle(inp_list):
59 |     if isinstance(inp_list, list):
60 |         random.seed(__RAND_SEED)
61 |         random.shuffle(inp_list)
62 |         return
63 |     #import collections
64 |     #if isinstance(inp_list, (collections.Sequence)):
65 |     if isinstance(inp_list, numpy.ndarray):
66 |         numpy.random.seed(__RAND_SEED)
67 |         numpy.random.shuffle(inp_list)
68 |         return
69 |     # destructive operations; in place; no need to return
70 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
71 | 
72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac"))
73 | __fixed_shuffle(paths)
74 | 
75 | arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths]
76 | np_arr = np.array(arr)
77 | # 88/6/6 split
78 | length = len(np_arr)
79 | train_size = int(np.floor(length * .88)) # train
80 | test_size = int(np.floor(length * .06)) # test
81 | 
82 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr)
83 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size])
84 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size])
85 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:])
86 | 
87 | #pass dataset name through two_tier.py || three_tier.py to datasets.py


--------------------------------------------------------------------------------
/datasets/music/new_experiment16k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys, os, subprocess, scikits.audiolab, random, time, glob
 3 | 
 4 | PWD = os.getcwd()
 5 | print 'PWD is', PWD
 6 | #store dataset name
 7 | DATASET_NAME = str(sys.argv[1])
 8 | DOWNLOAD_DIR = str(sys.argv[2])
 9 | print 'dl_dir is set to', DOWNLOAD_DIR
10 | #create the 
11 | print "creating directory for", DATASET_NAME
12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME)
13 | os.makedirs(DATASET_DIR)
14 | #move samples from directory to use dataset name
15 | print "moving samples"
16 | types = {'wav', "mp3"}
17 | for t in types:
18 |     os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR))
19 | #run proprocess
20 | print "preprocessing"
21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts")
22 | os.makedirs(OUTPUT_DIR)
23 | # Step 1: write all filenames to a list
24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f:
25 |     for dirpath, dirnames, filenames in os.walk(DATASET_DIR):
26 |         for filename in filenames:
27 |             if filename.endswith(".wav") or filename.endswith("mp3"):
28 |                 f.write("file '" + dirpath + '/'+ filename + "'\n")
29 | 
30 | # Step 2: concatenate everything into one massive wav file
31 | print "concatenate all files"
32 | os.system('pwd')
33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR))
34 | audio = "preprocess_all_audio.wav"
35 | print "get length"
36 | # # get the length of the resulting file
37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
38 | print length, "DURATION"
39 | print "print big file into chunks"
40 | # # Step 3: split the big file into 8-second chunks
41 | # overlapping 3 times per 8 seconds
42 | '''
43 | for i in xrange(int((length//8)*3)-1):
44 |     time = (i * 8 )/ 3
45 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
46 | '''
47 | size = 12
48 | num = 6400
49 | for i in xrange(0, num):
50 |     time = i * ((length-size)/float(num))
51 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
52 | print "clean up"
53 | # # Step 4: clean up temp files
54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR))
56 | print 'save as .npy'
57 | __RAND_SEED = 123
58 | def __fixed_shuffle(inp_list):
59 |     if isinstance(inp_list, list):
60 |         random.seed(__RAND_SEED)
61 |         random.shuffle(inp_list)
62 |         return
63 |     #import collections
64 |     #if isinstance(inp_list, (collections.Sequence)):
65 |     if isinstance(inp_list, numpy.ndarray):
66 |         numpy.random.seed(__RAND_SEED)
67 |         numpy.random.shuffle(inp_list)
68 |         return
69 |     # destructive operations; in place; no need to return
70 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
71 | 
72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac"))
73 | __fixed_shuffle(paths)
74 | 
75 | arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths]
76 | np_arr = np.array(arr)
77 | # 88/6/6 split
78 | length = len(np_arr)
79 | train_size = int(np.floor(length * .88)) # train
80 | test_size = int(np.floor(length * .06)) # test
81 | 
82 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr)
83 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size])
84 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size])
85 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:])
86 | 
87 | #pass dataset name through two_tier.py || three_tier.py to datasets.py
88 | 


--------------------------------------------------------------------------------
/datasets/music/new_experiment48k.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys, os, subprocess, random, time, glob, soundfile as sf
 3 | 
 4 | PWD = os.getcwd()
 5 | print 'PWD is', PWD
 6 | #store dataset name
 7 | DATASET_NAME = str(sys.argv[1])
 8 | DOWNLOAD_DIR = str(sys.argv[2])
 9 | print 'dl_dir is set to', DOWNLOAD_DIR
10 | #create the 
11 | print "creating directory for", DATASET_NAME
12 | DATASET_DIR = os.path.join(PWD, DATASET_NAME)
13 | os.makedirs(DATASET_DIR)
14 | #move samples from directory to use dataset name
15 | print "moving samples"
16 | types = {'wav', "mp3"}
17 | for t in types:
18 |     os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR))
19 | #run proprocess
20 | print "preprocessing"
21 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts")
22 | os.makedirs(OUTPUT_DIR)
23 | # Step 1: write all filenames to a list
24 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f:
25 |     for dirpath, dirnames, filenames in os.walk(DATASET_DIR):
26 |         for filename in filenames:
27 |             if filename.endswith(".wav") or filename.endswith("mp3"):
28 |                 f.write("file '" + dirpath + '/'+ filename + "'\n")
29 | 
30 | # Step 2: concatenate everything into one massive wav file
31 | print "concatenate all files"
32 | os.system('pwd')
33 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR))
34 | audio = "preprocess_all_audio.wav"
35 | print "get length"
36 | # # get the length of the resulting file
37 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
38 | print length, "DURATION"
39 | print "print big file into chunks"
40 | # # Step 3: split the big file into 8-second chunks
41 | # overlapping 3 times per 8 seconds
42 | '''
43 | for i in xrange(int((length//8)*3)-1):
44 |     time = (i * 8 )/ 3
45 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 48000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
46 | '''
47 | size = 8
48 | num = 6400
49 | for i in xrange(0, num):
50 |     time = i * ((length-size)/float(num))
51 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 48000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
52 | print "clean up"
53 | # # Step 4: clean up temp files
54 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
55 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR))
56 | print 'save as .npy'
57 | __RAND_SEED = 123
58 | def __fixed_shuffle(inp_list):
59 |     if isinstance(inp_list, list):
60 |         random.seed(__RAND_SEED)
61 |         random.shuffle(inp_list)
62 |         return
63 |     #import collections
64 |     #if isinstance(inp_list, (collections.Sequence)):
65 |     if isinstance(inp_list, numpy.ndarray):
66 |         numpy.random.seed(__RAND_SEED)
67 |         numpy.random.shuffle(inp_list)
68 |         return
69 |     # destructive operations; in place; no need to return
70 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
71 | 
72 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac"))
73 | __fixed_shuffle(paths)
74 | 
75 | #arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths]
76 | arr = [sf.read(p)[0].astype('float16') for p in paths]
77 | np_arr = np.array(arr)
78 | # 88/6/6 split
79 | length = len(np_arr)
80 | train_size = int(np.floor(length * .88)) # train
81 | test_size = int(np.floor(length * .06)) # test
82 | 
83 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr)
84 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size])
85 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size])
86 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:])
87 | 
88 | #pass dataset name through two_tier.py || three_tier.py to datasets.py
89 | 


--------------------------------------------------------------------------------
/lib/generate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import time
  3 | import scipy.io.wavfile
  4 | import glob
  5 | import sys
  6 | import numpy
  7 | import pickle
  8 | import theano
  9 | import theano.tensor as T
 10 | 
 11 | tag = sys.argv[1]
 12 | name = glob.glob("../results*/" + tag + "/args.pkl")[0]
 13 | params = pickle.load(open(name, "r"))
 14 | print params
 15 | info = {}
 16 | for p in xrange(1,len(params),2):
 17 |     if p+1 < len(params):
 18 |         info[params[p][2:]] = params[p+1]
 19 | print info
 20 | #exit()
 21 | 
 22 | Q_TYPE = info["q_type"]
 23 | Q_LEVELS = int(info["q_levels"])
 24 | N_RNN = int(info["n_rnn"])
 25 | DIM = int(info["dim"])
 26 | FRAME_SIZE = int(info["frame_size"])
 27 | 
 28 | 
 29 | #{'dim': '1024', 'q_type': 'linear', 'learn_h0': 'True', 'weight_norm': 'True', 'q_levels': '256', 'skip_conn': 'False', 'batch_size': '128', 'n_frames': '64', 'emb_size': '256', 'exp': 'KURT2x4', 'frame_size': '16', 'which_set': 'KURT', 'rnn_type': 'GRU', 'n_rnn': '4'}
 30 | 
 31 | ###grab this stuff
 32 | #args
 33 | #Q_TYPE
 34 | #Q_TEVELS
 35 | #N_RNN
 36 | #DIM
 37 | #FRAME_SIZE
 38 | 
 39 | BITRATE = 16000
 40 | N_SEQS = 20  # Number of samples to generate every time monitoring.
 41 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 42 | H0_MULT = 1
 43 | 
 44 | RESULTS_DIR = 'results_2t'
 45 | RESULTS_DIR = name.split("/")[1]
 46 | print RESULTS_DIR
 47 | 
 48 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
 49 | ### Create directories ###
 50 | #   FOLDER_PREFIX: root, contains:
 51 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
 52 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
 53 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
 54 | 
 55 | print SAMPLES_PATH
 56 | # Uniform [-0.5, 0.5) for half of initial state for generated samples
 57 | # to study the behaviour of the model and also to introduce some diversity
 58 | # to samples in a simple way. [it's disabled for now]
 59 | sequences = T.imatrix('sequences')
 60 | h0        = T.tensor3('h0')
 61 | reset     = T.iscalar('reset')
 62 | mask      = T.matrix('mask')
 63 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM)
 64 | fixed_rand_h0 -= 0.5
 65 | fixed_rand_h0 = fixed_rand_h0.astype('float32')
 66 | 
 67 | def generate_and_save_samples():
 68 |     # Sampling at frame level
 69 |     frame_level_generate_fn = theano.function(
 70 |         [sequences, h0, reset],
 71 |         frame_level_rnn(sequences, h0, reset),
 72 |         on_unused_input='warn'
 73 |     )
 74 |     def write_audio_file(name, data):
 75 |         data = data.astype('float32')
 76 |         data -= data.min()
 77 |         data /= data.max()
 78 |         data -= 0.5
 79 |         data *= 0.95
 80 |         scipy.io.wavfile.write(
 81 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
 82 |                     BITRATE,
 83 |                     data)
 84 | 
 85 |     total_time = time()
 86 |     # Generate N_SEQS' sample files, each 5 seconds long
 87 |     N_SECS = 5
 88 |     LENGTH = N_SECS*BITRATE
 89 | 
 90 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
 91 |     samples[:, :FRAME_SIZE] = Q_ZERO
 92 | 
 93 |     # First half zero, others fixed random at each checkpoint
 94 |     h0 = numpy.zeros(
 95 |             (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
 96 |             dtype='float32'
 97 |     )
 98 |     h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
 99 |     frame_level_outputs = None
100 | 
101 |     for t in xrange(FRAME_SIZE, LENGTH):
102 | 
103 |         if t % FRAME_SIZE == 0:
104 |             frame_level_outputs, h0 = frame_level_generate_fn(
105 |                 samples[:, t-FRAME_SIZE:t],
106 |                 h0,
107 |                 #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
108 |                 numpy.int32(t == FRAME_SIZE)
109 |             )
110 | 
111 |         samples[:, t] = sample_level_generate_fn(
112 |             frame_level_outputs[:, t % FRAME_SIZE],
113 |             samples[:, t-FRAME_SIZE:t],
114 |         )
115 | 
116 |     total_time = time() - total_time
117 |     log = "{} samples of {} seconds length generated in {} seconds."
118 |     log = log.format(N_SEQS, N_SECS, total_time)
119 |     print log,
120 | 
121 |     for i in xrange(N_SEQS):
122 |         samp = samples[i]
123 |         if Q_TYPE == 'mu-law':
124 |             from datasets.dataset import mu2linear
125 |             samp = mu2linear(samp)
126 |         elif Q_TYPE == 'a-law':
127 |             raise NotImplementedError('a-law is not implemented')
128 |         write_audio_file("sample_{}_{}".format(tag, i), samp)
129 | 
130 | generate_and_save_samples()


--------------------------------------------------------------------------------
/datasets/music/new_experiment16k_conditioning.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys, os, subprocess, scikits.audiolab, random, time, glob, math
  3 | 
  4 | 
  5 | from build_features import * 
  6 | 
  7 | PWD = os.getcwd()
  8 | print 'PWD is', PWD
  9 | #store dataset name
 10 | DATASET_NAME = str(sys.argv[1])
 11 | DOWNLOAD_DIR = str(sys.argv[2])
 12 | print 'dl_dir is set to', DOWNLOAD_DIR
 13 | #create the 
 14 | print "creating directory for", DATASET_NAME
 15 | DATASET_DIR = os.path.join(PWD, DATASET_NAME)
 16 | os.makedirs(DATASET_DIR)
 17 | #move samples from directory to use dataset name
 18 | print "moving samples"
 19 | types = {'wav', "mp3"}
 20 | for t in types:
 21 |     os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR))
 22 | #run proprocess
 23 | print "preprocessing"
 24 | OUTPUT_DIR=os.path.join(DATASET_DIR, "parts")
 25 | os.makedirs(OUTPUT_DIR)
 26 | # Step 1: write all filenames to a list
 27 | with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f:
 28 |     for dirpath, dirnames, filenames in os.walk(DATASET_DIR):
 29 |         for filename in filenames:
 30 |             if filename.endswith(".wav") or filename.endswith("mp3"):
 31 |                 f.write("file '" + dirpath + '/'+ filename + "'\n")
 32 | 
 33 | # Step 2: concatenate everything into one massive wav file
 34 | print "concatenate all files"
 35 | os.system('pwd')
 36 | os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR))
 37 | audio = "preprocess_all_audio.wav"
 38 | print "get length"
 39 | # # get the length of the resulting file
 40 | length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
 41 | print length, "DURATION"
 42 | print "print big file into chunks"
 43 | # # Step 3: split the big file into 8-second chunks
 44 | # overlapping 3 times per 8 seconds
 45 | '''
 46 | for i in xrange(int((length//8)*3)-1):
 47 |     time = (i * 8 )/ 3
 48 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
 49 | '''
 50 | 
 51 | # size in seconds of each chunk
 52 | size = 8
 53 | # number of chunks
 54 | num_chunks = 3200
 55 | 
 56 | # cj (conditioning) generate the feature matrix for the entire dataset WAV
 57 | features = build_onset_envelope_feature("{}/preprocess_all_audio.wav".format(OUTPUT_DIR))
 58 | # frame_rate is the number of feature frames per second
 59 | # calcualte it by comparing length of features to length of audio 
 60 | # don't confuse feature_frames for the SampleRNN frames
 61 | total_num_frames = features.shape[0]
 62 | num_features = features.shape[1]
 63 | frame_rate = len(features)/float(length)
 64 | # number of frames per chunk of audio 
 65 | frames_per_chunk = int(math.floor((size)*frame_rate))
 66 | # a matrix of chunks x frames x features
 67 | feature_matrix = np.zeros((num_chunks, frames_per_chunk, num_features), dtype='float32')
 68 | 
 69 | 
 70 | 
 71 | for i in xrange(0, num_chunks):
 72 |     time = i * ((length-size)/float(num_chunks))
 73 | 
 74 |     # build the feature_matrix
 75 |     # it's the feature timesliced according to the start and end times of the chunk
 76 |     start_frame = int(math.floor((time)*frame_rate))
 77 |     end_frame = start_frame + frames_per_chunk
 78 |     if(len(features)<=end_frame): 
 79 |         end_frame = len(features)-1
 80 |     # print "start_frame", start_frame
 81 |     # print "end_frame", end_frame
 82 |     # print "features[start:end].shape", features[start_frame:end_frame].shape
 83 |     # print "len(features)", len(features)
 84 |     # print "time", time 
 85 |     # print "frames_per_chunk", frames_per_chunk
 86 |     # print "frame_rate", frame_rate
 87 |     # print "total_num_frames", total_num_frames
 88 |     # print "num_features", num_features
 89 |     feature_matrix[i] = features[start_frame:end_frame]
 90 | 
 91 |     os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
 92 | print "clean up"
 93 | 
 94 | 
 95 | 
 96 | # # Step 4: clean up temp files
 97 | os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
 98 | os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR))
 99 | print 'save as .npy'
100 | __RAND_SEED = 123
101 | def __fixed_shuffle(inp_list):
102 |     if isinstance(inp_list, list):
103 |         random.seed(__RAND_SEED)
104 |         random.shuffle(inp_list)
105 |         return
106 |     #import collections
107 |     #if isinstance(inp_list, (collections.Sequence)):
108 |     if isinstance(inp_list, numpy.ndarray):
109 |         numpy.random.seed(__RAND_SEED)
110 |         numpy.random.shuffle(inp_list)
111 |         return
112 |     # destructive operations; in place; no need to return
113 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
114 | 
115 | paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac"))
116 | __fixed_shuffle(paths)
117 | 
118 | 
119 | 
120 | 
121 | 
122 | # CJ (conditioning)
123 | # For conditioning, the np_arr should be structured as follows
124 | # np_arr[0] are the PCM samples as usual
125 | # np_arr[1] are the feature vectors 
126 | 
127 | # Turn the FLACs into PCM samples
128 | samples = np.array([(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths])
129 | print samples.shape
130 | 
131 | print feature_matrix.shape 
132 | 
133 | 
134 | # 88/6/6 split
135 | length = samples.shape[0]
136 | train_size = int(np.floor(length * .88)) # train
137 | test_size = int(np.floor(length * .06)) # test
138 | 
139 | np.save(os.path.join(DATASET_DIR,'all_music.npy'), samples)
140 | np.save(os.path.join(DATASET_DIR,'music_train.npy'), samples[:train_size])
141 | np.save(os.path.join(DATASET_DIR,'music_valid.npy'), samples[train_size:train_size + test_size])
142 | np.save(os.path.join(DATASET_DIR,'music_test.npy'), samples[train_size + test_size:])
143 | 
144 | np.save(os.path.join(DATASET_DIR,'all_features.npy'), feature_matrix)
145 | np.save(os.path.join(DATASET_DIR,'features_train.npy'), feature_matrix[:train_size])
146 | np.save(os.path.join(DATASET_DIR,'features_valid.npy'), feature_matrix[train_size:train_size + test_size])
147 | np.save(os.path.join(DATASET_DIR,'features_test.npy'), feature_matrix[train_size + test_size:])
148 | 
149 | #pass dataset name through two_tier.py || three_tier.py to datasets.py


--------------------------------------------------------------------------------
/datasets/music/drum-preprocess.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import subprocess
  3 | # requires sox, ffmpeg, and ffprobe command line tools
  4 | 
  5 | RAW_DATA_DIR=str(sys.argv[1])
  6 | TEMP_DIR=os.path.join(RAW_DATA_DIR, "temp")
  7 | FR_DIR=os.path.join(RAW_DATA_DIR, "fr-parts")
  8 | BR_DIR=os.path.join(RAW_DATA_DIR, "br-parts")
  9 | F_DIR=os.path.join(RAW_DATA_DIR, "f-parts")
 10 | B_DIR=os.path.join(RAW_DATA_DIR, "b-parts")
 11 | SAMPLE_RATE = 16000
 12 | os.makedirs(TEMP_DIR)
 13 | os.makedirs(FR_DIR)
 14 | os.makedirs(BR_DIR)
 15 | os.makedirs(F_DIR)
 16 | os.makedirs(B_DIR)
 17 | 
 18 | def createParts():
 19 |     def renderFlacs(fr, br, f, b):
 20 |         os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac
 21 |         os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac
 22 |         os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac
 23 |         os.system('ffmpeg -i {}/{}_temp.wav -ac 1 -ab 16k -ar {} {}/p{}.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac
 24 |         #pitch down
 25 |         os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac
 26 |         os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac
 27 |         os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac
 28 |         os.system('ffmpeg -i {}/{}_down.wav -ac 1 -ab 16k -ar {} {}/p{}d.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac       
 29 |         #pitch up
 30 |         os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, fr, SAMPLE_RATE, FR_DIR, i))#convert part to flac
 31 |         os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, br, SAMPLE_RATE, BR_DIR, i))#convert part to flac
 32 |         os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, f, SAMPLE_RATE, F_DIR, i))#convert part to flac
 33 |         os.system('ffmpeg -i {}/{}_up.wav -ac 1 -ab 16k -ar {} {}/p{}u.flac'.format(TEMP_DIR, b, SAMPLE_RATE, B_DIR, i))#convert part to flac
 34 |     #initial preparation
 35 |     os.system('ffmpeg -i "{}" -ac 1 -ab 16k -ar {} {}/this_temp.wav'.format(full_name, SAMPLE_RATE, TEMP_DIR)) #resample this file as mono 16000smpls/s
 36 |     this_length = float(subprocess.check_output('ffprobe -i {}/this_temp.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(TEMP_DIR), shell=True)) #check length of resampled audio
 37 |     print full_name, ':', this_length, 'DURATION'
 38 |     pad_length =  longest_length - this_length
 39 |     os.system('sox {}/this_temp.wav {}/r_temp.wav reverse'.format(TEMP_DIR, TEMP_DIR)) # reverse file
 40 |     if pad_length > 0.: # every audiofile except the largest
 41 |         #create temp files
 42 |         os.system('ffmpeg -f lavfi -i anullsrc=channel_layout=mono:sample_rate={} -t {} {}/anullsrc_temp.wav'.format(SAMPLE_RATE, pad_length, TEMP_DIR)) #create anullsrc_temp.wav zero-pad
 43 |         os.system('sox {}/anullsrc_temp.wav {}/r_temp.wav {}/fr_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #FR
 44 |         os.system('sox {}/r_temp.wav {}/anullsrc_temp.wav {}/br_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #BR
 45 |         os.system('sox {}/anullsrc_temp.wav {}/this_temp.wav {}/f_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #F
 46 |         os.system('sox {}/this_temp.wav {}/anullsrc_temp.wav {}/b_temp.wav'.format(TEMP_DIR, TEMP_DIR, TEMP_DIR)) #B
 47 |         # extend the data set by copying and repitching each sample up+down 1 semitone
 48 |         os.system('sox {}/fr_temp.wav {}/fr_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#FR down
 49 |         os.system('sox {}/br_temp.wav {}/br_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#BR down
 50 |         os.system('sox {}/f_temp.wav {}/f_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#F down
 51 |         os.system('sox {}/b_temp.wav {}/b_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#B down
 52 |         os.system('sox {}/fr_temp.wav {}/fr_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#FR up
 53 |         os.system('sox {}/br_temp.wav {}/br_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#BR up
 54 |         os.system('sox {}/f_temp.wav {}/f_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#F up
 55 |         os.system('sox {}/b_temp.wav {}/b_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#D up
 56 |         #final export
 57 |         renderFlacs('fr', 'br', 'f', 'b') #render parts
 58 |         #clean up temp files
 59 |         os.system('rm {}/anullsrc_temp.wav'.format(TEMP_DIR))
 60 |         os.system('rm {}/fr_down.wav'.format(TEMP_DIR))
 61 |         os.system('rm {}/br_down.wav'.format(TEMP_DIR))
 62 |         os.system('rm {}/f_down.wav'.format(TEMP_DIR))
 63 |         os.system('rm {}/b_down.wav'.format(TEMP_DIR))
 64 |         os.system('rm {}/fr_up.wav'.format(TEMP_DIR))
 65 |         os.system('rm {}/br_up.wav'.format(TEMP_DIR))
 66 |         os.system('rm {}/f_up.wav'.format(TEMP_DIR))
 67 |         os.system('rm {}/b_up.wav'.format(TEMP_DIR))
 68 |     else: #longest file
 69 |         # extend the data set by copying and repitching each sample up+down 1 semitone
 70 |         os.system('sox {}/this_temp.wav {}/r_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))# up
 71 |         os.system('sox {}/this_temp.wav {}/r_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))# down
 72 |         os.system('sox {}/r_temp.wav {}/this_up.wav pitch 100'.format(TEMP_DIR, TEMP_DIR))#r up
 73 |         os.system('sox {}/r_temp.wav {}/this_down.wav pitch -100'.format(TEMP_DIR, TEMP_DIR))#r down
 74 |         # final export
 75 |         renderFlacs('r', 'r', 'this', 'this')
 76 |         #clean up temp files
 77 |         os.system('rm {}/r_up.wav'.format(TEMP_DIR))
 78 |         os.system('rm {}/r_down.wav'.format(TEMP_DIR))
 79 |         os.system('rm {}/this_up.wav'.format(TEMP_DIR))
 80 |         os.system('rm {}/this_down.wav'.format(TEMP_DIR))
 81 |     os.system('rm {}/r_temp.wav'.format(TEMP_DIR))
 82 |     os.system('rm {}/this_temp.wav'.format(TEMP_DIR))
 83 | 
 84 | # Step 1: Find the largest file size in the audio dataset
 85 | objects = os.listdir(RAW_DATA_DIR)
 86 | sofar = 0
 87 | largest = ""
 88 | for item in objects:
 89 |     if ".wav" in item:
 90 |         size = os.path.getsize(item)
 91 |         if size > sofar:
 92 |                 sofar = size
 93 |                 largest = item
 94 | 
 95 | print "Largest file is ", sofar
 96 | print largest
 97 | os.system('ffmpeg -i "{}" -ac 1 -ab 16k -ar {} {}/longest_temp.wav'.format(largest, SAMPLE_RATE, TEMP_DIR)) #resample the largest file as mono 
 98 | longest_length = float(subprocess.check_output('ffprobe -i {}/longest_temp.wav -show_entries format=duration -v quiet -of csv="p=0"'.format(TEMP_DIR), shell=True))
 99 | #clean up longest temp wav
100 | os.system('rm {}/longest_temp.wav'.format(TEMP_DIR))
101 | 
102 | i = 0
103 | for dirpath, dirnames, filenames in os.walk(RAW_DATA_DIR):
104 |     for filename in filenames:
105 |         if filename.endswith(".wav"):
106 |             full_name = dirpath + '/'+ filename # raw audio file
107 |             createParts()
108 |             i += 1
109 | #remove empty temp dir
110 | #os.system('rmdir {}'.format(TEMP_DIR))
111 | 


--------------------------------------------------------------------------------
/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | """
  4 | import numpy as np
  5 | import random, time, os, glob
  6 | 
  7 | def __getFile(dataset_name):
  8 |     return 'music/'+dataset_name+'/music_{}.npy'
  9 | 
 10 | __base = [
 11 |     ('Local', 'datasets/')
 12 | ]
 13 | 
 14 | __train = lambda s: s.format('train')
 15 | __valid = lambda s: s.format('valid')
 16 | __test = lambda s: s.format('test')
 17 | 
 18 | def find_dataset(filename):
 19 |     for (k, v) in __base:
 20 |         tmp_path = os.path.join(v, filename)
 21 |         if os.path.exists(tmp_path):
 22 |             #print "Path on {}:".format(k)
 23 |             #print tmp_path
 24 |             return tmp_path
 25 |         #print "not found on {}".format(k)
 26 |     raise Exception('{} NOT FOUND!'.format(filename))
 27 | 
 28 | ### Basic utils ###
 29 | def __round_to(x, y):
 30 |     """round x up to the nearest y"""
 31 |     return int(np.ceil(x / float(y))) * y
 32 | 
 33 | def __normalize(data):
 34 |     """To range [0., 1.]"""
 35 |     data -= data.min(axis=1)[:, None]
 36 |     data /= data.max(axis=1)[:, None]
 37 |     return data
 38 | 
 39 | def __linear_quantize(data, q_levels):
 40 |     """
 41 |     floats in (0, 1) to ints in [0, q_levels-1]
 42 |     scales normalized across axis 1
 43 |     """
 44 |     # Normalization is on mini-batch not whole file
 45 |     #eps = np.float64(1e-5)
 46 |     #data -= data.min(axis=1)[:, None]
 47 |     #data *= ((q_levels - eps) / data.max(axis=1)[:, None])
 48 |     #data += eps/2
 49 |     #data = data.astype('int32')
 50 | 
 51 |     eps = np.float64(1e-5)
 52 |     data *= (q_levels - eps)
 53 |     data += eps/2
 54 |     data = data.astype('int32')
 55 |     return data
 56 | 
 57 | def __a_law_quantize(data):
 58 |     """
 59 |     :todo:
 60 |     """
 61 |     raise NotImplementedError
 62 | 
 63 | def linear2mu(x, mu=255):
 64 |     """
 65 |     From Joao
 66 |     x should be normalized between -1 and 1
 67 |     Converts an array according to mu-law and discretizes it
 68 |     Note:
 69 |         mu2linear(linear2mu(x)) != x
 70 |         Because we are compressing to 8 bits here.
 71 |         They will sound pretty much the same, though.
 72 |     :usage:
 73 |         >>> bitrate, samples = scipy.io.wavfile.read('orig.wav')
 74 |         >>> norm = __normalize(samples)[None, :]  # It takes 2D as inp
 75 |         >>> mu_encoded = linear2mu(2.*norm-1.)  # From [0, 1] to [-1, 1]
 76 |         >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype
 77 |         0, 255, dtype('int16')
 78 |         >>> mu_decoded = mu2linear(mu_encoded)  # Back to linear
 79 |         >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype
 80 |         -1, 0.9574371, dtype('float32')
 81 |     """
 82 |     x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu)
 83 |     return ((x_mu + 1)/2 * mu).astype('int16')
 84 | 
 85 | def mu2linear(x, mu=255):
 86 |     """
 87 |     From Joao with modifications
 88 |     Converts an integer array from mu to linear
 89 |     For important notes and usage see: linear2mu
 90 |     """
 91 |     mu = float(mu)
 92 |     x = x.astype('float32')
 93 |     y = 2. * (x - (mu+1.)/2.) / (mu+1.)
 94 |     return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.)
 95 | 
 96 | def __mu_law_quantize(data):
 97 |     return linear2mu(data)
 98 | 
 99 | def __batch_quantize(data, q_levels, q_type):
100 |     """
101 |     One of 'linear', 'a-law', 'mu-law' for q_type.
102 |     """
103 |     data = data.astype('float64')
104 |     #data = __normalize(data)
105 |     if q_type == 'linear':
106 |         return __linear_quantize(data, q_levels)
107 |     if q_type == 'a-law':
108 |         return __a_law_quantize(data)
109 |     if q_type == 'mu-law':
110 |         # from [0, 1] to [-1, 1]
111 |         #data = 2.*data-1.
112 |         # Automatically quantized to 256 bins.
113 |         return __mu_law_quantize(data)
114 |     raise NotImplementedError
115 | 
116 | __RAND_SEED = 123
117 | def __fixed_shuffle(inp_list):
118 |     if isinstance(inp_list, list):
119 |         random.seed(__RAND_SEED)
120 |         random.shuffle(inp_list)
121 |         return
122 |     #import collections
123 |     #if isinstance(inp_list, (collections.Sequence)):
124 |     if isinstance(inp_list, np.ndarray):
125 |         np.random.seed(__RAND_SEED)
126 |         np.random.shuffle(inp_list)
127 |         return
128 |     # destructive operations; in place; no need to return
129 |     raise ValueError("inp_list is neither a list nor a np.ndarray but a "+type(inp_list))
130 | 
131 | def __make_random_batches(inp_list, batch_size):
132 |     batches = []
133 |     for i in xrange(len(inp_list) / batch_size):
134 |         batches.append(inp_list[i*batch_size:(i+1)*batch_size])
135 | 
136 |     __fixed_shuffle(batches)
137 |     return batches
138 | 
139 | 
140 | ### MUSIC DATASET LOADER ###
141 | def __music_feed_epoch(files,
142 |                        batch_size,
143 |                        seq_len,
144 |                        overlap,
145 |                        q_levels,
146 |                        q_zero,
147 |                        q_type,
148 |                        real_valued=False):
149 |     """
150 |     Helper function to load music dataset.
151 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
152 |     quantized audio data; `reset` is a boolean indicating the start of a new
153 |     sequence (i.e. you should reset h0 whenever `reset` is True).
154 |     Feeds subsequences which overlap by a specified amount, so that the model
155 |     can always have target for every input in a given subsequence.
156 |     Assumes all flac files have the same length.
157 |     returns: (subbatch, reset)
158 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
159 |     reset: True or False
160 |     """
161 |     batches = __make_random_batches(files, batch_size)
162 | 
163 |     for bch in batches:
164 |         # batch_seq_len = length of longest sequence in the batch, rounded up to
165 |         # the nearest SEQ_LEN.
166 |         batch_seq_len = len(bch[0])  # should be 8*16000
167 |         batch_seq_len = __round_to(batch_seq_len, seq_len)
168 | 
169 |         batch = np.zeros(
170 |             (batch_size, batch_seq_len),
171 |             dtype='float64'
172 |         )
173 | 
174 |         mask = np.ones(batch.shape, dtype='float32')
175 | 
176 |         for i, data in enumerate(bch):
177 |             #data, fs, enc = scikits.audiolab.flacread(path)
178 |             # data is float16 from reading the npy file
179 |             batch[i, :len(data)] = data
180 |             # This shouldn't change anything. All the flac files for Music
181 |             # are the same length and the mask should be 1 every where.
182 |             # mask[i, len(data):] = np.float32(0)
183 | 
184 |         if not real_valued:
185 |             batch = __batch_quantize(batch, q_levels, q_type)
186 | 
187 |             batch = np.concatenate([
188 |                 np.full((batch_size, overlap), q_zero, dtype='int32'),
189 |                 batch
190 |             ], axis=1)
191 |         else:
192 |             batch -= __music_train_mean_std[0]
193 |             batch /= __music_train_mean_std[1]
194 |             batch = np.concatenate([
195 |                 np.full((batch_size, overlap), 0, dtype='float32'),
196 |                 batch
197 |             ], axis=1).astype('float32')
198 | 
199 |         mask = np.concatenate([
200 |             np.full((batch_size, overlap), 1, dtype='float32'),
201 |             mask
202 |         ], axis=1)
203 | 
204 |         for i in xrange(batch_seq_len // seq_len):
205 |             reset = np.int32(i==0)
206 |             subbatch = batch[:, i*seq_len : (i+1)*seq_len + overlap]
207 |             submask = mask[:, i*seq_len : (i+1)*seq_len + overlap]
208 |             yield (subbatch, reset, submask)
209 | 
210 | def music_train_feed_epoch(d_name, *args):
211 |     """
212 |     :parameters:
213 |         batch_size: int
214 |         seq_len:
215 |         overlap:
216 |         q_levels:
217 |         q_zero:
218 |         q_type: One the following 'linear', 'a-law', or 'mu-law'
219 |     4,340 (9.65 hours) in total
220 |     With batch_size = 128:
221 |         4,224 (9.39 hours) in total
222 |         3,712 (88%, 8.25 hours)for training set
223 |         256 (6%, .57 hours) for validation set
224 |         256 (6%, .57 hours) for test set
225 |     Note:
226 |         32 of Beethoven's piano sonatas available on archive.org (Public Domain)
227 |     :returns:
228 |         A generator yielding (subbatch, reset, submask)
229 |     """
230 |     # Just check if valid/test sets are also available. If not, raise.
231 |     find_dataset(__valid(__getFile(d_name)))
232 |     find_dataset(__test(__getFile(d_name)))
233 |     # Load train set
234 |     data_path = find_dataset(__train(__getFile(d_name)))
235 |     files = np.load(data_path)
236 |     generator = __music_feed_epoch(files, *args)
237 |     return generator
238 | 
239 | def music_valid_feed_epoch(d_name, *args):
240 |     """
241 |     See:
242 |         music_train_feed_epoch
243 |     """
244 |     data_path = find_dataset(__valid(__getFile(d_name)))
245 |     files = np.load(data_path)
246 |     generator = __music_feed_epoch(files, *args)
247 |     return generator
248 | 
249 | def music_test_feed_epoch(d_name, *args):
250 |     """
251 |     See:
252 |         music_train_feed_epoch
253 |     """
254 |     data_path = find_dataset(__test(__getFile(d_name)))
255 |     files = np.load(data_path)
256 |     generator = __music_feed_epoch(files, *args)
257 |     return generator
258 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/datasets/dataset_conditioning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | """
  4 | import numpy as np
  5 | import random, time, os, glob
  6 | 
  7 | def __getFile(dataset_name):
  8 |     return 'music/'+dataset_name+'/music_{}.npy'
  9 | 
 10 | def __getFeatures(dataset_name):
 11 |     return 'music/'+dataset_name+'/features_{}.npy'
 12 | 
 13 | __base = [
 14 |     ('Local', 'datasets/')
 15 | ]
 16 | 
 17 | __train = lambda s: s.format('train')
 18 | __valid = lambda s: s.format('valid')
 19 | __test = lambda s: s.format('test')
 20 | 
 21 | def find_dataset(filename):
 22 |     for (k, v) in __base:
 23 |         tmp_path = os.path.join(v, filename)
 24 |         if os.path.exists(tmp_path):
 25 |             #print "Path on {}:".format(k)
 26 |             #print tmp_path
 27 |             return tmp_path
 28 |         #print "not found on {}".format(k)
 29 |     raise Exception('{} NOT FOUND!'.format(filename))
 30 | 
 31 | ### Basic utils ###
 32 | def __round_to(x, y):
 33 |     """round x up to the nearest y"""
 34 |     return int(np.ceil(x / float(y))) * y
 35 | 
 36 | def __normalize(data):
 37 |     """To range [0., 1.]"""
 38 |     data -= data.min(axis=1)[:, None]
 39 |     data /= data.max(axis=1)[:, None]
 40 |     return data
 41 | 
 42 | def __linear_quantize(data, q_levels):
 43 |     """
 44 |     floats in (0, 1) to ints in [0, q_levels-1]
 45 |     scales normalized across axis 1
 46 |     """
 47 |     # Normalization is on mini-batch not whole file
 48 |     #eps = np.float64(1e-5)
 49 |     #data -= data.min(axis=1)[:, None]
 50 |     #data *= ((q_levels - eps) / data.max(axis=1)[:, None])
 51 |     #data += eps/2
 52 |     #data = data.astype('int32')
 53 | 
 54 |     eps = np.float64(1e-5)
 55 |     data *= (q_levels - eps)
 56 |     data += eps/2
 57 |     data = data.astype('int32')
 58 |     return data
 59 | 
 60 | def __a_law_quantize(data):
 61 |     """
 62 |     :todo:
 63 |     """
 64 |     raise NotImplementedError
 65 | 
 66 | def linear2mu(x, mu=255):
 67 |     """
 68 |     From Joao
 69 |     x should be normalized between -1 and 1
 70 |     Converts an array according to mu-law and discretizes it
 71 |     Note:
 72 |         mu2linear(linear2mu(x)) != x
 73 |         Because we are compressing to 8 bits here.
 74 |         They will sound pretty much the same, though.
 75 |     :usage:
 76 |         >>> bitrate, samples = scipy.io.wavfile.read('orig.wav')
 77 |         >>> norm = __normalize(samples)[None, :]  # It takes 2D as inp
 78 |         >>> mu_encoded = linear2mu(2.*norm-1.)  # From [0, 1] to [-1, 1]
 79 |         >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype
 80 |         0, 255, dtype('int16')
 81 |         >>> mu_decoded = mu2linear(mu_encoded)  # Back to linear
 82 |         >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype
 83 |         -1, 0.9574371, dtype('float32')
 84 |     """
 85 |     x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu)
 86 |     return ((x_mu + 1)/2 * mu).astype('int16')
 87 | 
 88 | def mu2linear(x, mu=255):
 89 |     """
 90 |     From Joao with modifications
 91 |     Converts an integer array from mu to linear
 92 |     For important notes and usage see: linear2mu
 93 |     """
 94 |     mu = float(mu)
 95 |     x = x.astype('float32')
 96 |     y = 2. * (x - (mu+1.)/2.) / (mu+1.)
 97 |     return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.)
 98 | 
 99 | def __mu_law_quantize(data):
100 |     return linear2mu(data)
101 | 
102 | def __batch_quantize(data, q_levels, q_type):
103 |     """
104 |     One of 'linear', 'a-law', 'mu-law' for q_type.
105 |     """
106 |     data = data.astype('float64')
107 |     data = __normalize(data)
108 |     if q_type == 'linear':
109 |         return __linear_quantize(data, q_levels)
110 |     if q_type == 'a-law':
111 |         return __a_law_quantize(data)
112 |     if q_type == 'mu-law':
113 |         # from [0, 1] to [-1, 1]
114 |         data = 2.*data-1.
115 |         # Automatically quantized to 256 bins.
116 |         return __mu_law_quantize(data)
117 |     raise NotImplementedError
118 | 
119 | __RAND_SEED = 123
120 | def __fixed_shuffle(inp_list):
121 |     if isinstance(inp_list, list):
122 |         random.seed(__RAND_SEED)
123 |         random.shuffle(inp_list)
124 |         return
125 |     #import collections
126 |     #if isinstance(inp_list, (collections.Sequence)):
127 |     if isinstance(inp_list, np.ndarray):
128 |         np.random.seed(__RAND_SEED)
129 |         np.random.shuffle(inp_list)
130 |         return
131 |     # destructive operations; in place; no need to return
132 |     raise ValueError("inp_list is neither a list nor a np.ndarray but a "+type(inp_list))
133 | 
134 | def __make_random_batches(sample_data, feature_data, batch_size):
135 |     batches = []
136 |     print "sample_data.shape", sample_data.shape
137 |     print "feature_data.shape", feature_data.shape
138 |     print "len(sample_data)", len(sample_data)
139 |     print "batch_size", batch_size
140 |     print len(sample_data) / batch_size
141 | 
142 |     for i in xrange(len(sample_data) / batch_size):
143 |         sample_batch = sample_data[i*batch_size:(i+1)*batch_size]
144 |         feature_batch = feature_data[i*batch_size:(i+1)*batch_size]
145 |         batches.append([sample_batch, feature_batch])
146 | 
147 |     print "len(batches)", len(batches)
148 |     __fixed_shuffle(batches)
149 |     return batches
150 | 
151 | 
152 | ### MUSIC DATASET LOADER ###
153 | def __music_feed_epoch(sample_data, feature_data,
154 |                        batch_size,
155 |                        seq_len,
156 |                        overlap,
157 |                        q_levels,
158 |                        q_zero,
159 |                        q_type,
160 |                        real_valued=False):
161 |     """
162 |     Helper function to load music dataset.
163 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
164 |     quantized audio data; `reset` is a boolean indicating the start of a new
165 |     sequence (i.e. you should reset h0 whenever `reset` is True).
166 |     Feeds subsequences which overlap by a specified amount, so that the model
167 |     can always have target for every input in a given subsequence.
168 |     Assumes all flac files have the same length.
169 |     returns: (subbatch, reset)
170 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
171 |     reset: True or False
172 |     """
173 |     batches = __make_random_batches(sample_data, feature_data, batch_size)
174 | 
175 |     for bch in batches:
176 | 
177 |         print "len(bch)", len(bch)
178 |         print bch[0].shape
179 |         print bch[1].shape
180 |         # batch_seq_len = length of longest sequence in the batch, rounded up to
181 |         # the nearest SEQ_LEN.
182 |         batch_seq_len = len(bch[0][0])  # should be 8*16000
183 |         batch_seq_len = __round_to(batch_seq_len, seq_len)
184 |         print "batch_seq_len", batch_seq_len
185 | 
186 |         batch = np.zeros(
187 |             (batch_size, batch_seq_len),
188 |             dtype='float64'
189 |         )
190 | 
191 |         num_features = bch[1].shape[2]
192 |         # cj (conditioning)
193 |         features = np.zeros((batch_size, batch_seq_len, num_features), dtype='float32')
194 |         print "num_features", num_features
195 |         print "features.shape", features.shape
196 | 
197 |         mask = np.ones(batch.shape, dtype='float32')
198 | 
199 |         for i, _ in enumerate(bch[0]):  
200 |             chunk_samples = bch[0][i]
201 |             chunk_features = bch[1][i]
202 |             # print "len(chunk_samples)", len(chunk_samples)        
203 |             # print "len(chunk_features)", len(chunk_features)
204 |             # samples are in data[0]
205 |             #data, fs, enc = scikits.audiolab.flacread(path)
206 |             # data is float16 from reading the npy file
207 |             batch[i, :len(chunk_samples)] = chunk_samples
208 |             # This shouldn't change anything. All the flac files for Music
209 |             # are the same length and the mask should be 1 every where.
210 |             # mask[i, len(data):] = np.float32(0)
211 |             # print "batch.shape", batch.shape
212 | 
213 |             # feature matrix is in data[1]
214 |             x = np.linspace(0, len(chunk_features), len(chunk_samples))
215 |             xp = np.linspace(0, len(chunk_features), len(chunk_features))
216 |             ## now is the time to upsample
217 |             for j in xrange(num_features):
218 |                 fp = chunk_features[:,j]
219 |                 interpolated = np.interp(x, xp, fp)
220 |                 # print "interpolated.shape", interpolated.shape
221 |                 # print "chunk_feats.shape", chunk_features.shape
222 |                 features[i, :len(chunk_samples), j] = interpolated
223 | 
224 |         if not real_valued:
225 |             batch = __batch_quantize(batch, q_levels, q_type)
226 | 
227 |             batch = np.concatenate([
228 |                 np.full((batch_size, overlap), q_zero, dtype='int32'),
229 |                 batch
230 |             ], axis=1)
231 |         else:
232 |             batch -= __music_train_mean_std[0]
233 |             batch /= __music_train_mean_std[1]
234 |             batch = np.concatenate([
235 |                 np.full((batch_size, overlap), 0, dtype='float32'),
236 |                 batch
237 |             ], axis=1).astype('float32')
238 | 
239 | 
240 |         mask = np.concatenate([
241 |             np.full((batch_size, overlap), 1, dtype='float32'),
242 |             mask
243 |         ], axis=1)
244 | 
245 | 
246 |         # cj (conditioning): not sure what this is for
247 |         """features = np.concatenate([
248 |             np.full((batch_size, overlap, num_features), 0, dtype='float32'),
249 |             features
250 |         ], axis=1)"""
251 |         print "overlap", overlap
252 | 
253 |         for i in xrange(batch_seq_len // seq_len):
254 |             reset = np.int32(i==0)
255 |             subbatch = batch[:, i*seq_len : (i+1)*seq_len + overlap]
256 |             submask = mask[:, i*seq_len : (i+1)*seq_len + overlap]
257 |             subfeatures = features[:, i*seq_len : (i+1)*seq_len]
258 |             # calculate the mean features over the whole sequence
259 |             #subfeatures = np.mean(features, axis=1).reshape(features.shape[0], features.shape[2])
260 |             yield (subbatch, reset, submask, subfeatures)
261 | 
262 | def music_train_feed_epoch(d_name, *args):
263 |     """
264 |     :parameters:
265 |         batch_size: int
266 |         seq_len:
267 |         overlap:
268 |         q_levels:
269 |         q_zero:
270 |         q_type: One the following 'linear', 'a-law', or 'mu-law'
271 |     4,340 (9.65 hours) in total
272 |     With batch_size = 128:
273 |         4,224 (9.39 hours) in total
274 |         3,712 (88%, 8.25 hours)for training set
275 |         256 (6%, .57 hours) for validation set
276 |         256 (6%, .57 hours) for test set
277 |     Note:
278 |         32 of Beethoven's piano sonatas available on archive.org (Public Domain)
279 |     :returns:
280 |         A generator yielding (subbatch, reset, submask)
281 |     """
282 |     # Just check if valid/test sets are also available. If not, raise.
283 |     find_dataset(__valid(__getFile(d_name)))
284 |     find_dataset(__test(__getFile(d_name)))
285 |     # Load train set
286 |     data_path = find_dataset(__train(__getFile(d_name)))
287 |     sample_data = np.load(data_path)
288 | 
289 |     # get local conditioning features
290 |     data_path = find_dataset(__train(__getFeatures(d_name)))
291 |     feature_data = np.load(data_path)
292 |     print "feature file: ", data_path
293 |     print "feature_data.shape", feature_data.shape
294 | 
295 |     generator = __music_feed_epoch(sample_data, feature_data, *args)
296 |     return generator
297 | 
298 | def get_feature_data(d_name):
299 |     data_path = find_dataset(__train(__getFeatures(d_name)))
300 |     feature_data = np.load(data_path)
301 |     return feature_data
302 | 
303 | def music_valid_feed_epoch(d_name, *args):
304 |     """
305 |     See:
306 |         music_train_feed_epoch
307 |     """
308 |     data_path = find_dataset(__valid(__getFile(d_name)))
309 |     sample_data = np.load(data_path)
310 |     # get local conditioning features
311 |     data_path = find_dataset(__train(__getFeatures(d_name)))
312 |     feature_data = np.load(data_path)
313 | 
314 |     generator = __music_feed_epoch(sample_data, feature_data, *args)
315 |     return generator
316 | 
317 | def music_test_feed_epoch(d_name, *args):
318 |     """
319 |     See:
320 |         music_train_feed_epoch
321 |     """
322 |     data_path = find_dataset(__test(__getFile(d_name)))
323 |     sample_data = np.load(data_path)
324 |     # get local conditioning features
325 |     data_path = find_dataset(__train(__getFeatures(d_name)))
326 |     feature_data = np.load(data_path)
327 | 
328 |     generator = __music_feed_epoch(sample_data, feature_data, *args)
329 |     return generator
330 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dadabots SampleRNN 
  2 | ## Generating Black Metal, Mathcore, Skate Punk, Beatbox, Meshuggah
  3 | 
  4 | Code accompanying the NIPS 2017 paper [Generating Black Metal and Math Rock: Beyond
  5 | Bach, Beethoven, and Beatles](http://dadabots.com/nips2017/generating-black-metal-and-math-rock.pdf) and MUME 2018 paper [Generating Albums with SampleRNN to Imitate Metal, Rock, and Punk Bands](http://musicalmetacreation.org/buddydrive/file/carr/)
  6 | 
  7 | We modified a SampleRNN architecture to generate music in modern genres such as black metal, math rock, skate punk, beatbox, etc
  8 | 
  9 | This early example of neural synthesis is a proof-of-concept for how machine learning can drive new types of music software. Creating music can be as simple as specifying a set of music influences on which a model trains. We demonstrate a method for generating albums that imitate bands in experimental music genres previously unrealized by traditional synthesis techniques
 10 | (e.g. additive, subtractive, FM, granular, concatenative). Unlike MIDI and symbolic models, SampleRNN generates raw audio in the time domain. This requirement becomes increasingly important in modern music styles where timbre and space are used compositionally. Long developmental compositions with rapid transitions between sections are possible by increasing the depth of the network beyond the number used for speech datasets. We are delighted by the unique characteristic artifacts of neural synthesis.
 11 | 
 12 | We've created[ several albums](https://dadabots.bandcamp.com/) this way. Read our papers for more expalnation of how we use this as part of a creative workflow, how to choose good datasets, etc. 
 13 | 
 14 | Dadabots is CJ Carr [[github]](https://github.com/Cortexelus) [[website]](http://cortexel.us) and Zack Zukowski [[github]](https://github.com/ZVK) [[website]](http://zackzukowski.com/) 
 15 | 
 16 | # SampleRNN (Dadabots fork)
 17 | 
 18 | Original SampleRNN paper [SampleRNN: An Unconditional End-to-End Neural Audio Generation Model](https://openreview.net/forum?id=SkxKPDv5xl). 
 19 | 
 20 | ## Features
 21 | - Load a dataset of audio
 22 | - Train a model on that audio to predict "given what just happened, what comes next?"
 23 | - Generate new audio by iteratively choosing "what next comes" indefinitely 
 24 | 
 25 | ## Modifications from original code:
 26 | - Auto-preprocessing (audio conversion, concatenation, chunking, and saving .npy files). We find splitting an album into 32 00 overlapping chunks of 8 seconds to give us good results. 
 27 | - New scripts for generating 100s of audio examples in parallel from a trained net.
 28 | - New scripts for different sample rates are available (16k, 32k). 32k audio sounds better, but the nets take longer to train, and they don't learn structure as well as 16k.
 29 | - Any processed datasets can be loaded into the two-tier network via arguments. This significantly speeds up the workflow without having to change code. 
 30 | - Sampling is picked from distribution (not argmax). This makes better sense because certain sounds (noise, texture, the "s" sound in speech) are inherently stochastic. Also this is significant for avoiding traps (the generated audio gets stuck in a loop). 
 31 | - Wny number of RNN layers is now possible (until you run out of memory). This was significant to getting good results. The original limit was insufficient for music, we get good results with 5 layers. 
 32 | - Local conditioning. Although we haven't fully researched the possibilities of local conditioning, we coded it in. 
 33 | - Fix bad amplitude normalization causing DC offsets (see [issue](https://github.com/soroushmehr/sampleRNN_ICLR2017/issues/24)) 
 34 | 
 35 | ## Dependencies
 36 | 
 37 | The original code lists:
 38 | - cuDNN 5105
 39 | - Python 2.7.12
 40 | - Numpy 1.11.1
 41 | - Theano 0.8.2 
 42 | - Lasagne 0.2.dev1
 43 | - ffmpeg (libav-tools)
 44 | 
 45 | But we get much faster code using the next generation of GPU architecture with:
 46 | - CUDA 9.2
 47 | - cuDNN 8.0
 48 | - Theano 1.0
 49 | - NVIDIA V100 GPU
 50 | 
 51 | ## Setup
 52 | 
 53 | A detailed description of how we setup this code on Ubuntu 16.04 with NVIDIA 100 GPU can be found here. 
 54 | 
 55 | [DETAILED SETUP INSTRUCTIONS](https://github.com/Cortexelus/dadabots_sampleRNN/wiki/Installing-Dadabots-SampleRNN-on-Ubuntu)
 56 | 
 57 | 
 58 | 
 59 | ## Datasets
 60 | To create a new dataset, place your audio here:
 61 | ```
 62 | datasets/music/downloads/
 63 | ```
 64 | then run the new experiment python script located in the datasets/music directory:
 65 | 
 66 | 16k sample rate: 
 67 | ```
 68 | cd datasets/music/
 69 | sudo python new_experiment16k.py krallice downloads/
 70 | ```
 71 | 
 72 | 32k sample rate: 
 73 | ```
 74 | cd datasets/music/
 75 | sudo python new_experiment32k.py krallice downloads/
 76 | ```
 77 | 
 78 | ## Training
 79 | To train a model on an existing dataset with accelerated GPU processing, you need to run following lines from the root of `dadabots_sampleRNN` folder which corresponds to the best found set of hyper-paramters.
 80 | 
 81 | Mission control center:
 82 | ```
 83 | $ pwd
 84 | /root/cj/https://github.com/Cortexelus/dadabots_sampleRNN
 85 | ```
 86 | 
 87 | ### Training SampleRNN (2-tier)
 88 | ```
 89 | $ python models/two_tier/two_tier32k.py -h
 90 | usage: two_tier.py [-h] [--exp EXP] --n_frames N_FRAMES --frame_size
 91 |                    FRAME_SIZE --weight_norm WEIGHT_NORM --emb_size EMB_SIZE
 92 |                    --skip_conn SKIP_CONN --dim DIM --n_rnn {1,2,3,4,5}
 93 |                    --rnn_type {LSTM,GRU} --learn_h0 LEARN_H0 --q_levels
 94 |                    Q_LEVELS --q_type {linear,a-law,mu-law} --which_set
 95 |                    {...} --batch_size {64,128,256} [--debug]
 96 |                    [--resume]
 97 | 
 98 | two_tier.py No default value! Indicate every argument.
 99 | 
100 | optional arguments:
101 |   -h, --help            show this help message and exit
102 |   --exp EXP             Experiment name (name it anything you want)
103 |   --n_frames N_FRAMES   How many "frames" to include in each Truncated BPTT
104 |                         pass
105 |   --frame_size FRAME_SIZE
106 |                         How many samples per frame
107 |   --weight_norm WEIGHT_NORM
108 |                         Adding learnable weight normalization to all the
109 |                         linear layers (except for the embedding layer)
110 |   --emb_size EMB_SIZE   Size of embedding layer (0 to disable)
111 |   --skip_conn SKIP_CONN
112 |                         Add skip connections to RNN
113 |   --dim DIM             Dimension of RNN and MLPs
114 |   --n_rnn {1,2,3,4,5,6,7,8,9,10,11,12,n,...}
115 | 					 	Number of layers in the stacked RNN
116 |   --rnn_type {LSTM,GRU}
117 |                         GRU or LSTM
118 |   --learn_h0 LEARN_H0   Whether to learn the initial state of RNN
119 |   --q_levels Q_LEVELS   Number of bins for quantization of audio samples.
120 |                         Should be 256 for mu-law.
121 |   --q_type {linear,a-law,mu-law}
122 |                         Quantization in linear-scale, a-law-companding, or mu-
123 |                         law compandig. With mu-/a-law quantization level shoud
124 |                         be set as 256
125 |   --which_set {...}
126 |                         The name of the dataset you created. In the above example "krallice"
127 |   --batch_size {64,128,256}
128 |                         size of mini-batch
129 |   --debug               Debug mode
130 |   --resume              Resume the same model from the last checkpoint. Order
131 |                         of params are important. [for now]
132 | ```
133 | 
134 | 
135 | If you're using cuda9 with v100 gpus, you need "device=cuda0" 
136 | 
137 | If you're using cuda8 with K80 gpus or earlier, you may need "device=gpu0" instead
138 | 
139 | If you have 8 GPUs, you can run up to 8 experiments in parallel, by setting device to cuda0, cuda1, cuda2, cuda3... cuda7
140 | 
141 | 
142 | #### Our best hyperparameters
143 | 
144 | After training 100s of models with different hyperparameters, these were our best hyperparameters (at the limits of the V100 hardware) for the kind of music we wanted to generate. Further explanation for our choices can be found in our papers.
145 | 
146 | 
147 | ```
148 | THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python -u models/two_tier/two_tier16k.py --exp krallice_experiment --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set krallice
149 | ```
150 | 
151 | 
152 | ### Training SampleRNN (3-tier)
153 | 
154 | There's also a 3-tier option, but we initially had better results with 2-tier, so we don't use 3-tier. It doesn't have the modifications we made to 2-tier. 
155 | 
156 | ```
157 | $ python models/three_tier/three_tier.py -h
158 | usage: three_tier16k.py [-h] [--exp EXP] --seq_len SEQ_LEN --big_frame_size
159 |                      BIG_FRAME_SIZE --frame_size FRAME_SIZE --weight_norm
160 |                      WEIGHT_NORM --emb_size EMB_SIZE --skip_conn SKIP_CONN
161 |                      --dim DIM --n_rnn {1,2,3,4,5} --rnn_type {LSTM,GRU}
162 |                      --learn_h0 LEARN_H0 --q_levels Q_LEVELS --q_type
163 |                      {linear,a-law,mu-law} --which_set {ONOM,BLIZZ,MUSIC}
164 |                      --batch_size {64,128,256} [--debug] [--resume]
165 | 
166 | three_tier.py No default value! Indicate every argument.
167 | 
168 | optional arguments:
169 |   -h, --help            show this help message and exit
170 |   --exp EXP             Experiment name
171 |   --seq_len SEQ_LEN     How many samples to include in each Truncated BPTT
172 |                         pass
173 |   --big_frame_size BIG_FRAME_SIZE
174 |                         How many samples per big frame in tier 3
175 |   --frame_size FRAME_SIZE
176 |                         How many samples per frame in tier 2
177 |   --weight_norm WEIGHT_NORM
178 |                         Adding learnable weight normalization to all the
179 |                         linear layers (except for the embedding layer)
180 |   --emb_size EMB_SIZE   Size of embedding layer (> 0)
181 |   --skip_conn SKIP_CONN
182 |                         Add skip connections to RNN
183 |   --dim DIM             Dimension of RNN and MLPs
184 |   --n_rnn {1,2,3,4,5}   Number of layers in the stacked RNN
185 |   --rnn_type {LSTM,GRU}
186 |                         GRU or LSTM
187 |   --learn_h0 LEARN_H0   Whether to learn the initial state of RNN
188 |   --q_levels Q_LEVELS   Number of bins for quantization of audio samples.
189 |                         Should be 256 for mu-law.
190 |   --q_type {linear,a-law,mu-law}
191 |                         Quantization in linear-scale, a-law-companding, or mu-
192 |                         law compandig. With mu-/a-law quantization level shoud
193 |                         be set as 256
194 |   --which_set WHICH_SET
195 |                         any preprocessed set in the datasets/music/ directory
196 |   --batch_size {64,128,256}
197 |                         size of mini-batch
198 |   --debug               Debug mode
199 |   --resume              Resume the same model from the last checkpoint. Order
200 |                         of params are important. [for now]
201 | ```
202 | To run:
203 | ```
204 | $ THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python -u models/three_tier/three_tier.py --exp 3TIER --seq_len 512 --big_frame_size 8 --frame_size 2 --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type GRU --q_levels 256 --q_type linear --batch_size 128 --weight_norm True --learn_h0 True --which_set MUSIC
205 | 
206 | ```
207 | 
208 | ## Generating
209 | 
210 | Generate 100 songs (4 minutes each) from a trained 32k model:
211 | ```
212 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu0,floatX=float32 python -u models/two_tier/two_tier_generate32k.py --exp krallice_experiment --n_frames 64 --frame_size 16 --emb_size 256 --skip_conn True --dim 1024 --n_rnn 5 --rnn_type LSTM --q_levels 256 --q_type mu-law --batch_size 128 --weight_norm True --learn_h0 False --which_set krallice --n_secs 240 --n_seqs 100
213 | ```
214 | 
215 | All the parameters have to be the same as when you trained it. Notice we're calling `two_tier_generate32k.py` with two new flags `--n_secs` and `--n_seqs` 
216 | 
217 | It will take just as much time to generate 100 songs as 5, because they are created in parallel (up to a hardware memory limit). 
218 | 
219 | This will generate from the latest checkpoint. However, we found the latest checkpoint does not always create the best music. Instead we listen to the test audio generated at each checkpoint, choose our favorite checkpoint, and delete the newer checkpoints, before generating a huge batch with this script. 
220 | 
221 | 
222 | ## Creative Workflow
223 | 
224 | At this point, we suggest human curation. Listen through the generated audio, find the best parts, and use them in your music. Read our [MUME 2018 paper](http://musicalmetacreation.org/buddydrive/file/carr/) to see how our workflow changed over the course of six albums. 
225 | 
226 | 
227 | ## Reference
228 | If you are using this code, please cite our paper:  
229 | 
230 | Generating Albums with SampleRNN to Imitate Metal, Rock, and Punk Bands. CJ Carr, Zack Zukowski (MUME 2018).
231 | 
232 | And the original paper:
233 | 
234 | SampleRNN: An Unconditional End-to-End Neural Audio Generation Model. Soroush Mehri, Kundan Kumar, Ishaan Gulrajani, Rithesh Kumar, Shubham Jain, Jose Sotelo, Aaron Courville, Yoshua Bengio, 5th International Conference on Learning Representations (ICLR 2017).
235 | 
236 | ## License
237 | 
238 | This documentation licensed CC-BY 4.0
239 | 
240 | The source code is licensed Apache 2.0
241 | 
242 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
  1 | import ops
  2 | #import lasagne
  3 | #from theano.compile.nanguardmode import NanGuardMode
  4 | 
  5 | import math
  6 | import time
  7 | import locale
  8 | 
  9 | import numpy
 10 | import theano
 11 | import theano.tensor as T
 12 | import theano.gof
 13 | 
 14 | import cPickle as pickle
 15 | #import pickle
 16 | import warnings
 17 | import sys, os, errno, glob
 18 | 
 19 | import matplotlib
 20 | matplotlib.use('Agg')
 21 | import matplotlib.pyplot as plt
 22 | 
 23 | # TODO: Grouping is not working on cluster! :-?
 24 | # Set a locale first or you won't get grouping at all
 25 | locale.setlocale(locale.LC_ALL, '')
 26 | # 'en_US.UTF-8'
 27 | 
 28 | _params = {}
 29 | def param(name, *args, **kwargs):
 30 |     """
 31 |     A wrapper for `theano.shared` which enables parameter sharing in models.
 32 | 
 33 |     Creates and returns theano shared variables similarly to `theano.shared`,
 34 |     except if you try to create a param with the same name as a
 35 |     previously-created one, `param(...)` will just return the old one instead of
 36 |     making a new one.
 37 | 
 38 |     This constructor also adds a `param` attribute to the shared variables it
 39 |     creates, so that you can easily search a graph for all params.
 40 |     """
 41 | 
 42 |     if name not in _params:
 43 |         kwargs['name'] = name
 44 |         param = theano.shared(*args, **kwargs)
 45 |         param.param = True
 46 |         _params[name] = param
 47 |     return _params[name]
 48 | 
 49 | def delete_params(name):
 50 |     to_delete = [p_name for p_name in _params if name in p_name]
 51 |     for p_name in to_delete:
 52 |         del _params[p_name]
 53 | 
 54 | def search(node, critereon):
 55 |     """
 56 |     Traverse the Theano graph starting at `node` and return a list of all nodes
 57 |     which match the `critereon` function. When optimizing a cost function, you
 58 |     can use this to get a list of all of the trainable params in the graph, like
 59 |     so:
 60 | 
 61 |     `lib.search(cost, lambda x: hasattr(x, "param"))`
 62 |     or
 63 |     `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)`
 64 |     """
 65 | 
 66 |     def _search(node, critereon, visited):
 67 |         if node in visited:
 68 |             return []
 69 |         visited.add(node)
 70 | 
 71 |         results = []
 72 |         if isinstance(node, T.Apply):
 73 |             for inp in node.inputs:
 74 |                 results += _search(inp, critereon, visited)
 75 |         else: # Variable node
 76 |             if critereon(node):
 77 |                 results.append(node)
 78 |             if node.owner is not None:
 79 |                 results += _search(node.owner, critereon, visited)
 80 |         return results
 81 | 
 82 |     return _search(node, critereon, set())
 83 | 
 84 | def floatX(x):
 85 |     """
 86 |     Convert `x` to the numpy type specified in `theano.config.floatX`.
 87 |     """
 88 |     if theano.config.floatX == 'float16':
 89 |         return numpy.float16(x)
 90 |     elif theano.config.floatX == 'float32':
 91 |         return numpy.float32(x)
 92 |     else: # Theano's default float type is float64
 93 |         print "Warning: lib.floatX using float64"
 94 |         return numpy.float64(x)
 95 | 
 96 | def save_params(path):
 97 |     param_vals = {}
 98 |     for name, param in _params.iteritems():
 99 |         param_vals[name] = param.get_value()
100 | 
101 |     with open(path, 'wb') as f:
102 |         pickle.dump(param_vals, f)
103 | 
104 | def load_params(path):
105 |     with open(path, 'rb') as f:
106 |         param_vals = pickle.load(f)
107 | 
108 |     for name, val in param_vals.iteritems():
109 |         _params[name].set_value(val)
110 | 
111 | def clear_all_params():
112 |     to_delete = [p_name for p_name in _params]
113 |     for p_name in to_delete:
114 |         del _params[p_name]
115 | 
116 | def ensure_dir(dirname):
117 |     """
118 |     Ensure that a named directory exists; if it does not, attempt to create it.
119 |     """
120 |     try:
121 |         os.makedirs(dirname)
122 |     except OSError, e:
123 |         if e.errno != errno.EEXIST:
124 |             raise
125 | 
126 | __model_setting_file_name = 'model_settings.txt'
127 | def print_model_settings(locals_var, path=None, sys_arg=False):
128 |     """
129 |     Prints all variables in upper case in locals_var,
130 |     except for T which usually stands for theano.tensor.
131 |     If locals() passed as input to this method, will print
132 |     all the variables in upper case defined so far, that is
133 |     model settings.
134 | 
135 |     With `path` as an address to a directory it will _append_ it
136 |     as a file named `model_settings.txt` as well.
137 | 
138 |     With `sys_arg` set to True, log information about Python, Numpy,
139 |     and Theano and passed arguments to the script will be added too.
140 |     args.pkl would be overwritten, specially in case of resuming a job.
141 |     But again that wouldn't be much of a problem as all the passed args
142 |     to the script except for '--resume' should be the same.
143 | 
144 |     With both `path` and `sys_arg` passed, dumps the theano.config.
145 | 
146 |     :usage:
147 |         >>> import theano.tensor as T
148 |         >>> import lib
149 |         >>> BATCH_SIZE, DIM = 128, 512
150 |         >>> DATA_PATH = '/Path/to/dataset'
151 |         >>> lib.print_model_settings(locals(), path='./')
152 |     """
153 |     log = ""
154 |     if sys_arg:
155 |         try:
156 |             log += "Python:\n"
157 |             log += "\tsys.version_info\t{}\n".format(str(sys.version_info))
158 |             log += "Numpy:\n"
159 |             log += "\t.__version__\t{}\n".format(numpy.__version__)
160 |             log += "Theano:\n"
161 |             log += "\t.__version__\t{}\n".format(theano.__version__)
162 |             log += "\n\nAll passed args:\n"
163 |             log += str(sys.argv)
164 |             log += "\n"
165 |         except:
166 |             print "Something went wrong during sys_arg logging. Continue anyway!"
167 | 
168 |     log += "\nModel settings:"
169 |     all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')]
170 |     all_vars = sorted(all_vars, key=lambda x: x[0])
171 |     for var_name, var_value in all_vars:
172 |         log += ("\n\t%-20s %s" % (var_name, var_value))
173 |     print log
174 |     if path is not None:
175 |         ensure_dir(path)
176 |         # Don't override, just append if by mistake there is something in the file.
177 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
178 |             f.write(log)
179 |         if sys_arg:
180 |             with open(os.path.join(path, 'th_conf.txt'), 'a+') as f:
181 |                 f.write(str(theano.config))
182 |             with open(os.path.join(path, 'args.pkl'), 'wb') as f:
183 |                 pickle.dump(sys.argv, f)
184 |                 # To load:
185 |                 # >>> import cPickle as pickle
186 |                 # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb'))
187 | 
188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True):
189 |     """
190 |     Default criterion:
191 |         lambda x: hasattr(x, 'param') and x.param==True
192 |     This will return every parameter for cost from computation graph.
193 | 
194 |     To exclude a parameter, just set 'param' to False:
195 |         >>> h0 = lib.param('h0',\
196 |                 numpy.zeros((3, 2*512), dtype=theano.config.floatX))
197 |         >>> print h0.param  # Default: True
198 |         >>> h0.param = False
199 | 
200 |     In this case one still can get list of all params (False or True) by:
201 |         >>> lib.get_params(cost, lambda x: hasattr(x, 'param')
202 | 
203 |     :returns:
204 |         A list of params
205 |     """
206 |     return search(cost, criterion)
207 | 
208 | def print_params_info(params, path=None):
209 |     """
210 |     Print information about the parameters in the given param set.
211 | 
212 |     With `path` as an address to a directory it will _append_ it
213 |     as a file named `model_settings.txt` as well.
214 | 
215 |     :usage:
216 |         >>> params = lib.get_params(cost)
217 |         >>> lib.print_params_info(params, path='./')
218 |     """
219 |     params = sorted(params, key=lambda p: p.name)
220 |     values = [p.get_value(borrow=True) for p in params]
221 |     shapes = [p.shape for p in values]
222 |     total_param_count = 0
223 |     multiply_all = lambda a, b: a*b
224 |     log = "\nParams for cost:"
225 |     for param, value, shape in zip(params, values, shapes):
226 |         log += ("\n\t%-20s %s" % (shape, param.name))
227 |         total_param_count += reduce(multiply_all, shape)
228 | 
229 |     log += "\nTotal parameter count for this cost:\n\t{0}".format(
230 |         locale.format("%d", total_param_count, grouping=True)
231 |     )
232 |     print log
233 | 
234 |     if path is not None:
235 |         ensure_dir(path)
236 |         # Don't override, just append if by mistake there is something in the file.
237 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
238 |             f.write(log)
239 | 
240 | __train_log_file_name = 'train_log.pkl'
241 | def save_training_info(values, path):
242 |     """
243 |     Gets a set of values as dictionary and append them to a log file.
244 |     stores in <path>/train_log.pkl
245 |     """
246 |     file_name = os.path.join(path, __train_log_file_name)
247 |     try:
248 |         with open(file_name, "rb") as f:
249 |             log = pickle.load(f)
250 |     except IOError:  # first time
251 |         log = {}
252 |         for k in values.keys():
253 |             log[k] = []
254 |     for k, v in values.items():
255 |         log[k].append(v)
256 |     with open(file_name, "wb") as f:
257 |         pickle.dump(log, f)
258 | 
259 | resume_key = 'last resume index'
260 | def resumable(path,
261 |               iter_key='iter',
262 |               epoch_key='epoch',
263 |               add_resume_counter=True,
264 |               other_keys=[]):
265 |     """
266 |     :warning:
267 |         This is a naive implementation of resuming a training session
268 |         and does not save and reload the training loop. The serialization
269 |         of training loop and everything is costly and error-prone.
270 | 
271 |     :todo:
272 |         - Save and load a serializable training loop. (See warning above)
273 |         - Heavily dependent on the "model" file and the names used there right
274 |           now. It's really easy to miss anything.
275 | 
276 |     `path` should be pointing at the root directory where `train_log.pkl`
277 |     (See __train_log_file_name) and `params/` reside.
278 | 
279 |     Always assuming all the values in the log dictionary (except `resume_key`),
280 |     are lists with the same length.
281 |     """
282 |     file_name = os.path.join(path, __train_log_file_name)
283 |     # Raise error if does not exists.
284 |     with open(file_name, "rb") as f:
285 |         log = pickle.load(f)
286 | 
287 |     param_found = False
288 |     res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl')
289 |     for reverse_idx in range(-1, -len(log[epoch_key])-1, -1):
290 |         ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx]
291 |         print "> Params file for epoch {} iter {}".format(ep, it),
292 |         last_path = glob.glob(res_path.format(ep, it))
293 |         if len(last_path) == 1:
294 |             res_path = last_path[0]
295 |             param_found = True
296 |             print "found."
297 |             break
298 |         elif len(last_path) == 0:
299 |             print "[NOT FOUND]. FALLING BACK TO..."
300 |         else:  # > 1
301 |             # choose one, warning, rare
302 |             print "[multiple version found]:"
303 |             for l_path in last_path:
304 |                 print l_path
305 |             res_path = last_path[0]
306 |             param_found = True
307 |             print "Arbitrarily choosing first:\n\t{}".format(res_path)
308 | 
309 |     assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log)
310 |     # Finishing for loop with no success
311 |     assert param_found, 'No matching params file with train_log'
312 | 
313 |     acceptable_len = reverse_idx+len(log[epoch_key])+1
314 |     if acceptable_len != len(log[epoch_key]):
315 |         # Backup of the old train_log
316 |         with open(file_name+'.backup', 'wb') as f:
317 |             pickle.dump(log, f)
318 | 
319 |         # Change the log file to match the last existing checkpoint.
320 |         for k, v in log.items():
321 |             # Fix resume indices
322 |             if k == resume_key:
323 |                 log[k] = [i for i in log[k] if i < acceptable_len]
324 |                 continue
325 |             # Rest is useless with no param file.
326 |             log[k] = v[:acceptable_len]
327 | 
328 |     epochs = log[epoch_key]
329 |     iters = log[iter_key]
330 | 
331 |     if add_resume_counter:
332 |         resume_val = len(epochs)
333 |         if not resume_key in log.keys():
334 |             log[resume_key] = [resume_val]
335 |         else:
336 |             if log[resume_key] == [] or log[resume_key][-1] != resume_val:
337 |                 log[resume_key].append(resume_val)
338 |         with open(file_name, "wb") as f:
339 |             pickle.dump(log, f)
340 | 
341 |     last_epoch = epochs[-1]
342 |     last_iter = iters[-1]
343 | 
344 |     # The if-else statement is more readable than `next`:
345 |     #iters_to_consume = next((last_iter%(i-1) for (e, i) in\
346 |     #       zip(epochs, iters) if e == 1), last_iter)
347 |     if last_epoch == 0:
348 |         iters_to_consume = last_iter
349 |     else:
350 |         for e, i in zip(epochs, iters):
351 |             # first time. Epoch turns from 0 to 1.
352 |             # At the end of each `epoch` there should be
353 |             # a monitoring step so it will gives number
354 |             # number of iterations per epoch
355 |             if e == 1:
356 |                 iters_per_epoch = i - 1
357 |                 break
358 |         iters_to_consume = last_iter % iters_per_epoch
359 | 
360 |     last_other_keys = [log[k][-1] for k in other_keys]
361 |     return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys
362 | 
363 | def plot_traing_info(x, ylist, path):
364 |     """
365 |     Loads log file and plot x and y values as provided by input.
366 |     Saves as <path>/train_log.png
367 |     """
368 |     file_name = os.path.join(path, __train_log_file_name)
369 |     try:
370 |         with open(file_name, "rb") as f:
371 |             log = pickle.load(f)
372 |     except IOError:  # first time
373 |         warnings.warn("There is no {} file here!!!".format(file_name))
374 |         return
375 |     plt.figure()
376 |     x_vals = log[x]
377 |     for y in ylist:
378 |         y_vals = log[y]
379 |         if len(y_vals) != len(x_vals):
380 |             warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x))
381 |         plt.plot(x_vals, y_vals, label=y)
382 |         # assert len(y_vals) == len(x_vals), "not the same len"
383 |     plt.xlabel(x)
384 |     plt.legend()
385 |     #plt.show()
386 |     plt.savefig(file_name[:-3]+'png', bbox_inches='tight')
387 |     plt.close('all')
388 | 
389 | def create_logging_folders(path):
390 |     """
391 |     Handle structure of folders and naming here instead of training file.
392 | 
393 |     :todo:
394 |         - Implement!
395 |     """
396 |     pass
397 | 
398 | def tv(var):
399 |     """
400 |     :todo:
401 |         - add tv() function for theano variables so that instead of calling
402 |         x.tag.test_value, you can get the same thing just by calling the method
403 |         in a faster way...
404 |         - also for x.tag.test_value.shape
405 |     """
406 |     # Based on EAFP (easier to ask for forgiveness than permission)
407 |     try:
408 |         return var.tag.test_value
409 |     except AttributeError:
410 |         print "NONE, test_value has not been set."
411 |         import ipdb; ipdb.set_trace()
412 | 
413 |     ## Rather than LBYL (look before you leap)
414 |     #if hasattr(var, 'tag'):
415 |     #    if hasattr(var.tag, 'test_value'):
416 |     #        return var.tag.test_value
417 |     #   else:
418 |     #       print "NONE, test_value has not set."
419 |     #       import ipdb; ipdb.set_trace()
420 |     #else:
421 |     #    print "NONE, tag has not set."
422 |     #    import ipdb; ipdb.set_trace()
423 | 
424 | def tvs(var):
425 |     """
426 |     :returns:
427 |         var.tag.test_value.shape
428 |     """
429 |     return tv(var).shape
430 | 
431 | def _is_symbolic(v):
432 |     r"""Return `True` if any of the arguments are symbolic.
433 |     See:
434 |         https://github.com/Theano/Theano/wiki/Cookbook
435 |     """
436 |     symbolic = False
437 |     v = list(v)
438 |     for _container, _iter in [(v, xrange(len(v)))]:
439 |         for _k in _iter:
440 |             _v = _container[_k]
441 |             if isinstance(_v, theano.gof.Variable):
442 |                 symbolic = True
443 |     return symbolic
444 | 
445 | def unique_list(inp_list):
446 |     """
447 |     returns a list with unique values of inp_list.
448 |     :usage:
449 |         >>> inp_list = ['a', 'b', 'c']
450 |         >>> unique_inp_list = unique_list(inp_list*2)
451 |     """
452 |     return list(set(inp_list))
453 | 


--------------------------------------------------------------------------------
/models/one_tier/wavent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | WaveNets Audio Generation Model
  4 | 
  5 | How-to-run example:
  6 | 
  7 | sampleRNN$
  8 | THEANO_FLAGS=mode=FAST_RUN,device=gpu1,floatX=float32,lib.cnmem=.95 python models/one_tier/wavent.py --dim 64 --q_levels 256 --q_type linear --which_set MUSIC --batch_size 8 --wavenet_blocks 4 --dilation_layers_per_block 10 --sequence_len_to_train 1600
  9 | """
 10 | import time
 11 | from datetime import datetime
 12 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
 13 | exp_start = time.time()
 14 | 
 15 | import os, sys
 16 | sys.path.insert(1, os.getcwd())
 17 | import argparse
 18 | 
 19 | import numpy
 20 | numpy.random.seed(123)
 21 | np = numpy
 22 | import random
 23 | random.seed(123)
 24 | 
 25 | import theano
 26 | import theano.tensor as T
 27 | import theano.ifelse
 28 | import lasagne
 29 | import scipy.io.wavfile
 30 | 
 31 | import lib
 32 | 
 33 | 
 34 | ### Parsing passed args/hyperparameters ###
 35 | def get_args():
 36 |     def t_or_f(arg):
 37 |         ua = str(arg).upper()
 38 |         if 'TRUE'.startswith(ua):
 39 |             return True
 40 |         elif 'FALSE'.startswith(ua):
 41 |             return False
 42 |         else:
 43 |            raise ValueError('Arg is neither `True` nor `False`')
 44 | 
 45 |     def check_non_negative(value):
 46 |         ivalue = int(value)
 47 |         if ivalue < 0:
 48 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 49 |         return ivalue
 50 | 
 51 |     def check_positive(value):
 52 |         ivalue = int(value)
 53 |         if ivalue < 1:
 54 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 55 |         return ivalue
 56 | 
 57 |     def check_unit_interval(value):
 58 |         fvalue = float(value)
 59 |         if fvalue < 0 or fvalue > 1:
 60 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 61 |         return fvalue
 62 | 
 63 |     # No default value here. Indicate every single arguement.
 64 |     parser = argparse.ArgumentParser(
 65 |         description='two_tier.py\nNo default value! Indicate every argument.')
 66 | 
 67 |     # Hyperparameter arguements:
 68 |     parser.add_argument('--exp', help='Experiment name',
 69 |             type=str, required=False, default='_')
 70 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 71 |             type=check_positive, required=True)
 72 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
 73 |             type=check_positive, required=True)
 74 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
 75 |             choices=['linear', 'a-law', 'mu-law'], required=True)
 76 |     #parser.add_argument('--nll_coeff', help='Value of alpha in [0, 1] for cost=alpha*NLL+(1-alpha)*FFT_cost',\
 77 |     #        type=check_unit_interval, required=True)
 78 |     parser.add_argument('--which_set', help='ONOM, BLIZZ, or MUSIC',
 79 |             choices=['ONOM', 'BLIZZ', 'MUSIC', 'HUCK'], required=True)
 80 |     parser.add_argument('--batch_size', help='size of mini-batch',
 81 |             type=check_positive, choices=[8, 16, 32, 64, 128, 256], required=True)
 82 |     parser.add_argument('--wavenet_blocks', help='Number of wavnet blocks to use',
 83 |             type=check_positive, required=True)
 84 |     parser.add_argument('--dilation_layers_per_block', help='number of dilation layers per block',
 85 |             type=check_positive, required=True)
 86 | 
 87 |     parser.add_argument('--sequence_len_to_train', help='size of output map',
 88 |             type=check_positive, required=True)
 89 | 
 90 |     parser.add_argument('--debug', help='debug mode', required=False, default=False, action='store_true')
 91 | 
 92 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
 93 |             required=False, default=False, action='store_true')
 94 | 
 95 |     args = parser.parse_args()
 96 | 
 97 |     # Create tag for this experiment based on passed args
 98 |     tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
 99 |     print "Created experiment tag for these args:"
100 |     print tag
101 | 
102 |     return args, tag
103 | 
104 | args, tag = get_args()
105 | 
106 | # N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass
107 | OVERLAP = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1# How many samples per frame
108 | #GLOBAL_NORM = args.global_norm
109 | DIM = args.dim # Model dimensionality.
110 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
111 | Q_TYPE = args.q_type # log- or linear-scale
112 | #NLL_COEFF = args.nll_coeff
113 | WHICH_SET = args.which_set
114 | BATCH_SIZE = args.batch_size
115 | #DATA_PATH = args.data_path
116 | 
117 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
118 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
119 | 
120 | # Fixed hyperparams
121 | GRAD_CLIP = 1 # Elementwise grad clip threshold
122 | BITRATE = 16000
123 | 
124 | # Other constants
125 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
126 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
127 | #TRAIN_MODE = 'time-iters'
128 | # To use PRINT_TIME for validation,
129 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
130 | #TRAIN_MODE = 'iters-time'
131 | # To use PRINT_ITERS for validation,
132 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
133 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
134 | STOP_ITERS = 100000 # Stop after this many iterations
135 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds.
136 | STOP_TIME = 60*60*60 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
137 | N_SEQS = 10  # Number of samples to generate every time monitoring.
138 | FOLDER_PREFIX = os.path.join('results_wavenets', tag)
139 | SEQ_LEN = args.sequence_len_to_train # Total length (# of samples) of each truncated BPTT sequence
140 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
141 | 
142 | LEARNING_RATE = lib.floatX(numpy.float32(0.0001))
143 | RESUME = args.resume
144 | 
145 | epoch_str = 'epoch'
146 | iter_str = 'iter'
147 | lowest_valid_str = 'lowest valid cost'
148 | corresp_test_str = 'correponding test cost'
149 | train_nll_str, valid_nll_str, test_nll_str = \
150 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
151 | 
152 | if args.debug:
153 |     import warnings
154 |     warnings.warn('----------RUNNING IN DEBUG MODE----------')
155 |     TRAIN_MODE = 'time-iters'
156 |     PRINT_TIME = 100
157 |     STOP_TIME = 300
158 |     STOP_ITERS = 1000
159 | 
160 | ### Create directories ###
161 | #   FOLDER_PREFIX: root, contains:
162 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
163 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
164 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
165 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
166 | 
167 | if not os.path.exists(FOLDER_PREFIX):
168 |     os.makedirs(FOLDER_PREFIX)
169 | 
170 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
171 | 
172 | if not os.path.exists(PARAMS_PATH):
173 |     os.makedirs(PARAMS_PATH)
174 | 
175 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
176 | 
177 | if not os.path.exists(SAMPLES_PATH):
178 |     os.makedirs(SAMPLES_PATH)
179 | 
180 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
181 | 
182 | if not os.path.exists(BEST_PATH):
183 |     os.makedirs(BEST_PATH)
184 | 
185 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
186 | 
187 | ### Creating computation graph ###
188 | 
189 | def create_wavenet_block(inp, num_dilation_layer, input_dim, output_dim, name =None):
190 |     assert name is not None
191 |     layer_out = inp
192 |     skip_contrib = []
193 |     skip_weights = lib.param(name+".parametrized_weights", lib.floatX(numpy.ones((num_dilation_layer,))))
194 |     for i in range(num_dilation_layer):
195 |         layer_out, skip_c = lib.ops.dil_conv_1D(
196 |                     layer_out,
197 |                     output_dim,
198 |                     input_dim if i == 0 else output_dim,
199 |                     2,
200 |                     dilation = 2**i,
201 |                     non_linearity = 'gated',
202 |                     name = name+".dilation_{}".format(i+1)
203 |                 )
204 |         skip_c = skip_c*skip_weights[i]
205 | 
206 |         skip_contrib.append(skip_c)
207 | 
208 |     skip_out =  skip_contrib[-1]
209 | 
210 |     j = 0
211 |     for i in range(num_dilation_layer-1):
212 |         j += 2**(num_dilation_layer-i-1)
213 |         skip_out = skip_out + skip_contrib[num_dilation_layer-2 - i][:,j:]
214 | 
215 |     return layer_out, skip_out
216 | 
217 | def create_model(inp):
218 |     out = (inp.astype(theano.config.floatX)/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))
219 |     l_out = out.dimshuffle(0,1,'x')
220 | 
221 |     skips = []
222 |     for i in range(args.wavenet_blocks):
223 |         l_out, skip_out = create_wavenet_block(l_out, args.dilation_layers_per_block, 1 if i == 0 else args.dim, args.dim, name = "block_{}".format(i+1))
224 |         skips.append(skip_out)
225 | 
226 |     out = skips[-1]
227 | 
228 |     for i in range(args.wavenet_blocks -  1):
229 |         out = out + skips[args.wavenet_blocks - 2 - i][:,(2**args.dilation_layers_per_block - 1)*(i+1):]
230 | 
231 |     for i in range(3):
232 |         out = lib.ops.conv1d("out_{}".format(i+1), out, args.dim, args.dim, 1, non_linearity='relu')
233 | 
234 |     out = lib.ops.conv1d("final", out, args.dim, args.q_levels, 1, non_linearity='identity')
235 | 
236 |     return out
237 | 
238 | sequences = T.imatrix('sequences')
239 | h0        = T.tensor3('h0')
240 | reset     = T.iscalar('reset')
241 | mask      = T.matrix('mask')
242 | 
243 | if args.debug:
244 |     # Solely for debugging purposes.
245 |     # Maybe I should set the compute_test_value=warn from here.
246 |     sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN), dtype='int32')
247 | 
248 | input_sequences = sequences[:, :-1]
249 | target_sequences = sequences[:, (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1:]
250 | 
251 | target_mask = mask[:, (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1:]
252 | 
253 | output = create_model(input_sequences)
254 | 
255 | cost = T.nnet.categorical_crossentropy(
256 |     T.nnet.softmax(output.reshape((-1, Q_LEVELS))),
257 |     target_sequences.flatten()
258 | )
259 | 
260 | cost = cost.reshape(target_sequences.shape)
261 | cost = cost * target_mask
262 | # Don't use these lines; could end up with NaN
263 | # Specially at the end of audio files where mask is
264 | # all zero for some of the shorter files in mini-batch.
265 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
266 | #cost = cost.mean(axis=0)
267 | 
268 | # Use this one instead.
269 | cost = cost.sum()
270 | cost = cost / target_mask.sum()
271 | 
272 | # By default we report cross-entropy cost in bits.
273 | # Switch to nats by commenting out this line:
274 | # log_2(e) = 1.44269504089
275 | cost = cost * lib.floatX(numpy.log2(numpy.e))
276 | 
277 | ### Getting the params, grads, updates, and Theano functions ###
278 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True)
279 | lib.print_params_info(params, path=FOLDER_PREFIX)
280 | 
281 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
282 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
283 | 
284 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE)
285 | 
286 | # Training function
287 | train_fn = theano.function(
288 |     [sequences, mask],
289 |     cost,
290 |     updates=updates,
291 |     on_unused_input='warn'
292 | )
293 | 
294 | # Validation and Test function
295 | test_fn = theano.function(
296 |     [sequences, mask],
297 |     cost,
298 |     on_unused_input='warn'
299 | )
300 | 
301 | # Sampling at frame level
302 | generate_fn = theano.function(
303 |     [sequences],
304 |     lib.ops.softmax_and_sample(output),
305 |     on_unused_input='warn'
306 | )
307 | 
308 | 
309 | def generate_and_save_samples(tag):
310 |     def write_audio_file(name, data):
311 |         data = data.astype('float32')
312 |         data -= data.min()
313 |         data /= data.max()
314 |         data -= 0.5
315 |         data *= 0.95
316 |         scipy.io.wavfile.write(
317 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
318 |                     BITRATE,
319 |                     data)
320 | 
321 |     total_time = time.time()
322 |     # Generate N_SEQS' sample files, each 5 seconds long
323 |     N_SECS = 5
324 |     LENGTH = N_SECS*BITRATE
325 | 
326 |     if args.debug:
327 |         LENGTH = 1024
328 | 
329 |     num_prev_samples_to_use = (2**args.dilation_layers_per_block - 1)*args.wavenet_blocks + 1
330 | 
331 |     samples = numpy.zeros((N_SEQS, LENGTH + num_prev_samples_to_use), dtype='int32')
332 |     samples[:, :num_prev_samples_to_use] = Q_ZERO
333 | 
334 |     for t in range(LENGTH):
335 |         samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = generate_fn(samples[:, t:t + num_prev_samples_to_use+1])
336 |         if (t > 2*BITRATE) and( t < 3*BITRATE):
337 |             samples[:,num_prev_samples_to_use+t:num_prev_samples_to_use+t+1] = Q_ZERO
338 | 
339 |     total_time = time.time() - total_time
340 |     log = "{} samples of {} seconds length generated in {} seconds."
341 |     log = log.format(N_SEQS, N_SECS, total_time)
342 |     print log,
343 | 
344 |     for i in xrange(N_SEQS):
345 |         samp = samples[i, num_prev_samples_to_use: ]
346 |         if Q_TYPE == 'mu-law':
347 |             from datasets.dataset import mu2linear
348 |             samp = mu2linear(samp)
349 |         elif Q_TYPE == 'a-law':
350 |             raise NotImplementedError('a-law is not implemented')
351 |         write_audio_file("sample_{}_{}".format(tag, i), samp)
352 | 
353 | ### Import the data_feeder ###
354 | # Handling WHICH_SET
355 | if WHICH_SET == 'ONOM':
356 |     from datasets.dataset import onom_train_feed_epoch as train_feeder
357 |     from datasets.dataset import onom_valid_feed_epoch as valid_feeder
358 |     from datasets.dataset import onom_test_feed_epoch  as test_feeder
359 | elif WHICH_SET == 'BLIZZ':
360 |     from datasets.dataset import blizz_train_feed_epoch as train_feeder
361 |     from datasets.dataset import blizz_valid_feed_epoch as valid_feeder
362 |     from datasets.dataset import blizz_test_feed_epoch  as test_feeder
363 | elif WHICH_SET == 'MUSIC':
364 |     from datasets.dataset import music_train_feed_epoch as train_feeder
365 |     from datasets.dataset import music_valid_feed_epoch as valid_feeder
366 |     from datasets.dataset import music_test_feed_epoch  as test_feeder
367 | elif WHICH_SET == 'HUCK':
368 |     from datasets.dataset import huck_train_feed_epoch as train_feeder
369 |     from datasets.dataset import huck_valid_feed_epoch as valid_feeder
370 |     from datasets.dataset import huck_test_feed_epoch  as test_feeder
371 | 
372 | 
373 | def monitor(data_feeder):
374 |     """
375 |     Cost and time of test_fn on a given dataset section.
376 |     Pass only one of `valid_feeder` or `test_feeder`.
377 |     Don't pass `train_feed`.
378 | 
379 |     :returns:
380 |         Mean cost over the input dataset (data_feeder)
381 |         Total time spent
382 |     """
383 |     _total_time = 0.
384 |     _costs = []
385 |     _data_feeder = data_feeder(BATCH_SIZE,
386 |                                SEQ_LEN,
387 |                                OVERLAP,
388 |                                Q_LEVELS,
389 |                                Q_ZERO,
390 |                                Q_TYPE)
391 | 
392 |     for _seqs, _reset, _mask in _data_feeder:
393 |         _start_time = time.time()
394 |         _cost = test_fn(_seqs, _mask)
395 |         _total_time += time.time() - _start_time
396 | 
397 |         _costs.append(_cost)
398 | 
399 |     return numpy.mean(_costs), _total_time
400 | 
401 | 
402 | print "Wall clock time spent before training started: {:.2f}h"\
403 |         .format((time.time()-exp_start)/3600.)
404 | print "Training!"
405 | total_iters = 0
406 | total_time = 0.
407 | last_print_time = 0.
408 | last_print_iters = 0
409 | costs = []
410 | lowest_valid_cost = numpy.finfo(numpy.float32).max
411 | corresponding_test_cost = numpy.finfo(numpy.float32).max
412 | new_lowest_cost = False
413 | end_of_batch = False
414 | epoch = 0  # Important for mostly other datasets rather than Blizz
415 | 
416 | # Initial load train dataset
417 | tr_feeder = train_feeder(BATCH_SIZE,
418 |                          SEQ_LEN,
419 |                          OVERLAP,
420 |                          Q_LEVELS,
421 |                          Q_ZERO,
422 |                          Q_TYPE)
423 | 
424 | 
425 | 
426 | if RESUME:
427 |     # Check if checkpoint from previous run is not corrupted.
428 |     # Then overwrite some of the variables above.
429 |     iters_to_consume, res_path, epoch, total_iters,\
430 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
431 |         lib.resumable(path=FOLDER_PREFIX,
432 |                       iter_key=iter_str,
433 |                       epoch_key=epoch_str,
434 |                       add_resume_counter=True,
435 |                       other_keys=[lowest_valid_str,
436 |                                   corresp_test_str,
437 |                                   test_nll_str])
438 |     # At this point we saved the pkl file.
439 |     last_print_iters = total_iters
440 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
441 |     # Consumes this much iters to get to the last point in training data.
442 |     consume_time = time.time()
443 |     for i in xrange(iters_to_consume):
444 |         tr_feeder.next()
445 |     consume_time = time.time() - consume_time
446 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
447 |             format(consume_time, iters_to_consume)
448 | 
449 |     lib.load_params(res_path)
450 |     print "Parameters from last available checkpoint loaded from path {}".format(res_path)
451 | 
452 | test_time = 0.0
453 | 
454 | while True:
455 |     # THIS IS ONE ITERATION
456 |     if total_iters % 500 == 0:
457 |         print total_iters,
458 | 
459 |     total_iters += 1
460 | 
461 |     try:
462 |         # Take as many mini-batches as possible from train set
463 |         mini_batch = tr_feeder.next()
464 |     except StopIteration:
465 |         # Mini-batches are finished. Load it again.
466 |         # Basically, one epoch.
467 |         tr_feeder = train_feeder(BATCH_SIZE,
468 |                                  SEQ_LEN,
469 |                                  OVERLAP,
470 |                                  Q_LEVELS,
471 |                                  Q_ZERO,
472 |                                  Q_TYPE)
473 | 
474 |         # and start taking new mini-batches again.
475 |         mini_batch = tr_feeder.next()
476 |         epoch += 1
477 |         end_of_batch = True
478 |         print "[Another epoch]",
479 | 
480 |     seqs, reset, mask = mini_batch
481 | 
482 | 
483 |     ##Remove this
484 |     # print seqs.shape
485 |     # targ = generate_fn(seqs)
486 |     # print targ.shape
487 |     #####
488 | 
489 |     start_time = time.time()
490 |     cost = train_fn(seqs, mask)
491 |     total_time += time.time() - start_time
492 |     #print "This cost:", cost, "This h0.mean()", h0.mean()
493 | 
494 |     costs.append(cost)
495 | 
496 |     if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
497 |         (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \
498 |         (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \
499 |         (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \
500 |         end_of_batch:
501 |         print "\nValidation!",
502 |         valid_cost, valid_time = monitor(valid_feeder)
503 |         print "Done!"
504 | 
505 |         # Only when the validation cost is improved get the cost for test set.
506 |         if valid_cost < lowest_valid_cost:
507 |             lowest_valid_cost = valid_cost
508 |             print "\n>>> Best validation cost of {} reached. Testing!"\
509 |                     .format(valid_cost),
510 |             test_cost, test_time = monitor(test_feeder)
511 |             print "Done!"
512 |             # Report last one which is the lowest on validation set:
513 |             print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time)
514 |             corresponding_test_cost = test_cost
515 |             new_lowest_cost = True
516 | 
517 |         # Stdout the training progress
518 |         print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
519 |         print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n"
520 |         print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n"
521 |         print_info += "\tvalid cost:{:.4f}\ttotal time:{:.2f}h\n"
522 |         print_info += "\ttest  cost:{:.4f}\ttotal time:{:.2f}h"
523 |         print_info = print_info.format(epoch,
524 |                                        total_iters,
525 |                                        (time.time()-exp_start)/3600,
526 |                                        lowest_valid_cost,
527 |                                        corresponding_test_cost,
528 |                                        numpy.mean(costs),
529 |                                        total_time/3600,
530 |                                        total_time/total_iters,
531 |                                        valid_cost,
532 |                                        valid_time/3600,
533 |                                        test_cost,
534 |                                        test_time/3600)
535 |         print print_info
536 | 
537 |         # Save and graph training progress
538 |         x_axis_str = 'iter'
539 |         train_nll_str, valid_nll_str, test_nll_str = \
540 |             'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
541 |         training_info = {'epoch' : epoch,
542 |                          x_axis_str : total_iters,
543 |                          train_nll_str : numpy.mean(costs),
544 |                          valid_nll_str : valid_cost,
545 |                          test_nll_str : test_cost,
546 |                          'lowest valid cost' : lowest_valid_cost,
547 |                          'correponding test cost' : corresponding_test_cost,
548 |                          'train time' : total_time,
549 |                          'valid time' : valid_time,
550 |                          'test time' : test_time,
551 |                          'wall clock time' : time.time()-exp_start}
552 |         lib.save_training_info(training_info, FOLDER_PREFIX)
553 |         print "Train info saved!",
554 | 
555 |         y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str]
556 |         lib.plot_traing_info(x_axis_str, y_axis_strs, FOLDER_PREFIX)
557 |         print "Plotted!"
558 | 
559 |         # Generate and save samples
560 |         print "Sampling!",
561 |         tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}"
562 |         tag = tag.format(epoch,
563 |                          total_iters,
564 |                          total_time/3600,
565 |                          numpy.mean(cost),
566 |                          valid_cost)
567 |         tag += ("_best" if new_lowest_cost else "")
568 |         # Generate samples
569 |         generate_and_save_samples(tag)
570 |         print "Done!"
571 | 
572 |         # Save params of model
573 |         lib.save_params(
574 |                 os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag))
575 |         )
576 |         print "Params saved!"
577 | 
578 |         if total_iters-last_print_iters == PRINT_ITERS \
579 |             or total_time-last_print_time >= PRINT_TIME:
580 |                 # If we are here b/c of onom_end_of_batch, we shouldn't mess
581 |                 # with costs and last_print_iters
582 |             costs = []
583 |             last_print_time += PRINT_TIME
584 |             last_print_iters += PRINT_ITERS
585 | 
586 |         end_of_batch = False
587 |         new_lowest_cost = False
588 | 
589 |         print "Validation Done!\nBack to Training..."
590 | 
591 |     if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
592 |        (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \
593 |        ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \
594 |             (total_iters == STOP_ITERS or total_time >= STOP_TIME)):
595 | 
596 |         print "Done! Total iters:", total_iters, "Total time: ", total_time
597 |         print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
598 |         print "Wall clock time spent: {:.2f}h"\
599 |                     .format((time.time()-exp_start)/3600)
600 | 
601 |         sys.exit()
602 | 


--------------------------------------------------------------------------------
/models/two_tier/two_tier_generate32k.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | 
  4 | Two-tier model, Quantized input
  5 | For more info:
  6 | $ python two_tier.py -h
  7 | 
  8 | How-to-run example:
  9 | sampleRNN$ pwd
 10 | /u/mehris/sampleRNN
 11 | 
 12 | sampleRNN$ \
 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \
 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \
 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \
 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \
 17 | --batch_size 128 --which_set MUSIC
 18 | 
 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the
 20 | resume code as many time as possible, depending on the TRAIN_MODE.
 21 | (folder name, file name, flags, their order, and the values are important)
 22 | """
 23 | from time import time
 24 | from datetime import datetime
 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
 26 | exp_start = time()
 27 | 
 28 | import os, sys, glob
 29 | sys.path.insert(1, os.getcwd())
 30 | import argparse
 31 | import datetime 
 32 | import numpy
 33 | numpy.random.seed(123)
 34 | np = numpy
 35 | import random
 36 | random.seed(123)
 37 | import re
 38 | 
 39 | 
 40 | import theano
 41 | import theano.tensor as T
 42 | import theano.ifelse
 43 | import lasagne
 44 | import scipy.io.wavfile
 45 | 
 46 | import lib
 47 | 
 48 | LEARNING_RATE = 0.001
 49 | 
 50 | ### Parsing passed args/hyperparameters ###
 51 | def get_args():
 52 |     def t_or_f(arg):
 53 |         ua = str(arg).upper()
 54 |         if 'TRUE'.startswith(ua):
 55 |             return True
 56 |         elif 'FALSE'.startswith(ua):
 57 |             return False
 58 |         else:
 59 |            raise ValueError('Arg is neither `True` nor `False`')
 60 | 
 61 |     def check_non_negative(value):
 62 |         ivalue = int(value)
 63 |         if ivalue < 0:
 64 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 65 |         return ivalue
 66 | 
 67 |     def check_positive(value):
 68 |         ivalue = int(value)
 69 |         if ivalue < 1:
 70 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 71 |         return ivalue
 72 | 
 73 |     def check_unit_interval(value):
 74 |         fvalue = float(value)
 75 |         if fvalue < 0 or fvalue > 1:
 76 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 77 |         return fvalue
 78 | 
 79 |     # No default value here. Indicate every single arguement.
 80 |     parser = argparse.ArgumentParser(
 81 |         description='two_tier.py\nNo default value! Indicate every argument.')
 82 | 
 83 |     # Hyperparameter arguements:
 84 |     parser.add_argument('--exp', help='Experiment name',
 85 |             type=str, required=False, default='_')
 86 |     parser.add_argument('--n_frames', help='How many "frames" to include in each\
 87 |             Truncated BPTT pass', type=check_positive, required=True)
 88 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 89 |             type=check_positive, required=True)
 90 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization\
 91 |             to all the linear layers (except for the embedding layer)',\
 92 |             type=t_or_f, required=True)
 93 |     parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True)
 94 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True)
 95 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 96 |             type=check_positive, required=True)
 97 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 98 |             type=check_positive, choices=xrange(1,40), required=True)
 99 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
100 |             required=True)
101 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
102 |             type=t_or_f, required=True)
103 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
104 |             type=check_positive, required=True)
105 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
106 |             choices=['linear', 'a-law', 'mu-law'], required=True)
107 |     parser.add_argument('--which_set', help='the directory name of the dataset' ,
108 |             type=str, required=True)
109 |     parser.add_argument('--batch_size', help='size of mini-batch',
110 |             type=check_positive, choices=xrange(0, 129), required=True)
111 | 
112 |     parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true')
113 |     # NEW
114 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
115 |             required=False, default=False, action='store_true')
116 | 
117 |     parser.add_argument('--n_secs', help='Seconds to generate',\
118 |             type=check_positive, required=True)
119 |     parser.add_argument('--n_seqs', help='Number wavs to generate',\
120 |             type=check_positive, required=True)
121 | 
122 | 
123 |     args = parser.parse_args()
124 | 
125 |     # NEW
126 |     # Create tag for this experiment based on passed args
127 |     tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
128 |     tag = re.sub(r'-n_secs[0-9]+', "", tag)
129 |     tag = re.sub(r'-n_seqs[0-9]+', "", tag)
130 |     tag = re.sub(r'_generate', "", tag)
131 |     tag += '-lr'+str(LEARNING_RATE)
132 |     print "Created experiment tag for these args:"
133 |     print tag
134 | 
135 |     return args, tag
136 | 
137 | args, tag = get_args()
138 | 
139 | 
140 | print "sup" 
141 | 
142 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass
143 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame
144 | WEIGHT_NORM = args.weight_norm
145 | EMB_SIZE = args.emb_size
146 | SKIP_CONN = args.skip_conn
147 | DIM = args.dim # Model dimensionality.
148 | N_RNN = args.n_rnn # How many RNNs to stack
149 | RNN_TYPE = args.rnn_type
150 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1
151 | LEARN_H0 = args.learn_h0
152 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
153 | Q_TYPE = args.q_type # log- or linear-scale
154 | WHICH_SET = args.which_set
155 | BATCH_SIZE = args.batch_size
156 | RESUME = args.resume
157 | N_SECS = args.n_secs
158 | N_SEQS = args.n_seqs  
159 | 
160 | 
161 | print "hi"
162 | 
163 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
164 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
165 | 
166 | # Fixed hyperparams
167 | GRAD_CLIP = 1 # Elementwise grad clip threshold
168 | BITRATE = 32000
169 | 
170 | # Other constants
171 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
172 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
173 | #TRAIN_MODE = 'time-iters'
174 | # To use PRINT_TIME for validation,
175 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
176 | #TRAIN_MODE = 'iters-time'
177 | # To use PRINT_ITERS for validation,
178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
179 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
180 | STOP_ITERS = 100000 # Stop after this many iterations
181 | # TODO:
182 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds.
183 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
184 | # TODO:
185 | RESULTS_DIR = 'results_2t'
186 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
187 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
188 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
189 | 
190 | 
191 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE
192 | 
193 | 
194 | epoch_str = 'epoch'
195 | iter_str = 'iter'
196 | lowest_valid_str = 'lowest valid cost'
197 | corresp_test_str = 'correponding test cost'
198 | train_nll_str, valid_nll_str, test_nll_str = \
199 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
200 | 
201 | if args.debug:
202 |     import warnings
203 |     warnings.warn('----------RUNNING IN DEBUG MODE----------')
204 |     TRAIN_MODE = 'time'
205 |     PRINT_TIME = 100
206 |     STOP_TIME = 3000
207 |     STOP_ITERS = 1000
208 | 
209 | ### Create directories ###
210 | #   FOLDER_PREFIX: root, contains:
211 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
212 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
213 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
214 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
215 | if not os.path.exists(FOLDER_PREFIX):
216 |     os.makedirs(FOLDER_PREFIX)
217 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
218 | if not os.path.exists(PARAMS_PATH):
219 |     os.makedirs(PARAMS_PATH)
220 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
221 | if not os.path.exists(SAMPLES_PATH):
222 |     os.makedirs(SAMPLES_PATH)
223 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
224 | if not os.path.exists(BEST_PATH):
225 |     os.makedirs(BEST_PATH)
226 | 
227 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
228 | 
229 | ### Import the data_feeder ###
230 | # Handling WHICH_SET
231 | from datasets.dataset import music_train_feed_epoch as train_feeder
232 | from datasets.dataset import music_valid_feed_epoch as valid_feeder
233 | from datasets.dataset import music_test_feed_epoch  as test_feeder
234 | 
235 | def load_data(data_feeder):
236 |     """
237 |     Helper function to deal with interface of different datasets.
238 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
239 |     """
240 |     return data_feeder(WHICH_SET, BATCH_SIZE,
241 |                        SEQ_LEN,
242 |                        OVERLAP,
243 |                        Q_LEVELS,
244 |                        Q_ZERO,
245 |                        Q_TYPE)
246 | 
247 | ### Creating computation graph ###
248 | def frame_level_rnn(input_sequences, h0, reset):
249 |     """
250 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
251 |     h0.shape:              (batch size, N_RNN, DIM)
252 |     reset.shape:           ()
253 | 
254 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
255 |     """
256 |     frames = input_sequences.reshape((
257 |         input_sequences.shape[0],
258 |         input_sequences.shape[1] // FRAME_SIZE,
259 |         FRAME_SIZE
260 |     ))
261 | 
262 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
263 |     # (a reasonable range to pass as inputs to the RNN)
264 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
265 |     frames *= lib.floatX(2)
266 |     # (128, 64, 4)
267 | 
268 |     # Initial state of RNNs
269 |     learned_h0 = lib.param(
270 |         'FrameLevel.h0',
271 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
272 |     )
273 |     # Handling LEARN_H0
274 |     learned_h0.param = LEARN_H0
275 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
276 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
277 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
278 | 
279 |     # Handling RNN_TYPE
280 |     # Handling SKIP_CONN
281 |     if RNN_TYPE == 'GRU':
282 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
283 |                                                    N_RNN,
284 |                                                    FRAME_SIZE,
285 |                                                    DIM,
286 |                                                    frames,
287 |                                                    h0=h0,
288 |                                                    weightnorm=WEIGHT_NORM,
289 |                                                    skip_conn=SKIP_CONN)
290 |     elif RNN_TYPE == 'LSTM':
291 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
292 |                                                     N_RNN,
293 |                                                     FRAME_SIZE,
294 |                                                     DIM,
295 |                                                     frames,
296 |                                                     h0=h0,
297 |                                                     weightnorm=WEIGHT_NORM,
298 |                                                     skip_conn=SKIP_CONN)
299 | 
300 |     # rnns_out (bs, seqlen, dim) (128, 64, 512)
301 |     output = lib.ops.Linear(
302 |         'FrameLevel.Output',
303 |         DIM,
304 |         FRAME_SIZE * DIM,
305 |         rnns_out,
306 |         initialization='he',
307 |         weightnorm=WEIGHT_NORM
308 |     )
309 |     # output: (2, 9, 4*dim)
310 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
311 |     # output: (2, 9*4, dim)
312 | 
313 |     return (output, last_hidden)
314 | 
315 | def sample_level_predictor(frame_level_outputs, prev_samples):
316 |     """
317 |     batch size = BATCH_SIZE * SEQ_LEN
318 |     SEQ_LEN = N_FRAMES * FRAME_SIZE
319 | 
320 |     frame_level_outputs.shape: (batch size, DIM)
321 |     prev_samples.shape:        (batch size, FRAME_SIZE) int32
322 | 
323 |     output.shape:              (batch size, Q_LEVELS)
324 |     """
325 |     # Handling EMB_SIZE
326 |     if EMB_SIZE == 0:
327 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
328 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
329 |         last_out_shape = Q_LEVELS
330 |     elif EMB_SIZE > 0:
331 |         prev_samples = lib.ops.Embedding(
332 |             'SampleLevel.Embedding',
333 |             Q_LEVELS,
334 |             EMB_SIZE,
335 |             prev_samples)
336 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
337 |         last_out_shape = EMB_SIZE
338 |     else:
339 |         raise ValueError('EMB_SIZE cannot be negative.')
340 | 
341 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape))
342 | 
343 |     out = lib.ops.Linear(
344 |         'SampleLevel.L1_PrevSamples',
345 |         FRAME_SIZE * last_out_shape,
346 |         DIM,
347 |         prev_samples,
348 |         biases=False,
349 |         initialization='he',
350 |         weightnorm=WEIGHT_NORM)
351 |     # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM)
352 | 
353 |     out += frame_level_outputs
354 |     # ^ (2*(9*4), dim)
355 | 
356 |     # L2
357 |     out = lib.ops.Linear('SampleLevel.L2',
358 |                          DIM,
359 |                          DIM,
360 |                          out,
361 |                          initialization='he',
362 |                          weightnorm=WEIGHT_NORM)
363 |     out = T.nnet.relu(out)
364 | 
365 |     # L3
366 |     out = lib.ops.Linear('SampleLevel.L3',
367 |                          DIM,
368 |                          DIM,
369 |                          out,
370 |                          initialization='he',
371 |                          weightnorm=WEIGHT_NORM)
372 |     out = T.nnet.relu(out)
373 | 
374 |     # Output
375 |     # We apply the softmax later
376 |     out = lib.ops.Linear('SampleLevel.Output',
377 |                          DIM,
378 |                          Q_LEVELS,
379 |                          out,
380 |                          weightnorm=WEIGHT_NORM)
381 |     return out
382 | 
383 | sequences = T.imatrix('sequences')
384 | h0        = T.tensor3('h0')
385 | reset     = T.iscalar('reset')
386 | mask      = T.matrix('mask')
387 | 
388 | if args.debug:
389 |     # Solely for debugging purposes.
390 |     # Maybe I should set the compute_test_value=warn from here.
391 |     sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32')
392 |     h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
393 |     reset.tag.test_value = numpy.array(1, dtype='int32')
394 |     mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32')
395 | 
396 | input_sequences = sequences[:, :-FRAME_SIZE]
397 | target_sequences = sequences[:, FRAME_SIZE:]
398 | 
399 | target_mask = mask[:, FRAME_SIZE:]
400 | 
401 | frame_level_outputs, new_h0 =\
402 |     frame_level_rnn(input_sequences, h0, reset)
403 | 
404 | prev_samples = sequences[:, :-1]
405 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
406 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
407 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
408 | # (batch_size*n_frames*frame_size, frame_size)
409 | 
410 | sample_level_outputs = sample_level_predictor(
411 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
412 |     prev_samples,
413 | )
414 | 
415 | cost = T.nnet.categorical_crossentropy(
416 |     T.nnet.softmax(sample_level_outputs),
417 |     target_sequences.flatten()
418 | )
419 | cost = cost.reshape(target_sequences.shape)
420 | cost = cost * target_mask
421 | # Don't use these lines; could end up with NaN
422 | # Specially at the end of audio files where mask is
423 | # all zero for some of the shorter files in mini-batch.
424 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
425 | #cost = cost.mean(axis=0)
426 | 
427 | # Use this one instead.
428 | cost = cost.sum()
429 | cost = cost / target_mask.sum()
430 | 
431 | # By default we report cross-entropy cost in bits.
432 | # Switch to nats by commenting out this line:
433 | # log_2(e) = 1.44269504089
434 | cost = cost * lib.floatX(numpy.log2(numpy.e))
435 | 
436 | ### Getting the params, grads, updates, and Theano functions ###
437 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True)
438 | lib.print_params_info(params, path=FOLDER_PREFIX)
439 | 
440 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
441 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
442 | 
443 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE)
444 | 
445 | # Training function
446 | train_fn = theano.function(
447 |     [sequences, h0, reset, mask],
448 |     [cost, new_h0],
449 |     updates=updates,
450 |     on_unused_input='warn'
451 | )
452 | 
453 | # Validation and Test function, hence no updates
454 | test_fn = theano.function(
455 |     [sequences, h0, reset, mask],
456 |     [cost, new_h0],
457 |     on_unused_input='warn'
458 | )
459 | 
460 | # Sampling at frame level
461 | frame_level_generate_fn = theano.function(
462 |     [sequences, h0, reset],
463 |     frame_level_rnn(sequences, h0, reset),
464 |     on_unused_input='warn'
465 | )
466 | 
467 | # Sampling at audio sample level
468 | frame_level_outputs = T.matrix('frame_level_outputs')
469 | prev_samples        = T.imatrix('prev_samples')
470 | sample_level_generate_fn = theano.function(
471 |     [frame_level_outputs, prev_samples],
472 |     lib.ops.softmax_and_sample(
473 |         sample_level_predictor(
474 |             frame_level_outputs,
475 |             prev_samples,
476 |         )
477 |     ),
478 |     on_unused_input='warn'
479 | )
480 | 
481 | # Uniform [-0.5, 0.5) for half of initial state for generated samples
482 | # to study the behaviour of the model and also to introduce some diversity
483 | # to samples in a simple way. [it's disabled for now]
484 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM)
485 | fixed_rand_h0 -= 0.5
486 | fixed_rand_h0 = fixed_rand_h0.astype('float32')
487 | 
488 | def generate_and_save_samples(tag, N_SECS=5):
489 |     def write_audio_file(name, data):
490 |         data = data.astype('float32')
491 |         data -= data.min()
492 |         data /= data.max()
493 |         data -= 0.5
494 |         data *= 0.95
495 |         scipy.io.wavfile.write(
496 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
497 |                     BITRATE,
498 |                     data)
499 | 
500 |     total_time = time()
501 |     # Generate N_SEQS' sample files, each 5 seconds long
502 |     LENGTH = N_SECS*BITRATE if not args.debug else 100
503 | 
504 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
505 |     samples[:, :FRAME_SIZE] = Q_ZERO
506 | 
507 |     # First half zero, others fixed random at each checkpoint
508 |     h0 = numpy.zeros(
509 |             (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
510 |             dtype='float32'
511 |     )
512 |     h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
513 |     frame_level_outputs = None
514 | 
515 |     for t in xrange(FRAME_SIZE, LENGTH):
516 | 
517 |         if t % FRAME_SIZE == 0:
518 |             frame_level_outputs, h0 = frame_level_generate_fn(
519 |                 samples[:, t-FRAME_SIZE:t],
520 |                 h0,
521 |                 #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
522 |                 numpy.int32(t == FRAME_SIZE)
523 |             )
524 | 
525 |         samples[:, t] = sample_level_generate_fn(
526 |             frame_level_outputs[:, t % FRAME_SIZE],
527 |             samples[:, t-FRAME_SIZE:t],
528 |         )
529 | 
530 |     total_time = time() - total_time
531 |     log = "{} samples of {} seconds length generated in {} seconds."
532 |     log = log.format(N_SEQS, N_SECS, total_time)
533 |     print log
534 | 
535 |     for i in xrange(N_SEQS):
536 |         samp = samples[i]
537 |         if Q_TYPE == 'mu-law':
538 |             from datasets.dataset import mu2linear
539 |             samp = mu2linear(samp)
540 |         elif Q_TYPE == 'a-law':
541 |             raise NotImplementedError('a-law is not implemented')
542 | 
543 |         now = datetime.datetime.now()
544 |         now_time = "{}:{}:{}".format(now.hour, now.minute, now.second)
545 | 
546 |         file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i)
547 |         print "writing...", file_name
548 |         write_audio_file(file_name, samp)
549 | 
550 | 
551 | 
552 | def monitor(data_feeder):
553 |     """
554 |     Cost and time of test_fn on a given dataset section.
555 |     Pass only one of `valid_feeder` or `test_feeder`.
556 |     Don't pass `train_feed`.
557 | 
558 |     :returns:
559 |         Mean cost over the input dataset (data_feeder)
560 |         Total time spent
561 |     """
562 |     _total_time = time()
563 |     _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
564 |     _costs = []
565 |     _data_feeder = load_data(data_feeder)
566 |     for _seqs, _reset, _mask in _data_feeder:
567 |         _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask)
568 |         _costs.append(_cost)
569 | 
570 |     return numpy.mean(_costs), time() - _total_time
571 | 
572 | print "Wall clock time spent before training started: {:.2f}h"\
573 |         .format((time()-exp_start)/3600.)
574 | print "Training!"
575 | total_iters = 0
576 | total_time = 0.
577 | last_print_time = 0.
578 | last_print_iters = 0
579 | costs = []
580 | lowest_valid_cost = numpy.finfo(numpy.float32).max
581 | corresponding_test_cost = numpy.finfo(numpy.float32).max
582 | new_lowest_cost = False
583 | end_of_batch = False
584 | epoch = 0
585 | 
586 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
587 | 
588 | # Initial load train dataset
589 | tr_feeder = load_data(train_feeder)
590 | 
591 | ### Handling the resume option:
592 | if True: #if Resume:
593 |     # Check if checkpoint from previous run is not corrupted.
594 |     # Then overwrite some of the variables above.
595 |     iters_to_consume, res_path, epoch, total_iters,\
596 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
597 |         lib.resumable(path=FOLDER_PREFIX,
598 |                       iter_key=iter_str,
599 |                       epoch_key=epoch_str,
600 |                       add_resume_counter=True,
601 |                       other_keys=[lowest_valid_str,
602 |                                   corresp_test_str,
603 |                                   test_nll_str])
604 |     # At this point we saved the pkl file.
605 |     last_print_iters = total_iters
606 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
607 |     # Consumes this much iters to get to the last point in training data.
608 |     consume_time = time()
609 |     for i in xrange(iters_to_consume):
610 |         tr_feeder.next()
611 |     consume_time = time() - consume_time
612 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
613 |             format(consume_time, iters_to_consume)
614 | 
615 |     lib.load_params(res_path)
616 |     print "Parameters from last available checkpoint loaded."
617 | 
618 | 
619 | 
620 |     # 2. Stdout the training progress
621 |     print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
622 |     print_info = print_info.format(epoch,
623 |                                    total_iters,
624 |                                    (time()-exp_start)/3600)
625 |     print print_info
626 | 
627 |     tag = "e{}_i{}"
628 |     tag = tag.format(epoch,
629 |                      total_iters)
630 | 
631 |     # 5. Generate and save samples (time consuming)
632 |     # If not successful, we still have the params to sample afterward
633 |     print "Sampling!",
634 |     # Generate samples
635 |     generate_and_save_samples(tag, N_SECS)
636 |     print "Done!"
637 | 
638 |     print "Wall clock time spent: {:.2f}h"\
639 |                 .format((time()-exp_start)/3600)
640 | 
641 |     sys.exit()


--------------------------------------------------------------------------------
/models/two_tier/two_tier_generate16k.py.ol:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | 
  4 | Two-tier model, Quantized input
  5 | For more info:
  6 | $ python two_tier.py -h
  7 | 
  8 | How-to-run example:
  9 | sampleRNN$ pwd
 10 | /u/mehris/sampleRNN
 11 | 
 12 | sampleRNN$ \
 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \
 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \
 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \
 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \
 17 | --batch_size 128 --which_set MUSIC
 18 | 
 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the
 20 | resume code as many time as possible, depending on the TRAIN_MODE.
 21 | (folder name, file name, flags, their order, and the values are important)
 22 | """
 23 | from time import time
 24 | from datetime import datetime
 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
 26 | exp_start = time()
 27 | 
 28 | import os, sys, glob
 29 | sys.path.insert(1, os.getcwd())
 30 | import argparse
 31 | import datetime 
 32 | import numpy
 33 | numpy.random.seed(123)
 34 | np = numpy
 35 | import random
 36 | random.seed(123)
 37 | import re
 38 | 
 39 | 
 40 | import theano
 41 | import theano.tensor as T
 42 | import theano.ifelse
 43 | import lasagne
 44 | import scipy.io.wavfile
 45 | 
 46 | import lib
 47 | 
 48 | LEARNING_RATE = 0.001
 49 | 
 50 | ### Parsing passed args/hyperparameters ###
 51 | def get_args():
 52 |     def t_or_f(arg):
 53 |         ua = str(arg).upper()
 54 |         if 'TRUE'.startswith(ua):
 55 |             return True
 56 |         elif 'FALSE'.startswith(ua):
 57 |             return False
 58 |         else:
 59 |            raise ValueError('Arg is neither `True` nor `False`')
 60 | 
 61 |     def check_non_negative(value):
 62 |         ivalue = int(value)
 63 |         if ivalue < 0:
 64 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 65 |         return ivalue
 66 | 
 67 |     def check_positive(value):
 68 |         ivalue = int(value)
 69 |         if ivalue < 1:
 70 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 71 |         return ivalue
 72 | 
 73 |     def check_unit_interval(value):
 74 |         fvalue = float(value)
 75 |         if fvalue < 0 or fvalue > 1:
 76 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 77 |         return fvalue
 78 | 
 79 |     # No default value here. Indicate every single arguement.
 80 |     parser = argparse.ArgumentParser(
 81 |         description='two_tier.py\nNo default value! Indicate every argument.')
 82 | 
 83 |     # Hyperparameter arguements:
 84 |     parser.add_argument('--exp', help='Experiment name',
 85 |             type=str, required=False, default='_')
 86 |     parser.add_argument('--n_frames', help='How many "frames" to include in each\
 87 |             Truncated BPTT pass', type=check_positive, required=True)
 88 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 89 |             type=check_positive, required=True)
 90 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization\
 91 |             to all the linear layers (except for the embedding layer)',\
 92 |             type=t_or_f, required=True)
 93 |     parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True)
 94 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True)
 95 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 96 |             type=check_positive, required=True)
 97 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 98 |             type=check_positive, choices=xrange(1,40), required=True)
 99 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
100 |             required=True)
101 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
102 |             type=t_or_f, required=True)
103 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
104 |             type=check_positive, required=True)
105 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
106 |             choices=['linear', 'a-law', 'mu-law'], required=True)
107 |     parser.add_argument('--which_set', help='the directory name of the dataset' ,
108 |             type=str, required=True)
109 |     parser.add_argument('--batch_size', help='size of mini-batch',
110 |             type=check_positive, choices=xrange(0, 999), required=True)
111 | 
112 |     parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true')
113 |     # NEW
114 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
115 |             required=False, default=False, action='store_true')
116 | 
117 |     parser.add_argument('--n_secs', help='Seconds to generate',\
118 |             type=check_positive, required=True)
119 |     parser.add_argument('--n_seqs', help='Number wavs to generate',\
120 |             type=check_positive, required=True)
121 | 
122 | 
123 |     args = parser.parse_args()
124 | 
125 |     # NEW
126 |     # Create tag for this experiment based on passed args
127 |     tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
128 |     tag = re.sub(r'-n_secs[0-9]+', "", tag)
129 |     tag = re.sub(r'-n_seqs[0-9]+', "", tag)
130 |     tag = re.sub(r'_generate', "", tag)
131 |     tag += '-lr'+str(LEARNING_RATE)
132 |     print "Created experiment tag for these args:"
133 |     print tag
134 | 
135 |     return args, tag
136 | 
137 | args, tag = get_args()
138 | 
139 | 
140 | print "sup" 
141 | 
142 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass
143 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame
144 | WEIGHT_NORM = args.weight_norm
145 | EMB_SIZE = args.emb_size
146 | SKIP_CONN = args.skip_conn
147 | DIM = args.dim # Model dimensionality.
148 | N_RNN = args.n_rnn # How many RNNs to stack
149 | RNN_TYPE = args.rnn_type
150 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1
151 | LEARN_H0 = args.learn_h0
152 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
153 | Q_TYPE = args.q_type # log- or linear-scale
154 | WHICH_SET = args.which_set
155 | BATCH_SIZE = args.batch_size
156 | RESUME = args.resume
157 | N_SECS = args.n_secs
158 | N_SEQS = args.n_seqs  
159 | 
160 | 
161 | print "hi"
162 | 
163 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
164 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
165 | 
166 | # Fixed hyperparams
167 | GRAD_CLIP = 1 # Elementwise grad clip threshold
168 | BITRATE = 16000
169 | 
170 | # Other constants
171 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
172 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
173 | #TRAIN_MODE = 'time-iters'
174 | # To use PRINT_TIME for validation,
175 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
176 | #TRAIN_MODE = 'iters-time'
177 | # To use PRINT_ITERS for validation,
178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
179 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
180 | STOP_ITERS = 100000 # Stop after this many iterations
181 | # TODO:
182 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds.
183 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
184 | # TODO:
185 | RESULTS_DIR = 'results_2t'
186 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
187 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
188 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
189 | 
190 | 
191 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE
192 | 
193 | 
194 | epoch_str = 'epoch'
195 | iter_str = 'iter'
196 | lowest_valid_str = 'lowest valid cost'
197 | corresp_test_str = 'correponding test cost'
198 | train_nll_str, valid_nll_str, test_nll_str = \
199 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
200 | 
201 | if args.debug:
202 |     import warnings
203 |     warnings.warn('----------RUNNING IN DEBUG MODE----------')
204 |     TRAIN_MODE = 'time'
205 |     PRINT_TIME = 100
206 |     STOP_TIME = 3000
207 |     STOP_ITERS = 1000
208 | 
209 | ### Create directories ###
210 | #   FOLDER_PREFIX: root, contains:
211 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
212 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
213 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
214 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
215 | if not os.path.exists(FOLDER_PREFIX):
216 |     os.makedirs(FOLDER_PREFIX)
217 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
218 | if not os.path.exists(PARAMS_PATH):
219 |     os.makedirs(PARAMS_PATH)
220 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
221 | if not os.path.exists(SAMPLES_PATH):
222 |     os.makedirs(SAMPLES_PATH)
223 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
224 | if not os.path.exists(BEST_PATH):
225 |     os.makedirs(BEST_PATH)
226 | 
227 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
228 | 
229 | ### Import the data_feeder ###
230 | # Handling WHICH_SET
231 | from datasets.dataset import music_train_feed_epoch as train_feeder
232 | from datasets.dataset import music_valid_feed_epoch as valid_feeder
233 | from datasets.dataset import music_test_feed_epoch  as test_feeder
234 | 
235 | def load_data(data_feeder):
236 |     """
237 |     Helper function to deal with interface of different datasets.
238 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
239 |     """
240 |     return data_feeder(WHICH_SET, BATCH_SIZE,
241 |                        SEQ_LEN,
242 |                        OVERLAP,
243 |                        Q_LEVELS,
244 |                        Q_ZERO,
245 |                        Q_TYPE)
246 | 
247 | ### Creating computation graph ###
248 | def frame_level_rnn(input_sequences, h0, reset):
249 |     """
250 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
251 |     h0.shape:              (batch size, N_RNN, DIM)
252 |     reset.shape:           ()
253 | 
254 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
255 |     """
256 |     frames = input_sequences.reshape((
257 |         input_sequences.shape[0],
258 |         input_sequences.shape[1] // FRAME_SIZE,
259 |         FRAME_SIZE
260 |     ))
261 | 
262 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
263 |     # (a reasonable range to pass as inputs to the RNN)
264 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
265 |     frames *= lib.floatX(2)
266 |     # (128, 64, 4)
267 | 
268 |     # Initial state of RNNs
269 |     learned_h0 = lib.param(
270 |         'FrameLevel.h0',
271 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
272 |     )
273 |     # Handling LEARN_H0
274 |     learned_h0.param = LEARN_H0
275 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
276 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
277 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
278 | 
279 |     # Handling RNN_TYPE
280 |     # Handling SKIP_CONN
281 |     if RNN_TYPE == 'GRU':
282 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
283 |                                                    N_RNN,
284 |                                                    FRAME_SIZE,
285 |                                                    DIM,
286 |                                                    frames,
287 |                                                    h0=h0,
288 |                                                    weightnorm=WEIGHT_NORM,
289 |                                                    skip_conn=SKIP_CONN)
290 |     elif RNN_TYPE == 'LSTM':
291 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
292 |                                                     N_RNN,
293 |                                                     FRAME_SIZE,
294 |                                                     DIM,
295 |                                                     frames,
296 |                                                     h0=h0,
297 |                                                     weightnorm=WEIGHT_NORM,
298 |                                                     skip_conn=SKIP_CONN)
299 | 
300 |     # rnns_out (bs, seqlen, dim) (128, 64, 512)
301 |     output = lib.ops.Linear(
302 |         'FrameLevel.Output',
303 |         DIM,
304 |         FRAME_SIZE * DIM,
305 |         rnns_out,
306 |         initialization='he',
307 |         weightnorm=WEIGHT_NORM
308 |     )
309 |     # output: (2, 9, 4*dim)
310 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
311 |     # output: (2, 9*4, dim)
312 | 
313 |     return (output, last_hidden)
314 | 
315 | def sample_level_predictor(frame_level_outputs, prev_samples):
316 |     """
317 |     batch size = BATCH_SIZE * SEQ_LEN
318 |     SEQ_LEN = N_FRAMES * FRAME_SIZE
319 | 
320 |     frame_level_outputs.shape: (batch size, DIM)
321 |     prev_samples.shape:        (batch size, FRAME_SIZE) int32
322 | 
323 |     output.shape:              (batch size, Q_LEVELS)
324 |     """
325 |     # Handling EMB_SIZE
326 |     if EMB_SIZE == 0:
327 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
328 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
329 |         last_out_shape = Q_LEVELS
330 |     elif EMB_SIZE > 0:
331 |         prev_samples = lib.ops.Embedding(
332 |             'SampleLevel.Embedding',
333 |             Q_LEVELS,
334 |             EMB_SIZE,
335 |             prev_samples)
336 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
337 |         last_out_shape = EMB_SIZE
338 |     else:
339 |         raise ValueError('EMB_SIZE cannot be negative.')
340 | 
341 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape))
342 | 
343 |     out = lib.ops.Linear(
344 |         'SampleLevel.L1_PrevSamples',
345 |         FRAME_SIZE * last_out_shape,
346 |         DIM,
347 |         prev_samples,
348 |         biases=False,
349 |         initialization='he',
350 |         weightnorm=WEIGHT_NORM)
351 |     # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM)
352 | 
353 |     out += frame_level_outputs
354 |     # ^ (2*(9*4), dim)
355 | 
356 |     # L2
357 |     out = lib.ops.Linear('SampleLevel.L2',
358 |                          DIM,
359 |                          DIM,
360 |                          out,
361 |                          initialization='he',
362 |                          weightnorm=WEIGHT_NORM)
363 |     out = T.nnet.relu(out)
364 | 
365 |     # L3
366 |     out = lib.ops.Linear('SampleLevel.L3',
367 |                          DIM,
368 |                          DIM,
369 |                          out,
370 |                          initialization='he',
371 |                          weightnorm=WEIGHT_NORM)
372 |     out = T.nnet.relu(out)
373 | 
374 |     # Output
375 |     # We apply the softmax later
376 |     out = lib.ops.Linear('SampleLevel.Output',
377 |                          DIM,
378 |                          Q_LEVELS,
379 |                          out,
380 |                          weightnorm=WEIGHT_NORM)
381 |     return out
382 | 
383 | sequences = T.imatrix('sequences')
384 | h0        = T.tensor3('h0')
385 | reset     = T.iscalar('reset')
386 | mask      = T.matrix('mask')
387 | 
388 | if args.debug:
389 |     # Solely for debugging purposes.
390 |     # Maybe I should set the compute_test_value=warn from here.
391 |     sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32')
392 |     h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
393 |     reset.tag.test_value = numpy.array(1, dtype='int32')
394 |     mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32')
395 | 
396 | input_sequences = sequences[:, :-FRAME_SIZE]
397 | target_sequences = sequences[:, FRAME_SIZE:]
398 | 
399 | target_mask = mask[:, FRAME_SIZE:]
400 | 
401 | frame_level_outputs, new_h0 =\
402 |     frame_level_rnn(input_sequences, h0, reset)
403 | 
404 | prev_samples = sequences[:, :-1]
405 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
406 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
407 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
408 | # (batch_size*n_frames*frame_size, frame_size)
409 | 
410 | sample_level_outputs = sample_level_predictor(
411 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
412 |     prev_samples,
413 | )
414 | 
415 | cost = T.nnet.categorical_crossentropy(
416 |     T.nnet.softmax(sample_level_outputs),
417 |     target_sequences.flatten()
418 | )
419 | cost = cost.reshape(target_sequences.shape)
420 | cost = cost * target_mask
421 | # Don't use these lines; could end up with NaN
422 | # Specially at the end of audio files where mask is
423 | # all zero for some of the shorter files in mini-batch.
424 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
425 | #cost = cost.mean(axis=0)
426 | 
427 | # Use this one instead.
428 | cost = cost.sum()
429 | cost = cost / target_mask.sum()
430 | 
431 | # By default we report cross-entropy cost in bits.
432 | # Switch to nats by commenting out this line:
433 | # log_2(e) = 1.44269504089
434 | cost = cost * lib.floatX(numpy.log2(numpy.e))
435 | 
436 | ### Getting the params, grads, updates, and Theano functions ###
437 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True)
438 | lib.print_params_info(params, path=FOLDER_PREFIX)
439 | 
440 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
441 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
442 | 
443 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE)
444 | 
445 | # Training function
446 | train_fn = theano.function(
447 |     [sequences, h0, reset, mask],
448 |     [cost, new_h0],
449 |     updates=updates,
450 |     on_unused_input='warn'
451 | )
452 | 
453 | # Validation and Test function, hence no updates
454 | test_fn = theano.function(
455 |     [sequences, h0, reset, mask],
456 |     [cost, new_h0],
457 |     on_unused_input='warn'
458 | )
459 | 
460 | # Sampling at frame level
461 | frame_level_generate_fn = theano.function(
462 |     [sequences, h0, reset],
463 |     frame_level_rnn(sequences, h0, reset),
464 |     on_unused_input='warn'
465 | )
466 | 
467 | # Sampling at audio sample level
468 | frame_level_outputs = T.matrix('frame_level_outputs')
469 | prev_samples        = T.imatrix('prev_samples')
470 | sample_level_generate_fn = theano.function(
471 |     [frame_level_outputs, prev_samples],
472 |     lib.ops.softmax_and_sample(
473 |         sample_level_predictor(
474 |             frame_level_outputs,
475 |             prev_samples,
476 |         )
477 |     ),
478 |     on_unused_input='warn'
479 | )
480 | 
481 | # Uniform [-0.5, 0.5) for half of initial state for generated samples
482 | # to study the behaviour of the model and also to introduce some diversity
483 | # to samples in a simple way. [it's disabled for now]
484 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM)
485 | fixed_rand_h0 -= 0.5
486 | fixed_rand_h0 = fixed_rand_h0.astype('float32')
487 | 
488 | def generate_and_save_samples(tag, N_SECS=5):
489 |     def write_audio_file(name, data):
490 |         data = data.astype('float32')
491 |         data -= data.min()
492 |         data /= data.max()
493 |         data -= 0.5
494 |         data *= 0.95
495 |         scipy.io.wavfile.write(
496 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
497 |                     BITRATE,
498 |                     data)
499 | 
500 |     total_time = time()
501 |     # Generate N_SEQS' sample files, each 5 seconds long
502 |     LENGTH = N_SECS*BITRATE if not args.debug else 100
503 | 
504 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
505 |     samples[:, :FRAME_SIZE] = Q_ZERO
506 | 
507 |     # First half zero, others fixed random at each checkpoint
508 |     h0 = numpy.zeros(
509 |             (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
510 |             dtype='float32'
511 |     )
512 |     h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
513 |     frame_level_outputs = None
514 | 
515 |     for t in xrange(FRAME_SIZE, LENGTH):
516 | 
517 |         if t % FRAME_SIZE == 0:
518 |             frame_level_outputs, h0 = frame_level_generate_fn(
519 |                 samples[:, t-FRAME_SIZE:t],
520 |                 h0,
521 |                 #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
522 |                 numpy.int32(t == FRAME_SIZE)
523 |             )
524 | 
525 |         samples[:, t] = sample_level_generate_fn(
526 |             frame_level_outputs[:, t % FRAME_SIZE],
527 |             samples[:, t-FRAME_SIZE:t],
528 |         )
529 | 
530 |     total_time = time() - total_time
531 |     log = "{} samples of {} seconds length generated in {} seconds."
532 |     log = log.format(N_SEQS, N_SECS, total_time)
533 |     print log
534 | 
535 |     for i in xrange(N_SEQS):
536 |         samp = samples[i]
537 |         if Q_TYPE == 'mu-law':
538 |             from datasets.dataset import mu2linear
539 |             samp = mu2linear(samp)
540 |         elif Q_TYPE == 'a-law':
541 |             raise NotImplementedError('a-law is not implemented')
542 | 
543 |         now = datetime.datetime.now()
544 |         now_time = "{}:{}:{}".format(now.hour, now.minute, now.second)
545 | 
546 |         file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i)
547 |         print "writing...", file_name
548 |         write_audio_file(file_name, samp)
549 | 
550 | 
551 | 
552 | def monitor(data_feeder):
553 |     """
554 |     Cost and time of test_fn on a given dataset section.
555 |     Pass only one of `valid_feeder` or `test_feeder`.
556 |     Don't pass `train_feed`.
557 | 
558 |     :returns:
559 |         Mean cost over the input dataset (data_feeder)
560 |         Total time spent
561 |     """
562 |     _total_time = time()
563 |     _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
564 |     _costs = []
565 |     _data_feeder = load_data(data_feeder)
566 |     for _seqs, _reset, _mask in _data_feeder:
567 |         _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask)
568 |         _costs.append(_cost)
569 | 
570 |     return numpy.mean(_costs), time() - _total_time
571 | 
572 | print "Wall clock time spent before training started: {:.2f}h"\
573 |         .format((time()-exp_start)/3600.)
574 | print "Training!"
575 | total_iters = 0
576 | total_time = 0.
577 | last_print_time = 0.
578 | last_print_iters = 0
579 | costs = []
580 | lowest_valid_cost = numpy.finfo(numpy.float32).max
581 | corresponding_test_cost = numpy.finfo(numpy.float32).max
582 | new_lowest_cost = False
583 | end_of_batch = False
584 | epoch = 0
585 | 
586 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
587 | 
588 | # Initial load train dataset
589 | tr_feeder = load_data(train_feeder)
590 | 
591 | ### Handling the resume option:
592 | if True: #if Resume:
593 |     # Check if checkpoint from previous run is not corrupted.
594 |     # Then overwrite some of the variables above.
595 |     iters_to_consume, res_path, epoch, total_iters,\
596 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
597 |         lib.resumable(path=FOLDER_PREFIX,
598 |                       iter_key=iter_str,
599 |                       epoch_key=epoch_str,
600 |                       add_resume_counter=True,
601 |                       other_keys=[lowest_valid_str,
602 |                                   corresp_test_str,
603 |                                   test_nll_str])
604 |     # At this point we saved the pkl file.
605 |     last_print_iters = total_iters
606 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
607 |     # Consumes this much iters to get to the last point in training data.
608 |     consume_time = time()
609 |     for i in xrange(iters_to_consume):
610 |         tr_feeder.next()
611 |     consume_time = time() - consume_time
612 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
613 |             format(consume_time, iters_to_consume)
614 | 
615 |     lib.load_params(res_path)
616 |     print "Parameters from last available checkpoint loaded."
617 | 
618 | 
619 | 
620 |     # 2. Stdout the training progress
621 |     print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
622 |     print_info = print_info.format(epoch,
623 |                                    total_iters,
624 |                                    (time()-exp_start)/3600)
625 |     print print_info
626 | 
627 |     tag = "e{}_i{}"
628 |     tag = tag.format(epoch,
629 |                      total_iters)
630 | 
631 |     # 5. Generate and save samples (time consuming)
632 |     # If not successful, we still have the params to sample afterward
633 |     print "Sampling!",
634 |     # Generate samples
635 |     generate_and_save_samples(tag, N_SECS)
636 |     print "Done!"
637 | 
638 |     print "Wall clock time spent: {:.2f}h"\
639 |                 .format((time()-exp_start)/3600)
640 | 
641 |     sys.exit()
642 | 


--------------------------------------------------------------------------------
/models/two_tier/two_tier_generate16k.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | 
  4 | Two-tier model, Quantized input
  5 | For more info:
  6 | $ python two_tier.py -h
  7 | 
  8 | How-to-run example:
  9 | sampleRNN$ pwd
 10 | /u/mehris/sampleRNN
 11 | 
 12 | sampleRNN$ \
 13 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \
 14 | models/two_tier/two_tier.py --exp AXIS1 --n_frames 12 --frame_size 10 \
 15 | --weight_norm True --emb_size 64 --skip_conn False --dim 32 --n_rnn 2 \
 16 | --rnn_type LSTM --learn_h0 False --q_levels 16 --q_type linear \
 17 | --batch_size 128 --which_set MUSIC
 18 | 
 19 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the
 20 | resume code as many time as possible, depending on the TRAIN_MODE.
 21 | (folder name, file name, flags, their order, and the values are important)
 22 | """
 23 | from time import time
 24 | from datetime import datetime
 25 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
 26 | exp_start = time()
 27 | 
 28 | import os, sys, glob
 29 | sys.path.insert(1, os.getcwd())
 30 | import argparse
 31 | import datetime 
 32 | import numpy
 33 | numpy.random.seed(123)
 34 | np = numpy
 35 | import random
 36 | random.seed(123)
 37 | import re
 38 | 
 39 | 
 40 | import theano
 41 | import theano.tensor as T
 42 | import theano.ifelse
 43 | import lasagne
 44 | import scipy.io.wavfile
 45 | 
 46 | import lib
 47 | 
 48 | LEARNING_RATE = 0.001
 49 | 
 50 | ### Parsing passed args/hyperparameters ###
 51 | def get_args():
 52 |     def t_or_f(arg):
 53 |         ua = str(arg).upper()
 54 |         if 'TRUE'.startswith(ua):
 55 |             return True
 56 |         elif 'FALSE'.startswith(ua):
 57 |             return False
 58 |         else:
 59 |            raise ValueError('Arg is neither `True` nor `False`')
 60 | 
 61 |     def check_non_negative(value):
 62 |         ivalue = int(value)
 63 |         if ivalue < 0:
 64 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 65 |         return ivalue
 66 | 
 67 |     def check_positive(value):
 68 |         ivalue = int(value)
 69 |         if ivalue < 1:
 70 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 71 |         return ivalue
 72 | 
 73 |     def check_unit_interval(value):
 74 |         fvalue = float(value)
 75 |         if fvalue < 0 or fvalue > 1:
 76 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 77 |         return fvalue
 78 | 
 79 |     # No default value here. Indicate every single arguement.
 80 |     parser = argparse.ArgumentParser(
 81 |         description='two_tier.py\nNo default value! Indicate every argument.')
 82 | 
 83 |     # Hyperparameter arguements:
 84 |     parser.add_argument('--exp', help='Experiment name',
 85 |             type=str, required=False, default='_')
 86 |     parser.add_argument('--n_frames', help='How many "frames" to include in each\
 87 |             Truncated BPTT pass', type=check_positive, required=True)
 88 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 89 |             type=check_positive, required=True)
 90 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization\
 91 |             to all the linear layers (except for the embedding layer)',\
 92 |             type=t_or_f, required=True)
 93 |     parser.add_argument('--emb_size', help='Size of embedding layer (0 to disable)', type=check_non_negative, required=True)
 94 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN', type=t_or_f, required=True)
 95 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 96 |             type=check_positive, required=True)
 97 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 98 |             type=check_positive, choices=xrange(1,40), required=True)
 99 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
100 |             required=True)
101 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
102 |             type=t_or_f, required=True)
103 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
104 |             type=check_positive, required=True)
105 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
106 |             choices=['linear', 'a-law', 'mu-law'], required=True)
107 |     parser.add_argument('--which_set', help='the directory name of the dataset' ,
108 |             type=str, required=True)
109 |     parser.add_argument('--batch_size', help='size of mini-batch',
110 |             type=check_positive, choices=xrange(0, 999), required=True)
111 | 
112 |     parser.add_argument('--debug', help='Debug mode', required=False, default=False, action='store_true')
113 |     # NEW
114 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
115 |             required=False, default=False, action='store_true')
116 | 
117 |     parser.add_argument('--n_secs', help='Seconds to generate',\
118 |             type=check_positive, required=True)
119 |     parser.add_argument('--n_seqs', help='Number wavs to generate',\
120 |             type=check_positive, required=True)
121 |     parser.add_argument('--temp', help='Temperature',\
122 |             type=float, required=True)
123 | 
124 |     args = parser.parse_args()
125 | 
126 |     # NEW
127 |     # Create tag for this experiment based on passed args
128 |     tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
129 |     tag = re.sub(r'-n_secs[0-9]+', "", tag)
130 |     tag = re.sub(r'-n_seqs[0-9]+', "", tag)
131 |     tag = re.sub(r'-temp[0-9]*[\.]?[0-9]*', "", tag)
132 |     tag = re.sub(r'_generate', "", tag)
133 |     tag += '-lr'+str(LEARNING_RATE)
134 |     print "Created experiment tag for these args:"
135 |     print tag
136 | 
137 |     return args, tag
138 | 
139 | args, tag = get_args()
140 | 
141 | 
142 | print "sup" 
143 | 
144 | N_FRAMES = args.n_frames # How many 'frames' to include in each truncated BPTT pass
145 | OVERLAP = FRAME_SIZE = args.frame_size # How many samples per frame
146 | WEIGHT_NORM = args.weight_norm
147 | EMB_SIZE = args.emb_size
148 | SKIP_CONN = args.skip_conn
149 | DIM = args.dim # Model dimensionality.
150 | N_RNN = args.n_rnn # How many RNNs to stack
151 | RNN_TYPE = args.rnn_type
152 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1
153 | LEARN_H0 = args.learn_h0
154 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
155 | Q_TYPE = args.q_type # log- or linear-scale
156 | WHICH_SET = args.which_set
157 | BATCH_SIZE = args.batch_size
158 | RESUME = args.resume
159 | N_SECS = args.n_secs
160 | N_SEQS = args.n_seqs  
161 | TEMPERATURE = args.temp
162 | 
163 | 
164 | print "hi"
165 | 
166 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
167 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
168 | 
169 | # Fixed hyperparams
170 | GRAD_CLIP = 1 # Elementwise grad clip threshold
171 | BITRATE = 16000
172 | 
173 | # Other constants
174 | #TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
175 | TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
176 | #TRAIN_MODE = 'time-iters'
177 | # To use PRINT_TIME for validation,
178 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
179 | #TRAIN_MODE = 'iters-time'
180 | # To use PRINT_ITERS for validation,
181 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
182 | PRINT_ITERS = 10000 # Print cost, generate samples, save model checkpoint every N iterations.
183 | STOP_ITERS = 100000 # Stop after this many iterations
184 | # TODO:
185 | PRINT_TIME = 90*60 # Print cost, generate samples, save model checkpoint every N seconds.
186 | STOP_TIME = 60*60*24*3 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
187 | # TODO:
188 | RESULTS_DIR = 'results_2t'
189 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
190 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
191 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
192 | 
193 | 
194 | print "SEQ_LEN", SEQ_LEN, N_FRAMES, FRAME_SIZE
195 | 
196 | 
197 | epoch_str = 'epoch'
198 | iter_str = 'iter'
199 | lowest_valid_str = 'lowest valid cost'
200 | corresp_test_str = 'correponding test cost'
201 | train_nll_str, valid_nll_str, test_nll_str = \
202 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
203 | 
204 | if args.debug:
205 |     import warnings
206 |     warnings.warn('----------RUNNING IN DEBUG MODE----------')
207 |     TRAIN_MODE = 'time'
208 |     PRINT_TIME = 100
209 |     STOP_TIME = 3000
210 |     STOP_ITERS = 1000
211 | 
212 | ### Create directories ###
213 | #   FOLDER_PREFIX: root, contains:
214 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
215 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
216 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
217 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
218 | if not os.path.exists(FOLDER_PREFIX):
219 |     os.makedirs(FOLDER_PREFIX)
220 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
221 | if not os.path.exists(PARAMS_PATH):
222 |     os.makedirs(PARAMS_PATH)
223 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
224 | if not os.path.exists(SAMPLES_PATH):
225 |     os.makedirs(SAMPLES_PATH)
226 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
227 | if not os.path.exists(BEST_PATH):
228 |     os.makedirs(BEST_PATH)
229 | 
230 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
231 | 
232 | ### Import the data_feeder ###
233 | # Handling WHICH_SET
234 | from datasets.dataset import music_train_feed_epoch as train_feeder
235 | from datasets.dataset import music_valid_feed_epoch as valid_feeder
236 | from datasets.dataset import music_test_feed_epoch  as test_feeder
237 | 
238 | def load_data(data_feeder):
239 |     """
240 |     Helper function to deal with interface of different datasets.
241 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
242 |     """
243 |     return data_feeder(WHICH_SET, BATCH_SIZE,
244 |                        SEQ_LEN,
245 |                        OVERLAP,
246 |                        Q_LEVELS,
247 |                        Q_ZERO,
248 |                        Q_TYPE)
249 | 
250 | ### Creating computation graph ###
251 | def frame_level_rnn(input_sequences, h0, reset):
252 |     """
253 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE)
254 |     h0.shape:              (batch size, N_RNN, DIM)
255 |     reset.shape:           ()
256 | 
257 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
258 |     """
259 |     frames = input_sequences.reshape((
260 |         input_sequences.shape[0],
261 |         input_sequences.shape[1] // FRAME_SIZE,
262 |         FRAME_SIZE
263 |     ))
264 | 
265 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
266 |     # (a reasonable range to pass as inputs to the RNN)
267 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
268 |     frames *= lib.floatX(2)
269 |     # (128, 64, 4)
270 | 
271 |     # Initial state of RNNs
272 |     learned_h0 = lib.param(
273 |         'FrameLevel.h0',
274 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
275 |     )
276 |     # Handling LEARN_H0
277 |     learned_h0.param = LEARN_H0
278 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
279 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
280 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
281 | 
282 |     # Handling RNN_TYPE
283 |     # Handling SKIP_CONN
284 |     if RNN_TYPE == 'GRU':
285 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
286 |                                                    N_RNN,
287 |                                                    FRAME_SIZE,
288 |                                                    DIM,
289 |                                                    frames,
290 |                                                    h0=h0,
291 |                                                    weightnorm=WEIGHT_NORM,
292 |                                                    skip_conn=SKIP_CONN)
293 |     elif RNN_TYPE == 'LSTM':
294 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
295 |                                                     N_RNN,
296 |                                                     FRAME_SIZE,
297 |                                                     DIM,
298 |                                                     frames,
299 |                                                     h0=h0,
300 |                                                     weightnorm=WEIGHT_NORM,
301 |                                                     skip_conn=SKIP_CONN)
302 | 
303 |     # rnns_out (bs, seqlen, dim) (128, 64, 512)
304 |     output = lib.ops.Linear(
305 |         'FrameLevel.Output',
306 |         DIM,
307 |         FRAME_SIZE * DIM,
308 |         rnns_out,
309 |         initialization='he',
310 |         weightnorm=WEIGHT_NORM
311 |     )
312 |     # output: (2, 9, 4*dim)
313 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
314 |     # output: (2, 9*4, dim)
315 | 
316 |     return (output, last_hidden)
317 | 
318 | def sample_level_predictor(frame_level_outputs, prev_samples):
319 |     """
320 |     batch size = BATCH_SIZE * SEQ_LEN
321 |     SEQ_LEN = N_FRAMES * FRAME_SIZE
322 | 
323 |     frame_level_outputs.shape: (batch size, DIM)
324 |     prev_samples.shape:        (batch size, FRAME_SIZE) int32
325 | 
326 |     output.shape:              (batch size, Q_LEVELS)
327 |     """
328 |     # Handling EMB_SIZE
329 |     if EMB_SIZE == 0:
330 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
331 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
332 |         last_out_shape = Q_LEVELS
333 |     elif EMB_SIZE > 0:
334 |         prev_samples = lib.ops.Embedding(
335 |             'SampleLevel.Embedding',
336 |             Q_LEVELS,
337 |             EMB_SIZE,
338 |             prev_samples)
339 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
340 |         last_out_shape = EMB_SIZE
341 |     else:
342 |         raise ValueError('EMB_SIZE cannot be negative.')
343 | 
344 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape))
345 | 
346 |     out = lib.ops.Linear(
347 |         'SampleLevel.L1_PrevSamples',
348 |         FRAME_SIZE * last_out_shape,
349 |         DIM,
350 |         prev_samples,
351 |         biases=False,
352 |         initialization='he',
353 |         weightnorm=WEIGHT_NORM)
354 |     # shape: (BATCH_SIZE*N_FRAMES*FRAME_SIZE, DIM)
355 | 
356 |     out += frame_level_outputs
357 |     # ^ (2*(9*4), dim)
358 | 
359 |     # L2
360 |     out = lib.ops.Linear('SampleLevel.L2',
361 |                          DIM,
362 |                          DIM,
363 |                          out,
364 |                          initialization='he',
365 |                          weightnorm=WEIGHT_NORM)
366 |     out = T.nnet.relu(out)
367 | 
368 |     # L3
369 |     out = lib.ops.Linear('SampleLevel.L3',
370 |                          DIM,
371 |                          DIM,
372 |                          out,
373 |                          initialization='he',
374 |                          weightnorm=WEIGHT_NORM)
375 |     out = T.nnet.relu(out)
376 | 
377 |     # Output
378 |     # We apply the softmax later
379 |     out = lib.ops.Linear('SampleLevel.Output',
380 |                          DIM,
381 |                          Q_LEVELS,
382 |                          out,
383 |                          weightnorm=WEIGHT_NORM)
384 |     return out
385 | 
386 | sequences = T.imatrix('sequences')
387 | h0        = T.tensor3('h0')
388 | reset     = T.iscalar('reset')
389 | mask      = T.matrix('mask')
390 | 
391 | if args.debug:
392 |     # Solely for debugging purposes.
393 |     # Maybe I should set the compute_test_value=warn from here.
394 |     sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32')
395 |     h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
396 |     reset.tag.test_value = numpy.array(1, dtype='int32')
397 |     mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32')
398 | 
399 | input_sequences = sequences[:, :-FRAME_SIZE]
400 | target_sequences = sequences[:, FRAME_SIZE:]
401 | 
402 | target_mask = mask[:, FRAME_SIZE:]
403 | 
404 | frame_level_outputs, new_h0 =\
405 |     frame_level_rnn(input_sequences, h0, reset)
406 | 
407 | prev_samples = sequences[:, :-1]
408 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
409 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
410 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
411 | # (batch_size*n_frames*frame_size, frame_size)
412 | 
413 | sample_level_outputs = sample_level_predictor(
414 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
415 |     prev_samples,
416 | )
417 | 
418 | cost = T.nnet.categorical_crossentropy(
419 |     T.nnet.softmax(sample_level_outputs),
420 |     target_sequences.flatten()
421 | )
422 | cost = cost.reshape(target_sequences.shape)
423 | cost = cost * target_mask
424 | # Don't use these lines; could end up with NaN
425 | # Specially at the end of audio files where mask is
426 | # all zero for some of the shorter files in mini-batch.
427 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
428 | #cost = cost.mean(axis=0)
429 | 
430 | # Use this one instead.
431 | cost = cost.sum()
432 | cost = cost / target_mask.sum()
433 | 
434 | # By default we report cross-entropy cost in bits.
435 | # Switch to nats by commenting out this line:
436 | # log_2(e) = 1.44269504089
437 | cost = cost * lib.floatX(numpy.log2(numpy.e))
438 | 
439 | ### Getting the params, grads, updates, and Theano functions ###
440 | params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True)
441 | lib.print_params_info(params, path=FOLDER_PREFIX)
442 | 
443 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
444 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
445 | 
446 | updates = lasagne.updates.adam(grads, params, learning_rate=LEARNING_RATE)
447 | 
448 | # Training function
449 | train_fn = theano.function(
450 |     [sequences, h0, reset, mask],
451 |     [cost, new_h0],
452 |     updates=updates,
453 |     on_unused_input='warn'
454 | )
455 | 
456 | # Validation and Test function, hence no updates
457 | test_fn = theano.function(
458 |     [sequences, h0, reset, mask],
459 |     [cost, new_h0],
460 |     on_unused_input='warn'
461 | )
462 | 
463 | # Sampling at frame level
464 | frame_level_generate_fn = theano.function(
465 |     [sequences, h0, reset],
466 |     frame_level_rnn(sequences, h0, reset),
467 |     on_unused_input='warn'
468 | )
469 | 
470 | # Sampling at audio sample level
471 | frame_level_outputs = T.matrix('frame_level_outputs')
472 | prev_samples        = T.imatrix('prev_samples')
473 | sample_level_generate_fn = theano.function(
474 |     [frame_level_outputs, prev_samples],
475 |     lib.ops.softmax_and_sample(
476 |         sample_level_predictor(
477 |             frame_level_outputs,
478 |             prev_samples,
479 |         )/TEMPERATURE
480 |     ),
481 |     on_unused_input='warn'
482 | )
483 | 
484 | # Uniform [-0.5, 0.5) for half of initial state for generated samples
485 | # to study the behaviour of the model and also to introduce some diversity
486 | # to samples in a simple way. [it's disabled for now]
487 | fixed_rand_h0 = numpy.random.rand(N_SEQS//2, N_RNN, H0_MULT*DIM)
488 | fixed_rand_h0 -= 0.5
489 | fixed_rand_h0 = fixed_rand_h0.astype('float32')
490 | 
491 | def generate_and_save_samples(tag, N_SECS=5):
492 |     def write_audio_file(name, data):
493 |         data = data.astype('float32')
494 |         data -= data.min()
495 |         data /= data.max()
496 |         data -= 0.5
497 |         data *= 0.95
498 |         scipy.io.wavfile.write(
499 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
500 |                     BITRATE,
501 |                     data)
502 | 
503 |     total_time = time()
504 |     # Generate N_SEQS' sample files, each 5 seconds long
505 |     LENGTH = N_SECS*BITRATE if not args.debug else 100
506 | 
507 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
508 |     samples[:, :FRAME_SIZE] = Q_ZERO
509 | 
510 |     # First half zero, others fixed random at each checkpoint
511 |     h0 = numpy.zeros(
512 |             (N_SEQS-fixed_rand_h0.shape[0], N_RNN, H0_MULT*DIM),
513 |             dtype='float32'
514 |     )
515 |     h0 = numpy.concatenate((h0, fixed_rand_h0), axis=0)
516 |     frame_level_outputs = None
517 | 
518 |     for t in xrange(FRAME_SIZE, LENGTH):
519 | 
520 |         if t % FRAME_SIZE == 0:
521 |             frame_level_outputs, h0 = frame_level_generate_fn(
522 |                 samples[:, t-FRAME_SIZE:t],
523 |                 h0,
524 |                 #numpy.full((N_SEQS, ), (t == FRAME_SIZE), dtype='int32'),
525 |                 numpy.int32(t == FRAME_SIZE)
526 |             )
527 | 
528 |         samples[:, t] = sample_level_generate_fn(
529 |             frame_level_outputs[:, t % FRAME_SIZE],
530 |             samples[:, t-FRAME_SIZE:t],
531 |         )
532 | 
533 |     total_time = time() - total_time
534 |     log = "{} samples of {} seconds length generated in {} seconds."
535 |     log = log.format(N_SEQS, N_SECS, total_time)
536 |     print log
537 | 
538 |     for i in xrange(N_SEQS):
539 |         samp = samples[i]
540 |         if Q_TYPE == 'mu-law':
541 |             from datasets.dataset import mu2linear
542 |             samp = mu2linear(samp)
543 |         elif Q_TYPE == 'a-law':
544 |             raise NotImplementedError('a-law is not implemented')
545 | 
546 |         now = datetime.datetime.now()
547 |         now_time = "{}:{}:{}".format(now.hour, now.minute, now.second)
548 | 
549 |         file_name = "sample_{}_{}_{}_{}".format(tag, N_SECS, now_time, i)
550 |         print "writing...", file_name
551 |         write_audio_file(file_name, samp)
552 | 
553 | 
554 | 
555 | def monitor(data_feeder):
556 |     """
557 |     Cost and time of test_fn on a given dataset section.
558 |     Pass only one of `valid_feeder` or `test_feeder`.
559 |     Don't pass `train_feed`.
560 | 
561 |     :returns:
562 |         Mean cost over the input dataset (data_feeder)
563 |         Total time spent
564 |     """
565 |     _total_time = time()
566 |     _h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
567 |     _costs = []
568 |     _data_feeder = load_data(data_feeder)
569 |     for _seqs, _reset, _mask in _data_feeder:
570 |         _cost, _h0 = test_fn(_seqs, _h0, _reset, _mask)
571 |         _costs.append(_cost)
572 | 
573 |     return numpy.mean(_costs), time() - _total_time
574 | 
575 | print "Wall clock time spent before training started: {:.2f}h"\
576 |         .format((time()-exp_start)/3600.)
577 | print "Training!"
578 | total_iters = 0
579 | total_time = 0.
580 | last_print_time = 0.
581 | last_print_iters = 0
582 | costs = []
583 | lowest_valid_cost = numpy.finfo(numpy.float32).max
584 | corresponding_test_cost = numpy.finfo(numpy.float32).max
585 | new_lowest_cost = False
586 | end_of_batch = False
587 | epoch = 0
588 | 
589 | h0 = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
590 | 
591 | # Initial load train dataset
592 | tr_feeder = load_data(train_feeder)
593 | 
594 | ### Handling the resume option:
595 | if True: #if Resume:
596 |     # Check if checkpoint from previous run is not corrupted.
597 |     # Then overwrite some of the variables above.
598 |     iters_to_consume, res_path, epoch, total_iters,\
599 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
600 |         lib.resumable(path=FOLDER_PREFIX,
601 |                       iter_key=iter_str,
602 |                       epoch_key=epoch_str,
603 |                       add_resume_counter=True,
604 |                       other_keys=[lowest_valid_str,
605 |                                   corresp_test_str,
606 |                                   test_nll_str])
607 |     # At this point we saved the pkl file.
608 |     last_print_iters = total_iters
609 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
610 |     # Consumes this much iters to get to the last point in training data.
611 |     consume_time = time()
612 |     for i in xrange(iters_to_consume):
613 |         tr_feeder.next()
614 |     consume_time = time() - consume_time
615 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
616 |             format(consume_time, iters_to_consume)
617 | 
618 |     lib.load_params(res_path)
619 |     print "Parameters from last available checkpoint loaded."
620 | 
621 | 
622 | 
623 |     # 2. Stdout the training progress
624 |     print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
625 |     print_info = print_info.format(epoch,
626 |                                    total_iters,
627 |                                    (time()-exp_start)/3600)
628 |     print print_info
629 | 
630 |     tag = "e{}_i{}"
631 |     tag = tag.format(epoch,
632 |                      total_iters)
633 | 
634 |     # 5. Generate and save samples (time consuming)
635 |     # If not successful, we still have the params to sample afterward
636 |     print "Sampling!",
637 |     # Generate samples
638 |     generate_and_save_samples(tag, N_SECS)
639 |     print "Done!"
640 | 
641 |     print "Wall clock time spent: {:.2f}h"\
642 |                 .format((time()-exp_start)/3600)
643 | 
644 |     sys.exit()
645 | 


--------------------------------------------------------------------------------