├── examples ├── audio_tools.py └── test_audio_extract.py ├── README.md ├── LICENSE ├── mcts ├── toy_state_manager.py └── puct_mcts.py ├── image └── image_tools.py ├── graph └── graph_tools.py └── audio └── audio_tools.py /examples/audio_tools.py: -------------------------------------------------------------------------------- 1 | ../audio_tools.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tools 2 | Various tools for graphs, audio, images 3 | 4 | Goal is to keep as self-contained scripts for ease of integration into larger projects 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Kyle Kastner 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /mcts/toy_state_manager.py: -------------------------------------------------------------------------------- 1 | STATE_MAX = 50 2 | 3 | class RightPolicyStateManager(object): 4 | """ 5 | A state manager with a goal state and action space of 2 6 | """ 7 | def __init__(self, goal_state, random_state, rollout_limit=1000): 8 | self.rollout_limit = rollout_limit 9 | self.goal_state = goal_state 10 | self.random_state = random_state 11 | 12 | def get_next_state(self, state, action): 13 | if action == 1: 14 | return state + 1 15 | elif action == 0: 16 | return state - 1 17 | 18 | def get_action_space(self): 19 | # go down, 0 20 | # go up, 1 21 | return list(range(2)) 22 | 23 | def get_valid_actions(self, state): 24 | if state > 0 and state < STATE_MAX: 25 | return list(range(2)) 26 | elif state == 0: 27 | return [1] 28 | elif state == STATE_MAX: 29 | return [0] 30 | 31 | def get_init_state(self): 32 | # start in the worst state 33 | return 0 34 | #return self.random_state.randint(0, STATE_MAX) 35 | 36 | def rollout_fn(self, state): 37 | # can define custom rollout function 38 | return self.random_state.choice(self.get_valid_actions(state)) 39 | 40 | def score(self, state): 41 | # if these numbers are big, it tends to run slower 42 | 43 | # example of custom finish, score 44 | # sparse / goal discovery reward 45 | #return 1. if state == self.goal_state else 0. 46 | # distance / goal conditioned reward 47 | return 1. if state == self.goal_state else -(1. / self.goal_state) * (self.goal_state - state) 48 | 49 | def is_finished(self, state): 50 | # if this check is slow 51 | # can rewrite as _is_finished 52 | # then add 53 | # self.is_finished = MemoizeMutable(self._is_finished) 54 | # to __init__ instead 55 | 56 | # return winner, score, end 57 | # winner normally in [-1, 0, 1] 58 | # if it's one player, can just use [0, 1] and it's fine 59 | # score arbitrary float value 60 | # end in [True, False] 61 | return (1, 1., True) if state == self.goal_state else (0, 0., False) 62 | 63 | def rollout_from_state(self, state): 64 | # example rollout function 65 | s = state 66 | w, sc, e = self.is_finished(state) 67 | if e: 68 | return self.score(s) 69 | 70 | c = 0 71 | while True: 72 | a = self.rollout_fn(s) 73 | s = self.get_next_state(s, a) 74 | 75 | e = self.is_finished(s) 76 | c += 1 77 | if e: 78 | return self.score(s) 79 | if c > self.rollout_limit: 80 | # can also return different score if rollout limit hit 81 | return self.score(s) 82 | 83 | 84 | if __name__ == "__main__": 85 | from puct_mcts import MCTS, MemoizeMutable 86 | import numpy as np 87 | mcts_random = np.random.RandomState(1110) 88 | state_random = np.random.RandomState(11) 89 | exact = True 90 | 91 | state_man = RightPolicyStateManager(STATE_MAX, state_random) 92 | mcts = MCTS(state_man, n_playout=1000, random_state=mcts_random) 93 | state = mcts.state_manager.get_init_state() 94 | winner, score, end = mcts.state_manager.is_finished(state) 95 | states = [state] 96 | while True: 97 | if not end: 98 | if not exact: 99 | a, ap = mcts.sample_action(state, temp=temp, add_noise=noise) 100 | else: 101 | a, ap = mcts.get_action(state) 102 | 103 | for i in mcts.root.children_.keys(): 104 | print(i, mcts.root.children_[i].__dict__) 105 | print("") 106 | mcts.update_tree_root(a) 107 | state = mcts.state_manager.get_next_state(state, a) 108 | states.append(state) 109 | print(states) 110 | winner, score, end = mcts.state_manager.is_finished(state) 111 | if end: 112 | print(states[-1]) 113 | print("Ended") 114 | mcts.reconstruct_tree() 115 | break 116 | -------------------------------------------------------------------------------- /image/image_tools.py: -------------------------------------------------------------------------------- 1 | # Author: Kyle Kastner 2 | # License: BSD 3-Clause 3 | 4 | import numpy as np 5 | from scipy.linalg import eigh 6 | from scipy.misc import imresize 7 | 8 | 9 | def ind2sub(array_shape, ind): 10 | # Gives repeated indices, replicates matlabs ind2sub 11 | rows = (ind.astype("int32") // array_shape[1]) 12 | cols = (ind.astype("int32") % array_shape[1]) 13 | return (rows, cols) 14 | 15 | 16 | def graphcut(im, n_splits=2, split_type="mean", rad=5, sigma_x=.3, 17 | sigma_p=.1, scaling=255.): 18 | # im: grayscale image 19 | sz = im.shape[0] * im.shape[1] 20 | ind = np.arange(sz) 21 | 22 | I, J = ind2sub(im.shape, ind) 23 | I = I + 1 24 | J = J + 1 25 | 26 | scaled_im = im.ravel() / float(scaling) 27 | 28 | # float32 gives the wrong answer... 29 | scaled_im = scaled_im.astype("float64") 30 | sim = np.zeros((sz, sz)).astype("float64") 31 | 32 | # Faster with broadcast tricks 33 | # Still wasting computation - einsum might be fastest 34 | x1 = I[None] 35 | x2 = I[:, None] 36 | y1 = J[None] 37 | y2 = J[:, None] 38 | dist = (x1 - x2) ** 2 + (y1 - y2) ** 2 39 | scale = np.exp(-(dist / (sigma_x ** 2))) 40 | sim = scale 41 | sim[np.sqrt(dist) >= rad] = 0. 42 | del x1 43 | del x2 44 | del y1 45 | del y2 46 | del dist 47 | 48 | p1 = scaled_im[None] 49 | p2 = scaled_im[:, None] 50 | pdist = (p1 - p2) ** 2 51 | pscale = np.exp(-(pdist / (sigma_p ** 2))) 52 | 53 | sim *= pscale 54 | 55 | dind = np.diag_indices_from(sim) 56 | sim[dind] = 1. 57 | 58 | d = np.sum(sim, axis=1) 59 | D = np.diag(d) 60 | A = (D - sim) 61 | 62 | # Want second smallest eigenvector onward 63 | S, V = eigh(A, D, eigvals=(1, n_splits + 1), 64 | overwrite_a=True, overwrite_b=True) 65 | sort_ind = np.argsort(S) 66 | S = S[sort_ind] 67 | V = V[:, sort_ind] 68 | segs = V 69 | segs[:, -1] = ind 70 | 71 | def cut(im, matches, ix, split_type="mean"): 72 | # Can choose how to split 73 | if split_type == "mean": 74 | split = np.mean(segs[:, ix]) 75 | elif split_type == "median": 76 | split = np.median(segs[:, ix]) 77 | elif split_type == "zero": 78 | split = 0. 79 | else: 80 | raise ValueError("Unknown split type %s" % split_type) 81 | 82 | meets = np.where(matches[:, ix] >= split)[0] 83 | match1 = matches[meets, :] 84 | res1 = np.zeros_like(im) 85 | match_inds = match1[:, -1].astype("int32") 86 | res1.ravel()[match_inds] = im.ravel()[match_inds] 87 | 88 | meets = np.where(matches[:, ix] < split)[0] 89 | match2 = matches[meets, :] 90 | res2 = np.zeros_like(im) 91 | match_inds = match2[:, -1].astype("int32") 92 | res2.ravel()[match_inds] = im.ravel()[match_inds] 93 | return (match1, match2), (res1, res2) 94 | 95 | # Recursively split partitions 96 | # Currently also stores intermediates 97 | all_splits = [] 98 | all_matches = [[segs]] 99 | for i in range(n_splits): 100 | matched = all_matches[-1] 101 | current_splits = [] 102 | current_matches = [] 103 | for s in matched: 104 | matches, splits = cut(im, s, i, split_type=split_type) 105 | current_splits.extend(splits) 106 | current_matches.extend(matches) 107 | all_splits.append(current_splits) 108 | all_matches.append(current_matches) 109 | return all_matches, all_splits 110 | 111 | 112 | def test_graphcut(): 113 | import matplotlib.pyplot as plt 114 | from scipy.misc import lena 115 | im = lena() 116 | # Any bigger and my weak laptop gets memory errors 117 | bounds = (50, 50) 118 | im = imresize(im, bounds, interp="bicubic") 119 | all_matches, all_splits = graphcut(im, split_type="mean") 120 | 121 | to_plot = all_splits[-1] 122 | f, axarr = plt.subplots(2, len(to_plot) // 2) 123 | for n in range(len(to_plot)): 124 | axarr.ravel()[n].imshow(to_plot[n], cmap="gray") 125 | axarr.ravel()[n].set_xticks([]) 126 | axarr.ravel()[n].set_yticks([]) 127 | plt.show() 128 | 129 | if __name__ == "__main__": 130 | test_graphcut() 131 | -------------------------------------------------------------------------------- /examples/test_audio_extract.py: -------------------------------------------------------------------------------- 1 | from audio_tools import fetch_sample_speech_tapestry, world_synthesis 2 | from audio_tools import harvest, cheaptrick, d4c, sp2mgc, mgc2sp 3 | from audio_tools import soundsc 4 | from scipy.io import wavfile 5 | from scipy import fftpack 6 | import numpy as np 7 | import time 8 | 9 | 10 | def run_world_mgc_example(): 11 | # run on chromebook 12 | # enc 839.71 13 | # synth 48.79 14 | fs, d = fetch_sample_speech_tapestry() 15 | d = d.astype("float32") / 2 ** 15 16 | 17 | # harcoded for 16k from 18 | # https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh 19 | mgc_alpha = 0.58 20 | #mgc_order = 59 21 | mgc_order = 59 22 | # this is actually just mcep 23 | mgc_gamma = 0.0 24 | 25 | def enc(): 26 | temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs) 27 | temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs, 28 | temporal_positions_h, f0_h, vuv_h) 29 | temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs, 30 | temporal_positions_h, f0_h, vuv_h) 31 | 32 | mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma, 33 | verbose=True) 34 | return mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c 35 | 36 | 37 | start = time.time() 38 | mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c = enc() 39 | enc_done = time.time() 40 | 41 | sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True) 42 | synth_done = time.time() 43 | 44 | print("enc time: {}".format(enc_done - start)) 45 | print("synth time: {}".format(synth_done - enc_done)) 46 | y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs) 47 | #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs) 48 | wavfile.write("out_mgc.wav", fs, soundsc(y)) 49 | 50 | 51 | def run_world_base_example(): 52 | # on chromebook 53 | # enc 114.229 54 | # synth 5.165 55 | fs, d = fetch_sample_speech_tapestry() 56 | d = d.astype("float32") / 2 ** 15 57 | 58 | def enc(): 59 | temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs) 60 | temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs, 61 | temporal_positions_h, f0_h, vuv_h) 62 | temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs, 63 | temporal_positions_h, f0_h, vuv_h) 64 | 65 | return spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c 66 | 67 | 68 | start = time.time() 69 | spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c = enc() 70 | enc_done = time.time() 71 | 72 | y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, spectrogram_ct, fs) 73 | synth_done = time.time() 74 | 75 | print("enc time: {}".format(enc_done - start)) 76 | print("synth time: {}".format(synth_done - enc_done)) 77 | #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs) 78 | wavfile.write("out_base.wav", fs, soundsc(y)) 79 | 80 | 81 | def run_world_dct_example(): 82 | # on chromebook 83 | # enc 114.229 84 | # synth 5.165 85 | fs, d = fetch_sample_speech_tapestry() 86 | d = d.astype("float32") / 2 ** 15 87 | 88 | def enc(): 89 | temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs) 90 | temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs, 91 | temporal_positions_h, f0_h, vuv_h) 92 | temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs, 93 | temporal_positions_h, f0_h, vuv_h) 94 | 95 | return spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c 96 | 97 | start = time.time() 98 | spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c = enc() 99 | dct_buf = fftpack.dct(spectrogram_ct) 100 | n_fft = 512 101 | n_dct = 20 102 | dct_buf = dct_buf[:, :n_dct] 103 | idct_buf = np.zeros((dct_buf.shape[0], n_fft + 1)) 104 | idct_buf[:, :n_dct] = dct_buf 105 | ispectrogram_ct = fftpack.idct(idct_buf) 106 | enc_done = time.time() 107 | 108 | y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, spectrogram_ct, fs) 109 | synth_done = time.time() 110 | 111 | print("enc time: {}".format(enc_done - start)) 112 | print("synth time: {}".format(synth_done - enc_done)) 113 | #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs) 114 | wavfile.write("out_dct.wav", fs, soundsc(y)) 115 | 116 | 117 | #run_world_mgc_example() 118 | #run_world_base_example() 119 | run_world_dct_example() 120 | -------------------------------------------------------------------------------- /graph/graph_tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | # Author: Kyle Kastner 3 | # License: BSD 3-Clause 4 | 5 | # Using code modified from the following authors, collected in one place 6 | # http://www.gilles-bertrand.com/2014/03/dijkstra-algorithm-python-example-source-code-shortest-path.html 7 | # http://eddmann.com/posts/depth-first-search-and-breadth-first-search-in-python/ 8 | # https://gist.github.com/joninvski/701720https://gist.github.com/joninvski/701720 9 | # https://jlmedina123.wordpress.com/2014/05/17/floyd-warshall-algorithm-in-python/ 10 | # http://code.activestate.com/recipes/119466-dijkstras-algorithm-for-shortest-paths/ 11 | import matplotlib.pyplot as plt 12 | import matplotlib.image as mpimg 13 | import subprocess 14 | import os 15 | 16 | 17 | def pwrap(args, shell=False): 18 | p = subprocess.Popen(args, shell=shell, stdout=subprocess.PIPE, 19 | stdin=subprocess.PIPE, stderr=subprocess.PIPE, 20 | universal_newlines=True) 21 | return p 22 | 23 | # Print output 24 | # http://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running 25 | def execute(cmd, shell=False): 26 | popen = pwrap(cmd, shell=shell) 27 | for stdout_line in iter(popen.stdout.readline, ""): 28 | yield stdout_line 29 | 30 | popen.stdout.close() 31 | return_code = popen.wait() 32 | if return_code: 33 | raise subprocess.CalledProcessError(return_code, cmd) 34 | 35 | 36 | def pe(cmd, shell=False): 37 | """ 38 | Print and execute command on system 39 | """ 40 | for line in execute(cmd, shell=shell): 41 | print(line, end="") 42 | 43 | 44 | def _paths(graph, start, end, pop): 45 | # dfs or bfs depending on pop 46 | q = [(start, [start])] 47 | while q: 48 | if pop is None: 49 | (vertex, path) = q.pop() 50 | else: 51 | (vertex, path) = q.pop(0) 52 | for nx in set(graph[vertex].keys()) - set(path): 53 | if nx == end: 54 | yield path + [nx] 55 | else: 56 | q.append((nx, path + [nx])) 57 | 58 | 59 | def dfs_paths(graph, start, end): 60 | return _paths(graph, start, end, None) 61 | 62 | 63 | def bfs_paths(graph, start, end): 64 | return _paths(graph, start, end, 0) 65 | 66 | 67 | def initialize_bf(graph, source): 68 | d = {} # Stands for destination 69 | p = {} # Stands for predecessor 70 | for node in graph: 71 | d[node] = float("inf") 72 | p[node] = None 73 | d[source] = 0 74 | return d, p 75 | 76 | 77 | def relax_bf(node, neighbour, graph, d, p): 78 | if d[neighbour] > d[node] + graph[node][neighbour]: 79 | d[neighbour] = d[node] + graph[node][neighbour] 80 | p[neighbour] = node 81 | 82 | 83 | def bellman_ford_paths(graph, source): 84 | # returns distances and paths 85 | d, p = initialize_bf(graph, source) 86 | for i in range(len(graph)-1): 87 | for u in graph: 88 | for v in graph[u]: 89 | relax_bf(u, v, graph, d, p) 90 | 91 | # Check for negative-weight cycles 92 | for u in graph: 93 | for v in graph[u]: 94 | assert d[v] <= d[u] + graph[u][v] 95 | return d, p 96 | 97 | 98 | def floyd_warshall_paths(graph): 99 | # returns distances and paths 100 | # Initialize dist and pred: 101 | # copy graph into dist, but add infinite where there is 102 | # no edge, and 0 in the diagonal 103 | dist = {} 104 | pred = {} 105 | for u in graph: 106 | dist[u] = {} 107 | pred[u] = {} 108 | for v in graph: 109 | dist[u][v] = float("inf") 110 | pred[u][v] = -1 111 | dist[u][u] = 0 112 | for neighbor in graph[u]: 113 | dist[u][neighbor] = graph[u][neighbor] 114 | pred[u][neighbor] = u 115 | 116 | for t in graph: 117 | # given dist u to v, check if path u - t - v is shorter 118 | for u in graph: 119 | for v in graph: 120 | newdist = dist[u][t] + dist[t][v] 121 | if newdist < dist[u][v]: 122 | dist[u][v] = newdist 123 | pred[u][v] = pred[t][v] # route new path through t 124 | return dist, pred 125 | 126 | 127 | def dijkstra_path(graph, start, end, visited=[], distances={}, predecessors={}): 128 | """Find the shortest path between start and end nodes in a graph""" 129 | # we've found our end node, now find the path to it, and return 130 | if start == end: 131 | path = [] 132 | while end != None: 133 | path.append(end) 134 | end = predecessors.get(end, None) 135 | return distances[start], path[::-1] 136 | # detect if it's the first time through, set current distance to zero 137 | if not visited: 138 | distances[start] = 0 139 | 140 | # process neighbors as per algorithm, keep track of predecessors 141 | for neighbor in graph[start]: 142 | if neighbor not in visited: 143 | neighbordist = distances.get(neighbor, float("inf")) 144 | tentativedist = distances[start] + graph[start][neighbor] 145 | if tentativedist < neighbordist: 146 | distances[neighbor] = tentativedist 147 | predecessors[neighbor] = start 148 | # neighbors processed, now mark the current node as visited 149 | visited.append(start) 150 | # finds the closest unvisited node to the start 151 | unvisiteds = dict((k, distances.get(k, float("inf"))) for k in graph if k not in visited) 152 | closestnode = min(unvisiteds, key=unvisiteds.get) 153 | # now we can take the closest node and recurse, making it current 154 | return dijkstra_path(graph, closestnode, end, visited, distances, predecessors) 155 | 156 | 157 | def graphviz_plot(graph, fname="tmp_dotgraph.dot", show=True): 158 | if os.path.exists(fname): 159 | print("WARNING: Overwriting existing file {} for new plots".format(fname)) 160 | f = open(fname,'w') 161 | f.writelines('digraph G {\nnode [width=.3,height=.3,shape=octagon,style=filled,color=skyblue];\noverlap="false";\nrankdir="LR";\n') 162 | for i in graph: 163 | for j in graph[i]: 164 | s= ' '+ i 165 | s += ' -> ' + j + ' [label="' + str(graph[i][j]) + '"]' 166 | s+=';\n' 167 | f.writelines(s) 168 | f.writelines('}') 169 | f.close() 170 | graphname = fname.split(".")[0] + ".png" 171 | pe(["dot", "-Tpng", fname, "-o", graphname]) 172 | 173 | if show: 174 | plt.imshow(mpimg.imread(graphname)) 175 | plt.show() 176 | 177 | 178 | def test_graph_tools(): 179 | graph = {'s': {'a': 2, 'b': 1}, 180 | 'a': {'s': 3, 'b': 4, 'c': 8}, 181 | 'b': {'s': 4, 'a': 2, 'd': 2}, 182 | 'c': {'a': 2, 'd': 7, 't': 4}, 183 | 'd': {'b': 1, 'c': 11, 't': 5}, 184 | 't': {'c': 4, 'd': 5}} 185 | 186 | print([p for p in bfs_paths(graph, 'a', 't')]) 187 | print([p for p in dfs_paths(graph, 'a', 't')]) 188 | print(dijkstra_path(graph, 'a', 't')) 189 | print(floyd_warshall_paths(graph)) 190 | print(bellman_ford_paths(graph, 'a')) 191 | graphviz_plot(graph) 192 | 193 | 194 | if __name__ == "__main__": 195 | test_graph_tools() 196 | -------------------------------------------------------------------------------- /mcts/puct_mcts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import cPickle 4 | 5 | class MemoizeMutable(object): 6 | def __init__(self, fn): 7 | self.fn = fn 8 | self.memo = {} 9 | 10 | def __call__(self, *args, **kwds): 11 | str = cPickle.dumps(args, 1) + cPickle.dumps(kwds, 1) 12 | if not self.memo.has_key(str): 13 | self.memo[str] = self.fn(*args, **kwds) 14 | else: 15 | pass 16 | return self.memo[str] 17 | 18 | 19 | def softmax(x): 20 | assert len(x.shape) == 1 21 | probs = np.exp(x - np.max(x)) 22 | probs /= np.sum(probs) 23 | return probs 24 | 25 | 26 | class TreeNode(object): 27 | def __init__(self, prior_prob, parent): 28 | self.parent = parent 29 | self.Q_ = 0. 30 | self.P_ = float(prior_prob) 31 | # action -> tree node 32 | self.children_ = {} 33 | self.n_visits_ = 0 34 | 35 | def expand(self, actions_and_probs): 36 | for action, prob in actions_and_probs: 37 | if action not in self.children_: 38 | self.children_[action] = TreeNode(prob, self) 39 | 40 | def is_leaf(self): 41 | return self.children_ == {} 42 | 43 | def is_root(self): 44 | return self.parent is None 45 | 46 | def _update(self, value): 47 | self.n_visits_ += 1 48 | # not tracking W directly 49 | # original update is 50 | # n_visits += 1 51 | # W += v 52 | # Q = W / n_visits 53 | # so, 54 | # the old W = Q * (n_visits - 1) 55 | # new W = old W + v 56 | # new Q = new W / n_visits 57 | # plugging in new W 58 | # new Q = (old W + v) / n_visits 59 | # plugging in old W 60 | # new Q = (Q * (n_visits - 1) + v)/n_visits 61 | # new_Q = (Q * n_visits - Q + v)/n_visits 62 | # new_Q = Q * n_visits/n_visits - Q/n_visits + v/n_visits 63 | # new_Q = Q - Q/n_visits + v/n_visits 64 | # new_Q = Q + (v - Q) / n_visits 65 | # new_Q += (v - Q) / n_visits 66 | self.Q_ += (value - self.Q_) / float(self.n_visits_) 67 | 68 | def update(self, value): 69 | if self.parent != None: 70 | # negative in the original code due to being the opposing player 71 | self.parent.update(value) 72 | self._update(value) 73 | 74 | def get_value(self, c_puct): 75 | self.U_ = c_puct * self.P_ * np.sqrt(float(self.parent.n_visits_)) / float(1. + self.n_visits_) 76 | return self.Q_ + self.U_ 77 | 78 | def get_best(self, c_puct): 79 | best = max(self.children_.iteritems(), key=lambda x: x[1].get_value(c_puct)) 80 | return best 81 | 82 | 83 | class MCTS(object): 84 | def __init__(self, state_manager, c_puct=1.4, n_playout=1000, random_state=None): 85 | if random_state is None: 86 | raise ValueError("Must pass random_state object") 87 | self.random_state = random_state 88 | self.root = TreeNode(1., None) 89 | # state manager must, itself have *NO* state / updating behavior 90 | # internally. Otherwise we need deepcopy() in get_move_probs 91 | self.state_manager = state_manager 92 | self.c_puct = c_puct 93 | self.n_playout = n_playout 94 | self.tree_subs_ = [] 95 | self.warn_at_ = 10000 96 | 97 | def playout(self, state): 98 | node = self.root 99 | while True: 100 | if node.is_leaf(): 101 | break 102 | action, node = node.get_best(self.c_puct) 103 | state = self.state_manager.get_next_state(state, action) 104 | winner, score, end = self.state_manager.is_finished(state) 105 | if not end: 106 | # uniform prior probs 107 | actions = self.state_manager.get_valid_actions(state) 108 | action_space = self.state_manager.get_action_space() 109 | probs = np.ones((len(actions))) / float(len(actions)) 110 | actions_and_probs = list(zip(actions, probs)) 111 | node.expand(actions_and_probs) 112 | value = self.state_manager.rollout_from_state(state) 113 | # negative here 114 | node.update(value) 115 | return None 116 | 117 | def get_action_probs(self, state, temp=1E-3): 118 | # low temp -> nearly argmax 119 | for n in range(self.n_playout): 120 | self.playout(state) 121 | 122 | act_visits = [(act, node.n_visits_) for act, node in self.root.children_.items()] 123 | if len(act_visits) == 0: 124 | return None, None 125 | actions, visits = zip(*act_visits) 126 | action_probs = softmax(1. / temp * np.log(visits)) 127 | return actions, action_probs 128 | 129 | def sample_action(self, state, temp=1E-3, add_noise=True, 130 | dirichlet_coeff1=0.25, dirichlet_coeff2=0.3): 131 | vsz = len(self.state_manager.get_action_space()) 132 | act_probs = np.zeros((vsz,)) 133 | acts, probs = self.get_action_probs(state, temp) 134 | if acts == None: 135 | return acts, probs 136 | act_probs[list(acts)] = probs 137 | if add_noise: 138 | act = self.random_state.choice(acts, p=(1. - dirichlet_coeff1) * probs + dirichlet_coeff1 * self.random_state.dirichlet(dirichlet_coeff2 * np.ones(len(probs)))) 139 | else: 140 | act = self.random_state.choice(acts, p=probs) 141 | return act, act_probs 142 | 143 | def get_action(self, state): 144 | vsz = len(self.state_manager.get_action_space()) 145 | act_probs = np.zeros((vsz,)) 146 | # temp doesn't matter for argmax 147 | acts, probs = self.get_action_probs(state, temp=1.) 148 | if acts == None: 149 | return acts, probs 150 | act_probs[list(acts)] = probs 151 | maxes = np.max(act_probs) 152 | opts = np.where(act_probs == maxes)[0] 153 | if len(opts) > 1: 154 | # choose the one with the highest win score if equal? 155 | # if 2 options are *exactly* equal, just choose 1 at random 156 | self.random_state.shuffle(opts) 157 | act = opts[0] 158 | return act, act_probs 159 | 160 | def update_tree_root(self, action): 161 | if action in self.root.children_: 162 | self.tree_subs_.append((self.root, self.root.children_[action])) 163 | if len(self.tree_subs_) > self.warn_at_: 164 | print("WARNING: Over {} tree_subs_ detected, watch memory".format(self.warn_at_)) 165 | # only print the warning a few times 166 | self.warn_at_ = 10 * self.warn_at_ 167 | self.root = self.root.children_[action] 168 | self.root.parent = None 169 | else: 170 | raise ValueError("Action argument {} neither in root.children_ {} and not == -1 (reset)".format(self.root.children_.keys())) 171 | 172 | def reconstruct_tree(self): 173 | # walk the list back to front, putting parents back in place 174 | # should reconstruct tree while still preserving counts... 175 | # this might be a bad idea for large state spaces 176 | for pair in self.tree_subs_[::-1]: 177 | self.root.parent = pair[0] 178 | self.root = pair[0] 179 | self.tree_subs_ = [] 180 | 181 | def reset_tree(self): 182 | print("Resetting tree") 183 | self.root = TreeNode(1., None) 184 | self.tree_subs_ = [] 185 | -------------------------------------------------------------------------------- /audio/audio_tools.py: -------------------------------------------------------------------------------- 1 | # License: BSD 3-clause 2 | # Authors: Kyle Kastner 3 | # LTSD routine from jfsantos (Joao Felipe Santos) 4 | # Harvest, Cheaptrick, D4C, WORLD routines based on MATLAB code from M. Morise 5 | # http://ml.cs.yamanashi.ac.jp/world/english/ 6 | # MGC code based on r9y9 (Ryuichi Yamamoto) MelGeneralizedCepstrums.jl 7 | # Pieces also adapted from SPTK 8 | from __future__ import division 9 | import numpy as np 10 | import scipy as sp 11 | from numpy.lib.stride_tricks import as_strided 12 | import scipy.signal as sg 13 | from scipy.interpolate import interp1d 14 | import wave 15 | from scipy.cluster.vq import vq 16 | from scipy import linalg, fftpack 17 | from numpy.testing import assert_almost_equal 18 | from scipy.linalg import svd 19 | from scipy.io import wavfile 20 | from scipy.signal import firwin 21 | import zipfile 22 | import tarfile 23 | import os 24 | import copy 25 | import multiprocessing 26 | from multiprocessing import Pool 27 | import functools 28 | import time 29 | try: 30 | import urllib.request as urllib # for backwards compatibility 31 | except ImportError: 32 | import urllib2 as urllib 33 | 34 | 35 | def download(url, server_fname, local_fname=None, progress_update_percentage=5, 36 | bypass_certificate_check=False): 37 | """ 38 | An internet download utility modified from 39 | http://stackoverflow.com/questions/22676/ 40 | how-do-i-download-a-file-over-http-using-python/22776#22776 41 | """ 42 | if bypass_certificate_check: 43 | import ssl 44 | ctx = ssl.create_default_context() 45 | ctx.check_hostname = False 46 | ctx.verify_mode = ssl.CERT_NONE 47 | u = urllib.urlopen(url, context=ctx) 48 | else: 49 | u = urllib.urlopen(url) 50 | if local_fname is None: 51 | local_fname = server_fname 52 | full_path = local_fname 53 | meta = u.info() 54 | with open(full_path, 'wb') as f: 55 | try: 56 | file_size = int(meta.get("Content-Length")) 57 | except TypeError: 58 | print("WARNING: Cannot get file size, displaying bytes instead!") 59 | file_size = 100 60 | print("Downloading: %s Bytes: %s" % (server_fname, file_size)) 61 | file_size_dl = 0 62 | block_sz = int(1E7) 63 | p = 0 64 | while True: 65 | buffer = u.read(block_sz) 66 | if not buffer: 67 | break 68 | file_size_dl += len(buffer) 69 | f.write(buffer) 70 | if (file_size_dl * 100. / file_size) > p: 71 | status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 72 | 100. / file_size) 73 | print(status) 74 | p += progress_update_percentage 75 | 76 | 77 | def fetch_sample_speech_tapestry(): 78 | url = "https://www.dropbox.com/s/qte66a7haqspq2g/tapestry.wav?dl=1" 79 | wav_path = "tapestry.wav" 80 | if not os.path.exists(wav_path): 81 | download(url, wav_path) 82 | fs, d = wavfile.read(wav_path) 83 | d = d.astype('float32') / (2 ** 15) 84 | # file is stereo? - just choose one channel 85 | return fs, d 86 | 87 | 88 | def fetch_sample_file(wav_path): 89 | if not os.path.exists(wav_path): 90 | raise ValueError("Unable to find file at path %s" % wav_path) 91 | fs, d = wavfile.read(wav_path) 92 | d = d.astype('float32') / (2 ** 15) 93 | # file is stereo - just choose one channel 94 | if len(d.shape) > 1: 95 | d = d[:, 0] 96 | return fs, d 97 | 98 | 99 | def fetch_sample_music(): 100 | url = "http://www.music.helsinki.fi/tmt/opetus/uusmedia/esim/" 101 | url += "a2002011001-e02-16kHz.wav" 102 | wav_path = "test.wav" 103 | if not os.path.exists(wav_path): 104 | download(url, wav_path) 105 | fs, d = wavfile.read(wav_path) 106 | d = d.astype('float32') / (2 ** 15) 107 | # file is stereo - just choose one channel 108 | d = d[:, 0] 109 | return fs, d 110 | 111 | 112 | def fetch_sample_speech_fruit(n_samples=None): 113 | url = 'https://dl.dropboxusercontent.com/u/15378192/audio.tar.gz' 114 | wav_path = "audio.tar.gz" 115 | if not os.path.exists(wav_path): 116 | download(url, wav_path) 117 | tf = tarfile.open(wav_path) 118 | wav_names = [fname for fname in tf.getnames() 119 | if ".wav" in fname.split(os.sep)[-1]] 120 | speech = [] 121 | print("Loading speech files...") 122 | for wav_name in wav_names[:n_samples]: 123 | f = tf.extractfile(wav_name) 124 | fs, d = wavfile.read(f) 125 | d = d.astype('float32') / (2 ** 15) 126 | speech.append(d) 127 | return fs, speech 128 | 129 | 130 | def fetch_sample_speech_eustace(n_samples=None): 131 | """ 132 | http://www.cstr.ed.ac.uk/projects/eustace/download.html 133 | """ 134 | # data 135 | url = "http://www.cstr.ed.ac.uk/projects/eustace/down/eustace_wav.zip" 136 | wav_path = "eustace_wav.zip" 137 | if not os.path.exists(wav_path): 138 | download(url, wav_path) 139 | 140 | # labels 141 | url = "http://www.cstr.ed.ac.uk/projects/eustace/down/eustace_labels.zip" 142 | labels_path = "eustace_labels.zip" 143 | if not os.path.exists(labels_path): 144 | download(url, labels_path) 145 | 146 | # Read wavfiles 147 | # 16 kHz wav 148 | zf = zipfile.ZipFile(wav_path, 'r') 149 | wav_names = [fname for fname in zf.namelist() 150 | if ".wav" in fname.split(os.sep)[-1]] 151 | fs = 16000 152 | speech = [] 153 | print("Loading speech files...") 154 | for wav_name in wav_names[:n_samples]: 155 | wav_str = zf.read(wav_name) 156 | d = np.frombuffer(wav_str, dtype=np.int16) 157 | d = d.astype('float32') / (2 ** 15) 158 | speech.append(d) 159 | 160 | zf = zipfile.ZipFile(labels_path, 'r') 161 | label_names = [fname for fname in zf.namelist() 162 | if ".lab" in fname.split(os.sep)[-1]] 163 | labels = [] 164 | print("Loading label files...") 165 | for label_name in label_names[:n_samples]: 166 | label_file_str = zf.read(label_name) 167 | labels.append(label_file_str) 168 | return fs, speech 169 | 170 | 171 | def stft(X, fftsize=128, step="half", mean_normalize=True, real=False, 172 | compute_onesided=True): 173 | """ 174 | Compute STFT for 1D real valued input X 175 | """ 176 | if real: 177 | local_fft = fftpack.rfft 178 | cut = -1 179 | else: 180 | local_fft = fftpack.fft 181 | cut = None 182 | if compute_onesided: 183 | cut = fftsize // 2 + 1 184 | if mean_normalize: 185 | X -= X.mean() 186 | if step == "half": 187 | X = halfoverlap(X, fftsize) 188 | else: 189 | X = overlap(X, fftsize, step) 190 | size = fftsize 191 | win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1)) 192 | X = X * win[None] 193 | X = local_fft(X)[:, :cut] 194 | return X 195 | 196 | 197 | def istft(X, fftsize=128, step="half", wsola=False, mean_normalize=True, 198 | real=False, compute_onesided=True): 199 | """ 200 | Compute ISTFT for STFT transformed X 201 | """ 202 | if real: 203 | local_ifft = fftpack.irfft 204 | X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j 205 | X_pad[:, :-1] = X 206 | X = X_pad 207 | else: 208 | local_ifft = fftpack.ifft 209 | if compute_onesided: 210 | X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j 211 | X_pad[:, :fftsize // 2 + 1] = X 212 | X_pad[:, fftsize // 2 + 1:] = 0 213 | X = X_pad 214 | X = local_ifft(X).astype("float64") 215 | if step == "half": 216 | X = invert_halfoverlap(X) 217 | else: 218 | X = overlap_add(X, step, wsola=wsola) 219 | if mean_normalize: 220 | X -= np.mean(X) 221 | return X 222 | 223 | 224 | def mdct_slow(X, dctsize=128): 225 | M = dctsize 226 | N = 2 * dctsize 227 | N_0 = (M + 1) / 2 228 | X = halfoverlap(X, N) 229 | X = sine_window(X) 230 | n, k = np.meshgrid(np.arange(N), np.arange(M)) 231 | # Use transpose due to "samples as rows" convention 232 | tf = np.cos(np.pi * (n + N_0) * (k + 0.5) / M).T 233 | return np.dot(X, tf) 234 | 235 | 236 | def imdct_slow(X, dctsize=128): 237 | M = dctsize 238 | N = 2 * dctsize 239 | N_0 = (M + 1) / 2 240 | N_4 = N / 4 241 | n, k = np.meshgrid(np.arange(N), np.arange(M)) 242 | # inverse *is not* transposed 243 | tf = np.cos(np.pi * (n + N_0) * (k + 0.5) / M) 244 | X_r = np.dot(X, tf) / N_4 245 | X_r = sine_window(X_r) 246 | X = invert_halfoverlap(X_r) 247 | return X 248 | 249 | 250 | def nsgcwin(fmin, fmax, n_bins, fs, signal_len, gamma): 251 | """ 252 | Nonstationary Gabor window calculation 253 | 254 | References 255 | ---------- 256 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 257 | Constructing an invertible constant-Q transform with nonstationary Gabor 258 | frames, Proceedings of the 14th International Conference on Digital 259 | Audio Effects (DAFx 11), Paris, France, 2011 260 | 261 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 262 | A framework for invertible, real-time constant-Q transforms, submitted. 263 | 264 | Original matlab code copyright follows: 265 | 266 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 267 | 268 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 269 | http://nuhag.eu/ 270 | Permission is granted to modify and re-distribute this 271 | code in any manner as long as this notice is preserved. 272 | All standard disclaimers apply. 273 | """ 274 | # use a hanning window 275 | # no fractional shifts 276 | fftres = fs / signal_len 277 | fmin = float(fmin) 278 | fmax = float(fmax) 279 | gamma = float(gamma) 280 | nyq = fs / 2. 281 | b = np.floor(n_bins * np.log2(fmax / fmin)) 282 | fbas = fmin * 2 ** (np.arange(b + 1) / float(n_bins)) 283 | Q = 2 ** (1. / n_bins) - 2 ** (-1. / n_bins) 284 | cqtbw = Q * fbas + gamma 285 | cqtbw = cqtbw.ravel() 286 | maxidx = np.where(fbas + cqtbw / 2. > nyq)[0] 287 | if len(maxidx) > 0: 288 | # replicate bug in MATLAB version... 289 | # or is it a feature 290 | if sum(maxidx) == 0: 291 | first = len(cqtbw) - 1 292 | else: 293 | first = maxidx[0] 294 | fbas = fbas[:first] 295 | cqtbw = cqtbw[:first] 296 | minidx = np.where(fbas - cqtbw / 2. < 0)[0] 297 | if len(minidx) > 0: 298 | fbas = fbas[minidx[-1]+1:] 299 | cqtbw = cqtbw[minidx[-1]+1:] 300 | 301 | fbas_len = len(fbas) 302 | fbas_new = np.zeros((2 * (len(fbas) + 1))) 303 | fbas_new[1:len(fbas) + 1] = fbas 304 | fbas = fbas_new 305 | fbas[fbas_len + 1] = nyq 306 | fbas[fbas_len + 2:] = fs - fbas[1:fbas_len + 1][::-1] 307 | bw = np.zeros_like(fbas) 308 | bw[0] = 2 * fmin 309 | bw[1:len(cqtbw) + 1] = cqtbw 310 | bw[len(cqtbw) + 1] = fbas[fbas_len + 2] - fbas[fbas_len] 311 | bw[-len(cqtbw):] = cqtbw[::-1] 312 | bw = bw / fftres 313 | fbas = fbas / fftres 314 | 315 | posit = np.zeros_like(fbas) 316 | posit[:fbas_len + 2] = np.floor(fbas[:fbas_len + 2]) 317 | posit[fbas_len + 2:] = np.ceil(fbas[fbas_len + 2:]) 318 | base_shift = -posit[-1] % signal_len 319 | shift = np.zeros_like(posit).astype("int32") 320 | shift[1:] = (posit[1:] - posit[:-1]).astype("int32") 321 | shift[0] = base_shift 322 | 323 | bw = np.round(bw) 324 | bwfac = 1 325 | M = bw 326 | 327 | min_win = 4 328 | for ii in range(len(bw)): 329 | if bw[ii] < min_win: 330 | bw[ii] = min_win 331 | M[ii] = bw[ii] 332 | 333 | def _win(numel): 334 | if numel % 2 == 0: 335 | s1 = np.arange(0, .5, 1. / numel) 336 | if len(s1) != numel // 2: 337 | # edge case with small floating point numbers... 338 | s1 = s1[:-1] 339 | s2 = np.arange(-.5, 0, 1. / numel) 340 | if len(s2) != numel // 2: 341 | # edge case with small floating point numbers... 342 | s2 = s2[:-1] 343 | x = np.concatenate((s1, s2)) 344 | else: 345 | s1 = np.arange(0, .5, 1. / numel) 346 | s2 = np.arange(-.5 + .5 / numel, 0, 1. / numel) 347 | if len(s2) != numel // 2: # assume integer truncate 27 // 2 = 13 348 | s2 = s2[:-1] 349 | x = np.concatenate((s1, s2)) 350 | assert len(x) == numel 351 | g = .5 + .5 * np.cos(2 * np.pi * x) 352 | return g 353 | 354 | multiscale = [_win(bi) for bi in bw] 355 | bw = bwfac * np.ceil(M / bwfac) 356 | 357 | for kk in [0, fbas_len + 1]: 358 | if M[kk] > M[kk + 1]: 359 | multiscale[kk] = np.ones(M[kk]).astype(multiscale[0].dtype) 360 | i1 = np.floor(M[kk] / 2) - np.floor(M[kk + 1] / 2) 361 | i2 = np.floor(M[kk] / 2) + np.ceil(M[kk + 1] / 2) 362 | # Very rarely, gets an off by 1 error? Seems to be at the end... 363 | # for now, slice 364 | multiscale[kk][i1:i2] = _win(M[kk + 1]) 365 | multiscale[kk] = multiscale[kk] / np.sqrt(M[kk]) 366 | return multiscale, shift, M 367 | 368 | 369 | def nsgtf_real(X, multiscale, shift, window_lens): 370 | """ 371 | Nonstationary Gabor Transform for real values 372 | 373 | References 374 | ---------- 375 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 376 | Constructing an invertible constant-Q transform with nonstationary Gabor 377 | frames, Proceedings of the 14th International Conference on Digital 378 | Audio Effects (DAFx 11), Paris, France, 2011 379 | 380 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 381 | A framework for invertible, real-time constant-Q transforms, submitted. 382 | 383 | Original matlab code copyright follows: 384 | 385 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 386 | 387 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 388 | http://nuhag.eu/ 389 | Permission is granted to modify and re-distribute this 390 | code in any manner as long as this notice is preserved. 391 | All standard disclaimers apply. 392 | """ 393 | # This will break with multchannel input 394 | signal_len = len(X) 395 | N = len(shift) 396 | X_fft = np.fft.fft(X) 397 | 398 | fill = np.sum(shift) - signal_len 399 | if fill > 0: 400 | X_fft_tmp = np.zeros((signal_len + shift)) 401 | X_fft_tmp[:len(X_fft)] = X_fft 402 | X_fft = X_fft_tmp 403 | posit = np.cumsum(shift) - shift[0] 404 | scale_lens = np.array([len(m) for m in multiscale]) 405 | N = np.where(posit - np.floor(scale_lens) <= (signal_len + fill) / 2)[0][-1] 406 | c = [] 407 | # c[0] is almost exact 408 | for ii in range(N): 409 | idx_l = np.arange(np.ceil(scale_lens[ii] / 2), scale_lens[ii]) 410 | idx_r = np.arange(np.ceil(scale_lens[ii] / 2)) 411 | idx = np.concatenate((idx_l, idx_r)) 412 | idx = idx.astype("int32") 413 | subwin_range = posit[ii] + np.arange(-np.floor(scale_lens[ii] / 2), 414 | np.ceil(scale_lens[ii] / 2)) 415 | win_range = subwin_range % (signal_len + fill) 416 | win_range = win_range.astype("int32") 417 | if window_lens[ii] < scale_lens[ii]: 418 | raise ValueError("Not handling 'not enough channels' case") 419 | else: 420 | temp = np.zeros((window_lens[ii],)).astype(X_fft.dtype) 421 | temp_idx_l = np.arange(len(temp) - np.floor(scale_lens[ii] / 2), 422 | len(temp)) 423 | temp_idx_r = np.arange(np.ceil(scale_lens[ii] / 2)) 424 | temp_idx = np.concatenate((temp_idx_l, temp_idx_r)) 425 | temp_idx = temp_idx.astype("int32") 426 | temp[temp_idx] = X_fft[win_range] * multiscale[ii][idx] 427 | fs_new_bins = window_lens[ii] 428 | fk_bins = posit[ii] 429 | displace = fk_bins - np.floor(fk_bins / fs_new_bins) * fs_new_bins 430 | displace = displace.astype("int32") 431 | temp = np.roll(temp, displace) 432 | c.append(np.fft.ifft(temp)) 433 | 434 | if 0: 435 | # cell2mat concatenation 436 | c = np.concatenate(c) 437 | return c 438 | 439 | 440 | def nsdual(multiscale, shift, window_lens): 441 | """ 442 | Calculation of nonstationary inverse gabor filters 443 | 444 | References 445 | ---------- 446 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 447 | Constructing an invertible constant-Q transform with nonstationary Gabor 448 | frames, Proceedings of the 14th International Conference on Digital 449 | Audio Effects (DAFx 11), Paris, France, 2011 450 | 451 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 452 | A framework for invertible, real-time constant-Q transforms, submitted. 453 | 454 | Original matlab code copyright follows: 455 | 456 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 457 | 458 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 459 | http://nuhag.eu/ 460 | Permission is granted to modify and re-distribute this 461 | code in any manner as long as this notice is preserved. 462 | All standard disclaimers apply. 463 | """ 464 | N = len(shift) 465 | posit = np.cumsum(shift) 466 | seq_len = posit[-1] 467 | posit = posit - shift[0] 468 | 469 | diagonal = np.zeros((seq_len,)) 470 | win_range = [] 471 | 472 | for ii in range(N): 473 | filt_len = len(multiscale[ii]) 474 | idx = np.arange(-np.floor(filt_len / 2), np.ceil(filt_len / 2)) 475 | win_range.append((posit[ii] + idx) % seq_len) 476 | subdiag = window_lens[ii] * np.fft.fftshift(multiscale[ii]) ** 2 477 | ind = win_range[ii].astype(np.int) 478 | diagonal[ind] = diagonal[ind] + subdiag 479 | 480 | dual_multiscale = multiscale 481 | for ii in range(N): 482 | ind = win_range[ii].astype(np.int) 483 | dual_multiscale[ii] = np.fft.ifftshift( 484 | np.fft.fftshift(dual_multiscale[ii]) / diagonal[ind]) 485 | return dual_multiscale 486 | 487 | 488 | def nsgitf_real(c, c_dc, c_nyq, multiscale, shift): 489 | """ 490 | Nonstationary Inverse Gabor Transform on real valued signal 491 | 492 | References 493 | ---------- 494 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 495 | Constructing an invertible constant-Q transform with nonstationary Gabor 496 | frames, Proceedings of the 14th International Conference on Digital 497 | Audio Effects (DAFx 11), Paris, France, 2011 498 | 499 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 500 | A framework for invertible, real-time constant-Q transforms, submitted. 501 | 502 | Original matlab code copyright follows: 503 | 504 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 505 | 506 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 507 | http://nuhag.eu/ 508 | Permission is granted to modify and re-distribute this 509 | code in any manner as long as this notice is preserved. 510 | All standard disclaimers apply. 511 | """ 512 | c_l = [] 513 | c_l.append(c_dc) 514 | c_l.extend([ci for ci in c]) 515 | c_l.append(c_nyq) 516 | 517 | posit = np.cumsum(shift) 518 | seq_len = posit[-1] 519 | posit -= shift[0] 520 | out = np.zeros((seq_len,)).astype(c_l[1].dtype) 521 | 522 | for ii in range(len(c_l)): 523 | filt_len = len(multiscale[ii]) 524 | win_range = posit[ii] + np.arange(-np.floor(filt_len / 2), 525 | np.ceil(filt_len / 2)) 526 | win_range = (win_range % seq_len).astype(np.int) 527 | temp = np.fft.fft(c_l[ii]) * len(c_l[ii]) 528 | 529 | fs_new_bins = len(c_l[ii]) 530 | fk_bins = posit[ii] 531 | displace = int(fk_bins - np.floor(fk_bins / fs_new_bins) * fs_new_bins) 532 | temp = np.roll(temp, -displace) 533 | l = np.arange(len(temp) - np.floor(filt_len / 2), len(temp)) 534 | r = np.arange(np.ceil(filt_len / 2)) 535 | temp_idx = (np.concatenate((l, r)) % len(temp)).astype(np.int) 536 | temp = temp[temp_idx] 537 | lf = np.arange(filt_len - np.floor(filt_len / 2), filt_len) 538 | rf = np.arange(np.ceil(filt_len / 2)) 539 | filt_idx = np.concatenate((lf, rf)).astype(np.int) 540 | m = multiscale[ii][filt_idx] 541 | out[win_range] = out[win_range] + m * temp 542 | 543 | nyq_bin = np.floor(seq_len / 2) + 1 544 | out_idx = np.arange( 545 | nyq_bin - np.abs(1 - seq_len % 2) - 1, 0, -1).astype(np.int) 546 | out[nyq_bin:] = np.conj(out[out_idx]) 547 | t_out = np.real(np.fft.ifft(out)).astype(np.float64) 548 | return t_out 549 | 550 | 551 | def cqt(X, fs, n_bins=48, fmin=27.5, fmax="nyq", gamma=20): 552 | """ 553 | Constant Q Transform 554 | 555 | References 556 | ---------- 557 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 558 | Constructing an invertible constant-Q transform with nonstationary Gabor 559 | frames, Proceedings of the 14th International Conference on Digital 560 | Audio Effects (DAFx 11), Paris, France, 2011 561 | 562 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 563 | A framework for invertible, real-time constant-Q transforms, submitted. 564 | 565 | Original matlab code copyright follows: 566 | 567 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 568 | 569 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 570 | http://nuhag.eu/ 571 | Permission is granted to modify and re-distribute this 572 | code in any manner as long as this notice is preserved. 573 | All standard disclaimers apply. 574 | """ 575 | if fmax == "nyq": 576 | fmax = fs / 2. 577 | multiscale, shift, window_lens = nsgcwin(fmin, fmax, n_bins, fs, 578 | len(X), gamma) 579 | fbas = fs * np.cumsum(shift[1:]) / len(X) 580 | fbas = fbas[:len(window_lens) // 2 - 1] 581 | bins = window_lens.shape[0] // 2 - 1 582 | window_lens[1:bins + 1] = window_lens[bins + 2] 583 | window_lens[bins + 2:] = window_lens[1:bins + 1][::-1] 584 | norm = 2. * window_lens[:bins + 2] / float(len(X)) 585 | norm = np.concatenate((norm, norm[1:-1][::-1])) 586 | multiscale = [norm[ii] * multiscale[ii] for ii in range(2 * (bins + 1))] 587 | 588 | c = nsgtf_real(X, multiscale, shift, window_lens) 589 | c_dc = c[0] 590 | c_nyq = c[-1] 591 | c_sub = c[1:-1] 592 | c = np.vstack(c_sub) 593 | return c, c_dc, c_nyq, multiscale, shift, window_lens 594 | 595 | 596 | def icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens): 597 | """ 598 | Inverse constant Q Transform 599 | 600 | References 601 | ---------- 602 | Velasco G. A., Holighaus N., Dorfler M., Grill T. 603 | Constructing an invertible constant-Q transform with nonstationary Gabor 604 | frames, Proceedings of the 14th International Conference on Digital 605 | Audio Effects (DAFx 11), Paris, France, 2011 606 | 607 | Holighaus N., Dorfler M., Velasco G. A. and Grill T. 608 | A framework for invertible, real-time constant-Q transforms, submitted. 609 | 610 | Original matlab code copyright follows: 611 | 612 | AUTHOR(s) : Monika Dorfler, Gino Angelo Velasco, Nicki Holighaus, 2010-2011 613 | 614 | COPYRIGHT : (c) NUHAG, Dept.Math., University of Vienna, AUSTRIA 615 | http://nuhag.eu/ 616 | Permission is granted to modify and re-distribute this 617 | code in any manner as long as this notice is preserved. 618 | All standard disclaimers apply. 619 | """ 620 | new_multiscale = nsdual(multiscale, shift, window_lens) 621 | X = nsgitf_real(X_cq, c_dc, c_nyq, new_multiscale, shift) 622 | return X 623 | 624 | 625 | def rolling_mean(X, window_size): 626 | w = 1.0 / window_size * np.ones((window_size)) 627 | return np.correlate(X, w, 'valid') 628 | 629 | 630 | def rolling_window(X, window_size): 631 | # for 1d data 632 | shape = X.shape[:-1] + (X.shape[-1] - window_size + 1, window_size) 633 | strides = X.strides + (X.strides[-1],) 634 | return np.lib.stride_tricks.as_strided(X, shape=shape, strides=strides) 635 | 636 | 637 | def voiced_unvoiced(X, window_size=256, window_step=128, copy=True): 638 | """ 639 | Voiced unvoiced detection from a raw signal 640 | 641 | Based on code from: 642 | https://www.clear.rice.edu/elec532/PROJECTS96/lpc/code.html 643 | 644 | Other references: 645 | http://www.seas.ucla.edu/spapl/code/harmfreq_MOLRT_VAD.m 646 | 647 | Parameters 648 | ---------- 649 | X : ndarray 650 | Raw input signal 651 | 652 | window_size : int, optional (default=256) 653 | The window size to use, in samples. 654 | 655 | window_step : int, optional (default=128) 656 | How far the window steps after each calculation, in samples. 657 | 658 | copy : bool, optional (default=True) 659 | Whether to make a copy of the input array or allow in place changes. 660 | """ 661 | X = np.array(X, copy=copy) 662 | if len(X.shape) < 2: 663 | X = X[None] 664 | n_points = X.shape[1] 665 | n_windows = n_points // window_step 666 | # Padding 667 | pad_sizes = [(window_size - window_step) // 2, 668 | window_size - window_step // 2] 669 | # TODO: Handling for odd window sizes / steps 670 | X = np.hstack((np.zeros((X.shape[0], pad_sizes[0])), X, 671 | np.zeros((X.shape[0], pad_sizes[1])))) 672 | 673 | clipping_factor = 0.6 674 | b, a = sg.butter(10, np.pi * 9 / 40) 675 | voiced_unvoiced = np.zeros((n_windows, 1)) 676 | period = np.zeros((n_windows, 1)) 677 | for window in range(max(n_windows - 1, 1)): 678 | XX = X.ravel()[window * window_step + np.arange(window_size)] 679 | XX *= sg.hamming(len(XX)) 680 | XX = sg.lfilter(b, a, XX) 681 | left_max = np.max(np.abs(XX[:len(XX) // 3])) 682 | right_max = np.max(np.abs(XX[-len(XX) // 3:])) 683 | clip_value = clipping_factor * np.min([left_max, right_max]) 684 | XX_clip = np.clip(XX, clip_value, -clip_value) 685 | XX_corr = np.correlate(XX_clip, XX_clip, mode='full') 686 | center = np.argmax(XX_corr) 687 | right_XX_corr = XX_corr[center:] 688 | prev_window = max([window - 1, 0]) 689 | if voiced_unvoiced[prev_window] > 0: 690 | # Want it to be harder to turn off than turn on 691 | strength_factor = .29 692 | else: 693 | strength_factor = .3 694 | start = np.where(right_XX_corr < .3 * XX_corr[center])[0] 695 | # 20 is hardcoded but should depend on samplerate? 696 | try: 697 | start = np.max([20, start[0]]) 698 | except IndexError: 699 | start = 20 700 | search_corr = right_XX_corr[start:] 701 | index = np.argmax(search_corr) 702 | second_max = search_corr[index] 703 | if (second_max > strength_factor * XX_corr[center]): 704 | voiced_unvoiced[window] = 1 705 | period[window] = start + index - 1 706 | else: 707 | voiced_unvoiced[window] = 0 708 | period[window] = 0 709 | return np.array(voiced_unvoiced), np.array(period) 710 | 711 | 712 | def lpc_analysis(X, order=8, window_step=128, window_size=2 * 128, 713 | emphasis=0.9, voiced_start_threshold=.9, 714 | voiced_stop_threshold=.6, truncate=False, copy=True): 715 | """ 716 | Extract LPC coefficients from a signal 717 | 718 | Based on code from: 719 | http://labrosa.ee.columbia.edu/matlab/sws/ 720 | 721 | _rParameters 722 | ---------- 723 | X : ndarray 724 | Signals to extract LPC coefficients from 725 | 726 | order : int, optional (default=8) 727 | Order of the LPC coefficients. For speech, use the general rule that the 728 | order is two times the expected number of formants plus 2. 729 | This can be formulated as 2 + 2 * (fs // 2000). For approx. signals 730 | with fs = 7000, this is 8 coefficients - 2 + 2 * (7000 // 2000). 731 | 732 | window_step : int, optional (default=128) 733 | The size (in samples) of the space between each window 734 | 735 | window_size : int, optional (default=2 * 128) 736 | The size of each window (in samples) to extract coefficients over 737 | 738 | emphasis : float, optional (default=0.9) 739 | The emphasis coefficient to use for filtering 740 | 741 | voiced_start_threshold : float, optional (default=0.9) 742 | Upper power threshold for estimating when speech has started 743 | 744 | voiced_stop_threshold : float, optional (default=0.6) 745 | Lower power threshold for estimating when speech has stopped 746 | 747 | truncate : bool, optional (default=False) 748 | Whether to cut the data at the last window or do zero padding. 749 | 750 | copy : bool, optional (default=True) 751 | Whether to copy the input X or modify in place 752 | 753 | Returns 754 | ------- 755 | lp_coefficients : ndarray 756 | lp coefficients to describe the frame 757 | 758 | per_frame_gain : ndarray 759 | calculated gain for each frame 760 | 761 | residual_excitation : ndarray 762 | leftover energy which is not described by lp coefficents and gain 763 | 764 | voiced_frames : ndarray 765 | array of [0, 1] values which holds voiced/unvoiced decision for each 766 | frame. 767 | 768 | References 769 | ---------- 770 | D. P. W. Ellis (2004), "Sinewave Speech Analysis/Synthesis in Matlab", 771 | Web resource, available: http://www.ee.columbia.edu/ln/labrosa/matlab/sws/ 772 | """ 773 | X = np.array(X, copy=copy) 774 | if len(X.shape) < 2: 775 | X = X[None] 776 | 777 | n_points = X.shape[1] 778 | n_windows = int(n_points // window_step) 779 | if not truncate: 780 | pad_sizes = [(window_size - window_step) // 2, 781 | window_size - window_step // 2] 782 | # TODO: Handling for odd window sizes / steps 783 | X = np.hstack((np.zeros((X.shape[0], int(pad_sizes[0]))), X, 784 | np.zeros((X.shape[0], int(pad_sizes[1]))))) 785 | else: 786 | pad_sizes = [0, 0] 787 | X = X[0, :n_windows * window_step] 788 | 789 | lp_coefficients = np.zeros((n_windows, order + 1)) 790 | per_frame_gain = np.zeros((n_windows, 1)) 791 | residual_excitation = np.zeros( 792 | int(((n_windows - 1) * window_step + window_size))) 793 | # Pre-emphasis high-pass filter 794 | X = sg.lfilter([1, -emphasis], 1, X) 795 | # stride_tricks.as_strided? 796 | autocorr_X = np.zeros((n_windows, int(2 * window_size - 1))) 797 | for window in range(max(n_windows - 1, 1)): 798 | wtws = int(window * window_step) 799 | XX = X.ravel()[wtws + np.arange(window_size, dtype="int32")] 800 | WXX = XX * sg.hanning(window_size) 801 | autocorr_X[window] = np.correlate(WXX, WXX, mode='full') 802 | center = np.argmax(autocorr_X[window]) 803 | RXX = autocorr_X[window, 804 | np.arange(center, window_size + order, dtype="int32")] 805 | R = linalg.toeplitz(RXX[:-1]) 806 | solved_R = linalg.pinv(R).dot(RXX[1:]) 807 | filter_coefs = np.hstack((1, -solved_R)) 808 | residual_signal = sg.lfilter(filter_coefs, 1, WXX) 809 | gain = np.sqrt(np.mean(residual_signal ** 2)) 810 | lp_coefficients[window] = filter_coefs 811 | per_frame_gain[window] = gain 812 | assign_range = wtws + np.arange(window_size, dtype="int32") 813 | residual_excitation[assign_range] += residual_signal / gain 814 | # Throw away first part in overlap mode for proper synthesis 815 | residual_excitation = residual_excitation[int(pad_sizes[0]):] 816 | return lp_coefficients, per_frame_gain, residual_excitation 817 | 818 | 819 | def lpc_to_frequency(lp_coefficients, per_frame_gain): 820 | """ 821 | Extract resonant frequencies and magnitudes from LPC coefficients and gains. 822 | Parameters 823 | ---------- 824 | lp_coefficients : ndarray 825 | LPC coefficients, such as those calculated by ``lpc_analysis`` 826 | 827 | per_frame_gain : ndarray 828 | Gain calculated for each frame, such as those calculated 829 | by ``lpc_analysis`` 830 | 831 | Returns 832 | ------- 833 | frequencies : ndarray 834 | Resonant frequencies calculated from LPC coefficients and gain. Returned 835 | frequencies are from 0 to 2 * pi 836 | 837 | magnitudes : ndarray 838 | Magnitudes of resonant frequencies 839 | 840 | References 841 | ---------- 842 | D. P. W. Ellis (2004), "Sinewave Speech Analysis/Synthesis in Matlab", 843 | Web resource, available: http://www.ee.columbia.edu/ln/labrosa/matlab/sws/ 844 | """ 845 | n_windows, order = lp_coefficients.shape 846 | 847 | frame_frequencies = np.zeros((n_windows, (order - 1) // 2)) 848 | frame_magnitudes = np.zeros_like(frame_frequencies) 849 | 850 | for window in range(n_windows): 851 | w_coefs = lp_coefficients[window] 852 | g_coefs = per_frame_gain[window] 853 | roots = np.roots(np.hstack(([1], w_coefs[1:]))) 854 | # Roots doesn't return the same thing as MATLAB... agh 855 | frequencies, index = np.unique( 856 | np.abs(np.angle(roots)), return_index=True) 857 | # Make sure 0 doesn't show up... 858 | gtz = np.where(frequencies > 0)[0] 859 | frequencies = frequencies[gtz] 860 | index = index[gtz] 861 | magnitudes = g_coefs / (1. - np.abs(roots)) 862 | sort_index = np.argsort(frequencies) 863 | frame_frequencies[window, :len(sort_index)] = frequencies[sort_index] 864 | frame_magnitudes[window, :len(sort_index)] = magnitudes[sort_index] 865 | return frame_frequencies, frame_magnitudes 866 | 867 | 868 | def lpc_to_lsf(all_lpc): 869 | if len(all_lpc.shape) < 2: 870 | all_lpc = all_lpc[None] 871 | order = all_lpc.shape[1] - 1 872 | all_lsf = np.zeros((len(all_lpc), order)) 873 | for i in range(len(all_lpc)): 874 | lpc = all_lpc[i] 875 | lpc1 = np.append(lpc, 0) 876 | lpc2 = lpc1[::-1] 877 | sum_filt = lpc1 + lpc2 878 | diff_filt = lpc1 - lpc2 879 | 880 | if order % 2 != 0: 881 | deconv_diff, _ = sg.deconvolve(diff_filt, [1, 0, -1]) 882 | deconv_sum = sum_filt 883 | else: 884 | deconv_diff, _ = sg.deconvolve(diff_filt, [1, -1]) 885 | deconv_sum, _ = sg.deconvolve(sum_filt, [1, 1]) 886 | 887 | roots_diff = np.roots(deconv_diff) 888 | roots_sum = np.roots(deconv_sum) 889 | angle_diff = np.angle(roots_diff[::2]) 890 | angle_sum = np.angle(roots_sum[::2]) 891 | lsf = np.sort(np.hstack((angle_diff, angle_sum))) 892 | if len(lsf) != 0: 893 | all_lsf[i] = lsf 894 | return np.squeeze(all_lsf) 895 | 896 | 897 | def lsf_to_lpc(all_lsf): 898 | if len(all_lsf.shape) < 2: 899 | all_lsf = all_lsf[None] 900 | order = all_lsf.shape[1] 901 | all_lpc = np.zeros((len(all_lsf), order + 1)) 902 | for i in range(len(all_lsf)): 903 | lsf = all_lsf[i] 904 | zeros = np.exp(1j * lsf) 905 | sum_zeros = zeros[::2] 906 | diff_zeros = zeros[1::2] 907 | sum_zeros = np.hstack((sum_zeros, np.conj(sum_zeros))) 908 | diff_zeros = np.hstack((diff_zeros, np.conj(diff_zeros))) 909 | sum_filt = np.poly(sum_zeros) 910 | diff_filt = np.poly(diff_zeros) 911 | 912 | if order % 2 != 0: 913 | deconv_diff = sg.convolve(diff_filt, [1, 0, -1]) 914 | deconv_sum = sum_filt 915 | else: 916 | deconv_diff = sg.convolve(diff_filt, [1, -1]) 917 | deconv_sum = sg.convolve(sum_filt, [1, 1]) 918 | 919 | lpc = .5 * (deconv_sum + deconv_diff) 920 | # Last coefficient is 0 and not returned 921 | all_lpc[i] = lpc[:-1] 922 | return np.squeeze(all_lpc) 923 | 924 | 925 | def lpc_synthesis(lp_coefficients, per_frame_gain, residual_excitation=None, 926 | voiced_frames=None, window_step=128, emphasis=0.9): 927 | """ 928 | Synthesize a signal from LPC coefficients 929 | 930 | Based on code from: 931 | http://labrosa.ee.columbia.edu/matlab/sws/ 932 | http://web.uvic.ca/~tyoon/resource/auditorytoolbox/auditorytoolbox/synlpc.html 933 | 934 | Parameters 935 | ---------- 936 | lp_coefficients : ndarray 937 | Linear prediction coefficients 938 | 939 | per_frame_gain : ndarray 940 | Gain coefficients 941 | 942 | residual_excitation : ndarray or None, optional (default=None) 943 | Residual excitations. If None, this will be synthesized with white noise 944 | 945 | voiced_frames : ndarray or None, optional (default=None) 946 | Voiced frames. If None, all frames assumed to be voiced. 947 | 948 | window_step : int, optional (default=128) 949 | The size (in samples) of the space between each window 950 | 951 | emphasis : float, optional (default=0.9) 952 | The emphasis coefficient to use for filtering 953 | 954 | overlap_add : bool, optional (default=True) 955 | What type of processing to use when joining windows 956 | 957 | copy : bool, optional (default=True) 958 | Whether to copy the input X or modify in place 959 | 960 | Returns 961 | ------- 962 | synthesized : ndarray 963 | Sound vector synthesized from input arguments 964 | 965 | References 966 | ---------- 967 | D. P. W. Ellis (2004), "Sinewave Speech Analysis/Synthesis in Matlab", 968 | Web resource, available: http://www.ee.columbia.edu/ln/labrosa/matlab/sws/ 969 | """ 970 | # TODO: Incorporate better synthesis from 971 | # http://eecs.oregonstate.edu/education/docs/ece352/CompleteManual.pdf 972 | window_size = 2 * window_step 973 | [n_windows, order] = lp_coefficients.shape 974 | 975 | n_points = (n_windows + 1) * window_step 976 | n_excitation_points = n_points + window_step + window_step // 2 977 | 978 | random_state = np.random.RandomState(1999) 979 | if residual_excitation is None: 980 | # Need to generate excitation 981 | if voiced_frames is None: 982 | # No voiced/unvoiced info 983 | voiced_frames = np.ones((lp_coefficients.shape[0], 1)) 984 | residual_excitation = np.zeros((n_excitation_points)) 985 | f, m = lpc_to_frequency(lp_coefficients, per_frame_gain) 986 | t = np.linspace(0, 1, window_size, endpoint=False) 987 | hanning = sg.hanning(window_size) 988 | for window in range(n_windows): 989 | window_base = window * window_step 990 | index = window_base + np.arange(window_size) 991 | if voiced_frames[window]: 992 | sig = np.zeros_like(t) 993 | cycles = np.cumsum(f[window][0] * t) 994 | sig += sg.sawtooth(cycles, 0.001) 995 | residual_excitation[index] += hanning * sig 996 | residual_excitation[index] += hanning * 0.01 * random_state.randn( 997 | window_size) 998 | else: 999 | n_excitation_points = residual_excitation.shape[0] 1000 | n_points = n_excitation_points + window_step + window_step // 2 1001 | residual_excitation = np.hstack((residual_excitation, 1002 | np.zeros(window_size))) 1003 | if voiced_frames is None: 1004 | voiced_frames = np.ones_like(per_frame_gain) 1005 | 1006 | synthesized = np.zeros((n_points)) 1007 | for window in range(n_windows): 1008 | window_base = window * window_step 1009 | oldbit = synthesized[window_base + np.arange(window_step)] 1010 | w_coefs = lp_coefficients[window] 1011 | if not np.all(w_coefs): 1012 | # Hack to make lfilter avoid 1013 | # ValueError: BUG: filter coefficient a[0] == 0 not supported yet 1014 | # when all coeffs are 0 1015 | w_coefs = [1] 1016 | g_coefs = voiced_frames[window] * per_frame_gain[window] 1017 | index = window_base + np.arange(window_size) 1018 | newbit = g_coefs * sg.lfilter([1], w_coefs, 1019 | residual_excitation[index]) 1020 | synthesized[index] = np.hstack((oldbit, np.zeros( 1021 | (window_size - window_step)))) 1022 | synthesized[index] += sg.hanning(window_size) * newbit 1023 | synthesized = sg.lfilter([1], [1, -emphasis], synthesized) 1024 | return synthesized 1025 | 1026 | 1027 | def soundsc(X, gain_scale=.9, copy=True): 1028 | """ 1029 | Approximate implementation of soundsc from MATLAB without the audio playing. 1030 | 1031 | Parameters 1032 | ---------- 1033 | X : ndarray 1034 | Signal to be rescaled 1035 | 1036 | gain_scale : float 1037 | Gain multipler, default .9 (90% of maximum representation) 1038 | 1039 | copy : bool, optional (default=True) 1040 | Whether to make a copy of input signal or operate in place. 1041 | 1042 | Returns 1043 | ------- 1044 | X_sc : ndarray 1045 | (-32767, 32767) scaled version of X as int16, suitable for writing 1046 | with scipy.io.wavfile 1047 | """ 1048 | X = np.array(X, copy=copy) 1049 | X = (X - X.min()) / (X.max() - X.min()) 1050 | X = 2 * X - 1 1051 | X = gain_scale * X 1052 | X = X * 2 ** 15 1053 | return X.astype('int16') 1054 | 1055 | 1056 | def _wav2array(nchannels, sampwidth, data): 1057 | # wavio.py 1058 | # Author: Warren Weckesser 1059 | # License: BSD 3-Clause (http://opensource.org/licenses/BSD-3-Clause) 1060 | 1061 | """data must be the string containing the bytes from the wav file.""" 1062 | num_samples, remainder = divmod(len(data), sampwidth * nchannels) 1063 | if remainder > 0: 1064 | raise ValueError('The length of data is not a multiple of ' 1065 | 'sampwidth * num_channels.') 1066 | if sampwidth > 4: 1067 | raise ValueError("sampwidth must not be greater than 4.") 1068 | 1069 | if sampwidth == 3: 1070 | a = np.empty((num_samples, nchannels, 4), dtype=np.uint8) 1071 | raw_bytes = np.fromstring(data, dtype=np.uint8) 1072 | a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth) 1073 | a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255 1074 | result = a.view(' 0 1630 | weights = np.zeros((n_filts, n_fft)) 1631 | fft_freqs = np.arange(n_fft // 2) / n_fft * fs 1632 | min_mel = herz_to_mel(min_freq) 1633 | max_mel = herz_to_mel(max_freq) 1634 | partial = np.arange(n_filts + 2) / (n_filts + 1.) * (max_mel - min_mel) 1635 | bin_freqs = mel_to_herz(min_mel + partial) 1636 | bin_bin = np.round(bin_freqs / fs * (n_fft - 1)) 1637 | for i in range(n_filts): 1638 | fs_i = bin_freqs[i + np.arange(3)] 1639 | fs_i = fs_i[1] + width * (fs_i - fs_i[1]) 1640 | lo_slope = (fft_freqs - fs_i[0]) / float(fs_i[1] - fs_i[0]) 1641 | hi_slope = (fs_i[2] - fft_freqs) / float(fs_i[2] - fs_i[1]) 1642 | weights[i, :n_fft // 2] = np.maximum( 1643 | 0, np.minimum(lo_slope, hi_slope)) 1644 | # Constant amplitude multiplier 1645 | weights = np.diag(2. / (bin_freqs[2:n_filts + 2] 1646 | - bin_freqs[:n_filts])).dot(weights) 1647 | weights[:, n_fft // 2:] = 0 1648 | return weights 1649 | 1650 | 1651 | def time_attack_agc(X, fs, t_scale=0.5, f_scale=1.): 1652 | """ 1653 | AGC based on code by Dan Ellis 1654 | 1655 | http://labrosa.ee.columbia.edu/matlab/tf_agc/ 1656 | """ 1657 | # 32 ms grid for FFT 1658 | n_fft = 2 ** int(np.log(0.032 * fs) / np.log(2)) 1659 | f_scale = float(f_scale) 1660 | window_size = n_fft 1661 | window_step = window_size // 2 1662 | X_freq = stft(X, window_size, mean_normalize=False) 1663 | fft_fs = fs / window_step 1664 | n_bands = max(10, 20 / f_scale) 1665 | mel_width = f_scale * n_bands / 10. 1666 | f_to_a = mel_freq_weights(n_fft, fs, n_bands, mel_width) 1667 | f_to_a = f_to_a[:, :n_fft // 2 + 1] 1668 | audiogram = np.abs(X_freq).dot(f_to_a.T) 1669 | fbg = np.zeros_like(audiogram) 1670 | state = np.zeros((audiogram.shape[1],)) 1671 | alpha = np.exp(-(1. / fft_fs) / t_scale) 1672 | for i in range(len(audiogram)): 1673 | state = np.maximum(alpha * state, audiogram[i]) 1674 | fbg[i] = state 1675 | 1676 | sf_to_a = np.sum(f_to_a, axis=0) 1677 | E = np.diag(1. / (sf_to_a + (sf_to_a == 0))) 1678 | E = E.dot(f_to_a.T) 1679 | E = fbg.dot(E.T) 1680 | E[E <= 0] = np.min(E[E > 0]) 1681 | ts = istft(X_freq / E, window_size, mean_normalize=False) 1682 | return ts, X_freq, E 1683 | 1684 | 1685 | def hebbian_kmeans(X, n_clusters=10, n_epochs=10, W=None, learning_rate=0.01, 1686 | batch_size=100, random_state=None, verbose=True): 1687 | """ 1688 | Modified from existing code from R. Memisevic 1689 | See http://www.cs.toronto.edu/~rfm/code/hebbian_kmeans.py 1690 | """ 1691 | if W is None: 1692 | if random_state is None: 1693 | random_state = np.random.RandomState() 1694 | W = 0.1 * random_state.randn(n_clusters, X.shape[1]) 1695 | else: 1696 | assert n_clusters == W.shape[0] 1697 | X2 = (X ** 2).sum(axis=1, keepdims=True) 1698 | last_print = 0 1699 | for e in range(n_epochs): 1700 | for i in range(0, X.shape[0], batch_size): 1701 | X_i = X[i: i + batch_size] 1702 | X2_i = X2[i: i + batch_size] 1703 | D = -2 * np.dot(W, X_i.T) 1704 | D += (W ** 2).sum(axis=1, keepdims=True) 1705 | D += X2_i.T 1706 | S = (D == D.min(axis=0)[None, :]).astype("float").T 1707 | W += learning_rate * ( 1708 | np.dot(S.T, X_i) - S.sum(axis=0)[:, None] * W) 1709 | if verbose: 1710 | if e == 0 or e > (.05 * n_epochs + last_print): 1711 | last_print = e 1712 | print("Epoch %i of %i, cost %.4f" % ( 1713 | e + 1, n_epochs, D.min(axis=0).sum())) 1714 | return W 1715 | 1716 | 1717 | def complex_to_real_view(arr_c): 1718 | # Inplace view from complex to r, i as separate columns 1719 | assert arr_c.dtype in [np.complex64, np.complex128] 1720 | shp = arr_c.shape 1721 | dtype = np.float64 if arr_c.dtype == np.complex128 else np.float32 1722 | arr_r = arr_c.ravel().view(dtype=dtype).reshape(shp[0], 2 * shp[1]) 1723 | return arr_r 1724 | 1725 | 1726 | def real_to_complex_view(arr_r): 1727 | # Inplace view from real, image as columns to complex 1728 | assert arr_r.dtype not in [np.complex64, np.complex128] 1729 | shp = arr_r.shape 1730 | dtype = np.complex128 if arr_r.dtype == np.float64 else np.complex64 1731 | arr_c = arr_r.ravel().view(dtype=dtype).reshape(shp[0], shp[1] // 2) 1732 | return arr_c 1733 | 1734 | 1735 | def complex_to_abs(arr_c): 1736 | return np.abs(arr_c) 1737 | 1738 | 1739 | def complex_to_angle(arr_c): 1740 | return np.angle(arr_c) 1741 | 1742 | 1743 | def abs_and_angle_to_complex(arr_abs, arr_angle): 1744 | # abs(f_c2 - f_c) < 1E-15 1745 | return arr_abs * np.exp(1j * arr_angle) 1746 | 1747 | 1748 | def angle_to_sin_cos(arr_angle): 1749 | return np.hstack((np.sin(arr_angle), np.cos(arr_angle))) 1750 | 1751 | 1752 | def sin_cos_to_angle(arr_sin, arr_cos): 1753 | return np.arctan2(arr_sin, arr_cos) 1754 | 1755 | 1756 | def polyphase_core(x, m, f): 1757 | # x = input data 1758 | # m = decimation rate 1759 | # f = filter 1760 | # Hack job - append zeros to match decimation rate 1761 | if x.shape[0] % m != 0: 1762 | x = np.append(x, np.zeros((m - x.shape[0] % m,))) 1763 | if f.shape[0] % m != 0: 1764 | f = np.append(f, np.zeros((m - f.shape[0] % m,))) 1765 | polyphase = p = np.zeros((m, (x.shape[0] + f.shape[0]) / m), dtype=x.dtype) 1766 | p[0, :-1] = np.convolve(x[::m], f[::m]) 1767 | # Invert the x values when applying filters 1768 | for i in range(1, m): 1769 | p[i, 1:] = np.convolve(x[m - i::m], f[i::m]) 1770 | return p 1771 | 1772 | 1773 | def polyphase_single_filter(x, m, f): 1774 | return np.sum(polyphase_core(x, m, f), axis=0) 1775 | 1776 | 1777 | def polyphase_lowpass(arr, downsample=2, n_taps=50, filter_pad=1.1): 1778 | filt = firwin(downsample * n_taps, 1 / (downsample * filter_pad)) 1779 | filtered = polyphase_single_filter(arr, downsample, filt) 1780 | return filtered 1781 | 1782 | 1783 | def window(arr, window_size, window_step=1, axis=0): 1784 | """ 1785 | Directly taken from Erik Rigtorp's post to numpy-discussion. 1786 | 1787 | 1788 | 1789 | """ 1790 | if window_size < 1: 1791 | raise ValueError("`window_size` must be at least 1.") 1792 | if window_size > arr.shape[-1]: 1793 | raise ValueError("`window_size` is too long.") 1794 | 1795 | orig = list(range(len(arr.shape))) 1796 | trans = list(range(len(arr.shape))) 1797 | trans[axis] = orig[-1] 1798 | trans[-1] = orig[axis] 1799 | arr = arr.transpose(trans) 1800 | 1801 | shape = arr.shape[:-1] + (arr.shape[-1] - window_size + 1, window_size) 1802 | strides = arr.strides + (arr.strides[-1],) 1803 | strided = as_strided(arr, shape=shape, strides=strides) 1804 | 1805 | if window_step > 1: 1806 | strided = strided[..., ::window_step, :] 1807 | 1808 | orig = list(range(len(strided.shape))) 1809 | trans = list(range(len(strided.shape))) 1810 | trans[-2] = orig[-1] 1811 | trans[-1] = orig[-2] 1812 | trans = trans[::-1] 1813 | strided = strided.transpose(trans) 1814 | return strided 1815 | 1816 | 1817 | def unwindow(arr, window_size, window_step=1, axis=0): 1818 | # undo windows by broadcast 1819 | if axis != 0: 1820 | raise ValueError("axis != 0 currently unsupported") 1821 | shp = arr.shape 1822 | unwindowed = np.tile(arr[:, None, ...], (1, window_step, 1, 1)) 1823 | unwindowed = unwindowed.reshape(shp[0] * window_step, *shp[1:]) 1824 | return unwindowed.mean(axis=1) 1825 | 1826 | 1827 | def xcorr_offset(x1, x2): 1828 | """ 1829 | Under MSR-LA License 1830 | 1831 | Based on MATLAB implementation from Spectrogram Inversion Toolbox 1832 | 1833 | References 1834 | ---------- 1835 | D. Griffin and J. Lim. Signal estimation from modified 1836 | short-time Fourier transform. IEEE Trans. Acoust. Speech 1837 | Signal Process., 32(2):236-243, 1984. 1838 | 1839 | Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory 1840 | Model Inversion for Sound Separation. Proc. IEEE-ICASSP, 1841 | Adelaide, 1994, II.77-80. 1842 | 1843 | Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal 1844 | Estimation from Modified Short-Time Fourier Transform 1845 | Magnitude Spectra. IEEE Transactions on Audio Speech and 1846 | Language Processing, 08/2007. 1847 | """ 1848 | x1 = x1 - x1.mean() 1849 | x2 = x2 - x2.mean() 1850 | frame_size = len(x2) 1851 | half = frame_size // 2 1852 | corrs = np.convolve(x1.astype('float32'), x2[::-1].astype('float32')) 1853 | corrs[:half] = -1E30 1854 | corrs[-half:] = -1E30 1855 | offset = corrs.argmax() - len(x1) 1856 | return offset 1857 | 1858 | 1859 | def invert_spectrogram(X_s, step, calculate_offset=True, set_zero_phase=True): 1860 | """ 1861 | Under MSR-LA License 1862 | 1863 | Based on MATLAB implementation from Spectrogram Inversion Toolbox 1864 | 1865 | References 1866 | ---------- 1867 | D. Griffin and J. Lim. Signal estimation from modified 1868 | short-time Fourier transform. IEEE Trans. Acoust. Speech 1869 | Signal Process., 32(2):236-243, 1984. 1870 | 1871 | Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory 1872 | Model Inversion for Sound Separation. Proc. IEEE-ICASSP, 1873 | Adelaide, 1994, II.77-80. 1874 | 1875 | Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal 1876 | Estimation from Modified Short-Time Fourier Transform 1877 | Magnitude Spectra. IEEE Transactions on Audio Speech and 1878 | Language Processing, 08/2007. 1879 | """ 1880 | size = int(X_s.shape[1] // 2) 1881 | wave = np.zeros((X_s.shape[0] * step + size)) 1882 | # Getting overflow warnings with 32 bit... 1883 | wave = wave.astype('float64') 1884 | total_windowing_sum = np.zeros((X_s.shape[0] * step + size)) 1885 | win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1)) 1886 | 1887 | est_start = int(size // 2) - 1 1888 | est_end = est_start + size 1889 | for i in range(X_s.shape[0]): 1890 | wave_start = int(step * i) 1891 | wave_end = wave_start + size 1892 | if set_zero_phase: 1893 | spectral_slice = X_s[i].real + 0j 1894 | else: 1895 | # already complex 1896 | spectral_slice = X_s[i] 1897 | 1898 | # Don't need fftshift due to different impl. 1899 | wave_est = np.real(np.fft.ifft(spectral_slice))[::-1] 1900 | if calculate_offset and i > 0: 1901 | offset_size = size - step 1902 | if offset_size <= 0: 1903 | print("WARNING: Large step size >50\% detected! " 1904 | "This code works best with high overlap - try " 1905 | "with 75% or greater") 1906 | offset_size = step 1907 | offset = xcorr_offset(wave[wave_start:wave_start + offset_size], 1908 | wave_est[est_start:est_start + offset_size]) 1909 | else: 1910 | offset = 0 1911 | wave[wave_start:wave_end] += win * wave_est[ 1912 | est_start - offset:est_end - offset] 1913 | total_windowing_sum[wave_start:wave_end] += win 1914 | wave = np.real(wave) / (total_windowing_sum + 1E-6) 1915 | return wave 1916 | 1917 | 1918 | def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False, 1919 | complex_input=False): 1920 | """ 1921 | Under MSR-LA License 1922 | 1923 | Based on MATLAB implementation from Spectrogram Inversion Toolbox 1924 | 1925 | References 1926 | ---------- 1927 | D. Griffin and J. Lim. Signal estimation from modified 1928 | short-time Fourier transform. IEEE Trans. Acoust. Speech 1929 | Signal Process., 32(2):236-243, 1984. 1930 | 1931 | Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory 1932 | Model Inversion for Sound Separation. Proc. IEEE-ICASSP, 1933 | Adelaide, 1994, II.77-80. 1934 | 1935 | Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal 1936 | Estimation from Modified Short-Time Fourier Transform 1937 | Magnitude Spectra. IEEE Transactions on Audio Speech and 1938 | Language Processing, 08/2007. 1939 | """ 1940 | reg = np.max(X_s) / 1E8 1941 | X_best = copy.deepcopy(X_s) 1942 | try: 1943 | for i in range(n_iter): 1944 | if verbose: 1945 | print("Runnning iter %i" % i) 1946 | if i == 0 and not complex_input: 1947 | X_t = invert_spectrogram(X_best, step, calculate_offset=True, 1948 | set_zero_phase=True) 1949 | else: 1950 | # Calculate offset was False in the MATLAB version 1951 | # but in mine it massively improves the result 1952 | # Possible bug in my impl? 1953 | X_t = invert_spectrogram(X_best, step, calculate_offset=True, 1954 | set_zero_phase=False) 1955 | est = stft(X_t, fftsize=fftsize, step=step, compute_onesided=False) 1956 | phase = est / np.maximum(reg, np.abs(est)) 1957 | phase = phase[:len(X_s)] 1958 | X_s = X_s[:len(phase)] 1959 | X_best = X_s * phase 1960 | except ValueError: 1961 | raise ValueError("The iterate_invert_spectrogram algorithm requires" 1962 | " stft(..., compute_onesided=False),", 1963 | " be sure you have calculated stft with this argument") 1964 | X_t = invert_spectrogram(X_best, step, calculate_offset=True, 1965 | set_zero_phase=False) 1966 | return np.real(X_t) 1967 | 1968 | 1969 | def harvest_get_downsampled_signal(x, fs, target_fs): 1970 | decimation_ratio = np.round(fs / target_fs) 1971 | offset = np.ceil(140. / decimation_ratio) * decimation_ratio 1972 | start_pad = x[0] * np.ones(int(offset), dtype=np.float32) 1973 | end_pad = x[-1] * np.ones(int(offset), dtype=np.float32) 1974 | x = np.concatenate((start_pad, x, end_pad), axis=0) 1975 | 1976 | if fs < target_fs: 1977 | raise ValueError("CASE NOT HANDLED IN harvest_get_downsampled_signal") 1978 | else: 1979 | try: 1980 | y0 = sg.decimate(x, int(decimation_ratio), 3, zero_phase=True) 1981 | except: 1982 | y0 = sg.decimate(x, int(decimation_ratio), 3) 1983 | actual_fs = fs / decimation_ratio 1984 | y = y0[int(offset / decimation_ratio):-int(offset / decimation_ratio)] 1985 | y = y - np.mean(y) 1986 | return y, actual_fs 1987 | 1988 | 1989 | def harvest_get_raw_f0_candidates(number_of_frames, boundary_f0_list, 1990 | y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, 1991 | f0_ceil): 1992 | raw_f0_candidates = np.zeros((len(boundary_f0_list), number_of_frames), dtype=np.float32) 1993 | for i in range(len(boundary_f0_list)): 1994 | raw_f0_candidates[i, :] = harvest_get_f0_candidate_from_raw_event( 1995 | boundary_f0_list[i], actual_fs, y_spectrum, y_length, 1996 | temporal_positions, f0_floor, f0_ceil) 1997 | return raw_f0_candidates 1998 | 1999 | 2000 | def harvest_nuttall(N): 2001 | t = np.arange(0, N) * 2 * np.pi / (N - 1) 2002 | coefs = np.array([0.355768, -0.487396, 0.144232, -0.012604]) 2003 | window = np.cos(t[:, None].dot(np.array([0., 1., 2., 3.])[None])).dot( coefs[:, None]) 2004 | # 1D window... 2005 | return window.ravel() 2006 | 2007 | 2008 | def harvest_get_f0_candidate_from_raw_event(boundary_f0, 2009 | fs, y_spectrum, y_length, temporal_positions, f0_floor, 2010 | f0_ceil): 2011 | filter_length_half = int(np.round(fs / boundary_f0 * 2)) 2012 | band_pass_filter_base = harvest_nuttall(filter_length_half * 2 + 1) 2013 | shifter = np.cos(2 * np.pi * boundary_f0 * np.arange(-filter_length_half, filter_length_half + 1) / float(fs)) 2014 | band_pass_filter = band_pass_filter_base * shifter 2015 | 2016 | index_bias = filter_length_half 2017 | # possible numerical issues if 32 bit 2018 | spectrum_low_pass_filter = np.fft.fft(band_pass_filter.astype("float64"), len(y_spectrum)) 2019 | filtered_signal = np.real(np.fft.ifft(spectrum_low_pass_filter * y_spectrum)) 2020 | index_bias = filter_length_half + 1 2021 | filtered_signal = filtered_signal[index_bias + np.arange(y_length).astype("int32")] 2022 | negative_zero_cross = harvest_zero_crossing_engine(filtered_signal, fs) 2023 | positive_zero_cross = harvest_zero_crossing_engine(-filtered_signal, fs) 2024 | d_filtered_signal = filtered_signal[1:] - filtered_signal[:-1] 2025 | peak = harvest_zero_crossing_engine(d_filtered_signal, fs) 2026 | dip = harvest_zero_crossing_engine(-d_filtered_signal, fs) 2027 | f0_candidate = harvest_get_f0_candidate_contour(negative_zero_cross, 2028 | positive_zero_cross, peak, dip, temporal_positions) 2029 | f0_candidate[f0_candidate > (boundary_f0 * 1.1)] = 0. 2030 | f0_candidate[f0_candidate < (boundary_f0 * .9)] = 0. 2031 | f0_candidate[f0_candidate > f0_ceil] = 0. 2032 | f0_candidate[f0_candidate < f0_floor] = 0. 2033 | return f0_candidate 2034 | 2035 | 2036 | def harvest_get_f0_candidate_contour(negative_zero_cross_tup, 2037 | positive_zero_cross_tup, peak_tup, dip_tup, temporal_positions): 2038 | # 0 is inteval locations 2039 | # 1 is interval based f0 2040 | usable_channel = max(0, len(negative_zero_cross_tup[0]) - 2) 2041 | usable_channel *= max(0, len(positive_zero_cross_tup[0]) - 2) 2042 | usable_channel *= max(0, len(peak_tup[0]) - 2) 2043 | usable_channel *= max(0, len(dip_tup[0]) - 2) 2044 | if usable_channel > 0: 2045 | interpolated_f0_list = np.zeros((4, len(temporal_positions))) 2046 | nz = interp1d(negative_zero_cross_tup[0], negative_zero_cross_tup[1], 2047 | kind="linear", bounds_error=False, fill_value="extrapolate") 2048 | pz = interp1d(positive_zero_cross_tup[0], positive_zero_cross_tup[1], 2049 | kind="linear", bounds_error=False, fill_value="extrapolate") 2050 | pkz = interp1d(peak_tup[0], peak_tup[1], 2051 | kind="linear", bounds_error=False, fill_value="extrapolate") 2052 | dz = interp1d(dip_tup[0], dip_tup[1], 2053 | kind="linear", bounds_error=False, fill_value="extrapolate") 2054 | interpolated_f0_list[0, :] = nz(temporal_positions) 2055 | interpolated_f0_list[1, :] = pz(temporal_positions) 2056 | interpolated_f0_list[2, :] = pkz(temporal_positions) 2057 | interpolated_f0_list[3, :] = dz(temporal_positions) 2058 | f0_candidate = np.mean(interpolated_f0_list, axis=0) 2059 | else: 2060 | f0_candidate = temporal_positions * 0 2061 | return f0_candidate 2062 | 2063 | 2064 | def harvest_zero_crossing_engine(x, fs, debug=False): 2065 | # negative zero crossing, going from positive to negative 2066 | x_shift = x.copy() 2067 | x_shift[:-1] = x_shift[1:] 2068 | x_shift[-1] = x[-1] 2069 | # +1 here to avoid edge case at 0 2070 | points = np.arange(len(x)) + 1 2071 | negative_going_points = points * ((x_shift * x < 0) * (x_shift < x)) 2072 | edge_list = negative_going_points[negative_going_points > 0] 2073 | # -1 to correct index 2074 | fine_edge_list = edge_list - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1]).astype("float32") 2075 | interval_locations = (fine_edge_list[:-1] + fine_edge_list[1:]) / float(2) / fs 2076 | interval_based_f0 = float(fs) / (fine_edge_list[1:] - fine_edge_list[:-1]) 2077 | return interval_locations, interval_based_f0 2078 | 2079 | 2080 | def harvest_detect_official_f0_candidates(raw_f0_candidates): 2081 | number_of_channels, number_of_frames = raw_f0_candidates.shape 2082 | f0_candidates = np.zeros((int(np.round(number_of_channels / 10.)), number_of_frames)) 2083 | number_of_candidates = 0 2084 | threshold = 10 2085 | for i in range(number_of_frames): 2086 | tmp = raw_f0_candidates[:, i].copy() 2087 | tmp[tmp > 0] = 1. 2088 | tmp[0] = 0 2089 | tmp[-1] = 0 2090 | tmp = tmp[1:] - tmp[:-1] 2091 | st = np.where(tmp == 1)[0] 2092 | ed = np.where(tmp == -1)[0] 2093 | count = 0 2094 | for j in range(len(st)): 2095 | dif = ed[j] - st[j] 2096 | if dif >= threshold: 2097 | tmp_f0 = raw_f0_candidates[st[j] + 1: ed[j] + 1, i] 2098 | f0_candidates[count, i] = np.mean(tmp_f0) 2099 | count = count + 1 2100 | number_of_candidates = max(number_of_candidates, count) 2101 | return f0_candidates, number_of_candidates 2102 | 2103 | 2104 | def harvest_overlap_f0_candidates(f0_candidates, max_number_of_f0_candidates): 2105 | n = 3 # this is the optimized parameter... apparently 2106 | number_of_candidates = n * 2 + 1 2107 | new_f0_candidates = f0_candidates[number_of_candidates, :].copy() 2108 | new_f0_candidates = new_f0_candidates[None] 2109 | # hack to bypass magic matlab-isms of allocating when indexing OOB 2110 | new_f0_candidates = np.vstack([new_f0_candidates] + (new_f0_candidates.shape[-1] - 1) * [np.zeros_like(new_f0_candidates)]) 2111 | # this indexing is megagross, possible source for bugs! 2112 | all_nonzero = [] 2113 | for i in range(number_of_candidates): 2114 | st = max(-(i - n), 0) 2115 | ed = min(-(i - n), 0) 2116 | f1_b = np.arange(max_number_of_f0_candidates).astype("int32") 2117 | f1 = f1_b + int(i * max_number_of_f0_candidates) 2118 | all_nonzero = list(set(all_nonzero + list(f1))) 2119 | f2 = None if ed == 0 else ed 2120 | f3 = -ed 2121 | f4 = None if st == 0 else -st 2122 | new_f0_candidates[f1, st:f2] = f0_candidates[f1_b, f3:f4] 2123 | new_f0_candidates = new_f0_candidates[all_nonzero, :] 2124 | return new_f0_candidates 2125 | 2126 | 2127 | def harvest_refine_candidates(x, fs, temporal_positions, f0_candidates, 2128 | f0_floor, f0_ceil): 2129 | new_f0_candidates = f0_candidates.copy() 2130 | f0_scores = f0_candidates * 0. 2131 | for i in range(len(temporal_positions)): 2132 | for j in range(len(f0_candidates)): 2133 | tmp_f0 = f0_candidates[j, i] 2134 | if tmp_f0 == 0: 2135 | continue 2136 | res = harvest_get_refined_f0(x, fs, temporal_positions[i], 2137 | tmp_f0, f0_floor, f0_ceil) 2138 | new_f0_candidates[j, i] = res[0] 2139 | f0_scores[j, i] = res[1] 2140 | return new_f0_candidates, f0_scores 2141 | 2142 | 2143 | def harvest_get_refined_f0(x, fs, current_time, current_f0, f0_floor, 2144 | f0_ceil): 2145 | half_window_length = np.ceil(3. * fs / current_f0 / 2.) 2146 | window_length_in_time = (2. * half_window_length + 1) / float(fs) 2147 | base_time = np.arange(-half_window_length, half_window_length + 1) / float(fs) 2148 | fft_size = int(2 ** np.ceil(np.log2((half_window_length * 2 + 1)) + 1)) 2149 | frequency_axis = np.arange(fft_size) / fft_size * float(fs) 2150 | 2151 | base_index = np.round((current_time + base_time) * fs + 0.001) 2152 | index_time = (base_index - 1) / float(fs) 2153 | window_time = index_time - current_time 2154 | part1 = np.cos(2 * np.pi * window_time / window_length_in_time) 2155 | part2 = np.cos(4 * np.pi * window_time / window_length_in_time) 2156 | main_window = 0.42 + 0.5 * part1 + 0.08 * part2 2157 | ext = np.zeros((len(main_window) + 2)) 2158 | ext[1:-1] = main_window 2159 | diff_window = -((ext[1:-1] - ext[:-2]) + (ext[2:] - ext[1:-1])) / float(2) 2160 | safe_index = np.maximum(1, np.minimum(len(x), base_index)).astype("int32") - 1 2161 | spectrum = np.fft.fft(x[safe_index] * main_window, fft_size) 2162 | diff_spectrum = np.fft.fft(x[safe_index] * diff_window, fft_size) 2163 | numerator_i = np.real(spectrum) * np.imag(diff_spectrum) - np.imag(spectrum) * np.real(diff_spectrum) 2164 | power_spectrum = np.abs(spectrum) ** 2 2165 | instantaneous_frequency = frequency_axis + numerator_i / power_spectrum * float(fs) / 2. / np.pi 2166 | 2167 | number_of_harmonics = int(min(np.floor(float(fs) / 2. / current_f0), 6.)) 2168 | harmonics_index = np.arange(number_of_harmonics) + 1 2169 | index_list = np.round(current_f0 * fft_size / fs * harmonics_index).astype("int32") 2170 | instantaneous_frequency_list = instantaneous_frequency[index_list] 2171 | amplitude_list = np.sqrt(power_spectrum[index_list]) 2172 | refined_f0 = np.sum(amplitude_list * instantaneous_frequency_list) 2173 | refined_f0 /= np.sum(amplitude_list * harmonics_index.astype("float32")) 2174 | 2175 | variation = np.abs(((instantaneous_frequency_list / harmonics_index.astype("float32")) - current_f0) / float(current_f0)) 2176 | refined_score = 1. / (0.000000000001 + np.mean(variation)) 2177 | 2178 | if (refined_f0 < f0_floor) or (refined_f0 > f0_ceil) or (refined_score < 2.5): 2179 | refined_f0 = 0. 2180 | redined_score = 0. 2181 | return refined_f0, refined_score 2182 | 2183 | 2184 | def harvest_select_best_f0(reference_f0, f0_candidates, allowed_range): 2185 | best_f0 = 0 2186 | best_error = allowed_range 2187 | 2188 | for i in range(len(f0_candidates)): 2189 | tmp = np.abs(reference_f0 - f0_candidates[i]) / reference_f0 2190 | if tmp > best_error: 2191 | continue 2192 | best_f0 = f0_candidates[i] 2193 | best_error = tmp 2194 | return best_f0, best_error 2195 | 2196 | 2197 | def harvest_remove_unreliable_candidates(f0_candidates, f0_scores): 2198 | new_f0_candidates = f0_candidates.copy() 2199 | new_f0_scores = f0_scores.copy() 2200 | threshold = 0.05 2201 | f0_length = f0_candidates.shape[1] 2202 | number_of_candidates = len(f0_candidates) 2203 | 2204 | for i in range(1, f0_length - 1): 2205 | for j in range(number_of_candidates): 2206 | reference_f0 = f0_candidates[j, i] 2207 | if reference_f0 == 0: 2208 | continue 2209 | _, min_error1 = harvest_select_best_f0(reference_f0, f0_candidates[:, i + 1], 1) 2210 | _, min_error2 = harvest_select_best_f0(reference_f0, f0_candidates[:, i - 1], 1) 2211 | min_error = min([min_error1, min_error2]) 2212 | if min_error > threshold: 2213 | new_f0_candidates[j, i] = 0 2214 | new_f0_scores[j, i] = 0 2215 | return new_f0_candidates, new_f0_scores 2216 | 2217 | 2218 | def harvest_search_f0_base(f0_candidates, f0_scores): 2219 | f0_base = f0_candidates[0, :] * 0. 2220 | for i in range(len(f0_base)): 2221 | max_index = np.argmax(f0_scores[:, i]) 2222 | f0_base[i] = f0_candidates[max_index, i] 2223 | return f0_base 2224 | 2225 | 2226 | def harvest_fix_step_1(f0_base, allowed_range): 2227 | # Step 1: Rapid change of f0 contour is replaced by 0 2228 | f0_step1 = f0_base.copy() 2229 | f0_step1[0] = 0. 2230 | f0_step1[1] = 0. 2231 | 2232 | for i in range(2, len(f0_base)): 2233 | if f0_base[i] == 0: 2234 | continue 2235 | reference_f0 = f0_base[i - 1] * 2 - f0_base[i - 2] 2236 | c1 = np.abs((f0_base[i] - reference_f0) / reference_f0) > allowed_range 2237 | c2 = np.abs((f0_base[i] - f0_base[i - 1]) / f0_base[i - 1]) > allowed_range 2238 | if c1 and c2: 2239 | f0_step1[i] = 0. 2240 | return f0_step1 2241 | 2242 | 2243 | def harvest_fix_step_2(f0_step1, voice_range_minimum): 2244 | f0_step2 = f0_step1.copy() 2245 | boundary_list = harvest_get_boundary_list(f0_step1) 2246 | 2247 | for i in range(1, int(len(boundary_list) / 2.) + 1): 2248 | distance = boundary_list[(2 * i) - 1] - boundary_list[(2 * i) - 2] 2249 | if distance < voice_range_minimum: 2250 | # need one more due to range not including last index 2251 | lb = boundary_list[(2 * i) - 2] 2252 | ub = boundary_list[(2 * i) - 1] + 1 2253 | f0_step2[lb:ub] = 0. 2254 | return f0_step2 2255 | 2256 | 2257 | def harvest_fix_step_3(f0_step2, f0_candidates, allowed_range, f0_scores): 2258 | f0_step3 = f0_step2.copy() 2259 | boundary_list = harvest_get_boundary_list(f0_step2) 2260 | multichannel_f0 = harvest_get_multichannel_f0(f0_step2, boundary_list) 2261 | rrange = np.zeros((int(len(boundary_list) / 2), 2)) 2262 | threshold1 = 100 2263 | threshold2 = 2200 2264 | count = 0 2265 | for i in range(1, int(len(boundary_list) / 2) + 1): 2266 | # changed to 2 * i - 2 2267 | extended_f0, tmp_range_1 = harvest_extend_f0(multichannel_f0[i - 1, :], 2268 | boundary_list[(2 * i) - 1], 2269 | min([len(f0_step2) - 1, boundary_list[(2 * i) - 1] + threshold1]), 2270 | 1, f0_candidates, allowed_range) 2271 | tmp_f0_sequence, tmp_range_0 = harvest_extend_f0(extended_f0, 2272 | boundary_list[(2 * i) - 2], 2273 | max([2, boundary_list[(2 * i) - 2] - threshold1]), -1, 2274 | f0_candidates, allowed_range) 2275 | 2276 | mean_f0 = np.mean(tmp_f0_sequence[tmp_range_0 : tmp_range_1 + 1]) 2277 | if threshold2 / mean_f0 < (tmp_range_1 - tmp_range_0): 2278 | multichannel_f0[count, :] = tmp_f0_sequence 2279 | rrange[count, :] = np.array([tmp_range_0, tmp_range_1]) 2280 | count = count + 1 2281 | if count > 0: 2282 | multichannel_f0 = multichannel_f0[:count, :] 2283 | rrange = rrange[:count, :] 2284 | f0_step3 = harvest_merge_f0(multichannel_f0, rrange, f0_candidates, 2285 | f0_scores) 2286 | return f0_step3 2287 | 2288 | 2289 | def harvest_merge_f0(multichannel_f0, rrange, f0_candidates, f0_scores): 2290 | number_of_channels = len(multichannel_f0) 2291 | sorted_order = np.argsort(rrange[:, 0]) 2292 | f0 = multichannel_f0[sorted_order[0], :] 2293 | for i in range(1, number_of_channels): 2294 | if rrange[sorted_order[i], 0] - rrange[sorted_order[0], 1] > 0: 2295 | # no overlapping 2296 | f0[int(rrange[sorted_order[i], 0]):int(rrange[sorted_order[i], 1])] = multichannel_f0[sorted_order[i], int(rrange[sorted_order[i], 0]):int(rrange[sorted_order[i], 1])] 2297 | cp = rrange.copy() 2298 | rrange[sorted_order[0], 0] = cp[sorted_order[i], 0] 2299 | rrange[sorted_order[0], 1] = cp[sorted_order[i], 1] 2300 | else: 2301 | cp = rrange.copy() 2302 | res = harvest_merge_f0_sub(f0, cp[sorted_order[0], 0], 2303 | cp[sorted_order[0], 1], 2304 | multichannel_f0[sorted_order[i], :], 2305 | cp[sorted_order[i], 0], 2306 | cp[sorted_order[i], 1], f0_candidates, f0_scores) 2307 | f0 = res[0] 2308 | rrange[sorted_order[0], 1] = res[1] 2309 | return f0 2310 | 2311 | 2312 | def harvest_merge_f0_sub(f0_1, st1, ed1, f0_2, st2, ed2, f0_candidates, 2313 | f0_scores): 2314 | merged_f0 = f0_1 2315 | if (st1 <= st2) and (ed1 >= ed2): 2316 | new_ed = ed1 2317 | return merged_f0, new_ed 2318 | new_ed = ed2 2319 | 2320 | score1 = 0. 2321 | score2 = 0. 2322 | for i in range(int(st2), int(ed1) + 1): 2323 | score1 = score1 + harvest_serach_score(f0_1[i], f0_candidates[:, i], f0_scores[:, i]) 2324 | score2 = score2 + harvest_serach_score(f0_2[i], f0_candidates[:, i], f0_scores[:, i]) 2325 | if score1 > score2: 2326 | merged_f0[int(ed1):int(ed2) + 1] = f0_2[int(ed1):int(ed2) + 1] 2327 | else: 2328 | merged_f0[int(st2):int(ed2) + 1] = f0_2[int(st2):int(ed2) + 1] 2329 | return merged_f0, new_ed 2330 | 2331 | 2332 | def harvest_serach_score(f0, f0_candidates, f0_scores): 2333 | score = 0 2334 | for i in range(len(f0_candidates)): 2335 | if (f0 == f0_candidates[i]) and (score < f0_scores[i]): 2336 | score = f0_scores[i] 2337 | return score 2338 | 2339 | 2340 | def harvest_extend_f0(f0, origin, last_point, shift, f0_candidates, 2341 | allowed_range): 2342 | threshold = 4 2343 | extended_f0 = f0.copy() 2344 | tmp_f0 = extended_f0[origin] 2345 | shifted_origin = origin 2346 | count = 0 2347 | 2348 | for i in np.arange(origin, last_point + shift, shift): 2349 | # off by 1 issues 2350 | if (i + shift) >= f0_candidates.shape[1]: 2351 | continue 2352 | bf0, bs = harvest_select_best_f0(tmp_f0, 2353 | f0_candidates[:, i + shift], allowed_range) 2354 | extended_f0[i + shift] = bf0 2355 | if extended_f0[i + shift] != 0: 2356 | tmp_f0 = extended_f0[i + shift] 2357 | count = 0 2358 | shifted_origin = i + shift 2359 | else: 2360 | count = count + 1 2361 | if count == threshold: 2362 | break 2363 | return extended_f0, shifted_origin 2364 | 2365 | 2366 | def harvest_get_multichannel_f0(f0, boundary_list): 2367 | multichannel_f0 = np.zeros((int(len(boundary_list) / 2), len(f0))) 2368 | for i in range(1, int(len(boundary_list) / 2) + 1): 2369 | sl = boundary_list[(2 * i) - 2] 2370 | el = boundary_list[(2 * i) - 1] + 1 2371 | multichannel_f0[i - 1, sl:el] = f0[sl:el] 2372 | return multichannel_f0 2373 | 2374 | 2375 | def harvest_get_boundary_list(f0): 2376 | vuv = f0.copy() 2377 | vuv[vuv != 0] = 1. 2378 | vuv[0] = 0 2379 | vuv[-1] = 0 2380 | diff_vuv = vuv[1:] - vuv[:-1] 2381 | boundary_list = np.where(diff_vuv != 0)[0] 2382 | boundary_list[::2] = boundary_list[::2] + 1 2383 | return boundary_list 2384 | 2385 | 2386 | def harvest_fix_step_4(f0_step3, threshold): 2387 | f0_step4 = f0_step3.copy() 2388 | boundary_list = harvest_get_boundary_list(f0_step3) 2389 | 2390 | for i in range(1, int(len(boundary_list) / 2.)): 2391 | distance = boundary_list[(2 * i)] - boundary_list[(2 * i) - 1] - 1 2392 | if distance >= threshold: 2393 | continue 2394 | boundary0 = f0_step3[boundary_list[(2 * i) - 1]] + 1 2395 | boundary1 = f0_step3[boundary_list[(2 * i)]] - 1 2396 | coefficient = (boundary1 - boundary0) / float((distance + 1)) 2397 | count = 1 2398 | st = boundary_list[(2 * i) - 1] + 1 2399 | ed = boundary_list[(2 * i)] 2400 | for j in range(st, ed): 2401 | f0_step4[j] = boundary0 + coefficient * count 2402 | count = count + 1 2403 | return f0_step4 2404 | 2405 | 2406 | def harvest_fix_f0_contour(f0_candidates, f0_scores): 2407 | f0_base = harvest_search_f0_base(f0_candidates, f0_scores) 2408 | f0_step1 = harvest_fix_step_1(f0_base, 0.008) # optimized? 2409 | f0_step2 = harvest_fix_step_2(f0_step1, 6) # optimized? 2410 | f0_step3 = harvest_fix_step_3(f0_step2, f0_candidates, 0.18, f0_scores) # optimized? 2411 | f0 = harvest_fix_step_4(f0_step3, 9) # optimized 2412 | vuv = f0.copy() 2413 | vuv[vuv != 0] = 1. 2414 | return f0, vuv 2415 | 2416 | 2417 | def harvest_filter_f0_contour(f0, st, ed, b, a): 2418 | smoothed_f0 = f0.copy() 2419 | smoothed_f0[:st] = smoothed_f0[st] 2420 | smoothed_f0[ed + 1:] = smoothed_f0[ed] 2421 | aaa = sg.lfilter(b, a, smoothed_f0) 2422 | bbb = sg.lfilter(b, a, aaa[::-1]) 2423 | smoothed_f0 = bbb[::-1].copy() 2424 | smoothed_f0[:st] = 0. 2425 | smoothed_f0[ed + 1:] = 0. 2426 | return smoothed_f0 2427 | 2428 | 2429 | def harvest_smooth_f0_contour(f0): 2430 | b = np.array([0.0078202080334971724, 0.015640416066994345, 0.0078202080334971724]) 2431 | a = np.array([1.0, -1.7347257688092754, 0.76600660094326412]) 2432 | smoothed_f0 = np.concatenate([np.zeros(300,), f0, np.zeros(300,)]) 2433 | boundary_list = harvest_get_boundary_list(smoothed_f0) 2434 | multichannel_f0 = harvest_get_multichannel_f0(smoothed_f0, boundary_list) 2435 | for i in range(1, int(len(boundary_list) / 2) + 1): 2436 | tmp_f0_contour = harvest_filter_f0_contour(multichannel_f0[i - 1, :], 2437 | boundary_list[(2 * i) - 2], boundary_list[(2 * i) - 1], b, a) 2438 | st = boundary_list[(2 * i) - 2] 2439 | ed = boundary_list[(2 * i) - 1] + 1 2440 | smoothed_f0[st:ed] = tmp_f0_contour[st:ed] 2441 | smoothed_f0 = smoothed_f0[300:-300] 2442 | return smoothed_f0 2443 | 2444 | 2445 | def _world_get_temporal_positions(x_len, fs): 2446 | frame_period = 5 2447 | basic_frame_period = 1 2448 | basic_temporal_positions = np.arange(0, x_len / float(fs), basic_frame_period / float(1000)) 2449 | temporal_positions = np.arange(0, 2450 | x_len / float(fs), 2451 | frame_period / float(1000)) 2452 | return basic_temporal_positions, temporal_positions 2453 | 2454 | 2455 | def harvest(x, fs): 2456 | f0_floor = 71 2457 | f0_ceil = 800 2458 | target_fs = 8000 2459 | channels_in_octave = 40. 2460 | basic_temporal_positions, temporal_positions = _world_get_temporal_positions(len(x), fs) 2461 | adjusted_f0_floor = f0_floor * 0.9 2462 | adjusted_f0_ceil = f0_ceil * 1.1 2463 | boundary_f0_list = np.arange(1, np.ceil(np.log2(adjusted_f0_ceil / adjusted_f0_floor) * channels_in_octave) + 1) / float(channels_in_octave) 2464 | boundary_f0_list = adjusted_f0_floor * 2.0 ** boundary_f0_list 2465 | y, actual_fs = harvest_get_downsampled_signal(x, fs, target_fs) 2466 | fft_size = 2. ** np.ceil(np.log2(len(y) + np.round(fs / f0_floor * 4) + 1)) 2467 | y_spectrum = np.fft.fft(y, int(fft_size)) 2468 | raw_f0_candidates = harvest_get_raw_f0_candidates( 2469 | len(basic_temporal_positions), 2470 | boundary_f0_list, len(y), basic_temporal_positions, actual_fs, 2471 | y_spectrum, f0_floor, f0_ceil) 2472 | 2473 | f0_candidates, number_of_candidates = harvest_detect_official_f0_candidates(raw_f0_candidates) 2474 | f0_candidates = harvest_overlap_f0_candidates(f0_candidates, number_of_candidates) 2475 | f0_candidates, f0_scores = harvest_refine_candidates(y, actual_fs, 2476 | basic_temporal_positions, f0_candidates, f0_floor, f0_ceil) 2477 | 2478 | f0_candidates, f0_scores = harvest_remove_unreliable_candidates(f0_candidates, f0_scores) 2479 | 2480 | connected_f0, vuv = harvest_fix_f0_contour(f0_candidates, f0_scores) 2481 | smoothed_f0 = harvest_smooth_f0_contour(connected_f0) 2482 | idx = np.minimum(len(smoothed_f0) - 1, np.round(temporal_positions * 1000)).astype("int32") 2483 | f0 = smoothed_f0[idx] 2484 | vuv = vuv[idx] 2485 | f0_candidates = f0_candidates 2486 | return temporal_positions, f0, vuv, f0_candidates 2487 | 2488 | 2489 | def cheaptrick_get_windowed_waveform(x, fs, current_f0, current_position): 2490 | half_window_length = np.round(1.5 * fs / float(current_f0)) 2491 | base_index = np.arange(-half_window_length, half_window_length + 1) 2492 | index = np.round(current_position * fs + 0.001) + base_index + 1 2493 | safe_index = np.minimum(len(x), np.maximum(1, np.round(index))).astype("int32") 2494 | safe_index = safe_index - 1 2495 | segment = x[safe_index] 2496 | time_axis = base_index / float(fs) / 1.5 2497 | window1 = 0.5 * np.cos(np.pi * time_axis * float(current_f0)) + 0.5 2498 | window1 = window1 / np.sqrt(np.sum(window1 ** 2)) 2499 | waveform = segment * window1 - window1 * np.mean(segment * window1) / np.mean(window1) 2500 | return waveform 2501 | 2502 | 2503 | def cheaptrick_get_power_spectrum(waveform, fs, fft_size, f0): 2504 | power_spectrum = np.abs(np.fft.fft(waveform, fft_size)) ** 2 2505 | frequency_axis = np.arange(fft_size) / float(fft_size) * float(fs) 2506 | ind = frequency_axis < (f0 + fs / fft_size) 2507 | low_frequency_axis = frequency_axis[ind] 2508 | low_frequency_replica = interp1d(f0 - low_frequency_axis, 2509 | power_spectrum[ind], kind="linear", 2510 | fill_value="extrapolate")(low_frequency_axis) 2511 | p1 = low_frequency_replica[(frequency_axis < f0)[:len(low_frequency_replica)]] 2512 | p2 = power_spectrum[(frequency_axis < f0)[:len(power_spectrum)]] 2513 | power_spectrum[frequency_axis < f0] = p1 + p2 2514 | lb1 = int(fft_size / 2) + 1 2515 | lb2 = 1 2516 | ub2 = int(fft_size / 2) 2517 | power_spectrum[lb1:] = power_spectrum[lb2:ub2][::-1] 2518 | return power_spectrum 2519 | 2520 | 2521 | def cheaptrick_linear_smoothing(power_spectrum, f0, fs, fft_size): 2522 | double_frequency_axis = np.arange(2 * fft_size) / float(fft_size ) * fs - fs 2523 | double_spectrum = np.concatenate([power_spectrum, power_spectrum]) 2524 | 2525 | double_segment = np.cumsum(double_spectrum * (fs / float(fft_size))) 2526 | center_frequency = np.arange(int(fft_size / 2) + 1) / float(fft_size ) * fs 2527 | low_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2., 2528 | double_segment, center_frequency - f0 / 3.) 2529 | high_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2., 2530 | double_segment, center_frequency + f0 / 3.) 2531 | smoothed_spectrum = (high_levels - low_levels) * 1.5 / f0 2532 | return smoothed_spectrum 2533 | 2534 | 2535 | def cheaptrick_interp1h(x, y, xi): 2536 | delta_x = float(x[1] - x[0]) 2537 | xi = np.maximum(x[0], np.minimum(x[-1], xi)) 2538 | xi_base = (np.floor((xi - x[0]) / delta_x)).astype("int32") 2539 | xi_fraction = (xi - x[0]) / delta_x - xi_base 2540 | delta_y = np.zeros_like(y) 2541 | delta_y[:-1] = y[1:] - y[:-1] 2542 | yi = y[xi_base] + delta_y[xi_base] * xi_fraction 2543 | return yi 2544 | 2545 | 2546 | def cheaptrick_smoothing_with_recovery(smoothed_spectrum, f0, fs, fft_size, q1): 2547 | quefrency_axis = np.arange(fft_size) / float(fs) 2548 | # 0 is NaN 2549 | smoothing_lifter = np.sin(np.pi * f0 * quefrency_axis) / (np.pi * f0 * quefrency_axis) 2550 | p = smoothing_lifter[1:int(fft_size / 2)][::-1].copy() 2551 | smoothing_lifter[int(fft_size / 2) + 1:] = p 2552 | smoothing_lifter[0] = 1. 2553 | compensation_lifter = (1 - 2. * q1) + 2. * q1 * np.cos(2 * np.pi * quefrency_axis * f0) 2554 | p = compensation_lifter[1:int(fft_size / 2)][::-1].copy() 2555 | compensation_lifter[int(fft_size / 2) + 1:] = p 2556 | tandem_cepstrum = np.fft.fft(np.log(smoothed_spectrum)) 2557 | tmp_spectral_envelope = np.exp(np.real(np.fft.ifft(tandem_cepstrum * smoothing_lifter * compensation_lifter))) 2558 | spectral_envelope = tmp_spectral_envelope[:int(fft_size / 2) + 1] 2559 | return spectral_envelope 2560 | 2561 | 2562 | def cheaptrick_estimate_one_slice(x, fs, current_f0, 2563 | current_position, fft_size, q1): 2564 | waveform = cheaptrick_get_windowed_waveform(x, fs, current_f0, 2565 | current_position) 2566 | power_spectrum = cheaptrick_get_power_spectrum(waveform, fs, fft_size, 2567 | current_f0) 2568 | smoothed_spectrum = cheaptrick_linear_smoothing(power_spectrum, current_f0, 2569 | fs, fft_size) 2570 | comb_spectrum = np.concatenate([smoothed_spectrum, smoothed_spectrum[1:-1][::-1]]) 2571 | spectral_envelope = cheaptrick_smoothing_with_recovery(comb_spectrum, 2572 | current_f0, fs, fft_size, q1) 2573 | return spectral_envelope 2574 | 2575 | 2576 | def cheaptrick(x, fs, temporal_positions, f0_sequence, 2577 | vuv, fftlen="auto", q1=-0.15): 2578 | f0_sequence = f0_sequence.copy() 2579 | f0_low_limit = 71 2580 | default_f0 = 500 2581 | if fftlen == "auto": 2582 | fftlen = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1))) 2583 | #raise ValueError("Only fftlen auto currently supported") 2584 | fft_size = fftlen 2585 | f0_low_limit = fs * 3.0 / (fft_size - 3.0) 2586 | f0_sequence[vuv == 0] = default_f0 2587 | spectrogram = np.zeros((int(fft_size / 2.) + 1, len(f0_sequence))) 2588 | for i in range(len(f0_sequence)): 2589 | if f0_sequence[i] < f0_low_limit: 2590 | f0_sequence[i] = default_f0 2591 | spectrogram[:, i] = cheaptrick_estimate_one_slice(x, fs, f0_sequence[i], 2592 | temporal_positions[i], fft_size, q1) 2593 | return temporal_positions, spectrogram.T, fs 2594 | 2595 | 2596 | def d4c_love_train(x, fs, current_f0, current_position, threshold): 2597 | vuv = 0 2598 | if current_f0 == 0: 2599 | return vuv 2600 | lowest_f0 = 40 2601 | current_f0 = max([current_f0, lowest_f0]) 2602 | fft_size = int(2 ** np.ceil(np.log2(3. * fs / lowest_f0 + 1))) 2603 | boundary0 = int(np.ceil(100 / (float(fs) / fft_size))) 2604 | boundary1 = int(np.ceil(4000 / (float(fs) / fft_size))) 2605 | boundary2 = int(np.ceil(7900 / (float(fs) / fft_size))) 2606 | 2607 | waveform = d4c_get_windowed_waveform(x, fs, current_f0, current_position, 2608 | 1.5, 2) 2609 | power_spectrum = np.abs(np.fft.fft(waveform, int(fft_size)) ** 2) 2610 | power_spectrum[0:boundary0 + 1] = 0. 2611 | cumulative_spectrum = np.cumsum(power_spectrum) 2612 | if (cumulative_spectrum[boundary1] / cumulative_spectrum[boundary2]) > threshold: 2613 | vuv = 1 2614 | return vuv 2615 | 2616 | 2617 | def d4c_get_windowed_waveform(x, fs, current_f0, current_position, half_length, 2618 | window_type): 2619 | half_window_length = int(np.round(half_length * fs / current_f0)) 2620 | base_index = np.arange(-half_window_length, half_window_length + 1) 2621 | index = np.round(current_position * fs + 0.001) + base_index + 1 2622 | safe_index = np.minimum(len(x), np.maximum(1, np.round(index))).astype("int32") - 1 2623 | 2624 | segment = x[safe_index] 2625 | time_axis = base_index / float(fs) / float(half_length) 2626 | if window_type == 1: 2627 | window1 = 0.5 * np.cos(np.pi * time_axis * current_f0) + 0.5 2628 | elif window_type == 2: 2629 | window1 = 0.08 * np.cos(np.pi * time_axis * current_f0 * 2) 2630 | window1 += 0.5 * np.cos(np.pi * time_axis * current_f0) + 0.42 2631 | else: 2632 | raise ValueError("Unknown window type") 2633 | waveform = segment * window1 - window1 * np.mean(segment * window1) / np.mean(window1) 2634 | return waveform 2635 | 2636 | 2637 | def d4c_get_static_centroid(x, fs, current_f0, current_position, fft_size): 2638 | waveform1 = d4c_get_windowed_waveform(x, fs, current_f0, 2639 | current_position + 1. / current_f0 / 4., 2, 2) 2640 | waveform2 = d4c_get_windowed_waveform(x, fs, current_f0, 2641 | current_position - 1. / current_f0 / 4., 2, 2) 2642 | centroid1 = d4c_get_centroid(waveform1, fft_size) 2643 | centroid2 = d4c_get_centroid(waveform2, fft_size) 2644 | centroid = d4c_dc_correction(centroid1 + centroid2, fs, fft_size, 2645 | current_f0) 2646 | return centroid 2647 | 2648 | 2649 | def d4c_get_centroid(x, fft_size): 2650 | fft_size = int(fft_size) 2651 | time_axis = np.arange(1, len(x) + 1) 2652 | x = x.copy() 2653 | x = x / np.sqrt(np.sum(x ** 2)) 2654 | 2655 | spectrum = np.fft.fft(x, fft_size) 2656 | weighted_spectrum = np.fft.fft(-x * 1j * time_axis, fft_size) 2657 | centroid = -(weighted_spectrum.imag) * spectrum.real + spectrum.imag * weighted_spectrum.real 2658 | return centroid 2659 | 2660 | 2661 | def d4c_dc_correction(signal, fs, fft_size, f0): 2662 | fft_size = int(fft_size) 2663 | frequency_axis = np.arange(fft_size) / fft_size * fs 2664 | low_frequency_axis = frequency_axis[frequency_axis < f0 + fs / fft_size] 2665 | low_frequency_replica = interp1d(f0 - low_frequency_axis, 2666 | signal[frequency_axis < f0 + fs / fft_size], 2667 | kind="linear", 2668 | fill_value="extrapolate")(low_frequency_axis) 2669 | idx = frequency_axis < f0 2670 | signal[idx] = low_frequency_replica[idx[:len(low_frequency_replica)]] + signal[idx] 2671 | signal[int(fft_size / 2.) + 1:] = signal[1 : int(fft_size / 2.)][::-1] 2672 | return signal 2673 | 2674 | 2675 | def d4c_linear_smoothing(group_delay, fs, fft_size, width): 2676 | double_frequency_axis = np.arange(2 * fft_size) / float(fft_size ) * fs - fs 2677 | double_spectrum = np.concatenate([group_delay, group_delay]) 2678 | 2679 | double_segment = np.cumsum(double_spectrum * (fs / float(fft_size))) 2680 | center_frequency = np.arange(int(fft_size / 2) + 1) / float(fft_size ) * fs 2681 | low_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2., 2682 | double_segment, center_frequency - width / 2.) 2683 | high_levels = cheaptrick_interp1h(double_frequency_axis + fs / float(fft_size) / 2., 2684 | double_segment, center_frequency + width / 2.) 2685 | smoothed_spectrum = (high_levels - low_levels) / width 2686 | return smoothed_spectrum 2687 | 2688 | 2689 | def d4c_get_smoothed_power_spectrum(waveform, fs, f0, fft_size): 2690 | power_spectrum = np.abs(np.fft.fft(waveform, int(fft_size))) ** 2 2691 | spectral_envelope = d4c_dc_correction(power_spectrum, fs, fft_size, f0) 2692 | spectral_envelope = d4c_linear_smoothing(spectral_envelope, fs, fft_size, f0) 2693 | spectral_envelope = np.concatenate([spectral_envelope, 2694 | spectral_envelope[1:-1][::-1]]) 2695 | return spectral_envelope 2696 | 2697 | 2698 | def d4c_get_static_group_delay(static_centroid, smoothed_power_spectrum, fs, f0, 2699 | fft_size): 2700 | group_delay = static_centroid / smoothed_power_spectrum 2701 | group_delay = d4c_linear_smoothing(group_delay, fs, fft_size, f0 / 2.) 2702 | group_delay = np.concatenate([group_delay, group_delay[1:-1][::-1]]) 2703 | smoothed_group_delay = d4c_linear_smoothing(group_delay, fs, fft_size, f0) 2704 | group_delay = group_delay[:int(fft_size / 2) + 1] - smoothed_group_delay 2705 | group_delay = np.concatenate([group_delay, group_delay[1:-1][::-1]]) 2706 | return group_delay 2707 | 2708 | 2709 | def d4c_get_coarse_aperiodicity(group_delay, fs, fft_size, 2710 | frequency_interval, number_of_aperiodicities, window1): 2711 | boundary = np.round(fft_size / len(window1) * 8) 2712 | half_window_length = np.floor(len(window1) / 2) 2713 | coarse_aperiodicity = np.zeros((number_of_aperiodicities, 1)) 2714 | for i in range(1, number_of_aperiodicities + 1): 2715 | center = np.floor(frequency_interval * i / (fs / float(fft_size))) 2716 | segment = group_delay[int(center - half_window_length):int(center + half_window_length + 1)] * window1 2717 | power_spectrum = np.abs(np.fft.fft(segment, int(fft_size))) ** 2 2718 | cumulative_power_spectrum = np.cumsum(np.sort(power_spectrum[:int(fft_size / 2) + 1])) 2719 | coarse_aperiodicity[i - 1] = -10 * np.log10(cumulative_power_spectrum[int(fft_size / 2 - boundary) - 1] / cumulative_power_spectrum[-1]) 2720 | return coarse_aperiodicity 2721 | 2722 | 2723 | def d4c_estimate_one_slice(x, fs, current_f0, frequency_interval, 2724 | current_position, fft_size, number_of_aperiodicities, window1): 2725 | if current_f0 == 0: 2726 | coarse_aperiodicity = np.zeros((number_of_aperiodicities, 1)) 2727 | return coarse_aperiodicity 2728 | 2729 | static_centroid = d4c_get_static_centroid(x, fs, current_f0, 2730 | current_position, fft_size) 2731 | waveform = d4c_get_windowed_waveform(x, fs, current_f0, current_position, 2732 | 2, 1) 2733 | smoothed_power_spectrum = d4c_get_smoothed_power_spectrum(waveform, fs, 2734 | current_f0, fft_size) 2735 | static_group_delay = d4c_get_static_group_delay(static_centroid, 2736 | smoothed_power_spectrum, fs, current_f0, fft_size) 2737 | coarse_aperiodicity = d4c_get_coarse_aperiodicity(static_group_delay, 2738 | fs, fft_size, frequency_interval, number_of_aperiodicities, window1) 2739 | return coarse_aperiodicity 2740 | 2741 | 2742 | def d4c(x, fs, temporal_positions_h, f0_h, vuv_h, threshold="default", 2743 | fft_size="auto"): 2744 | f0_low_limit = 47 2745 | if fft_size == "auto": 2746 | fft_size = 2 ** np.ceil(np.log2(4. * fs / f0_low_limit + 1.)) 2747 | else: 2748 | raise ValueError("Only fft_size auto currently supported") 2749 | f0_low_limit_for_spectrum = 71 2750 | fft_size_for_spectrum = 2 ** np.ceil(np.log2(3 * fs / f0_low_limit_for_spectrum + 1.)) 2751 | threshold = 0.85 2752 | upper_limit = 15000 2753 | frequency_interval = 3000 2754 | f0 = f0_h.copy() 2755 | temporal_positions = temporal_positions_h.copy() 2756 | f0[vuv_h == 0] = 0. 2757 | 2758 | number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2. - frequency_interval]) / float(frequency_interval))) 2759 | window_length = np.floor(frequency_interval / (fs / float(fft_size))) * 2 + 1 2760 | window1 = harvest_nuttall(window_length) 2761 | aperiodicity = np.zeros((int(fft_size_for_spectrum / 2) + 1, len(f0))) 2762 | coarse_ap = np.zeros((1, len(f0))) 2763 | 2764 | frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum 2765 | coarse_axis = np.arange(number_of_aperiodicities + 2) * frequency_interval 2766 | coarse_axis[-1] = fs / 2. 2767 | 2768 | for i in range(len(f0)): 2769 | r = d4c_love_train(x, fs, f0[i], temporal_positions_h[i], threshold) 2770 | if r == 0: 2771 | aperiodicity[:, i] = 1 - 0.000000000001 2772 | continue 2773 | current_f0 = max([f0_low_limit, f0[i]]) 2774 | coarse_aperiodicity = d4c_estimate_one_slice(x, fs, current_f0, 2775 | frequency_interval, temporal_positions[i], fft_size, 2776 | number_of_aperiodicities, window1) 2777 | coarse_ap[0, i] = coarse_aperiodicity.ravel()[0] 2778 | coarse_aperiodicity = np.maximum(0, coarse_aperiodicity - (current_f0 - 100) * 2. / 100.) 2779 | piece = np.concatenate([[-60], -coarse_aperiodicity.ravel(), [-0.000000000001]]) 2780 | part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20. 2781 | aperiodicity[:, i] = 10 ** part 2782 | return temporal_positions_h, f0_h, vuv_h, aperiodicity.T, coarse_ap.squeeze() 2783 | 2784 | 2785 | def world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv, 2786 | time_axis, default_f0): 2787 | f0_interpolated_raw = interp1d(temporal_positions, f0, kind="linear", 2788 | fill_value="extrapolate")(time_axis) 2789 | vuv_interpolated = interp1d(temporal_positions, vuv, kind="linear", 2790 | fill_value="extrapolate")(time_axis) 2791 | vuv_interpolated = vuv_interpolated > 0.5 2792 | f0_interpolated = f0_interpolated_raw * vuv_interpolated.astype("float32") 2793 | f0_interpolated[f0_interpolated == 0] = f0_interpolated[f0_interpolated == 0] + default_f0 2794 | total_phase = np.cumsum(2 * np.pi * f0_interpolated / float(fs)) 2795 | 2796 | core = np.mod(total_phase, 2 * np.pi) 2797 | core = np.abs(core[1:] - core[:-1]) 2798 | # account for diff, avoid deprecation warning with [:-1] 2799 | pulse_locations = time_axis[:-1][core > (np.pi / 2.)] 2800 | pulse_locations_index = np.round(pulse_locations * fs).astype("int32") 2801 | return pulse_locations, pulse_locations_index, vuv_interpolated 2802 | 2803 | 2804 | def world_synthesis_get_spectral_parameters(temporal_positions, 2805 | temporal_position_index, spectrogram, amplitude_periodic, 2806 | amplitude_random, pulse_locations): 2807 | floor_index = int(np.floor(temporal_position_index) - 1) 2808 | assert floor_index >= 0 2809 | ceil_index = int(np.ceil(temporal_position_index) - 1) 2810 | t1 = temporal_positions[floor_index] 2811 | t2 = temporal_positions[ceil_index] 2812 | 2813 | if t1 == t2: 2814 | spectrum_slice = spectrogram[:, floor_index] 2815 | periodic_slice = amplitude_periodic[:, floor_index] 2816 | aperiodic_slice = amplitude_random[:, floor_index] 2817 | else: 2818 | cs = np.concatenate([spectrogram[:, floor_index][None], 2819 | spectrogram[:, ceil_index][None]], axis=0) 2820 | mmm = max([t1, min([t2, pulse_locations])]) 2821 | spectrum_slice = interp1d(np.array([t1, t2]), cs, 2822 | kind="linear", axis=0)(mmm.copy()) 2823 | cp = np.concatenate([amplitude_periodic[:, floor_index][None], 2824 | amplitude_periodic[:, ceil_index][None]], axis=0) 2825 | periodic_slice = interp1d(np.array([t1, t2]), cp, 2826 | kind="linear", axis=0)(mmm.copy()) 2827 | ca = np.concatenate([amplitude_random[:, floor_index][None], 2828 | amplitude_random[:, ceil_index][None]], axis=0) 2829 | aperiodic_slice = interp1d(np.array([t1, t2]), ca, 2830 | kind="linear", axis=0)(mmm.copy()) 2831 | return spectrum_slice, periodic_slice, aperiodic_slice 2832 | 2833 | """ 2834 | Filter data with an FIR filter using the overlap-add method. 2835 | from http://projects.scipy.org/scipy/attachment/ticket/837/fftfilt.py 2836 | """ 2837 | def nextpow2(x): 2838 | """Return the first integer N such that 2**N >= abs(x)""" 2839 | return np.ceil(np.log2(np.abs(x))) 2840 | 2841 | 2842 | def fftfilt(b, x, *n): 2843 | """Filter the signal x with the FIR filter described by the 2844 | coefficients in b using the overlap-add method. If the FFT 2845 | length n is not specified, it and the overlap-add block length 2846 | are selected so as to minimize the computational cost of 2847 | the filtering operation.""" 2848 | 2849 | N_x = len(x) 2850 | N_b = len(b) 2851 | 2852 | # Determine the FFT length to use: 2853 | if len(n): 2854 | # Use the specified FFT length (rounded up to the nearest 2855 | # power of 2), provided that it is no less than the filter 2856 | # length: 2857 | n = n[0] 2858 | if n != int(n) or n <= 0: 2859 | raise ValueError('n must be a nonnegative integer') 2860 | if n < N_b: 2861 | n = N_b 2862 | N_fft = 2**nextpow2(n) 2863 | else: 2864 | if N_x > N_b: 2865 | # When the filter length is smaller than the signal, 2866 | # choose the FFT length and block size that minimize the 2867 | # FLOPS cost. Since the cost for a length-N FFT is 2868 | # (N/2)*log2(N) and the filtering operation of each block 2869 | # involves 2 FFT operations and N multiplications, the 2870 | # cost of the overlap-add method for 1 length-N block is 2871 | # N*(1+log2(N)). For the sake of efficiency, only FFT 2872 | # lengths that are powers of 2 are considered: 2873 | N = 2**np.arange(np.ceil(np.log2(N_b)), 2874 | np.floor(np.log2(N_x))) 2875 | cost = np.ceil(N_x/(N-N_b+1))*N*(np.log2(N)+1) 2876 | N_fft = N[np.argmin(cost)] 2877 | else: 2878 | # When the filter length is at least as long as the signal, 2879 | # filter the signal using a single block: 2880 | N_fft = 2**nextpow2(N_b+N_x-1) 2881 | 2882 | N_fft = int(N_fft) 2883 | 2884 | # Compute the block length: 2885 | L = int(N_fft - N_b + 1) 2886 | 2887 | # Compute the transform of the filter: 2888 | H = np.fft.fft(b, N_fft) 2889 | 2890 | y = np.zeros(N_x, dtype=np.float32) 2891 | i = 0 2892 | while i <= N_x: 2893 | il = min([i+L,N_x]) 2894 | k = min([i+N_fft,N_x]) 2895 | yt = np.fft.ifft(np.fft.fft(x[i:il],N_fft)*H,N_fft) # Overlap.. 2896 | y[i:k] = y[i:k] + yt[:k-i] # and add 2897 | i += L 2898 | return y 2899 | 2900 | 2901 | def world_synthesis(f0_d4c, vuv_d4c, aperiodicity_d4c, 2902 | spectrogram_ct, fs_ct, random_seed=1999): 2903 | 2904 | # swap 0 and 1 axis 2905 | spectrogram_ct = spectrogram_ct.T 2906 | fs = fs_ct 2907 | # coarse -> fine aper 2908 | if len(aperiodicity_d4c.shape) == 1 or aperiodicity_d4c.shape[1] == 1: 2909 | print("Coarse aperiodicity detected - interpolating to full size") 2910 | aper = np.zeros_like(spectrogram_ct) 2911 | if len(aperiodicity_d4c.shape) == 1: 2912 | aperiodicity_d4c = aperiodicity_d4c[None, :] 2913 | else: 2914 | aperiodicity_d4c = aperiodicity_d4c.T 2915 | coarse_aper_d4c = aperiodicity_d4c 2916 | frequency_interval = 3000 2917 | upper_limit = 15000 2918 | number_of_aperiodicities = int(np.floor(np.min([upper_limit, fs / 2. - frequency_interval]) / float(frequency_interval))) 2919 | coarse_axis = np.arange(number_of_aperiodicities + 2) * frequency_interval 2920 | coarse_axis[-1] = fs / 2. 2921 | f0_low_limit_for_spectrum = 71 2922 | fft_size_for_spectrum = 2 ** np.ceil(np.log2(3 * fs / f0_low_limit_for_spectrum + 1.)) 2923 | 2924 | frequency_axis = np.arange(int(fft_size_for_spectrum / 2) + 1) * float(fs) / fft_size_for_spectrum 2925 | 2926 | for i in range(len(f0_d4c)): 2927 | ca = coarse_aper_d4c[0, i] 2928 | cf = f0_d4c[i] 2929 | coarse_aperiodicity = np.maximum(0, ca - (cf - 100) * 2. / 100.) 2930 | piece = np.concatenate([[-60], -ca.ravel(), [-0.000000000001]]) 2931 | part = interp1d(coarse_axis, piece, kind="linear")(frequency_axis) / 20. 2932 | aper[:, i] = 10 ** part 2933 | aperiodicity_d4c = aper 2934 | else: 2935 | aperiodicity_d4c = aperiodicity_d4c.T 2936 | 2937 | default_f0 = 500. 2938 | random_state = np.random.RandomState(1999) 2939 | spectrogram = spectrogram_ct 2940 | aperiodicity = aperiodicity_d4c 2941 | # max 30s, if greater than thrown an error 2942 | max_len = 5000000 2943 | _, temporal_positions = _world_get_temporal_positions(max_len, fs) 2944 | temporal_positions = temporal_positions[:spectrogram.shape[1]] 2945 | #temporal_positions = temporal_positions_d4c 2946 | #from IPython import embed; embed() 2947 | #raise ValueError() 2948 | vuv = vuv_d4c 2949 | f0 = f0_d4c 2950 | 2951 | time_axis = np.arange(temporal_positions[0], temporal_positions[-1], 2952 | 1. / fs) 2953 | y = 0. * time_axis 2954 | r = world_synthesis_time_base_generation(temporal_positions, f0, fs, vuv, 2955 | time_axis, default_f0) 2956 | pulse_locations, pulse_locations_index, interpolated_vuv = r 2957 | fft_size = int((len(spectrogram) - 1) * 2) 2958 | base_index = np.arange(-fft_size / 2, fft_size / 2) + 1 2959 | y_length = len(y) 2960 | tmp_complex_cepstrum = np.zeros((fft_size,), dtype=np.complex128) 2961 | latter_index = np.arange(int(fft_size / 2) + 1, fft_size + 1) - 1 2962 | 2963 | temporal_position_index = interp1d(temporal_positions, np.arange(1, len(temporal_positions) + 1), kind="linear", fill_value="extrapolate")(pulse_locations) 2964 | temporal_postion_index = np.maximum(1, np.minimum(len(temporal_positions), 2965 | temporal_position_index)) - 1 2966 | 2967 | amplitude_aperiodic = aperiodicity ** 2 2968 | amplitude_periodic = np.maximum(0.001, (1. - amplitude_aperiodic)) 2969 | 2970 | for i in range(len(pulse_locations_index)): 2971 | spectrum_slice, periodic_slice, aperiodic_slice = world_synthesis_get_spectral_parameters( 2972 | temporal_positions, temporal_position_index[i], spectrogram, 2973 | amplitude_periodic, amplitude_aperiodic, pulse_locations[i]) 2974 | idx = min(len(pulse_locations_index), i + 2) - 1 2975 | noise_size = pulse_locations_index[idx] - pulse_locations_index[i] 2976 | output_buffer_index = np.maximum(1, np.minimum(y_length, pulse_locations_index[i] + 1 + base_index)).astype("int32") - 1 2977 | 2978 | if interpolated_vuv[pulse_locations_index[i]] >= 0.5: 2979 | tmp_periodic_spectrum = spectrum_slice * periodic_slice 2980 | # eps in matlab/octave 2981 | tmp_periodic_spectrum[tmp_periodic_spectrum == 0] = 2.2204E-16 2982 | periodic_spectrum = np.concatenate([tmp_periodic_spectrum, 2983 | tmp_periodic_spectrum[1:-1][::-1]]) 2984 | tmp_cepstrum = np.real(np.fft.fft(np.log(np.abs(periodic_spectrum)) / 2.)) 2985 | tmp_complex_cepstrum[latter_index] = tmp_cepstrum[latter_index] * 2 2986 | tmp_complex_cepstrum[0] = tmp_cepstrum[0] 2987 | 2988 | response = np.fft.fftshift(np.real(np.fft.ifft(np.exp(np.fft.ifft( 2989 | tmp_complex_cepstrum))))) 2990 | y[output_buffer_index] += response * np.sqrt( 2991 | max([1, noise_size])) 2992 | tmp_aperiodic_spectrum = spectrum_slice * aperiodic_slice 2993 | else: 2994 | tmp_aperiodic_spectrum = spectrum_slice 2995 | 2996 | tmp_aperiodic_spectrum[tmp_aperiodic_spectrum == 0] = 2.2204E-16 2997 | aperiodic_spectrum = np.concatenate([tmp_aperiodic_spectrum, 2998 | tmp_aperiodic_spectrum[1:-1][::-1]]) 2999 | tmp_cepstrum = np.real(np.fft.fft(np.log(np.abs(aperiodic_spectrum)) / 2.)) 3000 | tmp_complex_cepstrum[latter_index] = tmp_cepstrum[latter_index] * 2 3001 | tmp_complex_cepstrum[0] = tmp_cepstrum[0] 3002 | rc = np.fft.ifft(tmp_complex_cepstrum) 3003 | erc = np.exp(rc) 3004 | response = np.fft.fftshift(np.real(np.fft.ifft(erc))) 3005 | noise_input = random_state.randn(max([3, noise_size]),) 3006 | 3007 | y[output_buffer_index] = y[output_buffer_index] + fftfilt(noise_input - np.mean(noise_input), response) 3008 | return y 3009 | 3010 | 3011 | def _mgc_b2c(wc, c, alpha): 3012 | wc_o = np.zeros_like(wc) 3013 | desired_order = len(wc) - 1 3014 | for i in range(0, len(c))[::-1]: 3015 | prev = copy.copy(wc_o) 3016 | wc_o[0] = c[i] 3017 | if desired_order >= 1: 3018 | wc_o[1] = (1. - alpha ** 2) * prev[0] + alpha * prev[1] 3019 | for m in range(2, desired_order + 1): 3020 | wc_o[m] = prev[m - 1] + alpha * (prev[m] - wc_o[m - 1]) 3021 | return wc_o 3022 | 3023 | 3024 | def _mgc_ptrans(p, m, alpha): 3025 | d = 0. 3026 | o = 0. 3027 | 3028 | d = p[m] 3029 | for i in range(1, m)[::-1]: 3030 | o = p[i] + alpha * d 3031 | d = p[i] 3032 | p[i] = o 3033 | 3034 | o = alpha * d 3035 | p[0] = (1. - alpha ** 2) * p[0] + 2 * o 3036 | 3037 | 3038 | def _mgc_qtrans(q, m, alpha): 3039 | d = q[1] 3040 | for i in range(2, 2 * m + 1): 3041 | o = q[i] + alpha * d 3042 | d = q[i] 3043 | q[i] = o 3044 | 3045 | 3046 | def _mgc_gain(er, c, m, g): 3047 | t = 0. 3048 | if g != 0: 3049 | for i in range(1, m + 1): 3050 | t += er[i] * c[i] 3051 | return er[0] + g * t 3052 | else: 3053 | return er[0] 3054 | 3055 | 3056 | def _mgc_fill_toeplitz(A, t): 3057 | n = len(t) 3058 | for i in range(n): 3059 | for j in range(n): 3060 | A[i, j] = t[i - j] if i - j >= 0 else t[j - i] 3061 | 3062 | 3063 | def _mgc_fill_hankel(A, t): 3064 | n = len(t) // 2 + 1 3065 | for i in range(n): 3066 | for j in range(n): 3067 | A[i, j] = t[i + j] 3068 | 3069 | 3070 | def _mgc_ignorm(c, gamma): 3071 | if gamma == 0.: 3072 | c[0] = np.log(c[0]) 3073 | return c 3074 | gain = c[0] ** gamma 3075 | c[1:] *= gain 3076 | c[0] = (gain - 1.) / gamma 3077 | 3078 | 3079 | def _mgc_gnorm(c, gamma): 3080 | if gamma == 0.: 3081 | c[0] = np.exp(c[0]) 3082 | return c 3083 | gain = 1. + gamma * c[0] 3084 | c[1:] /= gain 3085 | c[0] = gain ** (1. / gamma) 3086 | 3087 | 3088 | def _mgc_b2mc(mc, alpha): 3089 | m = len(mc) 3090 | o = 0. 3091 | d = mc[m - 1] 3092 | for i in range(m - 1)[::-1]: 3093 | o = mc[i] + alpha * d 3094 | d = mc[i] 3095 | mc[i] = o 3096 | 3097 | 3098 | def _mgc_mc2b(mc, alpha): 3099 | itr = range(len(mc) - 1)[::-1] 3100 | for i in itr: 3101 | mc[i] = mc[i] - alpha * mc[i + 1] 3102 | 3103 | 3104 | def _mgc_gc2gc(src_ceps, src_gamma=0., dst_order=None, dst_gamma=0.): 3105 | if dst_order == None: 3106 | dst_order = len(src_ceps) - 1 3107 | 3108 | dst_ceps = np.zeros((dst_order + 1,), dtype=src_ceps.dtype) 3109 | dst_order = len(dst_ceps) - 1 3110 | m1 = len(src_ceps) - 1 3111 | dst_ceps[0] = copy.deepcopy(src_ceps[0]) 3112 | 3113 | for m in range(2, dst_order + 2): 3114 | ss1 = 0. 3115 | ss2 = 0. 3116 | min_1 = m1 if (m1 < m - 1) else m - 2 3117 | itr = range(2, min_1 + 2) 3118 | if len(itr) < 1: 3119 | if min_1 + 1 == 2: 3120 | itr = [2] 3121 | else: 3122 | itr = [] 3123 | 3124 | """ 3125 | # old slower version 3126 | for k in itr: 3127 | assert k >= 1 3128 | assert (m - k) >= 0 3129 | cc = src_ceps[k - 1] * dst_ceps[m - k] 3130 | ss2 += (k - 1) * cc 3131 | ss1 += (m - k) * cc 3132 | """ 3133 | 3134 | if len(itr) > 0: 3135 | itr = np.array(itr) 3136 | cc_a = src_ceps[itr - 1] * dst_ceps[m - itr] 3137 | ss2 += ((itr - 1) * cc_a).sum() 3138 | ss1 += ((m - itr) * cc_a).sum() 3139 | 3140 | if m <= m1 + 1: 3141 | dst_ceps[m - 1] = src_ceps[m - 1] + (dst_gamma * ss2 - src_gamma * ss1)/(m - 1.) 3142 | else: 3143 | dst_ceps[m - 1] = (dst_gamma * ss2 - src_gamma * ss1) / (m - 1.) 3144 | return dst_ceps 3145 | 3146 | 3147 | def _mgc_newton(mgc_stored, periodogram, order, alpha, gamma, 3148 | recursion_order, iter_number, y_fft, z_fft, cr, pr, rr, ri, 3149 | qr, qi, Tm, Hm, Tm_plus_Hm, b): 3150 | # a lot of inplace operations to match the Julia code 3151 | cr[1:order + 1] = mgc_stored[1:order + 1] 3152 | 3153 | if alpha != 0: 3154 | cr_res = _mgc_b2c(cr[:recursion_order + 1], cr[:order + 1], -alpha) 3155 | cr[:recursion_order + 1] = cr_res[:] 3156 | 3157 | y = sp.fftpack.fft(np.cast["float64"](cr)) 3158 | c = mgc_stored 3159 | x = periodogram 3160 | if gamma != 0.: 3161 | gamma_inv = 1. / gamma 3162 | else: 3163 | gamma_inv = np.inf 3164 | 3165 | if gamma == -1.: 3166 | pr[:] = copy.deepcopy(x) 3167 | new_pr = copy.deepcopy(pr) 3168 | elif gamma == 0.: 3169 | pr[:] = copy.deepcopy(x) / np.exp(2 * np.real(y)) 3170 | new_pr = copy.deepcopy(pr) 3171 | else: 3172 | tr = 1. + gamma * np.real(y) 3173 | ti = -gamma * np.imag(y) 3174 | trr = tr * tr 3175 | tii = ti * ti 3176 | s = trr + tii 3177 | t = x * np.power(s, (-gamma_inv)) 3178 | t /= s 3179 | pr[:] = t 3180 | rr[:] = tr * t 3181 | ri[:] = ti * t 3182 | t /= s 3183 | qr[:] = (trr - tii) * t 3184 | s = tr * ti * t 3185 | qi[:] = (s + s) 3186 | new_pr = copy.deepcopy(pr) 3187 | 3188 | if gamma != -1.: 3189 | """ 3190 | print() 3191 | print(pr.sum()) 3192 | print(rr.sum()) 3193 | print(ri.sum()) 3194 | print(qr.sum()) 3195 | print(qi.sum()) 3196 | print() 3197 | """ 3198 | pass 3199 | 3200 | y_fft[:] = copy.deepcopy(pr) + 0.j 3201 | z_fft[:] = np.fft.fft(y_fft) / len(y_fft) 3202 | pr[:] = copy.deepcopy(np.real(z_fft)) 3203 | if alpha != 0.: 3204 | idx_1 = pr[:2 * order + 1] 3205 | idx_2 = pr[:recursion_order + 1] 3206 | idx_3 = _mgc_b2c(idx_1, idx_2, alpha) 3207 | pr[:2 * order + 1] = idx_3[:] 3208 | 3209 | if gamma == 0. or gamma == -1.: 3210 | qr[:2 * order + 1] = pr[:2 * order + 1] 3211 | rr[:order + 1] = copy.deepcopy(pr[:order + 1]) 3212 | else: 3213 | for i in range(len(qr)): 3214 | y_fft[i] = qr[i] + 1j * qi[i] 3215 | z_fft[:] = np.fft.fft(y_fft) / len(y_fft) 3216 | qr[:] = np.real(z_fft) 3217 | 3218 | for i in range(len(rr)): 3219 | y_fft[i] = rr[i] + 1j * ri[i] 3220 | z_fft[:] = np.fft.fft(y_fft) / len(y_fft) 3221 | rr[:] = np.real(z_fft) 3222 | 3223 | if alpha != 0.: 3224 | qr_new = _mgc_b2c(qr[:recursion_order + 1], qr[:recursion_order + 1], alpha) 3225 | qr[:recursion_order + 1] = qr_new[:] 3226 | rr_new = _mgc_b2c(rr[:order + 1], rr[:recursion_order + 1], alpha) 3227 | rr[:order + 1] = rr_new[:] 3228 | 3229 | if alpha != 0: 3230 | _mgc_ptrans(pr, order, alpha) 3231 | _mgc_qtrans(qr, order, alpha) 3232 | 3233 | eta = 0. 3234 | if gamma != -1.: 3235 | eta = _mgc_gain(rr, c, order, gamma) 3236 | c[0] = np.sqrt(eta) 3237 | 3238 | if gamma == -1.: 3239 | qr[:] = 0. 3240 | elif gamma != 0.: 3241 | for i in range(2, 2 * order + 1): 3242 | qr[i] *= 1. + gamma 3243 | 3244 | te = pr[:order] 3245 | _mgc_fill_toeplitz(Tm, te) 3246 | he = qr[2: 2 * order + 1] 3247 | _mgc_fill_hankel(Hm, he) 3248 | 3249 | Tm_plus_Hm[:] = Hm[:] + Tm[:] 3250 | b[:order] = rr[1:order + 1] 3251 | res = np.linalg.solve(Tm_plus_Hm, b) 3252 | b[:] = res[:] 3253 | 3254 | c[1:order + 1] += res[:order] 3255 | 3256 | if gamma == -1.: 3257 | eta = _mgc_gain(rr, c, order, gamma) 3258 | c[0] = np.sqrt(eta) 3259 | return np.log(eta), new_pr 3260 | 3261 | 3262 | def _mgc_mgcepnorm(b_gamma, alpha, gamma, otype): 3263 | if otype != 0: 3264 | raise ValueError("Not yet implemented for otype != 0") 3265 | 3266 | mgc = copy.deepcopy(b_gamma) 3267 | _mgc_ignorm(mgc, gamma) 3268 | _mgc_b2mc(mgc, alpha) 3269 | return mgc 3270 | 3271 | 3272 | def _sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2, maxiter=30, criteria=0.001, otype=0, verbose=False): 3273 | # Based on r9y9 Julia code 3274 | # https://github.com/r9y9/MelGeneralizedCepstrums.jl 3275 | periodogram = np.abs(sp) ** 2 3276 | recursion_order = len(periodogram) - 1 3277 | slen = len(periodogram) 3278 | iter_number = 1 3279 | 3280 | def _z(): 3281 | return np.zeros((slen,), dtype="float64") 3282 | 3283 | def _o(): 3284 | return np.zeros((order,), dtype="float64") 3285 | 3286 | def _o2(): 3287 | return np.zeros((order, order), dtype="float64") 3288 | 3289 | cr = _z() 3290 | pr = _z() 3291 | rr = _z() 3292 | ri = _z().astype("float128") 3293 | qr = _z() 3294 | qi = _z().astype("float128") 3295 | Tm = _o2() 3296 | Hm = _o2() 3297 | Tm_plus_Hm = _o2() 3298 | b = _o() 3299 | y = _z() + 0j 3300 | z = _z() + 0j 3301 | b_gamma = np.zeros((order + 1,), dtype="float64") 3302 | # return pr_new due to oddness with Julia having different numbers 3303 | # in pr at end of function vs back in this scope 3304 | eta0, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha, -1., 3305 | recursion_order, iter_number, y, z, cr, pr, rr, 3306 | ri, qr, qi, Tm, Hm, Tm_plus_Hm, b) 3307 | pr[:] = pr_new 3308 | """ 3309 | print(eta0) 3310 | print(sum(b_gamma)) 3311 | print(sum(periodogram)) 3312 | print(order) 3313 | print(alpha) 3314 | print(recursion_order) 3315 | print(sum(y)) 3316 | print(sum(cr)) 3317 | print(sum(z)) 3318 | print(sum(pr)) 3319 | print(sum(rr)) 3320 | print(sum(qi)) 3321 | print(Tm.sum()) 3322 | print(Hm.sum()) 3323 | print(sum(b)) 3324 | raise ValueError() 3325 | """ 3326 | if gamma != -1.: 3327 | d = np.zeros((order + 1,), dtype="float64") 3328 | if alpha != 0.: 3329 | _mgc_ignorm(b_gamma, -1.) 3330 | _mgc_b2mc(b_gamma, alpha) 3331 | d = copy.deepcopy(b_gamma) 3332 | _mgc_gnorm(d, -1.) 3333 | # numbers are slightly different here - numerical diffs? 3334 | else: 3335 | d = copy.deepcopy(b_gamma) 3336 | b_gamma = _mgc_gc2gc(d, -1., order, gamma) 3337 | 3338 | if alpha != 0.: 3339 | _mgc_ignorm(b_gamma, gamma) 3340 | _mgc_mc2b(b_gamma, alpha) 3341 | _mgc_gnorm(b_gamma, gamma) 3342 | 3343 | if gamma != -1.: 3344 | eta_t = eta0 3345 | for i in range(1, maxiter + 1): 3346 | eta, pr_new = _mgc_newton(b_gamma, periodogram, order, alpha, 3347 | gamma, recursion_order, i, y, z, cr, pr, rr, 3348 | ri, qr, qi, Tm, Hm, Tm_plus_Hm, b) 3349 | pr[:] = pr_new 3350 | """ 3351 | print(eta0) 3352 | print(sum(b_gamma)) 3353 | print(sum(periodogram)) 3354 | print(order) 3355 | print(alpha) 3356 | print(recursion_order) 3357 | print(sum(y)) 3358 | print(sum(cr)) 3359 | print(sum(z)) 3360 | print(sum(pr)) 3361 | print(sum(rr)) 3362 | print(sum(qi)) 3363 | print(Tm.sum()) 3364 | print(Hm.sum()) 3365 | print(sum(b)) 3366 | raise ValueError() 3367 | """ 3368 | err = np.abs((eta_t - eta) / eta) 3369 | if verbose: 3370 | print("iter %i, criterion: %f" % (i, err)) 3371 | if i >= miniter: 3372 | if err < criteria: 3373 | if verbose: 3374 | print("optimization complete at iter %i" % i) 3375 | break 3376 | eta_t = eta 3377 | mgc_arr = _mgc_mgcepnorm(b_gamma, alpha, gamma, otype) 3378 | return mgc_arr 3379 | 3380 | 3381 | _sp_convert_results = [] 3382 | 3383 | def _sp_collect_result(result): 3384 | _sp_convert_results.append(result) 3385 | 3386 | 3387 | def _sp_convert(c_i, order, alpha, gamma, miniter, maxiter, criteria, 3388 | otype, verbose): 3389 | i = c_i[0] 3390 | tot_i = c_i[1] 3391 | sp_i = c_i[2] 3392 | r_i = (i, _sp2mgc(sp_i, order=order, alpha=alpha, gamma=gamma, 3393 | miniter=miniter, maxiter=maxiter, criteria=criteria, 3394 | otype=otype, verbose=verbose)) 3395 | return r_i 3396 | 3397 | 3398 | def sp2mgc(sp, order=20, alpha=0.35, gamma=-0.41, miniter=2, 3399 | maxiter=30, criteria=0.001, otype=0, verbose=False): 3400 | """ 3401 | Accepts 1D or 2D one-sided spectrum (complex or real valued). 3402 | 3403 | If 2D, assumes time is axis 0. 3404 | 3405 | Returns mel generalized cepstral coefficients. 3406 | 3407 | Based on r9y9 Julia code 3408 | https://github.com/r9y9/MelGeneralizedCepstrums.jl 3409 | """ 3410 | 3411 | if len(sp.shape) == 1: 3412 | sp = np.concatenate((sp, sp[:, 1:][:, ::-1]), axis=0) 3413 | return _sp2mgc(sp, order=order, alpha=alpha, gamma=gamma, 3414 | miniter=miniter, maxiter=maxiter, criteria=criteria, 3415 | otype=otype, verbose=verbose) 3416 | else: 3417 | sp = np.concatenate((sp, sp[:, 1:][:, ::-1]), axis=1) 3418 | # Slooow, use multiprocessing to speed up a bit 3419 | # http://blog.shenwei.me/python-multiprocessing-pool-difference-between-map-apply-map_async-apply_async/ 3420 | # http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call 3421 | c = [(i + 1, sp.shape[0], sp[i]) for i in range(sp.shape[0])] 3422 | p = Pool() 3423 | start = time.time() 3424 | if verbose: 3425 | print("Starting conversion of %i frames" % sp.shape[0]) 3426 | print("This may take some time...") 3427 | 3428 | # takes ~360s for 630 frames, 1 process 3429 | itr = p.map_async(functools.partial(_sp_convert, order=order, alpha=alpha, gamma=gamma, miniter=miniter, maxiter=maxiter, criteria=criteria, otype=otype, verbose=False), c, callback=_sp_collect_result) 3430 | 3431 | sz = len(c) // itr._chunksize 3432 | if (sz * itr._chunksize) != len(c): 3433 | sz += 1 3434 | 3435 | last_remaining = None 3436 | while True: 3437 | remaining = itr._number_left 3438 | if verbose: 3439 | if remaining != last_remaining: 3440 | last_remaining = remaining 3441 | print("%i chunks of %i complete" % (sz - remaining, sz)) 3442 | if itr.ready(): 3443 | break 3444 | time.sleep(.5) 3445 | 3446 | """ 3447 | # takes ~455s for 630 frames 3448 | itr = p.imap_unordered(functools.partial(_sp_convert, order=order, alpha=alpha, gamma=gamma, miniter=miniter, maxiter=maxiter, criteria=criteria, otype=otype, verbose=False), c) 3449 | res = [] 3450 | # print ~every 5% 3451 | mod = int(len(c)) // 20 3452 | if mod < 1: 3453 | mod = 1 3454 | for i, res_i in enumerate(itr, 1): 3455 | res.append(res_i) 3456 | if i % mod == 0 or i == 1: 3457 | print("%i of %i complete" % (i, len(c))) 3458 | """ 3459 | p.close() 3460 | p.join() 3461 | stop = time.time() 3462 | if verbose: 3463 | print("Processed %i frames in %s seconds" % (sp.shape[0], stop - start)) 3464 | # map_async result comes in chunks 3465 | flat = [a_i for a in _sp_convert_results for a_i in a] 3466 | final = [o[1] for o in sorted(flat, key=lambda x: x[0])] 3467 | for i in range(len(_sp_convert_results)): 3468 | _sp_convert_results.pop() 3469 | return np.array(final) 3470 | 3471 | 3472 | def win2mgc(windowed_signal, order=20, alpha=0.35, gamma=-0.41, miniter=2, 3473 | maxiter=30, criteria=0.001, otype=0, verbose=False): 3474 | """ 3475 | Accepts 1D or 2D array of windowed signal frames. 3476 | 3477 | If 2D, assumes time is axis 0. 3478 | 3479 | Returns mel generalized cepstral coefficients. 3480 | 3481 | Based on r9y9 Julia code 3482 | https://github.com/r9y9/MelGeneralizedCepstrums.jl 3483 | """ 3484 | if len(windowed_signal.shape) == 1: 3485 | sp = np.fft.fft(windowed_signal) 3486 | return _sp2mgc(sp, order=order, alpha=alpha, gamma=gamma, 3487 | miniter=miniter, maxiter=maxiter, criteria=criteria, 3488 | otype=otype, verbose=verbose) 3489 | else: 3490 | raise ValueError("2D input not yet complete for win2mgc") 3491 | 3492 | 3493 | def _mgc_freqt(wc, c, alpha): 3494 | prev = np.zeros_like(wc) 3495 | dst_order = len(wc) - 1 3496 | wc *= 0 3497 | m1 = len(c) - 1 3498 | for i in range(-m1, 1, 1): 3499 | prev[:] = wc 3500 | if dst_order >= 0: 3501 | wc[0] = c[-i] + alpha * prev[0] 3502 | if dst_order >= 1: 3503 | wc[1] = (1. - alpha * alpha) * prev[0] + alpha * prev[1] 3504 | for m in range(2, dst_order + 1): 3505 | wc[m] = prev[m - 1] + alpha * (prev[m] - wc[m - 1]) 3506 | 3507 | 3508 | def _mgc_mgc2mgc(src_ceps, src_alpha, src_gamma, dst_order, dst_alpha, dst_gamma): 3509 | dst_ceps = np.zeros((dst_order + 1,)) 3510 | alpha = (dst_alpha - src_alpha) / (1. - dst_alpha * src_alpha) 3511 | if alpha == 0.: 3512 | new_dst_ceps = copy.deepcopy(src_ceps) 3513 | _mgc_gnorm(new_dst_ceps, src_gamma) 3514 | dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma) 3515 | _mgc_ignorm(dst_ceps, dst_gamma) 3516 | else: 3517 | _mgc_freqt(dst_ceps, src_ceps, alpha) 3518 | _mgc_gnorm(dst_ceps, src_gamma) 3519 | new_dst_ceps = copy.deepcopy(dst_ceps) 3520 | dst_ceps = _mgc_gc2gc(new_dst_ceps, src_gamma, dst_order, dst_gamma) 3521 | _mgc_ignorm(dst_ceps, dst_gamma) 3522 | return dst_ceps 3523 | 3524 | 3525 | _mgc_convert_results = [] 3526 | 3527 | def _mgc_collect_result(result): 3528 | _mgc_convert_results.append(result) 3529 | 3530 | 3531 | def _mgc_convert(c_i, alpha, gamma, fftlen): 3532 | i = c_i[0] 3533 | tot_i = c_i[1] 3534 | mgc_i = c_i[2] 3535 | r_i = (i, _mgc_mgc2mgc(mgc_i, src_alpha=alpha, src_gamma=gamma, 3536 | dst_order=fftlen // 2, dst_alpha=0., dst_gamma=0.)) 3537 | return r_i 3538 | 3539 | 3540 | def mgc2sp(mgc_arr, alpha=0.35, gamma=-0.41, fftlen="auto", fs=None, 3541 | mode="world_pad", verbose=False): 3542 | """ 3543 | Accepts 1D or 2D array of mgc 3544 | 3545 | If 2D, assume time is on axis 0 3546 | 3547 | Returns reconstructed smooth spectrum 3548 | 3549 | Based on r9y9 Julia code 3550 | https://github.com/r9y9/MelGeneralizedCepstrums.jl 3551 | """ 3552 | if mode != "world_pad": 3553 | raise ValueError("Only currently supported mode is world_pad") 3554 | 3555 | if fftlen == "auto": 3556 | if fs == None: 3557 | raise ValueError("fs must be provided for fftlen 'auto'") 3558 | f0_low_limit = 71 3559 | fftlen = int(2 ** np.ceil(np.log2(3. * float(fs) / f0_low_limit + 1))) 3560 | if verbose: 3561 | print("setting fftlen to %i" % fftlen) 3562 | 3563 | if len(mgc_arr.shape) == 1: 3564 | c = _mgc_mgc2mgc(mgc_arr, alpha, gamma, fftlen // 2, 0., 0.) 3565 | buf = np.zeros((fftlen,), dtype=c.dtype) 3566 | buf[:len(c)] = c[:] 3567 | return np.fft.rfft(buf) 3568 | else: 3569 | # Slooow, use multiprocessing to speed up a bit 3570 | # http://blog.shenwei.me/python-multiprocessing-pool-difference-between-map-apply-map_async-apply_async/ 3571 | # http://stackoverflow.com/questions/5666576/show-the-progress-of-a-python-multiprocessing-pool-map-call 3572 | c = [(i + 1, mgc_arr.shape[0], mgc_arr[i]) for i in range(mgc_arr.shape[0])] 3573 | p = Pool() 3574 | start = time.time() 3575 | if verbose: 3576 | print("Starting conversion of %i frames" % mgc_arr.shape[0]) 3577 | print("This may take some time...") 3578 | #itr = p.map(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c) 3579 | #raise ValueError() 3580 | 3581 | # 500.1 s for 630 frames process 3582 | itr = p.map_async(functools.partial(_mgc_convert, alpha=alpha, gamma=gamma, fftlen=fftlen), c, callback=_mgc_collect_result) 3583 | 3584 | sz = len(c) // itr._chunksize 3585 | if (sz * itr._chunksize) != len(c): 3586 | sz += 1 3587 | 3588 | last_remaining = None 3589 | while True: 3590 | remaining = itr._number_left 3591 | if verbose: 3592 | if last_remaining != remaining: 3593 | last_remaining = remaining 3594 | print("%i chunks of %i complete" % (sz - remaining, sz)) 3595 | if itr.ready(): 3596 | break 3597 | time.sleep(.5) 3598 | p.close() 3599 | p.join() 3600 | stop = time.time() 3601 | if verbose: 3602 | print("Processed %i frames in %s seconds" % (mgc_arr.shape[0], stop - start)) 3603 | # map_async result comes in chunks 3604 | flat = [a_i for a in _mgc_convert_results for a_i in a] 3605 | final = [o[1] for o in sorted(flat, key=lambda x: x[0])] 3606 | for i in range(len(_mgc_convert_results)): 3607 | _mgc_convert_results.pop() 3608 | c = np.array(final) 3609 | buf = np.zeros((len(c), fftlen), dtype=c.dtype) 3610 | buf[:, :c.shape[1]] = c[:] 3611 | return np.exp(np.fft.rfft(buf, axis=-1).real) 3612 | 3613 | 3614 | def implot(arr, scale=None, title="", cmap="gray"): 3615 | import matplotlib.pyplot as plt 3616 | if scale is "specgram": 3617 | # plotting part 3618 | mag = 20. * np.log10(np.abs(arr)) 3619 | # Transpose so time is X axis, and invert y axis so 3620 | # frequency is low at bottom 3621 | mag = mag.T[::-1, :] 3622 | else: 3623 | mag = arr 3624 | f, ax = plt.subplots() 3625 | ax.matshow(mag, cmap=cmap) 3626 | plt.axis("off") 3627 | x1 = mag.shape[0] 3628 | y1 = mag.shape[1] 3629 | 3630 | def autoaspect(x_range, y_range): 3631 | """ 3632 | The aspect to make a plot square with ax.set_aspect in Matplotlib 3633 | """ 3634 | mx = max(x_range, y_range) 3635 | mn = min(x_range, y_range) 3636 | if x_range <= y_range: 3637 | return mx / float(mn) 3638 | else: 3639 | return mn / float(mx) 3640 | asp = autoaspect(x1, y1) 3641 | ax.set_aspect(asp) 3642 | plt.title(title) 3643 | 3644 | 3645 | def test_lpc_to_lsf(): 3646 | # Matlab style vectors for testing 3647 | # lsf = [0.7842 1.5605 1.8776 1.8984 2.3593] 3648 | # a = [1.0000 0.6149 0.9899 0.0000 0.0031 -0.0082]; 3649 | lsf = [[0.7842, 1.5605, 1.8776, 1.8984, 2.3593], 3650 | [0.7842, 1.5605, 1.8776, 1.8984, 2.3593]] 3651 | a = [[1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082], 3652 | [1.0000, 0.6149, 0.9899, 0.0000, 0.0031, -0.0082]] 3653 | a = np.array(a) 3654 | lsf = np.array(lsf) 3655 | lsf_r = lpc_to_lsf(a) 3656 | assert_almost_equal(lsf, lsf_r, decimal=4) 3657 | a_r = lsf_to_lpc(lsf) 3658 | assert_almost_equal(a, a_r, decimal=4) 3659 | lsf_r = lpc_to_lsf(a[0]) 3660 | assert_almost_equal(lsf[0], lsf_r, decimal=4) 3661 | a_r = lsf_to_lpc(lsf[0]) 3662 | assert_almost_equal(a[0], a_r, decimal=4) 3663 | 3664 | 3665 | def test_lpc_analysis_truncate(): 3666 | # Test that truncate doesn't crash and actually truncates 3667 | [a, g, e] = lpc_analysis(np.random.randn(85), order=8, window_step=80, 3668 | window_size=80, emphasis=0.9, truncate=True) 3669 | assert(a.shape[0] == 1) 3670 | 3671 | 3672 | def test_feature_build(): 3673 | samplerate, X = fetch_sample_music() 3674 | # MATLAB wavread does normalization 3675 | X = X.astype('float32') / (2 ** 15) 3676 | wsz = 256 3677 | wst = 128 3678 | a, g, e = lpc_analysis(X, order=8, window_step=wst, 3679 | window_size=wsz, emphasis=0.9, 3680 | copy=True) 3681 | v, p = voiced_unvoiced(X, window_size=wsz, 3682 | window_step=wst) 3683 | c = compress(e, n_components=64) 3684 | # First component of a is always 1 3685 | combined = np.hstack((a[:, 1:], g, c[:a.shape[0]])) 3686 | features = np.zeros((a.shape[0], 2 * combined.shape[1])) 3687 | start_indices = v * combined.shape[1] 3688 | start_indices = start_indices.astype('int32') 3689 | end_indices = (v + 1) * combined.shape[1] 3690 | end_indices = end_indices.astype('int32') 3691 | for i in range(features.shape[0]): 3692 | features[i, start_indices[i]:end_indices[i]] = combined[i] 3693 | 3694 | 3695 | def test_mdct_and_inverse(): 3696 | fs, X = fetch_sample_music() 3697 | X_dct = mdct_slow(X) 3698 | X_r = imdct_slow(X_dct) 3699 | assert np.all(np.abs(X_r[:len(X)] - X) < 1E-3) 3700 | assert np.abs(X_r[:len(X)] - X).mean() < 1E-6 3701 | 3702 | 3703 | def test_all(): 3704 | test_lpc_analysis_truncate() 3705 | test_feature_build() 3706 | test_lpc_to_lsf() 3707 | test_mdct_and_inverse() 3708 | 3709 | 3710 | def run_lpc_example(): 3711 | # ae.wav is from 3712 | # http://www.linguistics.ucla.edu/people/hayes/103/Charts/VChart/ae.wav 3713 | # Partially following the formant tutorial here 3714 | # http://www.mathworks.com/help/signal/ug/formant-estimation-with-lpc-coefficients.html 3715 | 3716 | samplerate, X = fetch_sample_music() 3717 | 3718 | c = overlap_dct_compress(X, 200, 400) 3719 | X_r = overlap_dct_uncompress(c, 400) 3720 | wavfile.write('lpc_uncompress.wav', samplerate, soundsc(X_r)) 3721 | 3722 | print("Calculating sinusoids") 3723 | f_hz, m = sinusoid_analysis(X, input_sample_rate=16000) 3724 | Xs_sine = sinusoid_synthesis(f_hz, m) 3725 | orig_fname = 'lpc_orig.wav' 3726 | sine_fname = 'lpc_sine_synth.wav' 3727 | wavfile.write(orig_fname, samplerate, soundsc(X)) 3728 | wavfile.write(sine_fname, samplerate, soundsc(Xs_sine)) 3729 | 3730 | lpc_order_list = [8, ] 3731 | dct_components_list = [200, ] 3732 | window_size_list = [400, ] 3733 | # Seems like a dct component size of ~2/3rds the step 3734 | # (1/3rd the window for 50% overlap) works well. 3735 | for lpc_order in lpc_order_list: 3736 | for dct_components in dct_components_list: 3737 | for window_size in window_size_list: 3738 | # 50% overlap 3739 | window_step = window_size // 2 3740 | a, g, e = lpc_analysis(X, order=lpc_order, 3741 | window_step=window_step, 3742 | window_size=window_size, emphasis=0.9, 3743 | copy=True) 3744 | print("Calculating LSF") 3745 | lsf = lpc_to_lsf(a) 3746 | # Not window_size - window_step! Need to implement overlap 3747 | print("Calculating compression") 3748 | c = dct_compress(e, n_components=dct_components, 3749 | window_size=window_step) 3750 | co = overlap_dct_compress(e, n_components=dct_components, 3751 | window_size=window_step) 3752 | block_excitation = dct_uncompress(c, window_size=window_step) 3753 | overlap_excitation = overlap_dct_uncompress(co, 3754 | window_size=window_step) 3755 | a_r = lsf_to_lpc(lsf) 3756 | f, m = lpc_to_frequency(a_r, g) 3757 | block_lpc = lpc_synthesis(a_r, g, block_excitation, 3758 | emphasis=0.9, 3759 | window_step=window_step) 3760 | overlap_lpc = lpc_synthesis(a_r, g, overlap_excitation, 3761 | emphasis=0.9, 3762 | window_step=window_step) 3763 | v, p = voiced_unvoiced(X, window_size=window_size, 3764 | window_step=window_step) 3765 | noisy_lpc = lpc_synthesis(a_r, g, voiced_frames=v, 3766 | emphasis=0.9, 3767 | window_step=window_step) 3768 | if dct_components is None: 3769 | dct_components = window_size 3770 | noisy_fname = 'lpc_noisy_synth_%iwin_%ilpc_%idct.wav' % ( 3771 | window_size, lpc_order, dct_components) 3772 | block_fname = 'lpc_block_synth_%iwin_%ilpc_%idct.wav' % ( 3773 | window_size, lpc_order, dct_components) 3774 | overlap_fname = 'lpc_overlap_synth_%iwin_%ilpc_%idct.wav' % ( 3775 | window_size, lpc_order, dct_components) 3776 | wavfile.write(noisy_fname, samplerate, soundsc(noisy_lpc)) 3777 | wavfile.write(block_fname, samplerate, 3778 | soundsc(block_lpc)) 3779 | wavfile.write(overlap_fname, samplerate, 3780 | soundsc(overlap_lpc)) 3781 | 3782 | 3783 | def run_fft_vq_example(): 3784 | n_fft = 512 3785 | time_smoothing = 4 3786 | def _pre(list_of_data): 3787 | f_c = np.vstack([stft(dd, n_fft) for dd in list_of_data]) 3788 | if len(f_c) % time_smoothing != 0: 3789 | newlen = len(f_c) - len(f_c) % time_smoothing 3790 | f_c = f_c[:newlen] 3791 | f_mag = complex_to_abs(f_c) 3792 | f_phs = complex_to_angle(f_c) 3793 | f_sincos = angle_to_sin_cos(f_phs) 3794 | f_r = np.hstack((f_mag, f_sincos)) 3795 | f_r = f_r.reshape((len(f_r) // time_smoothing, 3796 | time_smoothing * f_r.shape[1])) 3797 | return f_r, n_fft 3798 | 3799 | def preprocess_train(list_of_data, random_state): 3800 | f_r, n_fft = _pre(list_of_data) 3801 | clusters = f_r 3802 | return clusters 3803 | 3804 | def apply_preprocess(list_of_data, clusters): 3805 | f_r, n_fft = _pre(list_of_data) 3806 | memberships, distances = vq(f_r, clusters) 3807 | vq_r = clusters[memberships] 3808 | vq_r = vq_r.reshape((time_smoothing * len(vq_r), 3809 | vq_r.shape[1] // time_smoothing)) 3810 | f_mag = vq_r[:, :n_fft // 2 + 1] 3811 | f_sincos = vq_r[:, n_fft // 2 + 1:] 3812 | extent = f_sincos.shape[1] // 2 3813 | f_phs = sin_cos_to_angle(f_sincos[:, :extent], f_sincos[:, extent:]) 3814 | vq_c = abs_and_angle_to_complex(f_mag, f_phs) 3815 | d_k = istft(vq_c, fftsize=n_fft) 3816 | return d_k 3817 | 3818 | random_state = np.random.RandomState(1999) 3819 | 3820 | """ 3821 | fs, d = fetch_sample_music() 3822 | sub = int(.8 * d.shape[0]) 3823 | d1 = [d[:sub]] 3824 | d2 = [d[sub:]] 3825 | """ 3826 | 3827 | fs, d = fetch_sample_speech_fruit() 3828 | d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8] 3829 | d2 = d[7::8] 3830 | # make sure d1 and d2 aren't the same! 3831 | assert [len(di) for di in d1] != [len(di) for di in d2] 3832 | 3833 | clusters = preprocess_train(d1, random_state) 3834 | # Training data 3835 | vq_d1 = apply_preprocess(d1, clusters) 3836 | vq_d2 = apply_preprocess(d2, clusters) 3837 | assert [i != j for i, j in zip(vq_d1.ravel(), vq_d2.ravel())] 3838 | 3839 | fix_d1 = np.concatenate(d1) 3840 | fix_d2 = np.concatenate(d2) 3841 | 3842 | wavfile.write("fft_train_no_agc.wav", fs, soundsc(fix_d1)) 3843 | wavfile.write("fft_test_no_agc.wav", fs, soundsc(fix_d2)) 3844 | wavfile.write("fft_vq_train_no_agc.wav", fs, soundsc(vq_d1, fs)) 3845 | wavfile.write("fft_vq_test_no_agc.wav", fs, soundsc(vq_d2, fs)) 3846 | 3847 | agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5) 3848 | agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5) 3849 | agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5) 3850 | agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5) 3851 | 3852 | wavfile.write("fft_train_agc.wav", fs, soundsc(agc_d1)) 3853 | wavfile.write("fft_test_agc.wav", fs, soundsc(agc_d2)) 3854 | wavfile.write("fft_vq_train_agc.wav", fs, soundsc(agc_vq_d1, fs)) 3855 | wavfile.write("fft_vq_test_agc.wav", fs, soundsc(agc_vq_d2)) 3856 | 3857 | 3858 | def run_dct_vq_example(): 3859 | def _pre(list_of_data): 3860 | # Temporal window setting is crucial! - 512 seems OK for music, 256 3861 | # fruit perhaps due to samplerates 3862 | n_dct = 512 3863 | f_r = np.vstack([mdct_slow(dd, n_dct) for dd in list_of_data]) 3864 | return f_r, n_dct 3865 | 3866 | def preprocess_train(list_of_data, random_state): 3867 | f_r, n_dct = _pre(list_of_data) 3868 | clusters = f_r 3869 | return clusters 3870 | 3871 | def apply_preprocess(list_of_data, clusters): 3872 | f_r, n_dct = _pre(list_of_data) 3873 | f_clust = f_r 3874 | memberships, distances = vq(f_clust, clusters) 3875 | vq_r = clusters[memberships] 3876 | d_k = imdct_slow(vq_r, n_dct) 3877 | return d_k 3878 | 3879 | random_state = np.random.RandomState(1999) 3880 | 3881 | # This doesn't work very well due to only taking a sample from the end as 3882 | # test 3883 | fs, d = fetch_sample_music() 3884 | sub = int(.8 * d.shape[0]) 3885 | d1 = [d[:sub]] 3886 | d2 = [d[sub:]] 3887 | 3888 | """ 3889 | fs, d = fetch_sample_speech_fruit() 3890 | d1 = d[::8] + d[1::8] + d[2::8] + d[3::8] + d[4::8] + d[5::8] + d[6::8] 3891 | d2 = d[7::8] 3892 | # make sure d1 and d2 aren't the same! 3893 | assert [len(di) for di in d1] != [len(di) for di in d2] 3894 | """ 3895 | 3896 | clusters = preprocess_train(d1, random_state) 3897 | # Training data 3898 | vq_d1 = apply_preprocess(d1, clusters) 3899 | vq_d2 = apply_preprocess(d2, clusters) 3900 | assert [i != j for i, j in zip(vq_d2.ravel(), vq_d2.ravel())] 3901 | 3902 | fix_d1 = np.concatenate(d1) 3903 | fix_d2 = np.concatenate(d2) 3904 | 3905 | wavfile.write("dct_train_no_agc.wav", fs, soundsc(fix_d1)) 3906 | wavfile.write("dct_test_no_agc.wav", fs, soundsc(fix_d2)) 3907 | wavfile.write("dct_vq_train_no_agc.wav", fs, soundsc(vq_d1)) 3908 | wavfile.write("dct_vq_test_no_agc.wav", fs, soundsc(vq_d2)) 3909 | 3910 | """ 3911 | import matplotlib.pyplot as plt 3912 | plt.specgram(vq_d2, cmap="gray") 3913 | plt.figure() 3914 | plt.specgram(fix_d2, cmap="gray") 3915 | plt.show() 3916 | """ 3917 | 3918 | agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5) 3919 | agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5) 3920 | agc_vq_d1, freq_vq_d1, energy_vq_d1 = time_attack_agc(vq_d1, fs, .5, 5) 3921 | agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5) 3922 | 3923 | wavfile.write("dct_train_agc.wav", fs, soundsc(agc_d1)) 3924 | wavfile.write("dct_test_agc.wav", fs, soundsc(agc_d2)) 3925 | wavfile.write("dct_vq_train_agc.wav", fs, soundsc(agc_vq_d1)) 3926 | wavfile.write("dct_vq_test_agc.wav", fs, soundsc(agc_vq_d2)) 3927 | 3928 | 3929 | def run_phase_reconstruction_example(): 3930 | fs, d = fetch_sample_speech_tapestry() 3931 | # actually gives however many components you say! So double what .m file 3932 | # says 3933 | fftsize = 512 3934 | step = 64 3935 | X_s = np.abs(stft(d, fftsize=fftsize, step=step, real=False, 3936 | compute_onesided=False)) 3937 | X_t = iterate_invert_spectrogram(X_s, fftsize, step, verbose=True) 3938 | 3939 | """ 3940 | import matplotlib.pyplot as plt 3941 | plt.specgram(d, cmap="gray") 3942 | plt.savefig("1.png") 3943 | plt.close() 3944 | plt.imshow(X_s, cmap="gray") 3945 | plt.savefig("2.png") 3946 | plt.close() 3947 | """ 3948 | 3949 | wavfile.write("phase_original.wav", fs, soundsc(d)) 3950 | wavfile.write("phase_reconstruction.wav", fs, soundsc(X_t)) 3951 | 3952 | 3953 | def run_phase_vq_example(): 3954 | def _pre(list_of_data): 3955 | # Temporal window setting is crucial! - 512 seems OK for music, 256 3956 | # fruit perhaps due to samplerates 3957 | n_fft = 256 3958 | step = 32 3959 | f_r = np.vstack([np.abs(stft(dd, n_fft, step=step, real=False, 3960 | compute_onesided=False)) 3961 | for dd in list_of_data]) 3962 | return f_r, n_fft, step 3963 | 3964 | def preprocess_train(list_of_data, random_state): 3965 | f_r, n_fft, step = _pre(list_of_data) 3966 | clusters = copy.deepcopy(f_r) 3967 | return clusters 3968 | 3969 | def apply_preprocess(list_of_data, clusters): 3970 | f_r, n_fft, step = _pre(list_of_data) 3971 | f_clust = f_r 3972 | # Nondeterministic ? 3973 | memberships, distances = vq(f_clust, clusters) 3974 | vq_r = clusters[memberships] 3975 | d_k = iterate_invert_spectrogram(vq_r, n_fft, step, verbose=True) 3976 | return d_k 3977 | 3978 | random_state = np.random.RandomState(1999) 3979 | 3980 | fs, d = fetch_sample_speech_fruit() 3981 | d1 = d[::9] 3982 | d2 = d[7::8][:5] 3983 | # make sure d1 and d2 aren't the same! 3984 | assert [len(di) for di in d1] != [len(di) for di in d2] 3985 | 3986 | clusters = preprocess_train(d1, random_state) 3987 | fix_d1 = np.concatenate(d1) 3988 | fix_d2 = np.concatenate(d2) 3989 | vq_d2 = apply_preprocess(d2, clusters) 3990 | 3991 | wavfile.write("phase_train_no_agc.wav", fs, soundsc(fix_d1)) 3992 | wavfile.write("phase_vq_test_no_agc.wav", fs, soundsc(vq_d2)) 3993 | 3994 | agc_d1, freq_d1, energy_d1 = time_attack_agc(fix_d1, fs, .5, 5) 3995 | agc_d2, freq_d2, energy_d2 = time_attack_agc(fix_d2, fs, .5, 5) 3996 | agc_vq_d2, freq_vq_d2, energy_vq_d2 = time_attack_agc(vq_d2, fs, .5, 5) 3997 | 3998 | """ 3999 | import matplotlib.pyplot as plt 4000 | plt.specgram(agc_vq_d2, cmap="gray") 4001 | #plt.title("Fake") 4002 | plt.figure() 4003 | plt.specgram(agc_d2, cmap="gray") 4004 | #plt.title("Real") 4005 | plt.show() 4006 | """ 4007 | 4008 | wavfile.write("phase_train_agc.wav", fs, soundsc(agc_d1)) 4009 | wavfile.write("phase_test_agc.wav", fs, soundsc(agc_d2)) 4010 | wavfile.write("phase_vq_test_agc.wav", fs, soundsc(agc_vq_d2)) 4011 | 4012 | 4013 | def run_cqt_example(): 4014 | try: 4015 | fs, d = fetch_sample_file("/Users/User/cqt_resources/kempff1.wav") 4016 | except ValueError: 4017 | print("WARNING: Using sample music instead but kempff1.wav is the example") 4018 | fs, d = fetch_sample_music() 4019 | X = d[:44100] 4020 | X_cq, c_dc, c_nyq, multiscale, shift, window_lens = cqt(X, fs) 4021 | X_r = icqt(X_cq, c_dc, c_nyq, multiscale, shift, window_lens) 4022 | SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X)) 4023 | wavfile.write("cqt_original.wav", fs, soundsc(X)) 4024 | wavfile.write("cqt_reconstruction.wav", fs, soundsc(X_r)) 4025 | 4026 | 4027 | def run_fft_dct_example(): 4028 | random_state = np.random.RandomState(1999) 4029 | 4030 | fs, d = fetch_sample_speech_fruit() 4031 | n_fft = 64 4032 | X = d[0] 4033 | X_stft = stft(X, n_fft) 4034 | X_rr = complex_to_real_view(X_stft) 4035 | X_dct = fftpack.dct(X_rr, axis=-1, norm='ortho') 4036 | X_dct_sub = X_dct[1:] - X_dct[:-1] 4037 | std = X_dct_sub.std(axis=0, keepdims=True) 4038 | X_dct_sub += .01 * std * random_state.randn( 4039 | X_dct_sub.shape[0], X_dct_sub.shape[1]) 4040 | X_dct_unsub = np.cumsum(X_dct_sub, axis=0) 4041 | X_idct = fftpack.idct(X_dct_unsub, axis=-1, norm='ortho') 4042 | X_irr = real_to_complex_view(X_idct) 4043 | X_r = istft(X_irr, n_fft)[:len(X)] 4044 | 4045 | SNR = 20 * np.log10(np.linalg.norm(X - X_r) / np.linalg.norm(X)) 4046 | print(SNR) 4047 | 4048 | wavfile.write("fftdct_orig.wav", fs, soundsc(X)) 4049 | wavfile.write("fftdct_rec.wav", fs, soundsc(X_r)) 4050 | 4051 | 4052 | def run_world_example(): 4053 | fs, d = fetch_sample_speech_tapestry() 4054 | d = d.astype("float32") / 2 ** 15 4055 | temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs) 4056 | temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs, 4057 | temporal_positions_h, f0_h, vuv_h) 4058 | temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs, 4059 | temporal_positions_h, f0_h, vuv_h) 4060 | #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, spectrogram_ct, fs_ct) 4061 | y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, spectrogram_ct, fs_ct) 4062 | wavfile.write("out.wav", fs, soundsc(y)) 4063 | 4064 | 4065 | def run_mgc_example(): 4066 | import matplotlib.pyplot as plt 4067 | fs, x = wavfile.read("test16k.wav") 4068 | pos = 3000 4069 | fftlen = 1024 4070 | win = np.blackman(fftlen) / np.sqrt(np.sum(np.blackman(fftlen) ** 2)) 4071 | xw = x[pos:pos + fftlen] * win 4072 | sp = 20 * np.log10(np.abs(np.fft.rfft(xw))) 4073 | mgc_order = 20 4074 | mgc_alpha = 0.41 4075 | mgc_gamma = -0.35 4076 | mgc_arr = win2mgc(xw, order=mgc_order, alpha=mgc_alpha, gamma=mgc_gamma, verbose=True) 4077 | xwsp = 20 * np.log10(np.abs(np.fft.rfft(xw))) 4078 | sp = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fftlen) 4079 | plt.plot(xwsp) 4080 | plt.plot(20. / np.log(10) * np.real(sp), "r") 4081 | plt.xlim(1, len(xwsp)) 4082 | plt.show() 4083 | 4084 | 4085 | def run_world_mgc_example(): 4086 | fs, d = fetch_sample_speech_tapestry() 4087 | d = d.astype("float32") / 2 ** 15 4088 | 4089 | # harcoded for 16k from 4090 | # https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world/extract_features_for_merlin.sh 4091 | mgc_alpha = 0.58 4092 | #mgc_order = 59 4093 | mgc_order = 59 4094 | # this is actually just mcep 4095 | mgc_gamma = 0.0 4096 | 4097 | #from sklearn.externals import joblib 4098 | #mem = joblib.Memory("/tmp") 4099 | #mem.clear() 4100 | 4101 | def enc(): 4102 | temporal_positions_h, f0_h, vuv_h, f0_candidates_h = harvest(d, fs) 4103 | temporal_positions_ct, spectrogram_ct, fs_ct = cheaptrick(d, fs, 4104 | temporal_positions_h, f0_h, vuv_h) 4105 | temporal_positions_d4c, f0_d4c, vuv_d4c, aper_d4c, coarse_aper_d4c = d4c(d, fs, 4106 | temporal_positions_h, f0_h, vuv_h) 4107 | 4108 | mgc_arr = sp2mgc(spectrogram_ct, mgc_order, mgc_alpha, mgc_gamma, 4109 | verbose=True) 4110 | return mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c 4111 | 4112 | 4113 | mgc_arr, spectrogram_ct, f0_d4c, vuv_d4c, coarse_aper_d4c = enc() 4114 | sp_r = mgc2sp(mgc_arr, mgc_alpha, mgc_gamma, fs=fs, verbose=True) 4115 | 4116 | """ 4117 | import matplotlib.pyplot as plt 4118 | plt.imshow(20 * np.log10(sp_r)) 4119 | plt.figure() 4120 | plt.imshow(20 * np.log10(spectrogram_ct)) 4121 | plt.show() 4122 | raise ValueError() 4123 | """ 4124 | 4125 | y = world_synthesis(f0_d4c, vuv_d4c, coarse_aper_d4c, sp_r, fs) 4126 | #y = world_synthesis(f0_d4c, vuv_d4c, aper_d4c, sp_r, fs) 4127 | wavfile.write("out_mgc.wav", fs, soundsc(y)) 4128 | 4129 | 4130 | def get_frame(signal, winsize, no): 4131 | shift = winsize//2 4132 | start = no*shift 4133 | end = start+winsize 4134 | return signal[start:end] 4135 | 4136 | 4137 | class LTSD(): 4138 | """ 4139 | LTSD VAD code from jfsantos 4140 | """ 4141 | def __init__(self,winsize,window,order): 4142 | self.winsize = int(winsize) 4143 | self.window = window 4144 | self.order = order 4145 | self.amplitude = {} 4146 | 4147 | def get_amplitude(self,signal,l): 4148 | if self.amplitude.has_key(l): 4149 | return self.amplitude[l] 4150 | else: 4151 | amp = sp.absolute(sp.fft(get_frame(signal, self.winsize,l) * self.window)) 4152 | self.amplitude[l] = amp 4153 | return amp 4154 | 4155 | def compute_noise_avg_spectrum(self, nsignal): 4156 | windownum = int(len(nsignal)//(self.winsize//2) - 1) 4157 | avgamp = np.zeros(self.winsize) 4158 | for l in range(windownum): 4159 | avgamp += sp.absolute(sp.fft(get_frame(nsignal, self.winsize,l) * self.window)) 4160 | return avgamp/float(windownum) 4161 | 4162 | def compute(self,signal): 4163 | self.windownum = int(len(signal)//(self.winsize//2) - 1) 4164 | ltsds = np.zeros(self.windownum) 4165 | #Calculate the average noise spectrum amplitude based 20 frames in the head parts of input signal. 4166 | self.avgnoise = self.compute_noise_avg_spectrum(signal[0:self.winsize*20])**2 4167 | for l in range(self.windownum): 4168 | ltsds[l] = self.ltsd(signal,l,5) 4169 | return ltsds 4170 | 4171 | def ltse(self,signal,l,order): 4172 | maxamp = np.zeros(self.winsize) 4173 | for idx in range(l-order,l+order+1): 4174 | amp = self.get_amplitude(signal,idx) 4175 | maxamp = np.maximum(maxamp,amp) 4176 | return maxamp 4177 | 4178 | def ltsd(self,signal,l,order): 4179 | if l < order or l+order >= self.windownum: 4180 | return 0 4181 | return 10.0 * np.log10(np.sum(self.ltse(signal,l,order)**2/self.avgnoise)/float(len(self.avgnoise))) 4182 | 4183 | 4184 | def ltsd_vad(x, fs, threshold=9, winsize=8192): 4185 | # winsize based on sample rate 4186 | # 1024 for fs = 16000 4187 | orig_dtype = x.dtype 4188 | orig_scale_min = x.min() 4189 | orig_scale_max = x.max() 4190 | x = (x - x.min()) / (x.max() - x.min()) 4191 | # works with 16 bit 4192 | x = x * (2 ** 15) 4193 | x = x.astype("int32") 4194 | window = sp.hanning(winsize) 4195 | ltsd = LTSD(winsize, window, 5) 4196 | s_vad = ltsd.compute(x) 4197 | # LTSD is 50% overlap, so each "step" covers 4096 samples 4198 | # +1 to cover the extra edge window 4199 | n_samples = int(((len(s_vad) + 1) * winsize) // 2) 4200 | time_s = n_samples / float(fs) 4201 | time_points = np.linspace(0, time_s, len(s_vad)) 4202 | time_samples = (fs * time_points).astype(np.int32) 4203 | time_samples = time_samples 4204 | f_vad = np.zeros_like(x, dtype=np.bool) 4205 | offset = winsize 4206 | for n, (ss, es) in enumerate(zip(time_samples[:-1], time_samples[1:])): 4207 | sss = ss - offset 4208 | if sss < 0: 4209 | sss = 0 4210 | ses = es - offset 4211 | if ses < 0: 4212 | ses = 0 4213 | if s_vad[n + 1] < threshold: 4214 | f_vad[sss:ses] = False 4215 | else: 4216 | f_vad[sss:ses] = True 4217 | f_vad[ses:] = False 4218 | x = x.astype("float64") 4219 | x = x / float(2 ** 15) 4220 | x = x * (orig_scale_max - orig_scale_min) + orig_scale_min 4221 | x = x.astype(orig_dtype) 4222 | return x[f_vad], f_vad 4223 | 4224 | 4225 | def run_ltsd_example(): 4226 | fs, d = fetch_sample_speech_tapestry() 4227 | winsize = 1024 4228 | d = d.astype("float32") / 2 ** 15 4229 | d -= d.mean() 4230 | 4231 | pad = 3 * fs 4232 | noise_pwr = np.percentile(d, 1) ** 2 4233 | noise_pwr = max(1E-9, noise_pwr) 4234 | d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d)) 4235 | _, vad_segments = ltsd_vad(d, fs, winsize=winsize) 4236 | v_up = np.where(vad_segments == True)[0] 4237 | s = v_up[0] 4238 | st = v_up[-1] + int(.5 * fs) 4239 | d = d[s:st] 4240 | 4241 | bname = "tapestry.wav".split(".")[0] 4242 | wavfile.write("%s_out.wav" % bname, fs, soundsc(d)) 4243 | 4244 | 4245 | if __name__ == "__main__": 4246 | run_ltsd_example() 4247 | """ 4248 | Trying to run all examples will seg fault on my laptop - probably memory! 4249 | Comment individually 4250 | run_ltsd_example() 4251 | run_world_mgc_example() 4252 | run_world_example() 4253 | run_mgc_example() 4254 | run_phase_reconstruction_example() 4255 | run_phase_vq_example() 4256 | run_dct_vq_example() 4257 | run_fft_vq_example() 4258 | run_lpc_example() 4259 | run_cqt_example() 4260 | run_fft_dct_example() 4261 | test_all() 4262 | """ 4263 | --------------------------------------------------------------------------------