0.5
177 | return p1*mask+p2*(~mask)
178 |
179 | #%%
180 | PROMPT = "a bass guitar"
181 | # random p
182 | p = torch.rand(dummy_p.shape).to(device)*2*np.pi
183 | records = []
184 | zt = embed_text([PROMPT,PROMPT])[:1]
185 |
186 | generation=0
187 | while True:
188 | # synthesize
189 | audio = [
190 | effect_chain(source_audio, activation(p[i]))
191 | for i in range(BATCH_SIZE)
192 | ]
193 | # turn into tensor
194 | audio = torch.tensor(audio).float().to(device)
195 | # peak normalize each sample
196 | peaks = torch.max(torch.abs(audio),dim=1,keepdim=True)[0]
197 | audio = audio/peaks
198 | # embed audio
199 | za = embed_audio(audio)
200 |
201 | # novelty search
202 |
203 | #
204 | # get fitness by measuring similarity to target
205 | similarity = torch.nn.functional.cosine_similarity(za,zt)
206 | for b in range(BATCH_SIZE):
207 | records.append({"generation":generation,"similarity":similarity[b].item(),"p":p[b].detach().cpu().numpy()})
208 |
209 | fitness = torch.softmax(similarity/TEMPERATURE,dim=0)
210 |
211 | p1 = p[torch.multinomial(fitness,BATCH_SIZE,replacement=True)]
212 | p2 = p[torch.multinomial(fitness,BATCH_SIZE,replacement=True)]
213 |
214 | p = crossover(p1,p2)
215 |
216 | p = mutate(p)
217 |
218 | generation+=1
219 |
220 | if generation%1==0:
221 |
222 | # clear output
223 | clear_output(wait=True)
224 | # plot sorted fitness
225 | plt.plot(torch.sort(fitness).values.detach().cpu().numpy())
226 | plt.show()
227 |
228 | # show scatter plot of similarity
229 | sns.scatterplot(data=pd.DataFrame(records),x="generation",y="similarity",alpha=0.5)
230 | plt.show()
231 | # play audio of samples sorted by similarity
232 | play(audio[torch.argsort(-similarity)].flatten().detach().cpu().numpy())
233 |
234 |
235 | # %%
236 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Generate synthesizer sounds from text prompts with a simple evolutionary algorithm.
11 |
12 | Github link
13 |
14 |
15 |
16 |
17 |
18 |
19 | Start with randomly initialized synthesizer sounds. Each iteration, the current synthesizer sounds are evaluated
20 | on how well they match the text prompt. The best sounds are then combined and mutated to generate new sounds for
21 | the next iteration. 200 generations w/ 50 samples takes about ~20s on a 3090 (not tested on CPU).
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
79 |
80 |
81 |
82 |
83 |