├── Klatt.ParWave.pas
├── Klatt.dpr
└── README.md
/Klatt.ParWave.pas:
--------------------------------------------------------------------------------
1 | unit Klatt.ParWave;
2 |
3 | {
4 | Description : Klatt synthesizer
5 | Author : Wouter van Nifterick
6 | }
7 |
8 | interface
9 |
10 | uses System.SysUtils, Math, System.Generics.Collections;
11 |
12 |
13 | const
14 | cMaxSampleRateHz = 20000; // Maximum sample rate
15 |
16 | natural_samples: array[0..99] of integer=
17 | (
18 | -310,-400,530,356,224,89,23,-10,-58,-16,461,599,536,701,770,
19 | 605,497,461,560,404,110,224,131,104,-97,155,278,-154,-1165,
20 | -598,737,125,-592,41,11,-247,-10,65,92,80,-304,71,167,-1,122,
21 | 233,161,-43,278,479,485,407,266,650,134,80,236,68,260,269,179,
22 | 53,140,275,293,296,104,257,152,311,182,263,245,125,314,140,44,
23 | 203,230,-235,-286,23,107,92,-91,38,464,443,176,98,-784,-2449,
24 | -1891,-1045,-1600,-1462,-1384,-1261,-949,-730
25 | );
26 |
27 | type
28 | TVoicingSource = (
29 | Impulsive = 1,
30 | Natural = 2,
31 | Sampled = 3
32 | );
33 |
34 | TSynthesisModel = (
35 | CascadeParallel = 1,
36 | AllParallel = 2
37 | );
38 |
39 | TResonator = record
40 | a, b, c: double;
41 | p:array[1..2] of double;
42 | function Resonate(input: Single): Single;
43 | function AntiResonate(aInput: Single): Single;
44 | end;
45 |
46 | TOutputChannel=(
47 | OutputNone = 0,
48 | OutputVoice,
49 | OutputAspiration,
50 | OutputFrics,
51 | OutputGlotout,
52 | OutputPar_glotout,
53 | OutputOutbypas,
54 | OutputSourc
55 | );
56 |
57 | type
58 | TKlattFrame = record
59 | public
60 | /// Voicing fund freq in Hz
61 | F0Hz10: Integer;
62 | /// Amp of voicing in dB, 0 to 70
63 | AVdb : Integer;
64 | /// First formant freq in Hz, 200 to 1300
65 | F1Hz : Integer;
66 | /// First formant bw in Hz, 40 to 1000
67 | B1Hz : Integer;
68 | /// Second formant freq in Hz, 550 to 3000
69 | F2Hz : Integer;
70 | /// Second formant bw in Hz, 40 to 1000
71 | B2hz : Integer;
72 | /// Third formant freq in Hz, 1200 to 4999
73 | F3hz : Integer;
74 | /// Third formant bw in Hz, 40 to 1000
75 | B3hz : Integer;
76 | /// Fourth formant freq in Hz, 1200 to 4999
77 | F4hz : Integer;
78 | /// Fourth formant bw in Hz, 40 to 1000
79 | B4hz : Integer;
80 | /// Fifth formant freq in Hz, 1200 to 4999
81 | F5hz : Integer;
82 | /// Fifth formant bw in Hz, 40 to 1000
83 | B5hz : Integer;
84 | /// Sixth formant freq in Hz, 1200 to 4999
85 | F6hz : Integer;
86 | /// Sixth formant bw in Hz, 40 to 2000
87 | B6hz : Integer;
88 | /// Nasal zero freq in Hz, 248 to 528
89 | NasalZeroFrequency : Integer;
90 | /// Nasal zero bw in Hz, 40 to 1000
91 | BNZhz : Integer;
92 | /// Nasal pole freq in Hz, 248 to 528
93 | FNPhz : Integer;
94 | /// Nasal pole bw in Hz, 40 to 1000
95 | BNPhz : Integer;
96 | /// Amp of aspiration in dB, 0 to 70
97 | ASP : Integer;
98 | /// # of samples in open period, 10 to 65
99 | Kopen : Integer;
100 | /// Breathiness in voicing, 0 to 80
101 | Aturb : Integer;
102 | /// Voicing spectral tilt in dB, 0 to 24
103 | TLTdb : Integer;
104 | /// Amp of frication in dB, 0 to 80
105 | AF : Integer;
106 | /// Skewness of alternate periods, 0 to 40 in sample#/2
107 | Kskew : Integer;
108 | /// Amp of par 1st formant in dB, 0 to 80
109 | A1dB : Integer;
110 | /// Par. 1st formant bw in Hz, 40 to 1000
111 | B1phz : Integer;
112 | /// Amp of F2 frication in dB, 0 to 80
113 | A2dB : Integer;
114 | /// Par. 2nd formant bw in Hz, 40 to 1000
115 | B2phz : Integer;
116 | /// Amp of F3 frication in dB, 0 to 80
117 | A3dB : Integer;
118 | /// Par. 3rd formant bw in Hz, 40 to 1000
119 | B3phz : Integer;
120 | /// Amp of F4 frication in dB, 0 to 80
121 | A4dB : Integer;
122 | /// Par. 4th formant bw in Hz, 40 to 1000
123 | B4phz : Integer;
124 | /// Amp of F5 frication in dB, 0 to 80
125 | A5dB : Integer;
126 | /// Par. 5th formant bw in Hz, 40 to 1000
127 | B5phz : Integer;
128 | /// Amp of F6 (same as rp[6]a), 0 to 80
129 | A6dB : Integer;
130 | /// Par. 6th formant bw in Hz, 40 to 2000
131 | B6phz : Integer;
132 | /// Amp of par nasal pole in dB, 0 to 80
133 | ANPdB : Integer;
134 | /// Amp of bypass fric. in dB, 0 to 80
135 | ByPassPathAmp : Integer;
136 | /// Amp of voicing, par in dB, 0 to 70
137 | AVpdB : Integer;
138 | /// Overall gain, 60 dB is unity, 0 to 60
139 | Gain0dB : Integer;
140 | end;
141 |
142 | /// Structure for Klatt Globals
143 | TKlattSynth = class
144 | public
145 | SynthesisModel: TSynthesisModel; // cascade-parallel or all-parallel
146 | OutputChannel : TOutputChannel; // Output waveform selector
147 | SampleRateHz : integer; // Number of output samples per second
148 | FLPhz : integer; // Frequeny of glottal downsample low-pass filter
149 | BLPhz : integer; // Bandwidth of glottal downsample low-pass filter
150 | nfcascade : integer; // Number of formants in cascade vocal tract
151 | VoicingSource : TVoicingSource; // Type of glottal source
152 | f0_flutter : integer; // Percentage of f0 flutter 0-100
153 | Quiet : boolean; // set to TRUE for error messages
154 | SamplesPerFrame: integer; // number of samples per frame
155 | nper : integer; // Counter for number of samples in a pitch period
156 | CurrentSample : integer; //
157 | T0 : integer; // Fundamental period in output samples times 4
158 | nopen : integer; // Number of samples in open phase of period
159 | nmod : integer; // Position in period to begin noise amp. modul
160 | nrand : integer; // Variable used by random number generator
161 | pulse_shape_a : double; // Makes waveshape of glottal pulse when open
162 | pulse_shape_b : double; // Makes waveshape of glottal pulse when open
163 | minus_pi_t : double;
164 | two_pi_t : double;
165 | onemd : Single;
166 | Decay : Single;
167 | amp_bypas : Single; // AB converted to linear gain
168 | amp_voice : Single; // AVdb converted to linear gain
169 | amp_par_voice : Single; // AVpdb converted to linear gain
170 | amp_aspir : Single; // AP converted to linear gain
171 | amp_frica : Single; // AF converted to linear gain
172 | amp_breth : Single; // ATURB converted to linear gain
173 | amp_gain0 : Single; // G0 converted to linear gain
174 | NaturalSamples : array of integer; // pointer to an array of glottal samples
175 | original_f0 : integer; // original value of f0 not modified by flutter
176 | rnpp : TResonator; // internal storage for resonators
177 | rp : array [1 .. 6] of TResonator;
178 | rc : array [1 .. 8] of TResonator;
179 | rnpc : TResonator;
180 | rnz : TResonator;
181 | rgl : TResonator;
182 | rlp : TResonator;
183 | rout : TResonator;
184 |
185 | nlast : Single; // last noise
186 | TimeCount : Integer;
187 |
188 | Frames : array of TKlattFrame;
189 |
190 | constructor Create;
191 | procedure InitParWave;
192 |
193 | function GenerateNoise(aNoise: Single): Single;
194 |
195 | var vwave: Single;
196 | function ImpulsiveSource: Single;
197 | var vwave2: Single;
198 | function NaturalSource: Single;
199 |
200 | procedure SetABC(
201 | aResFrequencyHz: Integer; { Frequency of resonator in Hz }
202 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz }
203 | var aResonator: TResonator);
204 |
205 | procedure SetZeroABC(
206 | aResFreqHz : Integer; { Frequency of resonator in Hz }
207 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz }
208 | var aResonator : TResonator);
209 |
210 |
211 | var
212 | noise, voice, vlast, glotlast, sourc: Single;
213 |
214 | procedure RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray);
215 | function Render:TArray;
216 |
217 | var Skew: Integer;
218 | procedure InitFrame(var aFrame: TKlattFrame);
219 | procedure Flutter(var aFrame: TKlattFrame);
220 | procedure pitch_synch_par_reset(var aFrame: TKlattFrame);
221 |
222 | procedure LoadFromFile(const aFileName:String);
223 | end;
224 |
225 | { Structure for Klatt Parameters }
226 |
227 | function DBtoLIN(db: Integer):Single;
228 | function LINtoDB(n: double):double;
229 |
230 | implementation
231 |
232 | ///
233 | /// Random number generator (return a number between -8191 and +8191)
234 | /// Noise spectrum is tilted down by soft low-pass filter having a pole nea,
235 | ///
236 | /// the origin in the z-plane, i.e. output = input + (0.75 * lastoutput)
237 | ///
238 | function TKlattSynth.GenerateNoise(aNoise: Single): Single;
239 | var
240 | temp: Integer;
241 | begin
242 | temp := {random(2 * 8191) - 8191}Round(((Random*2)-1)*1024*4);
243 | nrand := temp;
244 | aNoise := nrand + (0.75 * nlast);
245 | nlast := aNoise;
246 | Result := aNoise;
247 | end;
248 |
249 | /// Initialize Globals variable
250 | constructor TKlattSynth.Create;
251 | begin
252 | Quiet := False;
253 | SynthesisModel := TSynthesisModel.AllParallel;
254 | SampleRateHz := 11025;
255 | VoicingSource := TVoicingSource.Natural;
256 | //natural_samples := natural_samples;
257 | nfcascade := 0;
258 | OutputChannel := TOutputChannel.OutputNone;
259 | f0_flutter := 0;
260 | Skew := 0;
261 | end;
262 |
263 | function LINtoDB(n:double):double;
264 | begin
265 | if (n > 1E-12) then
266 | Exit(LOG10(n) * 20)
267 | else
268 | Exit(-200)
269 | end;
270 |
271 | function DBtoLIN(db: Integer): Single;
272 | const
273 | AmpTable: array [0 .. 87] of Single = (
274 | 0,0,0,0,0,0,0,0,0,0,0,0,0,
275 | 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22,
276 | 25, 28, 32, 35, 40, 45, 51, 57, 64, 71, 80,
277 | 90, 101, 114, 128, 142, 159, 179, 202, 227, 256,
278 | 284, 318, 359, 405, 455, 512, 568, 638, 719, 811,
279 | 911, 1024, 1137, 1276, 1438, 1622, 1823, 2048, 2273,
280 | 2552, 2875, 3244, 3645, 4096, 4547, 5104, 5751, 6488,
281 | 7291, 8192, 9093, 10207, 11502, 12976, 14582, 16384,
282 | 18350, 20644, 23429, 26214, 29491, 32767
283 |
284 | {
285 | 1.295291, 1.455383, 1.635262, 1.837373, 2.064464, 2.319622, 2.606317, 2.928446,
286 | 3.290389, 3.697066, 4.154007, 4.667423, 5.244296, 5.892467, 6.620750, 7.439045,
287 | 8.358477, 9.391547, 10.552300, 11.856517, 13.321929, 14.968460, 16.818494,
288 | 18.897185, 21.232792, 23.857069, 26.805696, 30.118759, 33.841303, 38.023936,
289 | 42.723523, 48.003959, 53.937032, 60.603407, 68.093716, 76.509793, 85.966059,
290 | 96.591078, 108.529301, 121.943035, 137.014646, 153.949041, 172.976450, 194.355562,
291 | 218.377036, 245.367456, 275.693771, 309.768282, 348.054249, 391.072190, 439.406955,
292 | 493.715680, 554.736719, 623.299685, 700.336724, 786.895196, 884.151905, 993.429107,
293 | 1116.212480, 1254.171326, 1409.181265, 1583.349736, 1779.044647, 1998.926570,
294 | 2245.984910, 2523.578550, 2835.481517, 3185.934289, 3579.701448, 4022.136458,
295 | 4519.254448, 5077.813986, 5705.408973, 6410.571880, 7202.889753, 8093.134554,
296 | 9093.409611, 10217.314169, 11480.128280, 12899.020540, 14493.281505, 16284.585961,
297 | 18297.287597, 20558.750108, 23099.719223, 25954.740700, 29162.630000, 32767
298 | }
299 | );
300 |
301 | begin
302 | if ((db < 0) or (db > 87)) then
303 | exit(0);
304 |
305 | Result := AmpTable[db] * 0.001;
306 | end;
307 |
308 |
309 | ///
310 | /// This function adds F0 flutter, as specified in:
311 | ///
312 | /// "Analysis, synthesis and perception of voice quality variations among
313 | /// female and male talkers" D.H. Klatt and L.C. Klatt JASA 87(2) February 1990.
314 | ///
315 | /// Flutter is added by applying a quasi-random element constructed from three
316 | /// slowly varying sine waves.
317 | ///
318 | procedure TKlattSynth.Flutter(var aFrame: TKlattFrame);
319 | var delta_f0, fla, flb, flc, fld, fle: Double;
320 | begin
321 | fla := f0_flutter / 50;
322 | flb := original_f0 / 100;
323 | flc := sin(2 * PI * 12.7 * TimeCount);
324 | fld := sin(2 * PI * 7.1 * TimeCount);
325 | fle := sin(2 * PI * 4.7 * TimeCount);
326 | delta_f0 := fla * flb * (flc + fld + fle) * 10;
327 | aFrame.F0Hz10 := aFrame.F0Hz10 + Round(delta_f0);
328 | Inc(TimeCount);
329 | end;
330 |
331 |
332 | ///
333 | /// Convert formant freqencies and bandwidth into resonator difference
334 | /// equation constants.
335 | ///
336 | procedure TKlattSynth.SetABC(
337 | aResFrequencyHz: Integer; { Frequency of resonator in Hz }
338 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz }
339 | var aResonator: TResonator);
340 | var
341 | r : Single;
342 | arg: Double;
343 | begin
344 | // Let r = exp(-pi bw t)
345 | arg := minus_pi_t * aResBandWidthHz;
346 | r := exp(arg);
347 |
348 | // Let c = -r**2
349 | aResonator.c := -(r * r);
350 |
351 | // Let b = r * 2*cos(2 pi f t)
352 | arg := two_pi_t * aResFrequencyHz;
353 | aResonator.b := r * cos(arg) * 2;
354 |
355 | // Let a = 1 - b - c
356 | aResonator.a := 1 - aResonator.b - aResonator.c;
357 | end;
358 |
359 |
360 | /// Convert formant freqencies and bandwidth into anti-resonator difference equation constants.
361 | procedure TKlattSynth.SetZeroABC(
362 | aResFreqHz : Integer; { Frequency of resonator in Hz }
363 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz }
364 | var aResonator : TResonator);
365 | var
366 | r : Single;
367 | arg: Double;
368 | begin
369 | aResFreqHz := -aResFreqHz;
370 | if (aResFreqHz >= 0) then
371 | aResFreqHz := -1;
372 |
373 | // First compute ordinary resonator coefficients
374 | // Let r = exp(-pi bw t)
375 | arg := minus_pi_t * aResBandWidthHz;
376 | r := exp(arg);
377 |
378 | // Let c = -r**2
379 | aResonator.c := -(r * r);
380 |
381 | // Let b = r * 2*cos(2 pi f t)
382 | arg := two_pi_t * aResFreqHz;
383 | aResonator.b := r * cos(arg) * 2.;
384 |
385 | // Let a = 1 - b - c
386 | aResonator.a := 1 - aResonator.b - aResonator.c;
387 |
388 | // Now convert to antiresonator coefficients (a'=1/a, b'=b/a, c'=c/a)
389 | aResonator.a := 1 / aResonator.a;
390 | aResonator.c := aResonator.c * -aResonator.a;
391 | aResonator.b := aResonator.b * -aResonator.a;
392 | end;
393 |
394 | /// Initialises all parameters used in parwave, sets resonator internal memory to zero.
395 | procedure TKlattSynth.InitParWave;
396 | var i,j:integer;
397 | begin
398 | FLPhz := Round((950 * SampleRateHz) / 10000);
399 | BLPhz := Round((630 * SampleRateHz) / 10000);
400 | minus_pi_t := -PI / SampleRateHz;
401 | two_pi_t := -2 * minus_pi_t;
402 | SetABC(FLPhz, BLPhz, rlp);
403 | nper := 0;
404 | T0 := 0;
405 | nopen := 0;
406 | nmod := 0;
407 |
408 | for I := 1 to 2 do
409 | begin
410 | rnpp. p[i]:= 0;
411 | for j := 1 to 6 do
412 | rp[j].p[i] := 0;
413 | for j := 1 to 8 do
414 | rc[j].p[i] := 0;
415 |
416 | rnpc. p[i] := 0;
417 | rnz. p[i] := 0;
418 | rgl .p[i] := 0;
419 | rlp .p[i] := 0;
420 | rout .p[i] := 0;
421 | end;
422 | end;
423 |
424 | procedure TKlattSynth.LoadFromFile(const aFileName: String);
425 | const
426 | /// Number of control parameters
427 | cNumberOfParameters = 40; //
428 | var
429 | InFile : TextFile;
430 | FrameParamPtr : ^Integer;
431 | ParIndex : Integer;
432 | Value : Integer;
433 | begin
434 | AssignFile(InFile, aFileName);
435 | Reset(InFile);
436 |
437 | while not Eof(InFile) do
438 | begin
439 | SetLength(Frames,Length(Frames)+1);
440 | FrameParamPtr := @Frames[High(Frames)];
441 | for ParIndex := 1 to cNumberOfParameters do
442 | begin
443 | read(InFile, value);
444 | FrameParamPtr^ := value;
445 | Inc(FrameParamPtr);
446 | end;
447 | end;
448 | CloseFile(InFile);
449 | end;
450 |
451 | /// Use parameters from the input frame to set up resonator coefficients.
452 | procedure TKlattSynth.InitFrame(var aFrame: TKlattFrame);
453 | var
454 | amp_parF1,
455 | amp_parFNP,
456 | amp_parF2,
457 | amp_parF3,
458 | amp_parF4,
459 | amp_parF5,
460 | amp_parF6: Single;
461 | begin
462 | original_f0 := Round(aFrame.F0Hz10 / 10);
463 |
464 | aFrame.AVdb := aFrame.AVdb - 7;
465 | if (aFrame.AVdb < 0) then
466 | aFrame.AVdb := 0;
467 |
468 | amp_aspir := DBtoLIN(aFrame.ASP) * 0.05;
469 | amp_frica := DBtoLIN(aFrame.AF) * 0.25;
470 | amp_par_voice := DBtoLIN(aFrame.AVpdB);
471 | amp_parF1 := DBtoLIN(aFrame.A1dB) { * 0.4};
472 | amp_parF2 := DBtoLIN(aFrame.A2dB) { * 0.150};
473 | amp_parF3 := DBtoLIN(aFrame.A3dB) { * 0.060};
474 | amp_parF4 := DBtoLIN(aFrame.A4dB) { * 0.040};
475 | amp_parF5 := DBtoLIN(aFrame.A5dB) { * 0.022};
476 | amp_parF6 := DBtoLIN(aFrame.A6dB) { * 0.030};
477 | amp_parFNP := DBtoLIN(aFrame.ANPdB){ * 0.60};
478 | amp_bypas := DBtoLIN(aFrame.ByPassPathAmp) * 0.05;
479 | aFrame.Gain0dB := aFrame.Gain0dB - 3;
480 | if (aFrame.Gain0dB <= 0) then
481 | aFrame.Gain0dB := 57;
482 |
483 | amp_gain0 := DBtoLIN(aFrame.Gain0dB);
484 |
485 | // Set coefficients of variable cascade resonators
486 | if (nfcascade >= 8) then SetABC(7500, 600, rc[8]);
487 | if (nfcascade >= 7) then SetABC(6500, 500, rc[7]);
488 | if (nfcascade >= 6) then SetABC(aFrame.F6hz, aFrame.B6hz, rc[6]);
489 | if (nfcascade >= 5) then SetABC(aFrame.F5hz, aFrame.B5hz, rc[5]);
490 |
491 | SetABC(aFrame.F4hz, aFrame.B4hz, rc[4]);
492 | SetABC(aFrame.F3hz, aFrame.B3hz, rc[3]);
493 | SetABC(aFrame.F2Hz, aFrame.B2hz, rc[2]);
494 | SetABC(aFrame.F1Hz, aFrame.B1Hz, rc[1]);
495 |
496 | // Set coeficients of nasal resonator and zero antiresonato,
497 |
498 | SetABC(aFrame.FNPhz, aFrame.BNPhz, rnpc);
499 | SetZeroABC(aFrame.NasalZeroFrequency, aFrame.BNZhz, rnz);
500 |
501 | // Set coefficients of parallel resonators, and amplitude of outputs
502 | SetABC(aFrame.F1Hz, aFrame.B1phz, rp[1]); rp[1].a := rp[1].a * amp_parF1;
503 | SetABC(aFrame.FNPhz,aFrame.BNPhz, rnpp ); rnpp.a := rnpp.a * amp_parFNP;
504 | SetABC(aFrame.F2Hz, aFrame.B2phz, rp[2]); rp[2].a := rp[2].a * amp_parF2;
505 | SetABC(aFrame.F3hz, aFrame.B3phz, rp[3]); rp[3].a := rp[3].a * amp_parF3;
506 | SetABC(aFrame.F4hz, aFrame.B4phz, rp[4]); rp[4].a := rp[4].a * amp_parF4;
507 | SetABC(aFrame.F5hz, aFrame.B5phz, rp[5]); rp[5].a := rp[5].a * amp_parF5;
508 | SetABC(aFrame.F6hz, aFrame.B6phz, rp[6]); rp[6].a := rp[6].a * amp_parF6;
509 |
510 | // output low-pass filte,
511 |
512 | SetABC(0, Round(SampleRateHz / 2), rout);
513 | end;
514 |
515 | ///
516 | /// Generate a low pass filtered train of impulses as an approximation of
517 | /// a natural excitation waveform. Low-pass filter the differentiated impulse
518 | /// with a critically-damped second-order filter, time constant proportional
519 | /// to Kopen.
520 | ///
521 | function TKlattSynth.ImpulsiveSource: Single;
522 | const
523 | doublet: array [0 .. 2] of Single = (0, 13000000, -13000000);
524 | begin
525 | if (nper < 3) then
526 | vwave := doublet[nper]
527 | else
528 | vwave := 0;
529 |
530 | Result := rgl.Resonate(vwave);
531 | end;
532 |
533 | ///
534 | /// Vwave is the differentiated glottal flow waveform, there is a weak
535 | /// spectral zero around 800 Hz, magic constants a,b reset pitch synchronously.
536 | ///
537 | function TKlattSynth.NaturalSource: Single;
538 | var
539 | lgtemp: Single;
540 | begin
541 | if (nper < nopen) then
542 | begin
543 | pulse_shape_a := pulse_shape_a - pulse_shape_b;
544 | vwave2 := vwave2 + pulse_shape_a;
545 | lgtemp := vwave2 * 0.028;
546 | exit(lgtemp);
547 | end
548 | else
549 | begin
550 | vwave2 := 0;
551 | exit(0);
552 | end;
553 | end;
554 |
555 |
556 | ///
557 | /// function PITCH_SYNC_PAR_RESET
558 | ///
559 | /// Reset selected parameters pitch-synchronously.
560 | ///
561 | ///
562 | /// Constant B0 controls shape of glottal pulse as a function
563 | /// of desired duration of open phase N0
564 | /// (Note that N0 is specified in terms of 40,000 samples/sec of speech)
565 | ///
566 | /// Assume voicing waveform V(t) has form: k1 t**2 - k2 t**3
567 | ///
568 | /// If the radiation characterivative, a temporal derivative
569 | /// is folded in, and we go from continuous time to discrete
570 | /// integers n: dV/dt = vwave[n]
571 | /// = sum over i=1,2,...,n of { a - (i * b) }
572 | /// = a n - b/2 n**2
573 | ///
574 | /// where the constants a and b control the detailed shape
575 | /// and amplitude of the voicing waveform over the open
576 | /// potion of the voicing cycle "nopen".
577 | ///
578 | /// Let integral of dV/dt have no net dc flow --> a = (b * nopen) / 3
579 | ///
580 | /// Let maximum of dUg(n)/dn be constant --> b = gain / (nopen * nopen)
581 | /// meaning as nopen gets bigger, V has bigger peak proportional to n
582 | ///
583 | /// Thus, to generate the table below for 40 <= nopen <= 263:
584 | ///
585 | /// B0[nopen - 40] = 1920000 / (nopen * nopen)
586 | ///
587 | procedure TKlattSynth.pitch_synch_par_reset(var aFrame: TKlattFrame);
588 | var
589 | temp : Integer;
590 | temp1: Single;
591 | const
592 | B0: array [0 .. 223] of uint16 = (1200, 1142, 1088, 1038, 991, 948, 907, 869, 833, 799, 768, 738, 710, 683, 658, 634, 612, 590, 570, 551, 533, 515, 499, 483, 468, 454, 440, 427, 415, 403, 391, 380, 370, 360, 350, 341, 332, 323, 315, 307, 300, 292, 285,
593 | 278, 272, 265, 259, 253, 247, 242, 237, 231, 226, 221, 217, 212, 208, 204, 199, 195, 192, 188, 184, 180, 177, 174, 170, 167, 164, 161, 158, 155, 153, 150, 147, 145, 142, 140, 137, 135, 133, 131, 128, 126, 124, 122, 120, 119, 117, 115, 113, 111, 110,
594 | 108, 106, 105, 103, 102, 100, 99, 97, 96, 95, 93, 92, 91, 90, 88, 87, 86, 85, 84, 83, 82, 80, 79, 78, 77, 76, 75, 75, 74, 73, 72, 71, 70, 69, 68, 68, 67, 66, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 53, 53, 52, 52,
595 | 51, 51, 50, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 41, 41, 40, 40, 39, 39, 38, 38, 38, 38, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29,
596 | 29, 28, 28, 28, 28, 27, 27);
597 | begin
598 | if (aFrame.F0Hz10 > 0) then
599 | begin
600 | // T0 is 4* the number of samples in one pitch period
601 | T0 := Round((40 * SampleRateHz) / aFrame.F0Hz10);
602 |
603 | amp_voice := DBtoLIN(aFrame.AVdb);
604 |
605 | // Duration of period before amplitude modulation
606 | nmod := T0;
607 | if (aFrame.AVdb > 0) then
608 | nmod := nmod shr 1;
609 |
610 | // Breathiness of voicing waveform
611 | amp_breth := DBtoLIN(aFrame.Aturb) * 0.1;
612 |
613 | // Set open phase of glottal period where 40 <= open phase <= 263
614 | nopen := 4 * aFrame.Kopen;
615 |
616 | if ((VoicingSource = Impulsive) and (nopen > 263)) then
617 | nopen := 263;
618 |
619 | if (nopen >= (T0 - 1)) then
620 | begin
621 | nopen := T0 - 2;
622 |
623 | //if (globals.quiet = FALSE) then
624 | // raise Exception.Create('Glottal open period cannot exceed T0, truncated');
625 | end;
626 |
627 | if (nopen < 40) then
628 | begin
629 | // F0 max = 1000 Hz
630 | nopen := 40;
631 | if (Quiet = FALSE) then
632 | begin
633 | // writeln('Warning: minimum glottal open period is 10 samples.');
634 | // writeln(format('truncated, nopen = %d', [globals.nopen]));
635 | end;
636 | end;
637 |
638 | // Reset a & b, which determine shape of "natural" glottal waveform
639 | pulse_shape_b := B0[nopen - 40];
640 | pulse_shape_a := (pulse_shape_b * nopen) * 0.333;
641 |
642 | // Reset width of "impulsive" glottal pulse
643 | temp := Round(SampleRateHz / nopen);
644 |
645 | SetABC(0, temp, rgl);
646 |
647 | // Make gain at F1 about constant
648 | temp1 := nopen * 0.00833;
649 | rgl.a := rgl.a * temp1 * temp1;
650 |
651 | // Truncate skewness so as not to exceed duration of closed phase of glottal period.
652 | temp := T0 - nopen;
653 | if (aFrame.Kskew > temp) then
654 | begin
655 | if (Quiet = FALSE) then
656 | begin
657 | // writeln(format('Kskew duration=%d > glottal closed period=%d, truncate\n',
658 | // [
659 | // frame.Kskew, globals.T0 - globals.nopen
660 | // ]));
661 | end;
662 | aFrame.Kskew := temp;
663 | end;
664 | if (Skew >= 0) then
665 | Skew := aFrame.Kskew
666 | else
667 | Skew := -aFrame.Kskew;
668 |
669 | // Add skewness to closed portion of voicing period
670 | T0 := T0 + Skew;
671 | Skew := -Skew;
672 | end
673 | else
674 | begin
675 | T0 := 4; // Default for f0 undefined
676 | amp_voice := 0;
677 | nmod := T0;
678 | amp_breth := 0;
679 | pulse_shape_a := 0;
680 | pulse_shape_b := 0;
681 | end;
682 |
683 | //Reset these pars pitch synchronously or at update rate if f0=0
684 | if ((T0 <> 4) or (CurrentSample = 0)) then
685 | begin
686 | // Set one-pole low-pass filter that tilts glottal source
687 | Decay := (0.033 * aFrame.TLTdb);
688 |
689 | if (Decay > 0) then
690 | onemd := 1 - Decay
691 | else
692 | onemd := 1;
693 | end;
694 | end;
695 |
696 | function TKlattSynth.Render: TArray;
697 | var f:TKlattFrame; s:TArray;
698 | begin
699 | SetLength(s, cMaxSampleRateHz);
700 | for f in Frames do
701 | //
702 | end;
703 |
704 | ///
705 | /// Converts synthesis parameters to a waveform.
706 | ///
707 | procedure TKlattSynth.RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray);
708 | var
709 | i : Integer;
710 | temp, outbypas : Single;
711 | n4 : Integer;
712 | frics, glotout, aspiration: Single;
713 | casc_next_in, par_glotout : Single;
714 | begin
715 | // get parameters for next frame of speech
716 | InitFrame(aFrame); // get parameters for next frame of speech
717 | Flutter(aFrame); // add f0 flutter,
718 |
719 |
720 | // MAIN LOOP, for each output sample of current frame:
721 | for i := 0 to SamplesPerFrame - 1 do
722 | begin
723 | Inc(CurrentSample);
724 |
725 | // Get low-passed random number for aspiration and frication noise
726 | noise := GenerateNoise(noise);
727 |
728 | // Amplitude modulate noise (reduce noise amplitude during
729 | // second half of glottal period) if voicing simultaneously present.
730 | if (nper > nmod) then
731 | noise := noise * 0.5;
732 |
733 | // Compute frication noise
734 | frics := amp_frica * noise;
735 |
736 |
737 | // Compute voicing waveform. Run glottal source simulation at 4
738 | // times normal sample rate to minimize quantization noise in
739 | // period of female voice.
740 | for n4 := 0 to 3 do
741 | begin
742 | case (VoicingSource) of
743 | Impulsive : voice := ImpulsiveSource;
744 | Natural : voice := NaturalSource;
745 | //SAMPLED : voice := sampled_source(globals);
746 | end;
747 |
748 | // Reset period when counter 'nper' reaches T0
749 | if (nper >= T0) then
750 | begin
751 | nper := 0;
752 | pitch_synch_par_reset(aFrame);
753 | end;
754 |
755 | // Low-pass filter voicing waveform before downsampling from 4*samrate
756 | // to samrate samples/sec. Resonator f=.09*samrate, bw=.06*samrate
757 | voice := rlp.Resonate(voice);
758 |
759 | // Increment counter that keeps track of 4*samrate samples per sec
760 | Inc(nper);
761 | end;
762 |
763 | // Tilt spectrum of voicing source down by soft low-pass filtering,
764 | // amount of tilt determined by TLTdb
765 | voice := (voice * onemd) + (vlast * Decay);
766 | vlast := voice;
767 |
768 | {
769 | Add breathiness during glottal open phase. Amount of breathiness
770 | determined by parameter Aturb Use nrand rather than noise because
771 | noise is low-passed.
772 | }
773 | if (nper < nopen) then
774 | voice := voice + (amp_breth * nrand);
775 |
776 | // Set amplitude of voicing
777 | glotout := amp_voice * voice;
778 | par_glotout := amp_par_voice * voice;
779 |
780 | // Compute aspiration amplitude and add to voicing source
781 | aspiration := amp_aspir * noise;
782 | glotout := glotout + aspiration;
783 |
784 | par_glotout := par_glotout + aspiration;
785 |
786 | // Cascade vocal tract, excited by laryngeal sources.
787 | // Nasal antiresonator, then formants FNP, F5, F4, F3, F2, F1
788 | if (SynthesisModel <> TSynthesisModel.AllParallel) then
789 | begin
790 | casc_next_in := rnz.AntiResonate(glotout);
791 | casc_next_in := rnpc.Resonate(casc_next_in);
792 | // Do not use unless sample rate >= 16000
793 | if (nfcascade >= 8) then casc_next_in := rc[8].Resonate(casc_next_in);
794 | // Do not use unless sample rate >= 16000
795 | if (nfcascade >= 7) then casc_next_in := rc[7].Resonate( casc_next_in);
796 | { Do not use unless long vocal tract or sample rate increased }
797 | if (nfcascade >= 6) then casc_next_in := rc[6].Resonate( casc_next_in);
798 | if (nfcascade >= 5) then casc_next_in := rc[5].Resonate( casc_next_in);
799 | if (nfcascade >= 4) then casc_next_in := rc[4].Resonate( casc_next_in);
800 | if (nfcascade >= 3) then casc_next_in := rc[3].Resonate( casc_next_in);
801 | if (nfcascade >= 2) then casc_next_in := rc[2].Resonate( casc_next_in);
802 | if (nfcascade >= 1) then aOutput[i] := rc[1].Resonate( casc_next_in);
803 | aOutput[i] := aOutput[i];
804 | end
805 | else
806 | begin
807 | // we are not using the cascade tract, set out to zero
808 | aOutput[i] := 0;
809 | end;
810 |
811 | // Excite parallel F1 and FNP by voicing waveform
812 | sourc := par_glotout; // Source is voicing plus aspiration
813 |
814 | {
815 | Standard parallel vocal tract Formants F6,F5,F4,F3,F2,
816 | outputs added with alternating sign. Sound sourc for othe,
817 |
818 | parallel resonators is frication plus first difference of
819 | voicing waveform.
820 | }
821 | aOutput[i] := aOutput[i] + rp[1].Resonate(sourc);
822 | aOutput[i] := aOutput[i] + rnpp.Resonate(sourc);
823 |
824 | sourc := frics + par_glotout - glotlast;
825 | glotlast := par_glotout;
826 |
827 | aOutput[i] := rp[6].Resonate(sourc) - aOutput[i];
828 | aOutput[i] := rp[5].Resonate(sourc) - aOutput[i];
829 | aOutput[i] := rp[4].Resonate(sourc) - aOutput[i];
830 | aOutput[i] := rp[3].Resonate(sourc) - aOutput[i];
831 | aOutput[i] := rp[2].Resonate(sourc) - aOutput[i];
832 |
833 | outbypas := amp_bypas * sourc;
834 | aOutput[i] := outbypas - aOutput[i];
835 | aOutput[i] := aOutput[i] / 10;
836 |
837 | if (OutputChannel <> OutputNone) then
838 | begin
839 | case OutputChannel of
840 | OutputNone: ;
841 | OutputVoice: aOutput[i] := voice;
842 | OutputAspiration: aOutput[i] := aspiration;
843 | OutputFrics: aOutput[i] := frics;
844 | OutputGlotout: aOutput[i] := glotout;
845 | OutputPar_glotout: aOutput[i] := par_glotout;
846 | OutputOutbypas: aOutput[i] := outbypas;
847 | OutputSourc: aOutput[i] := sourc;
848 | end;
849 |
850 | aOutput[i] := rout.Resonate(aOutput[i]);
851 |
852 | temp := aOutput[i] * amp_gain0/1000;
853 |
854 | (* Convert back to integer *)
855 | if (temp < -32768) then
856 | temp := -32768;
857 |
858 | if (temp > 32767) then
859 | temp := 32767;
860 |
861 | aOutput[i] := temp;
862 | end;
863 | end;
864 | end;
865 |
866 | { TResonator }
867 |
868 | function TResonator.AntiResonate(aInput: Single): Single;
869 | var
870 | x: Single;
871 | begin
872 | x := a * aInput +
873 | b * p[1] +
874 | c * p[2];
875 |
876 | p[2] := p[1];
877 | p[1] := aInput;
878 | Result := x;
879 | end;
880 |
881 | function TResonator.Resonate(input: Single): Single;
882 | var
883 | x: Single;
884 | begin
885 | { This is a generic resonator function. Internal memory for the resonator,
886 | is stored in the globals structure. }
887 | x := (a * input + b * p[1] + c * p[2]);
888 | p[2] := p[1];
889 | p[1] := x;
890 | Result := x;
891 | end;
892 |
893 | end.
894 |
--------------------------------------------------------------------------------
/Klatt.dpr:
--------------------------------------------------------------------------------
1 | program Klatt;
2 |
3 | {
4 | Description : Klatt synthesizer
5 | Author : Wouter van Nifterick
6 | }
7 |
8 | {$APPTYPE CONSOLE}
9 |
10 | {$R *.res}
11 |
12 | uses
13 | SysUtils,
14 | WvN.Util.CmdLine,
15 | Klatt.ParWave in 'Klatt.ParWave.pas';
16 |
17 | procedure Usage;
18 | begin
19 | Writeln('Options...');
20 | Writeln('-h Displays this message');
21 | Writeln('-i sets input filename');
22 | Writeln('-o sets output filename');
23 | Writeln(' If output filename not specified, stdout is used');
24 | Writeln('-q quiet - print no messages');
25 | Writeln('-t select output waveform');
26 | Writeln('-c select cascade-parallel configuration');
27 | Writeln(' Parallel configuration is default');
28 | Writeln('-n Number of formants in cascade branch.');
29 | Writeln(' Default is 5');
30 | Writeln('-s set sample rate');
31 | Writeln('-f set number of milliseconds per frame, default 10');
32 | Writeln('-v Specifies voicing source.');
33 | Writeln(' 1:=impulse train, 2=natural simulation, 3=sampled natural');
34 | Writeln(' Default is a simulation of natural voicing');
35 | Writeln('-V Input file of samples for natural voicing.');
36 | Writeln('-F percentage of f0 flutter');
37 | Writeln(' Default is 0');
38 | Writeln('-r output 16 bit signed integers rather than ASCII');
39 | Writeln(' integers. cType := 1 gives high byte first, cType = 2 gives');
40 | Writeln(' low byte first.');
41 | end;
42 |
43 | type
44 | TAppSettings=record
45 | InFileName ,
46 | OutFileName ,
47 | SampeFileName : string;
48 | MsPerFrame : Integer;
49 | DoOutputRawSample : Boolean;
50 | OutputByteOrder : byte;
51 | procedure Init;
52 | end;
53 |
54 | procedure TAppSettings.Init;
55 | begin
56 | self := default(TAppSettings);
57 | InFileName := '';
58 | OutFileName := '';
59 | SampeFileName := '';
60 | MsPerFrame := 10;
61 | DoOutputRawSample := FALSE;
62 | end;
63 |
64 | procedure Main;
65 | var
66 | OutFile : File of byte;
67 | KlattSynth : TKlattSynth;
68 | FrameSamples : TArray;
69 | FrameSampleIndex : Integer;
70 | Sample : uint16;
71 | AppSettings : TAppSettings;
72 | FrameIndex : Integer;
73 | begin
74 | if(ParamCount=0) then
75 | begin
76 | usage;
77 | halt(1);
78 | end;
79 |
80 | AppSettings.Init;
81 |
82 | KlattSynth := TKlattSynth.Create;
83 |
84 | SetLength(FrameSamples, cMaxSampleRateHz);
85 |
86 | CommandLine.ProcessKeys( procedure(const key:char; const name,value:string )
87 | begin
88 | case Key of
89 | 'i': AppSettings.InFileName := Value;
90 | 'o': AppSettings.OutFileName := Value;
91 | 'q': KlattSynth.Quiet := TRUE;
92 | 't': KlattSynth.OutputChannel := TOutputChannel(StrToInt(Value));
93 | 'c': begin KlattSynth.SynthesisModel := CascadeParallel; KlattSynth.nfcascade := 5; end;
94 | 's': KlattSynth.SampleRateHz := StrToInt(Value);
95 | 'f': AppSettings.MsPerFrame := StrToInt(Value);
96 | // 'v': Globals.glsource := StrToInt(Value);
97 | 'V': AppSettings.SampeFileName := Value;
98 | 'h': begin usage(); halt(1); end;
99 | 'n': KlattSynth.nfcascade := StrToInt(Value);
100 | 'F': KlattSynth.f0_flutter := StrToInt(Value);
101 | 'r': begin AppSettings.DoOutputRawSample := TRUE; AppSettings.OutputByteOrder := StrToInt(Value); end;
102 | end;
103 | end
104 | );
105 | KlattSynth.SamplesPerFrame := Round((KlattSynth.SampleRateHz * AppSettings.MsPerFrame) / 2000);
106 |
107 | {
108 | if SampleFileName <> '' then
109 | begin
110 | AssignFile(InFile, SampeFileName);
111 | Reset(InFile);
112 | read(InFile, Globals.num_samples);
113 | read(InFile, Globals.SAMPLE_FACTOR);
114 | SetLength(Globals.natural_samples, length(natural_samples));
115 | for I := 0 to Globals.num_samples - 1 do
116 | read(InFile, Globals.natural_samples[I]);
117 | CloseFile(InFile);
118 | end;
119 | }
120 |
121 | if AppSettings.InFileName = '' then
122 | begin
123 | Writeln('Error: No inputfile given');
124 | Halt(2);
125 | end;
126 |
127 | if AppSettings.OutFileName = '' then
128 | KlattSynth.Quiet := True
129 | else
130 | begin
131 | AssignFile(OutFile, AppSettings.OutFileName);
132 | Rewrite(OutFile);
133 | end;
134 |
135 | KlattSynth.InitParWave;
136 | WriteLn('Reading ',AppSettings.InFileName,' ...');
137 | KlattSynth.LoadFromFile(AppSettings.InFileName);
138 |
139 | WriteLn(length(KlattSynth.Frames),' frames -> Rendering to ',length(KlattSynth.Frames)*KlattSynth.SamplesPerFrame, ' samples ...');
140 | WriteLn('Saving ', AppSettings.OutFileName,' ...');
141 | for FrameIndex := 0 to High(KlattSynth.Frames) do
142 | begin
143 | KlattSynth.RenderParWave(KlattSynth.Frames[FrameIndex], FrameSamples);
144 |
145 | for FrameSampleIndex := 0 to KlattSynth.SamplesPerFrame-1 do
146 | begin
147 | Sample := Round(FrameSamples[FrameSampleIndex]);
148 | if AppSettings.DoOutputRawSample then
149 | begin
150 | Sample := Round((Sample / 256) + 32768) and $FFFF;
151 | Write(OutFile, Sample);
152 | end
153 | else
154 | Writeln(format('%d', [round(FrameSamples[FrameSampleIndex])]));
155 | end;
156 | end;
157 | if AppSettings.OutFileName <> '' then CloseFile(OutFile);
158 | if (not KlattSynth.Quiet) then Writeln('Done');
159 | WriteLn('Done.');
160 | end;
161 |
162 | begin
163 | try
164 | Main;
165 | except
166 | on E: Exception do
167 | Writeln(E.ClassName, ': ', E.Message);
168 | end;
169 | end.
170 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # delphi-klatt-synth
2 | Klatt Speech Synthesizer
3 |
4 | This can be used to do text to speech, or generate singing sounds.
5 |
6 | It's a commandline tool that takes a Klatt file as an input, and generates a sample as output.
7 |
8 | The code is based on the The Klatt Synthesizer which was designed by Dennis H. Klatt in 1980.
9 | Dectalk, which was Stephen Hawking's voice, was largely based on this.
10 |
--------------------------------------------------------------------------------