├── Klatt.ParWave.pas ├── Klatt.dpr └── README.md /Klatt.ParWave.pas: -------------------------------------------------------------------------------- 1 | unit Klatt.ParWave; 2 | 3 | { 4 | Description : Klatt synthesizer 5 | Author : Wouter van Nifterick 6 | } 7 | 8 | interface 9 | 10 | uses System.SysUtils, Math, System.Generics.Collections; 11 | 12 | 13 | const 14 | cMaxSampleRateHz = 20000; // Maximum sample rate 15 | 16 | natural_samples: array[0..99] of integer= 17 | ( 18 | -310,-400,530,356,224,89,23,-10,-58,-16,461,599,536,701,770, 19 | 605,497,461,560,404,110,224,131,104,-97,155,278,-154,-1165, 20 | -598,737,125,-592,41,11,-247,-10,65,92,80,-304,71,167,-1,122, 21 | 233,161,-43,278,479,485,407,266,650,134,80,236,68,260,269,179, 22 | 53,140,275,293,296,104,257,152,311,182,263,245,125,314,140,44, 23 | 203,230,-235,-286,23,107,92,-91,38,464,443,176,98,-784,-2449, 24 | -1891,-1045,-1600,-1462,-1384,-1261,-949,-730 25 | ); 26 | 27 | type 28 | TVoicingSource = ( 29 | Impulsive = 1, 30 | Natural = 2, 31 | Sampled = 3 32 | ); 33 | 34 | TSynthesisModel = ( 35 | CascadeParallel = 1, 36 | AllParallel = 2 37 | ); 38 | 39 | TResonator = record 40 | a, b, c: double; 41 | p:array[1..2] of double; 42 | function Resonate(input: Single): Single; 43 | function AntiResonate(aInput: Single): Single; 44 | end; 45 | 46 | TOutputChannel=( 47 | OutputNone = 0, 48 | OutputVoice, 49 | OutputAspiration, 50 | OutputFrics, 51 | OutputGlotout, 52 | OutputPar_glotout, 53 | OutputOutbypas, 54 | OutputSourc 55 | ); 56 | 57 | type 58 | TKlattFrame = record 59 | public 60 | ///

Voicing fund freq in Hz

61 | F0Hz10: Integer; 62 | ///

Amp of voicing in dB,

0 to 70 63 | AVdb : Integer; 64 | ///

First formant freq in Hz,

200 to 1300 65 | F1Hz : Integer; 66 | ///

First formant bw in Hz,

40 to 1000 67 | B1Hz : Integer; 68 | ///

Second formant freq in Hz,

550 to 3000 69 | F2Hz : Integer; 70 | ///

Second formant bw in Hz,

40 to 1000 71 | B2hz : Integer; 72 | ///

Third formant freq in Hz,

1200 to 4999 73 | F3hz : Integer; 74 | ///

Third formant bw in Hz,

40 to 1000 75 | B3hz : Integer; 76 | ///

Fourth formant freq in Hz,

1200 to 4999 77 | F4hz : Integer; 78 | ///

Fourth formant bw in Hz,

40 to 1000 79 | B4hz : Integer; 80 | ///

Fifth formant freq in Hz,

1200 to 4999 81 | F5hz : Integer; 82 | ///

Fifth formant bw in Hz,

40 to 1000 83 | B5hz : Integer; 84 | ///

Sixth formant freq in Hz,

1200 to 4999 85 | F6hz : Integer; 86 | ///

Sixth formant bw in Hz,

40 to 2000 87 | B6hz : Integer; 88 | ///

Nasal zero freq in Hz,

248 to 528 89 | NasalZeroFrequency : Integer; 90 | ///

Nasal zero bw in Hz,

40 to 1000 91 | BNZhz : Integer; 92 | ///

Nasal pole freq in Hz,

248 to 528 93 | FNPhz : Integer; 94 | ///

Nasal pole bw in Hz,

40 to 1000 95 | BNPhz : Integer; 96 | ///

Amp of aspiration in dB,

0 to 70 97 | ASP : Integer; 98 | ///

# of samples in open period,

10 to 65 99 | Kopen : Integer; 100 | ///

Breathiness in voicing,

0 to 80 101 | Aturb : Integer; 102 | ///

Voicing spectral tilt in dB,

0 to 24 103 | TLTdb : Integer; 104 | ///

Amp of frication in dB,

0 to 80 105 | AF : Integer; 106 | ///

Skewness of alternate periods,

0 to 40 in sample#/2 107 | Kskew : Integer; 108 | ///

Amp of par 1st formant in dB,

0 to 80 109 | A1dB : Integer; 110 | ///

Par. 1st formant bw in Hz,

40 to 1000 111 | B1phz : Integer; 112 | ///

Amp of F2 frication in dB,

0 to 80 113 | A2dB : Integer; 114 | ///

Par. 2nd formant bw in Hz,

40 to 1000 115 | B2phz : Integer; 116 | ///

Amp of F3 frication in dB,

0 to 80 117 | A3dB : Integer; 118 | ///

Par. 3rd formant bw in Hz,

40 to 1000 119 | B3phz : Integer; 120 | ///

Amp of F4 frication in dB,

0 to 80 121 | A4dB : Integer; 122 | ///

Par. 4th formant bw in Hz,

40 to 1000 123 | B4phz : Integer; 124 | ///

Amp of F5 frication in dB,

0 to 80 125 | A5dB : Integer; 126 | ///

Par. 5th formant bw in Hz,

40 to 1000 127 | B5phz : Integer; 128 | ///

Amp of F6 (same as rp[6]a),

0 to 80 129 | A6dB : Integer; 130 | ///

Par. 6th formant bw in Hz,

40 to 2000 131 | B6phz : Integer; 132 | ///

Amp of par nasal pole in dB,

0 to 80 133 | ANPdB : Integer; 134 | ///

Amp of bypass fric. in dB,

0 to 80 135 | ByPassPathAmp : Integer; 136 | ///

Amp of voicing, par in dB,

0 to 70 137 | AVpdB : Integer; 138 | ///

Overall gain, 60 dB is unity,

0 to 60 139 | Gain0dB : Integer; 140 | end; 141 | 142 | ///

Structure for Klatt Globals

143 | TKlattSynth = class 144 | public 145 | SynthesisModel: TSynthesisModel; // cascade-parallel or all-parallel 146 | OutputChannel : TOutputChannel; // Output waveform selector 147 | SampleRateHz : integer; // Number of output samples per second 148 | FLPhz : integer; // Frequeny of glottal downsample low-pass filter 149 | BLPhz : integer; // Bandwidth of glottal downsample low-pass filter 150 | nfcascade : integer; // Number of formants in cascade vocal tract 151 | VoicingSource : TVoicingSource; // Type of glottal source 152 | f0_flutter : integer; // Percentage of f0 flutter 0-100 153 | Quiet : boolean; // set to TRUE for error messages 154 | SamplesPerFrame: integer; // number of samples per frame 155 | nper : integer; // Counter for number of samples in a pitch period 156 | CurrentSample : integer; // 157 | T0 : integer; // Fundamental period in output samples times 4 158 | nopen : integer; // Number of samples in open phase of period 159 | nmod : integer; // Position in period to begin noise amp. modul 160 | nrand : integer; // Variable used by random number generator 161 | pulse_shape_a : double; // Makes waveshape of glottal pulse when open 162 | pulse_shape_b : double; // Makes waveshape of glottal pulse when open 163 | minus_pi_t : double; 164 | two_pi_t : double; 165 | onemd : Single; 166 | Decay : Single; 167 | amp_bypas : Single; // AB converted to linear gain 168 | amp_voice : Single; // AVdb converted to linear gain 169 | amp_par_voice : Single; // AVpdb converted to linear gain 170 | amp_aspir : Single; // AP converted to linear gain 171 | amp_frica : Single; // AF converted to linear gain 172 | amp_breth : Single; // ATURB converted to linear gain 173 | amp_gain0 : Single; // G0 converted to linear gain 174 | NaturalSamples : array of integer; // pointer to an array of glottal samples 175 | original_f0 : integer; // original value of f0 not modified by flutter 176 | rnpp : TResonator; // internal storage for resonators 177 | rp : array [1 .. 6] of TResonator; 178 | rc : array [1 .. 8] of TResonator; 179 | rnpc : TResonator; 180 | rnz : TResonator; 181 | rgl : TResonator; 182 | rlp : TResonator; 183 | rout : TResonator; 184 | 185 | nlast : Single; // last noise 186 | TimeCount : Integer; 187 | 188 | Frames : array of TKlattFrame; 189 | 190 | constructor Create; 191 | procedure InitParWave; 192 | 193 | function GenerateNoise(aNoise: Single): Single; 194 | 195 | var vwave: Single; 196 | function ImpulsiveSource: Single; 197 | var vwave2: Single; 198 | function NaturalSource: Single; 199 | 200 | procedure SetABC( 201 | aResFrequencyHz: Integer; { Frequency of resonator in Hz } 202 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz } 203 | var aResonator: TResonator); 204 | 205 | procedure SetZeroABC( 206 | aResFreqHz : Integer; { Frequency of resonator in Hz } 207 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz } 208 | var aResonator : TResonator); 209 | 210 | 211 | var 212 | noise, voice, vlast, glotlast, sourc: Single; 213 | 214 | procedure RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray); 215 | function Render:TArray; 216 | 217 | var Skew: Integer; 218 | procedure InitFrame(var aFrame: TKlattFrame); 219 | procedure Flutter(var aFrame: TKlattFrame); 220 | procedure pitch_synch_par_reset(var aFrame: TKlattFrame); 221 | 222 | procedure LoadFromFile(const aFileName:String); 223 | end; 224 | 225 | { Structure for Klatt Parameters } 226 | 227 | function DBtoLIN(db: Integer):Single; 228 | function LINtoDB(n: double):double; 229 | 230 | implementation 231 | 232 | ///

233 | /// Random number generator (return a number between -8191 and +8191) 234 | /// Noise spectrum is tilted down by soft low-pass filter having a pole nea, 235 | /// 236 | /// the origin in the z-plane, i.e. output = input + (0.75 * lastoutput) 237 | ///

238 | function TKlattSynth.GenerateNoise(aNoise: Single): Single; 239 | var 240 | temp: Integer; 241 | begin 242 | temp := {random(2 * 8191) - 8191}Round(((Random*2)-1)*1024*4); 243 | nrand := temp; 244 | aNoise := nrand + (0.75 * nlast); 245 | nlast := aNoise; 246 | Result := aNoise; 247 | end; 248 | 249 | ///

Initialize Globals variable

250 | constructor TKlattSynth.Create; 251 | begin 252 | Quiet := False; 253 | SynthesisModel := TSynthesisModel.AllParallel; 254 | SampleRateHz := 11025; 255 | VoicingSource := TVoicingSource.Natural; 256 | //natural_samples := natural_samples; 257 | nfcascade := 0; 258 | OutputChannel := TOutputChannel.OutputNone; 259 | f0_flutter := 0; 260 | Skew := 0; 261 | end; 262 | 263 | function LINtoDB(n:double):double; 264 | begin 265 | if (n > 1E-12) then 266 | Exit(LOG10(n) * 20) 267 | else 268 | Exit(-200) 269 | end; 270 | 271 | function DBtoLIN(db: Integer): Single; 272 | const 273 | AmpTable: array [0 .. 87] of Single = ( 274 | 0,0,0,0,0,0,0,0,0,0,0,0,0, 275 | 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 276 | 25, 28, 32, 35, 40, 45, 51, 57, 64, 71, 80, 277 | 90, 101, 114, 128, 142, 159, 179, 202, 227, 256, 278 | 284, 318, 359, 405, 455, 512, 568, 638, 719, 811, 279 | 911, 1024, 1137, 1276, 1438, 1622, 1823, 2048, 2273, 280 | 2552, 2875, 3244, 3645, 4096, 4547, 5104, 5751, 6488, 281 | 7291, 8192, 9093, 10207, 11502, 12976, 14582, 16384, 282 | 18350, 20644, 23429, 26214, 29491, 32767 283 | 284 | { 285 | 1.295291, 1.455383, 1.635262, 1.837373, 2.064464, 2.319622, 2.606317, 2.928446, 286 | 3.290389, 3.697066, 4.154007, 4.667423, 5.244296, 5.892467, 6.620750, 7.439045, 287 | 8.358477, 9.391547, 10.552300, 11.856517, 13.321929, 14.968460, 16.818494, 288 | 18.897185, 21.232792, 23.857069, 26.805696, 30.118759, 33.841303, 38.023936, 289 | 42.723523, 48.003959, 53.937032, 60.603407, 68.093716, 76.509793, 85.966059, 290 | 96.591078, 108.529301, 121.943035, 137.014646, 153.949041, 172.976450, 194.355562, 291 | 218.377036, 245.367456, 275.693771, 309.768282, 348.054249, 391.072190, 439.406955, 292 | 493.715680, 554.736719, 623.299685, 700.336724, 786.895196, 884.151905, 993.429107, 293 | 1116.212480, 1254.171326, 1409.181265, 1583.349736, 1779.044647, 1998.926570, 294 | 2245.984910, 2523.578550, 2835.481517, 3185.934289, 3579.701448, 4022.136458, 295 | 4519.254448, 5077.813986, 5705.408973, 6410.571880, 7202.889753, 8093.134554, 296 | 9093.409611, 10217.314169, 11480.128280, 12899.020540, 14493.281505, 16284.585961, 297 | 18297.287597, 20558.750108, 23099.719223, 25954.740700, 29162.630000, 32767 298 | } 299 | ); 300 | 301 | begin 302 | if ((db < 0) or (db > 87)) then 303 | exit(0); 304 | 305 | Result := AmpTable[db] * 0.001; 306 | end; 307 | 308 | 309 | ///

310 | /// This function adds F0 flutter, as specified in: 311 | /// 312 | /// "Analysis, synthesis and perception of voice quality variations among 313 | /// female and male talkers" D.H. Klatt and L.C. Klatt JASA 87(2) February 1990. 314 | /// 315 | /// Flutter is added by applying a quasi-random element constructed from three 316 | /// slowly varying sine waves. 317 | ///

333 | /// Convert formant freqencies and bandwidth into resonator difference 334 | /// equation constants. 335 | ///

336 | procedure TKlattSynth.SetABC( 337 | aResFrequencyHz: Integer; { Frequency of resonator in Hz } 338 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz } 339 | var aResonator: TResonator); 340 | var 341 | r : Single; 342 | arg: Double; 343 | begin 344 | // Let r = exp(-pi bw t) 345 | arg := minus_pi_t * aResBandWidthHz; 346 | r := exp(arg); 347 | 348 | // Let c = -r**2 349 | aResonator.c := -(r * r); 350 | 351 | // Let b = r * 2*cos(2 pi f t) 352 | arg := two_pi_t * aResFrequencyHz; 353 | aResonator.b := r * cos(arg) * 2; 354 | 355 | // Let a = 1 - b - c 356 | aResonator.a := 1 - aResonator.b - aResonator.c; 357 | end; 358 | 359 | 360 | ///

Convert formant freqencies and bandwidth into anti-resonator difference equation constants.

361 | procedure TKlattSynth.SetZeroABC( 362 | aResFreqHz : Integer; { Frequency of resonator in Hz } 363 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz } 364 | var aResonator : TResonator); 365 | var 366 | r : Single; 367 | arg: Double; 368 | begin 369 | aResFreqHz := -aResFreqHz; 370 | if (aResFreqHz >= 0) then 371 | aResFreqHz := -1; 372 | 373 | // First compute ordinary resonator coefficients 374 | // Let r = exp(-pi bw t) 375 | arg := minus_pi_t * aResBandWidthHz; 376 | r := exp(arg); 377 | 378 | // Let c = -r**2 379 | aResonator.c := -(r * r); 380 | 381 | // Let b = r * 2*cos(2 pi f t) 382 | arg := two_pi_t * aResFreqHz; 383 | aResonator.b := r * cos(arg) * 2.; 384 | 385 | // Let a = 1 - b - c 386 | aResonator.a := 1 - aResonator.b - aResonator.c; 387 | 388 | // Now convert to antiresonator coefficients (a'=1/a, b'=b/a, c'=c/a) 389 | aResonator.a := 1 / aResonator.a; 390 | aResonator.c := aResonator.c * -aResonator.a; 391 | aResonator.b := aResonator.b * -aResonator.a; 392 | end; 393 | 394 | ///

Initialises all parameters used in parwave, sets resonator internal memory to zero.

Number of control parameters

Use parameters from the input frame to set up resonator coefficients.

452 | procedure TKlattSynth.InitFrame(var aFrame: TKlattFrame); 453 | var 454 | amp_parF1, 455 | amp_parFNP, 456 | amp_parF2, 457 | amp_parF3, 458 | amp_parF4, 459 | amp_parF5, 460 | amp_parF6: Single; 461 | begin 462 | original_f0 := Round(aFrame.F0Hz10 / 10); 463 | 464 | aFrame.AVdb := aFrame.AVdb - 7; 465 | if (aFrame.AVdb < 0) then 466 | aFrame.AVdb := 0; 467 | 468 | amp_aspir := DBtoLIN(aFrame.ASP) * 0.05; 469 | amp_frica := DBtoLIN(aFrame.AF) * 0.25; 470 | amp_par_voice := DBtoLIN(aFrame.AVpdB); 471 | amp_parF1 := DBtoLIN(aFrame.A1dB) { * 0.4}; 472 | amp_parF2 := DBtoLIN(aFrame.A2dB) { * 0.150}; 473 | amp_parF3 := DBtoLIN(aFrame.A3dB) { * 0.060}; 474 | amp_parF4 := DBtoLIN(aFrame.A4dB) { * 0.040}; 475 | amp_parF5 := DBtoLIN(aFrame.A5dB) { * 0.022}; 476 | amp_parF6 := DBtoLIN(aFrame.A6dB) { * 0.030}; 477 | amp_parFNP := DBtoLIN(aFrame.ANPdB){ * 0.60}; 478 | amp_bypas := DBtoLIN(aFrame.ByPassPathAmp) * 0.05; 479 | aFrame.Gain0dB := aFrame.Gain0dB - 3; 480 | if (aFrame.Gain0dB <= 0) then 481 | aFrame.Gain0dB := 57; 482 | 483 | amp_gain0 := DBtoLIN(aFrame.Gain0dB); 484 | 485 | // Set coefficients of variable cascade resonators 486 | if (nfcascade >= 8) then SetABC(7500, 600, rc[8]); 487 | if (nfcascade >= 7) then SetABC(6500, 500, rc[7]); 488 | if (nfcascade >= 6) then SetABC(aFrame.F6hz, aFrame.B6hz, rc[6]); 489 | if (nfcascade >= 5) then SetABC(aFrame.F5hz, aFrame.B5hz, rc[5]); 490 | 491 | SetABC(aFrame.F4hz, aFrame.B4hz, rc[4]); 492 | SetABC(aFrame.F3hz, aFrame.B3hz, rc[3]); 493 | SetABC(aFrame.F2Hz, aFrame.B2hz, rc[2]); 494 | SetABC(aFrame.F1Hz, aFrame.B1Hz, rc[1]); 495 | 496 | // Set coeficients of nasal resonator and zero antiresonato, 497 | 498 | SetABC(aFrame.FNPhz, aFrame.BNPhz, rnpc); 499 | SetZeroABC(aFrame.NasalZeroFrequency, aFrame.BNZhz, rnz); 500 | 501 | // Set coefficients of parallel resonators, and amplitude of outputs 502 | SetABC(aFrame.F1Hz, aFrame.B1phz, rp[1]); rp[1].a := rp[1].a * amp_parF1; 503 | SetABC(aFrame.FNPhz,aFrame.BNPhz, rnpp ); rnpp.a := rnpp.a * amp_parFNP; 504 | SetABC(aFrame.F2Hz, aFrame.B2phz, rp[2]); rp[2].a := rp[2].a * amp_parF2; 505 | SetABC(aFrame.F3hz, aFrame.B3phz, rp[3]); rp[3].a := rp[3].a * amp_parF3; 506 | SetABC(aFrame.F4hz, aFrame.B4phz, rp[4]); rp[4].a := rp[4].a * amp_parF4; 507 | SetABC(aFrame.F5hz, aFrame.B5phz, rp[5]); rp[5].a := rp[5].a * amp_parF5; 508 | SetABC(aFrame.F6hz, aFrame.B6phz, rp[6]); rp[6].a := rp[6].a * amp_parF6; 509 | 510 | // output low-pass filte, 511 | 512 | SetABC(0, Round(SampleRateHz / 2), rout); 513 | end; 514 | 515 | ///

516 | /// Generate a low pass filtered train of impulses as an approximation of 517 | /// a natural excitation waveform. Low-pass filter the differentiated impulse 518 | /// with a critically-damped second-order filter, time constant proportional 519 | /// to Kopen. 520 | ///

534 | /// Vwave is the differentiated glottal flow waveform, there is a weak 535 | /// spectral zero around 800 Hz, magic constants a,b reset pitch synchronously. 536 | ///

557 | /// function PITCH_SYNC_PAR_RESET 558 | /// 559 | /// Reset selected parameters pitch-synchronously. 560 | /// 561 | /// 562 | /// Constant B0 controls shape of glottal pulse as a function 563 | /// of desired duration of open phase N0 564 | /// (Note that N0 is specified in terms of 40,000 samples/sec of speech) 565 | /// 566 | /// Assume voicing waveform V(t) has form: k1 t**2 - k2 t**3 567 | /// 568 | /// If the radiation characterivative, a temporal derivative 569 | /// is folded in, and we go from continuous time to discrete 570 | /// integers n: dV/dt = vwave[n] 571 | /// = sum over i=1,2,...,n of { a - (i * b) } 572 | /// = a n - b/2 n**2 573 | /// 574 | /// where the constants a and b control the detailed shape 575 | /// and amplitude of the voicing waveform over the open 576 | /// potion of the voicing cycle "nopen". 577 | /// 578 | /// Let integral of dV/dt have no net dc flow --> a = (b * nopen) / 3 579 | /// 580 | /// Let maximum of dUg(n)/dn be constant --> b = gain / (nopen * nopen) 581 | /// meaning as nopen gets bigger, V has bigger peak proportional to n 582 | /// 583 | /// Thus, to generate the table below for 40 <= nopen <= 263: 584 | /// 585 | /// B0[nopen - 40] = 1920000 / (nopen * nopen) 586 | ///

705 | /// Converts synthesis parameters to a waveform. 706 | ///

707 | procedure TKlattSynth.RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray); 708 | var 709 | i : Integer; 710 | temp, outbypas : Single; 711 | n4 : Integer; 712 | frics, glotout, aspiration: Single; 713 | casc_next_in, par_glotout : Single; 714 | begin 715 | // get parameters for next frame of speech 716 | InitFrame(aFrame); // get parameters for next frame of speech 717 | Flutter(aFrame); // add f0 flutter, 718 | 719 | 720 | // MAIN LOOP, for each output sample of current frame: 721 | for i := 0 to SamplesPerFrame - 1 do 722 | begin 723 | Inc(CurrentSample); 724 | 725 | // Get low-passed random number for aspiration and frication noise 726 | noise := GenerateNoise(noise); 727 | 728 | // Amplitude modulate noise (reduce noise amplitude during 729 | // second half of glottal period) if voicing simultaneously present. 730 | if (nper > nmod) then 731 | noise := noise * 0.5; 732 | 733 | // Compute frication noise 734 | frics := amp_frica * noise; 735 | 736 | 737 | // Compute voicing waveform. Run glottal source simulation at 4 738 | // times normal sample rate to minimize quantization noise in 739 | // period of female voice. 740 | for n4 := 0 to 3 do 741 | begin 742 | case (VoicingSource) of 743 | Impulsive : voice := ImpulsiveSource; 744 | Natural : voice := NaturalSource; 745 | //SAMPLED : voice := sampled_source(globals); 746 | end; 747 | 748 | // Reset period when counter 'nper' reaches T0 749 | if (nper >= T0) then 750 | begin 751 | nper := 0; 752 | pitch_synch_par_reset(aFrame); 753 | end; 754 | 755 | // Low-pass filter voicing waveform before downsampling from 4*samrate 756 | // to samrate samples/sec. Resonator f=.09*samrate, bw=.06*samrate 757 | voice := rlp.Resonate(voice); 758 | 759 | // Increment counter that keeps track of 4*samrate samples per sec 760 | Inc(nper); 761 | end; 762 | 763 | // Tilt spectrum of voicing source down by soft low-pass filtering, 764 | // amount of tilt determined by TLTdb 765 | voice := (voice * onemd) + (vlast * Decay); 766 | vlast := voice; 767 | 768 | { 769 | Add breathiness during glottal open phase. Amount of breathiness 770 | determined by parameter Aturb Use nrand rather than noise because 771 | noise is low-passed. 772 | } 773 | if (nper < nopen) then 774 | voice := voice + (amp_breth * nrand); 775 | 776 | // Set amplitude of voicing 777 | glotout := amp_voice * voice; 778 | par_glotout := amp_par_voice * voice; 779 | 780 | // Compute aspiration amplitude and add to voicing source 781 | aspiration := amp_aspir * noise; 782 | glotout := glotout + aspiration; 783 | 784 | par_glotout := par_glotout + aspiration; 785 | 786 | // Cascade vocal tract, excited by laryngeal sources. 787 | // Nasal antiresonator, then formants FNP, F5, F4, F3, F2, F1 788 | if (SynthesisModel <> TSynthesisModel.AllParallel) then 789 | begin 790 | casc_next_in := rnz.AntiResonate(glotout); 791 | casc_next_in := rnpc.Resonate(casc_next_in); 792 | // Do not use unless sample rate >= 16000 793 | if (nfcascade >= 8) then casc_next_in := rc[8].Resonate(casc_next_in); 794 | // Do not use unless sample rate >= 16000 795 | if (nfcascade >= 7) then casc_next_in := rc[7].Resonate( casc_next_in); 796 | { Do not use unless long vocal tract or sample rate increased } 797 | if (nfcascade >= 6) then casc_next_in := rc[6].Resonate( casc_next_in); 798 | if (nfcascade >= 5) then casc_next_in := rc[5].Resonate( casc_next_in); 799 | if (nfcascade >= 4) then casc_next_in := rc[4].Resonate( casc_next_in); 800 | if (nfcascade >= 3) then casc_next_in := rc[3].Resonate( casc_next_in); 801 | if (nfcascade >= 2) then casc_next_in := rc[2].Resonate( casc_next_in); 802 | if (nfcascade >= 1) then aOutput[i] := rc[1].Resonate( casc_next_in); 803 | aOutput[i] := aOutput[i]; 804 | end 805 | else 806 | begin 807 | // we are not using the cascade tract, set out to zero 808 | aOutput[i] := 0; 809 | end; 810 | 811 | // Excite parallel F1 and FNP by voicing waveform 812 | sourc := par_glotout; // Source is voicing plus aspiration 813 | 814 | { 815 | Standard parallel vocal tract Formants F6,F5,F4,F3,F2, 816 | outputs added with alternating sign. Sound sourc for othe, 817 | 818 | parallel resonators is frication plus first difference of 819 | voicing waveform. 820 | } 821 | aOutput[i] := aOutput[i] + rp[1].Resonate(sourc); 822 | aOutput[i] := aOutput[i] + rnpp.Resonate(sourc); 823 | 824 | sourc := frics + par_glotout - glotlast; 825 | glotlast := par_glotout; 826 | 827 | aOutput[i] := rp[6].Resonate(sourc) - aOutput[i]; 828 | aOutput[i] := rp[5].Resonate(sourc) - aOutput[i]; 829 | aOutput[i] := rp[4].Resonate(sourc) - aOutput[i]; 830 | aOutput[i] := rp[3].Resonate(sourc) - aOutput[i]; 831 | aOutput[i] := rp[2].Resonate(sourc) - aOutput[i]; 832 | 833 | outbypas := amp_bypas * sourc; 834 | aOutput[i] := outbypas - aOutput[i]; 835 | aOutput[i] := aOutput[i] / 10; 836 | 837 | if (OutputChannel <> OutputNone) then 838 | begin 839 | case OutputChannel of 840 | OutputNone: ; 841 | OutputVoice: aOutput[i] := voice; 842 | OutputAspiration: aOutput[i] := aspiration; 843 | OutputFrics: aOutput[i] := frics; 844 | OutputGlotout: aOutput[i] := glotout; 845 | OutputPar_glotout: aOutput[i] := par_glotout; 846 | OutputOutbypas: aOutput[i] := outbypas; 847 | OutputSourc: aOutput[i] := sourc; 848 | end; 849 | 850 | aOutput[i] := rout.Resonate(aOutput[i]); 851 | 852 | temp := aOutput[i] * amp_gain0/1000; 853 | 854 | (* Convert back to integer *) 855 | if (temp < -32768) then 856 | temp := -32768; 857 | 858 | if (temp > 32767) then 859 | temp := 32767; 860 | 861 | aOutput[i] := temp; 862 | end; 863 | end; 864 | end; 865 | 866 | { TResonator } 867 | 868 | function TResonator.AntiResonate(aInput: Single): Single; 869 | var 870 | x: Single; 871 | begin 872 | x := a * aInput + 873 | b * p[1] + 874 | c * p[2]; 875 | 876 | p[2] := p[1]; 877 | p[1] := aInput; 878 | Result := x; 879 | end; 880 | 881 | function TResonator.Resonate(input: Single): Single; 882 | var 883 | x: Single; 884 | begin 885 | { This is a generic resonator function. Internal memory for the resonator, 886 | is stored in the globals structure. } 887 | x := (a * input + b * p[1] + c * p[2]); 888 | p[2] := p[1]; 889 | p[1] := x; 890 | Result := x; 891 | end; 892 | 893 | end. 894 | -------------------------------------------------------------------------------- /Klatt.dpr: -------------------------------------------------------------------------------- 1 | program Klatt; 2 | 3 | { 4 | Description : Klatt synthesizer 5 | Author : Wouter van Nifterick 6 | } 7 | 8 | {$APPTYPE CONSOLE} 9 | 10 | {$R *.res} 11 | 12 | uses 13 | SysUtils, 14 | WvN.Util.CmdLine, 15 | Klatt.ParWave in 'Klatt.ParWave.pas'; 16 | 17 | procedure Usage; 18 | begin 19 | Writeln('Options...'); 20 | Writeln('-h Displays this message'); 21 | Writeln('-i sets input filename'); 22 | Writeln('-o sets output filename'); 23 | Writeln(' If output filename not specified, stdout is used'); 24 | Writeln('-q quiet - print no messages'); 25 | Writeln('-t select output waveform'); 26 | Writeln('-c select cascade-parallel configuration'); 27 | Writeln(' Parallel configuration is default'); 28 | Writeln('-n Number of formants in cascade branch.'); 29 | Writeln(' Default is 5'); 30 | Writeln('-s set sample rate'); 31 | Writeln('-f set number of milliseconds per frame, default 10'); 32 | Writeln('-v Specifies voicing source.'); 33 | Writeln(' 1:=impulse train, 2=natural simulation, 3=sampled natural'); 34 | Writeln(' Default is a simulation of natural voicing'); 35 | Writeln('-V Input file of samples for natural voicing.'); 36 | Writeln('-F percentage of f0 flutter'); 37 | Writeln(' Default is 0'); 38 | Writeln('-r output 16 bit signed integers rather than ASCII'); 39 | Writeln(' integers. cType := 1 gives high byte first, cType = 2 gives'); 40 | Writeln(' low byte first.'); 41 | end; 42 | 43 | type 44 | TAppSettings=record 45 | InFileName , 46 | OutFileName , 47 | SampeFileName : string; 48 | MsPerFrame : Integer; 49 | DoOutputRawSample : Boolean; 50 | OutputByteOrder : byte; 51 | procedure Init; 52 | end; 53 | 54 | procedure TAppSettings.Init; 55 | begin 56 | self := default(TAppSettings); 57 | InFileName := ''; 58 | OutFileName := ''; 59 | SampeFileName := ''; 60 | MsPerFrame := 10; 61 | DoOutputRawSample := FALSE; 62 | end; 63 | 64 | procedure Main; 65 | var 66 | OutFile : File of byte; 67 | KlattSynth : TKlattSynth; 68 | FrameSamples : TArray; 69 | FrameSampleIndex : Integer; 70 | Sample : uint16; 71 | AppSettings : TAppSettings; 72 | FrameIndex : Integer; 73 | begin 74 | if(ParamCount=0) then 75 | begin 76 | usage; 77 | halt(1); 78 | end; 79 | 80 | AppSettings.Init; 81 | 82 | KlattSynth := TKlattSynth.Create; 83 | 84 | SetLength(FrameSamples, cMaxSampleRateHz); 85 | 86 | CommandLine.ProcessKeys( procedure(const key:char; const name,value:string ) 87 | begin 88 | case Key of 89 | 'i': AppSettings.InFileName := Value; 90 | 'o': AppSettings.OutFileName := Value; 91 | 'q': KlattSynth.Quiet := TRUE; 92 | 't': KlattSynth.OutputChannel := TOutputChannel(StrToInt(Value)); 93 | 'c': begin KlattSynth.SynthesisModel := CascadeParallel; KlattSynth.nfcascade := 5; end; 94 | 's': KlattSynth.SampleRateHz := StrToInt(Value); 95 | 'f': AppSettings.MsPerFrame := StrToInt(Value); 96 | // 'v': Globals.glsource := StrToInt(Value); 97 | 'V': AppSettings.SampeFileName := Value; 98 | 'h': begin usage(); halt(1); end; 99 | 'n': KlattSynth.nfcascade := StrToInt(Value); 100 | 'F': KlattSynth.f0_flutter := StrToInt(Value); 101 | 'r': begin AppSettings.DoOutputRawSample := TRUE; AppSettings.OutputByteOrder := StrToInt(Value); end; 102 | end; 103 | end 104 | ); 105 | KlattSynth.SamplesPerFrame := Round((KlattSynth.SampleRateHz * AppSettings.MsPerFrame) / 2000); 106 | 107 | { 108 | if SampleFileName <> '' then 109 | begin 110 | AssignFile(InFile, SampeFileName); 111 | Reset(InFile); 112 | read(InFile, Globals.num_samples); 113 | read(InFile, Globals.SAMPLE_FACTOR); 114 | SetLength(Globals.natural_samples, length(natural_samples)); 115 | for I := 0 to Globals.num_samples - 1 do 116 | read(InFile, Globals.natural_samples[I]); 117 | CloseFile(InFile); 118 | end; 119 | } 120 | 121 | if AppSettings.InFileName = '' then 122 | begin 123 | Writeln('Error: No inputfile given'); 124 | Halt(2); 125 | end; 126 | 127 | if AppSettings.OutFileName = '' then 128 | KlattSynth.Quiet := True 129 | else 130 | begin 131 | AssignFile(OutFile, AppSettings.OutFileName); 132 | Rewrite(OutFile); 133 | end; 134 | 135 | KlattSynth.InitParWave; 136 | WriteLn('Reading ',AppSettings.InFileName,' ...'); 137 | KlattSynth.LoadFromFile(AppSettings.InFileName); 138 | 139 | WriteLn(length(KlattSynth.Frames),' frames -> Rendering to ',length(KlattSynth.Frames)*KlattSynth.SamplesPerFrame, ' samples ...'); 140 | WriteLn('Saving ', AppSettings.OutFileName,' ...'); 141 | for FrameIndex := 0 to High(KlattSynth.Frames) do 142 | begin 143 | KlattSynth.RenderParWave(KlattSynth.Frames[FrameIndex], FrameSamples); 144 | 145 | for FrameSampleIndex := 0 to KlattSynth.SamplesPerFrame-1 do 146 | begin 147 | Sample := Round(FrameSamples[FrameSampleIndex]); 148 | if AppSettings.DoOutputRawSample then 149 | begin 150 | Sample := Round((Sample / 256) + 32768) and $FFFF; 151 | Write(OutFile, Sample); 152 | end 153 | else 154 | Writeln(format('%d', [round(FrameSamples[FrameSampleIndex])])); 155 | end; 156 | end; 157 | if AppSettings.OutFileName <> '' then CloseFile(OutFile); 158 | if (not KlattSynth.Quiet) then Writeln('Done'); 159 | WriteLn('Done.'); 160 | end; 161 | 162 | begin 163 | try 164 | Main; 165 | except 166 | on E: Exception do 167 | Writeln(E.ClassName, ': ', E.Message); 168 | end; 169 | end. 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # delphi-klatt-synth 2 | Klatt Speech Synthesizer 3 | 4 | This can be used to do text to speech, or generate singing sounds. 5 | 6 | It's a commandline tool that takes a Klatt file as an input, and generates a sample as output. 7 | 8 | The code is based on the The Klatt Synthesizer which was designed by Dennis H. Klatt in 1980. 9 | Dectalk, which was Stephen Hawking's voice, was largely based on this. 10 | --------------------------------------------------------------------------------