├── Klatt.ParWave.pas ├── Klatt.dpr └── README.md /Klatt.ParWave.pas: -------------------------------------------------------------------------------- 1 | unit Klatt.ParWave; 2 | 3 | { 4 | Description : Klatt synthesizer 5 | Author : Wouter van Nifterick 6 | } 7 | 8 | interface 9 | 10 | uses System.SysUtils, Math, System.Generics.Collections; 11 | 12 | 13 | const 14 | cMaxSampleRateHz = 20000; // Maximum sample rate 15 | 16 | natural_samples: array[0..99] of integer= 17 | ( 18 | -310,-400,530,356,224,89,23,-10,-58,-16,461,599,536,701,770, 19 | 605,497,461,560,404,110,224,131,104,-97,155,278,-154,-1165, 20 | -598,737,125,-592,41,11,-247,-10,65,92,80,-304,71,167,-1,122, 21 | 233,161,-43,278,479,485,407,266,650,134,80,236,68,260,269,179, 22 | 53,140,275,293,296,104,257,152,311,182,263,245,125,314,140,44, 23 | 203,230,-235,-286,23,107,92,-91,38,464,443,176,98,-784,-2449, 24 | -1891,-1045,-1600,-1462,-1384,-1261,-949,-730 25 | ); 26 | 27 | type 28 | TVoicingSource = ( 29 | Impulsive = 1, 30 | Natural = 2, 31 | Sampled = 3 32 | ); 33 | 34 | TSynthesisModel = ( 35 | CascadeParallel = 1, 36 | AllParallel = 2 37 | ); 38 | 39 | TResonator = record 40 | a, b, c: double; 41 | p:array[1..2] of double; 42 | function Resonate(input: Single): Single; 43 | function AntiResonate(aInput: Single): Single; 44 | end; 45 | 46 | TOutputChannel=( 47 | OutputNone = 0, 48 | OutputVoice, 49 | OutputAspiration, 50 | OutputFrics, 51 | OutputGlotout, 52 | OutputPar_glotout, 53 | OutputOutbypas, 54 | OutputSourc 55 | ); 56 | 57 | type 58 | TKlattFrame = record 59 | public 60 | /// Voicing fund freq in Hz 61 | F0Hz10: Integer; 62 | /// Amp of voicing in dB, 0 to 70 63 | AVdb : Integer; 64 | /// First formant freq in Hz, 200 to 1300 65 | F1Hz : Integer; 66 | /// First formant bw in Hz, 40 to 1000 67 | B1Hz : Integer; 68 | /// Second formant freq in Hz, 550 to 3000 69 | F2Hz : Integer; 70 | /// Second formant bw in Hz, 40 to 1000 71 | B2hz : Integer; 72 | /// Third formant freq in Hz, 1200 to 4999 73 | F3hz : Integer; 74 | /// Third formant bw in Hz, 40 to 1000 75 | B3hz : Integer; 76 | /// Fourth formant freq in Hz, 1200 to 4999 77 | F4hz : Integer; 78 | /// Fourth formant bw in Hz, 40 to 1000 79 | B4hz : Integer; 80 | /// Fifth formant freq in Hz, 1200 to 4999 81 | F5hz : Integer; 82 | /// Fifth formant bw in Hz, 40 to 1000 83 | B5hz : Integer; 84 | /// Sixth formant freq in Hz, 1200 to 4999 85 | F6hz : Integer; 86 | /// Sixth formant bw in Hz, 40 to 2000 87 | B6hz : Integer; 88 | /// Nasal zero freq in Hz, 248 to 528 89 | NasalZeroFrequency : Integer; 90 | /// Nasal zero bw in Hz, 40 to 1000 91 | BNZhz : Integer; 92 | /// Nasal pole freq in Hz, 248 to 528 93 | FNPhz : Integer; 94 | /// Nasal pole bw in Hz, 40 to 1000 95 | BNPhz : Integer; 96 | /// Amp of aspiration in dB, 0 to 70 97 | ASP : Integer; 98 | /// # of samples in open period, 10 to 65 99 | Kopen : Integer; 100 | /// Breathiness in voicing, 0 to 80 101 | Aturb : Integer; 102 | /// Voicing spectral tilt in dB, 0 to 24 103 | TLTdb : Integer; 104 | /// Amp of frication in dB, 0 to 80 105 | AF : Integer; 106 | /// Skewness of alternate periods, 0 to 40 in sample#/2 107 | Kskew : Integer; 108 | /// Amp of par 1st formant in dB, 0 to 80 109 | A1dB : Integer; 110 | /// Par. 1st formant bw in Hz, 40 to 1000 111 | B1phz : Integer; 112 | /// Amp of F2 frication in dB, 0 to 80 113 | A2dB : Integer; 114 | /// Par. 2nd formant bw in Hz, 40 to 1000 115 | B2phz : Integer; 116 | /// Amp of F3 frication in dB, 0 to 80 117 | A3dB : Integer; 118 | /// Par. 3rd formant bw in Hz, 40 to 1000 119 | B3phz : Integer; 120 | /// Amp of F4 frication in dB, 0 to 80 121 | A4dB : Integer; 122 | /// Par. 4th formant bw in Hz, 40 to 1000 123 | B4phz : Integer; 124 | /// Amp of F5 frication in dB, 0 to 80 125 | A5dB : Integer; 126 | /// Par. 5th formant bw in Hz, 40 to 1000 127 | B5phz : Integer; 128 | /// Amp of F6 (same as rp[6]a), 0 to 80 129 | A6dB : Integer; 130 | /// Par. 6th formant bw in Hz, 40 to 2000 131 | B6phz : Integer; 132 | /// Amp of par nasal pole in dB, 0 to 80 133 | ANPdB : Integer; 134 | /// Amp of bypass fric. in dB, 0 to 80 135 | ByPassPathAmp : Integer; 136 | /// Amp of voicing, par in dB, 0 to 70 137 | AVpdB : Integer; 138 | /// Overall gain, 60 dB is unity, 0 to 60 139 | Gain0dB : Integer; 140 | end; 141 | 142 | /// Structure for Klatt Globals 143 | TKlattSynth = class 144 | public 145 | SynthesisModel: TSynthesisModel; // cascade-parallel or all-parallel 146 | OutputChannel : TOutputChannel; // Output waveform selector 147 | SampleRateHz : integer; // Number of output samples per second 148 | FLPhz : integer; // Frequeny of glottal downsample low-pass filter 149 | BLPhz : integer; // Bandwidth of glottal downsample low-pass filter 150 | nfcascade : integer; // Number of formants in cascade vocal tract 151 | VoicingSource : TVoicingSource; // Type of glottal source 152 | f0_flutter : integer; // Percentage of f0 flutter 0-100 153 | Quiet : boolean; // set to TRUE for error messages 154 | SamplesPerFrame: integer; // number of samples per frame 155 | nper : integer; // Counter for number of samples in a pitch period 156 | CurrentSample : integer; // 157 | T0 : integer; // Fundamental period in output samples times 4 158 | nopen : integer; // Number of samples in open phase of period 159 | nmod : integer; // Position in period to begin noise amp. modul 160 | nrand : integer; // Variable used by random number generator 161 | pulse_shape_a : double; // Makes waveshape of glottal pulse when open 162 | pulse_shape_b : double; // Makes waveshape of glottal pulse when open 163 | minus_pi_t : double; 164 | two_pi_t : double; 165 | onemd : Single; 166 | Decay : Single; 167 | amp_bypas : Single; // AB converted to linear gain 168 | amp_voice : Single; // AVdb converted to linear gain 169 | amp_par_voice : Single; // AVpdb converted to linear gain 170 | amp_aspir : Single; // AP converted to linear gain 171 | amp_frica : Single; // AF converted to linear gain 172 | amp_breth : Single; // ATURB converted to linear gain 173 | amp_gain0 : Single; // G0 converted to linear gain 174 | NaturalSamples : array of integer; // pointer to an array of glottal samples 175 | original_f0 : integer; // original value of f0 not modified by flutter 176 | rnpp : TResonator; // internal storage for resonators 177 | rp : array [1 .. 6] of TResonator; 178 | rc : array [1 .. 8] of TResonator; 179 | rnpc : TResonator; 180 | rnz : TResonator; 181 | rgl : TResonator; 182 | rlp : TResonator; 183 | rout : TResonator; 184 | 185 | nlast : Single; // last noise 186 | TimeCount : Integer; 187 | 188 | Frames : array of TKlattFrame; 189 | 190 | constructor Create; 191 | procedure InitParWave; 192 | 193 | function GenerateNoise(aNoise: Single): Single; 194 | 195 | var vwave: Single; 196 | function ImpulsiveSource: Single; 197 | var vwave2: Single; 198 | function NaturalSource: Single; 199 | 200 | procedure SetABC( 201 | aResFrequencyHz: Integer; { Frequency of resonator in Hz } 202 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz } 203 | var aResonator: TResonator); 204 | 205 | procedure SetZeroABC( 206 | aResFreqHz : Integer; { Frequency of resonator in Hz } 207 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz } 208 | var aResonator : TResonator); 209 | 210 | 211 | var 212 | noise, voice, vlast, glotlast, sourc: Single; 213 | 214 | procedure RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray); 215 | function Render:TArray; 216 | 217 | var Skew: Integer; 218 | procedure InitFrame(var aFrame: TKlattFrame); 219 | procedure Flutter(var aFrame: TKlattFrame); 220 | procedure pitch_synch_par_reset(var aFrame: TKlattFrame); 221 | 222 | procedure LoadFromFile(const aFileName:String); 223 | end; 224 | 225 | { Structure for Klatt Parameters } 226 | 227 | function DBtoLIN(db: Integer):Single; 228 | function LINtoDB(n: double):double; 229 | 230 | implementation 231 | 232 | /// 233 | /// Random number generator (return a number between -8191 and +8191) 234 | /// Noise spectrum is tilted down by soft low-pass filter having a pole nea, 235 | /// 236 | /// the origin in the z-plane, i.e. output = input + (0.75 * lastoutput) 237 | /// 238 | function TKlattSynth.GenerateNoise(aNoise: Single): Single; 239 | var 240 | temp: Integer; 241 | begin 242 | temp := {random(2 * 8191) - 8191}Round(((Random*2)-1)*1024*4); 243 | nrand := temp; 244 | aNoise := nrand + (0.75 * nlast); 245 | nlast := aNoise; 246 | Result := aNoise; 247 | end; 248 | 249 | /// Initialize Globals variable 250 | constructor TKlattSynth.Create; 251 | begin 252 | Quiet := False; 253 | SynthesisModel := TSynthesisModel.AllParallel; 254 | SampleRateHz := 11025; 255 | VoicingSource := TVoicingSource.Natural; 256 | //natural_samples := natural_samples; 257 | nfcascade := 0; 258 | OutputChannel := TOutputChannel.OutputNone; 259 | f0_flutter := 0; 260 | Skew := 0; 261 | end; 262 | 263 | function LINtoDB(n:double):double; 264 | begin 265 | if (n > 1E-12) then 266 | Exit(LOG10(n) * 20) 267 | else 268 | Exit(-200) 269 | end; 270 | 271 | function DBtoLIN(db: Integer): Single; 272 | const 273 | AmpTable: array [0 .. 87] of Single = ( 274 | 0,0,0,0,0,0,0,0,0,0,0,0,0, 275 | 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 276 | 25, 28, 32, 35, 40, 45, 51, 57, 64, 71, 80, 277 | 90, 101, 114, 128, 142, 159, 179, 202, 227, 256, 278 | 284, 318, 359, 405, 455, 512, 568, 638, 719, 811, 279 | 911, 1024, 1137, 1276, 1438, 1622, 1823, 2048, 2273, 280 | 2552, 2875, 3244, 3645, 4096, 4547, 5104, 5751, 6488, 281 | 7291, 8192, 9093, 10207, 11502, 12976, 14582, 16384, 282 | 18350, 20644, 23429, 26214, 29491, 32767 283 | 284 | { 285 | 1.295291, 1.455383, 1.635262, 1.837373, 2.064464, 2.319622, 2.606317, 2.928446, 286 | 3.290389, 3.697066, 4.154007, 4.667423, 5.244296, 5.892467, 6.620750, 7.439045, 287 | 8.358477, 9.391547, 10.552300, 11.856517, 13.321929, 14.968460, 16.818494, 288 | 18.897185, 21.232792, 23.857069, 26.805696, 30.118759, 33.841303, 38.023936, 289 | 42.723523, 48.003959, 53.937032, 60.603407, 68.093716, 76.509793, 85.966059, 290 | 96.591078, 108.529301, 121.943035, 137.014646, 153.949041, 172.976450, 194.355562, 291 | 218.377036, 245.367456, 275.693771, 309.768282, 348.054249, 391.072190, 439.406955, 292 | 493.715680, 554.736719, 623.299685, 700.336724, 786.895196, 884.151905, 993.429107, 293 | 1116.212480, 1254.171326, 1409.181265, 1583.349736, 1779.044647, 1998.926570, 294 | 2245.984910, 2523.578550, 2835.481517, 3185.934289, 3579.701448, 4022.136458, 295 | 4519.254448, 5077.813986, 5705.408973, 6410.571880, 7202.889753, 8093.134554, 296 | 9093.409611, 10217.314169, 11480.128280, 12899.020540, 14493.281505, 16284.585961, 297 | 18297.287597, 20558.750108, 23099.719223, 25954.740700, 29162.630000, 32767 298 | } 299 | ); 300 | 301 | begin 302 | if ((db < 0) or (db > 87)) then 303 | exit(0); 304 | 305 | Result := AmpTable[db] * 0.001; 306 | end; 307 | 308 | 309 | /// 310 | /// This function adds F0 flutter, as specified in: 311 | /// 312 | /// "Analysis, synthesis and perception of voice quality variations among 313 | /// female and male talkers" D.H. Klatt and L.C. Klatt JASA 87(2) February 1990. 314 | /// 315 | /// Flutter is added by applying a quasi-random element constructed from three 316 | /// slowly varying sine waves. 317 | /// 318 | procedure TKlattSynth.Flutter(var aFrame: TKlattFrame); 319 | var delta_f0, fla, flb, flc, fld, fle: Double; 320 | begin 321 | fla := f0_flutter / 50; 322 | flb := original_f0 / 100; 323 | flc := sin(2 * PI * 12.7 * TimeCount); 324 | fld := sin(2 * PI * 7.1 * TimeCount); 325 | fle := sin(2 * PI * 4.7 * TimeCount); 326 | delta_f0 := fla * flb * (flc + fld + fle) * 10; 327 | aFrame.F0Hz10 := aFrame.F0Hz10 + Round(delta_f0); 328 | Inc(TimeCount); 329 | end; 330 | 331 | 332 | /// 333 | /// Convert formant freqencies and bandwidth into resonator difference 334 | /// equation constants. 335 | /// 336 | procedure TKlattSynth.SetABC( 337 | aResFrequencyHz: Integer; { Frequency of resonator in Hz } 338 | aResBandWidthHz: Integer; { Bandwidth of resonator in Hz } 339 | var aResonator: TResonator); 340 | var 341 | r : Single; 342 | arg: Double; 343 | begin 344 | // Let r = exp(-pi bw t) 345 | arg := minus_pi_t * aResBandWidthHz; 346 | r := exp(arg); 347 | 348 | // Let c = -r**2 349 | aResonator.c := -(r * r); 350 | 351 | // Let b = r * 2*cos(2 pi f t) 352 | arg := two_pi_t * aResFrequencyHz; 353 | aResonator.b := r * cos(arg) * 2; 354 | 355 | // Let a = 1 - b - c 356 | aResonator.a := 1 - aResonator.b - aResonator.c; 357 | end; 358 | 359 | 360 | /// Convert formant freqencies and bandwidth into anti-resonator difference equation constants. 361 | procedure TKlattSynth.SetZeroABC( 362 | aResFreqHz : Integer; { Frequency of resonator in Hz } 363 | aResBandWidthHz : Integer; { Bandwidth of resonator in Hz } 364 | var aResonator : TResonator); 365 | var 366 | r : Single; 367 | arg: Double; 368 | begin 369 | aResFreqHz := -aResFreqHz; 370 | if (aResFreqHz >= 0) then 371 | aResFreqHz := -1; 372 | 373 | // First compute ordinary resonator coefficients 374 | // Let r = exp(-pi bw t) 375 | arg := minus_pi_t * aResBandWidthHz; 376 | r := exp(arg); 377 | 378 | // Let c = -r**2 379 | aResonator.c := -(r * r); 380 | 381 | // Let b = r * 2*cos(2 pi f t) 382 | arg := two_pi_t * aResFreqHz; 383 | aResonator.b := r * cos(arg) * 2.; 384 | 385 | // Let a = 1 - b - c 386 | aResonator.a := 1 - aResonator.b - aResonator.c; 387 | 388 | // Now convert to antiresonator coefficients (a'=1/a, b'=b/a, c'=c/a) 389 | aResonator.a := 1 / aResonator.a; 390 | aResonator.c := aResonator.c * -aResonator.a; 391 | aResonator.b := aResonator.b * -aResonator.a; 392 | end; 393 | 394 | /// Initialises all parameters used in parwave, sets resonator internal memory to zero. 395 | procedure TKlattSynth.InitParWave; 396 | var i,j:integer; 397 | begin 398 | FLPhz := Round((950 * SampleRateHz) / 10000); 399 | BLPhz := Round((630 * SampleRateHz) / 10000); 400 | minus_pi_t := -PI / SampleRateHz; 401 | two_pi_t := -2 * minus_pi_t; 402 | SetABC(FLPhz, BLPhz, rlp); 403 | nper := 0; 404 | T0 := 0; 405 | nopen := 0; 406 | nmod := 0; 407 | 408 | for I := 1 to 2 do 409 | begin 410 | rnpp. p[i]:= 0; 411 | for j := 1 to 6 do 412 | rp[j].p[i] := 0; 413 | for j := 1 to 8 do 414 | rc[j].p[i] := 0; 415 | 416 | rnpc. p[i] := 0; 417 | rnz. p[i] := 0; 418 | rgl .p[i] := 0; 419 | rlp .p[i] := 0; 420 | rout .p[i] := 0; 421 | end; 422 | end; 423 | 424 | procedure TKlattSynth.LoadFromFile(const aFileName: String); 425 | const 426 | /// Number of control parameters 427 | cNumberOfParameters = 40; // 428 | var 429 | InFile : TextFile; 430 | FrameParamPtr : ^Integer; 431 | ParIndex : Integer; 432 | Value : Integer; 433 | begin 434 | AssignFile(InFile, aFileName); 435 | Reset(InFile); 436 | 437 | while not Eof(InFile) do 438 | begin 439 | SetLength(Frames,Length(Frames)+1); 440 | FrameParamPtr := @Frames[High(Frames)]; 441 | for ParIndex := 1 to cNumberOfParameters do 442 | begin 443 | read(InFile, value); 444 | FrameParamPtr^ := value; 445 | Inc(FrameParamPtr); 446 | end; 447 | end; 448 | CloseFile(InFile); 449 | end; 450 | 451 | /// Use parameters from the input frame to set up resonator coefficients. 452 | procedure TKlattSynth.InitFrame(var aFrame: TKlattFrame); 453 | var 454 | amp_parF1, 455 | amp_parFNP, 456 | amp_parF2, 457 | amp_parF3, 458 | amp_parF4, 459 | amp_parF5, 460 | amp_parF6: Single; 461 | begin 462 | original_f0 := Round(aFrame.F0Hz10 / 10); 463 | 464 | aFrame.AVdb := aFrame.AVdb - 7; 465 | if (aFrame.AVdb < 0) then 466 | aFrame.AVdb := 0; 467 | 468 | amp_aspir := DBtoLIN(aFrame.ASP) * 0.05; 469 | amp_frica := DBtoLIN(aFrame.AF) * 0.25; 470 | amp_par_voice := DBtoLIN(aFrame.AVpdB); 471 | amp_parF1 := DBtoLIN(aFrame.A1dB) { * 0.4}; 472 | amp_parF2 := DBtoLIN(aFrame.A2dB) { * 0.150}; 473 | amp_parF3 := DBtoLIN(aFrame.A3dB) { * 0.060}; 474 | amp_parF4 := DBtoLIN(aFrame.A4dB) { * 0.040}; 475 | amp_parF5 := DBtoLIN(aFrame.A5dB) { * 0.022}; 476 | amp_parF6 := DBtoLIN(aFrame.A6dB) { * 0.030}; 477 | amp_parFNP := DBtoLIN(aFrame.ANPdB){ * 0.60}; 478 | amp_bypas := DBtoLIN(aFrame.ByPassPathAmp) * 0.05; 479 | aFrame.Gain0dB := aFrame.Gain0dB - 3; 480 | if (aFrame.Gain0dB <= 0) then 481 | aFrame.Gain0dB := 57; 482 | 483 | amp_gain0 := DBtoLIN(aFrame.Gain0dB); 484 | 485 | // Set coefficients of variable cascade resonators 486 | if (nfcascade >= 8) then SetABC(7500, 600, rc[8]); 487 | if (nfcascade >= 7) then SetABC(6500, 500, rc[7]); 488 | if (nfcascade >= 6) then SetABC(aFrame.F6hz, aFrame.B6hz, rc[6]); 489 | if (nfcascade >= 5) then SetABC(aFrame.F5hz, aFrame.B5hz, rc[5]); 490 | 491 | SetABC(aFrame.F4hz, aFrame.B4hz, rc[4]); 492 | SetABC(aFrame.F3hz, aFrame.B3hz, rc[3]); 493 | SetABC(aFrame.F2Hz, aFrame.B2hz, rc[2]); 494 | SetABC(aFrame.F1Hz, aFrame.B1Hz, rc[1]); 495 | 496 | // Set coeficients of nasal resonator and zero antiresonato, 497 | 498 | SetABC(aFrame.FNPhz, aFrame.BNPhz, rnpc); 499 | SetZeroABC(aFrame.NasalZeroFrequency, aFrame.BNZhz, rnz); 500 | 501 | // Set coefficients of parallel resonators, and amplitude of outputs 502 | SetABC(aFrame.F1Hz, aFrame.B1phz, rp[1]); rp[1].a := rp[1].a * amp_parF1; 503 | SetABC(aFrame.FNPhz,aFrame.BNPhz, rnpp ); rnpp.a := rnpp.a * amp_parFNP; 504 | SetABC(aFrame.F2Hz, aFrame.B2phz, rp[2]); rp[2].a := rp[2].a * amp_parF2; 505 | SetABC(aFrame.F3hz, aFrame.B3phz, rp[3]); rp[3].a := rp[3].a * amp_parF3; 506 | SetABC(aFrame.F4hz, aFrame.B4phz, rp[4]); rp[4].a := rp[4].a * amp_parF4; 507 | SetABC(aFrame.F5hz, aFrame.B5phz, rp[5]); rp[5].a := rp[5].a * amp_parF5; 508 | SetABC(aFrame.F6hz, aFrame.B6phz, rp[6]); rp[6].a := rp[6].a * amp_parF6; 509 | 510 | // output low-pass filte, 511 | 512 | SetABC(0, Round(SampleRateHz / 2), rout); 513 | end; 514 | 515 | /// 516 | /// Generate a low pass filtered train of impulses as an approximation of 517 | /// a natural excitation waveform. Low-pass filter the differentiated impulse 518 | /// with a critically-damped second-order filter, time constant proportional 519 | /// to Kopen. 520 | /// 521 | function TKlattSynth.ImpulsiveSource: Single; 522 | const 523 | doublet: array [0 .. 2] of Single = (0, 13000000, -13000000); 524 | begin 525 | if (nper < 3) then 526 | vwave := doublet[nper] 527 | else 528 | vwave := 0; 529 | 530 | Result := rgl.Resonate(vwave); 531 | end; 532 | 533 | /// 534 | /// Vwave is the differentiated glottal flow waveform, there is a weak 535 | /// spectral zero around 800 Hz, magic constants a,b reset pitch synchronously. 536 | /// 537 | function TKlattSynth.NaturalSource: Single; 538 | var 539 | lgtemp: Single; 540 | begin 541 | if (nper < nopen) then 542 | begin 543 | pulse_shape_a := pulse_shape_a - pulse_shape_b; 544 | vwave2 := vwave2 + pulse_shape_a; 545 | lgtemp := vwave2 * 0.028; 546 | exit(lgtemp); 547 | end 548 | else 549 | begin 550 | vwave2 := 0; 551 | exit(0); 552 | end; 553 | end; 554 | 555 | 556 | /// 557 | /// function PITCH_SYNC_PAR_RESET 558 | /// 559 | /// Reset selected parameters pitch-synchronously. 560 | /// 561 | /// 562 | /// Constant B0 controls shape of glottal pulse as a function 563 | /// of desired duration of open phase N0 564 | /// (Note that N0 is specified in terms of 40,000 samples/sec of speech) 565 | /// 566 | /// Assume voicing waveform V(t) has form: k1 t**2 - k2 t**3 567 | /// 568 | /// If the radiation characterivative, a temporal derivative 569 | /// is folded in, and we go from continuous time to discrete 570 | /// integers n: dV/dt = vwave[n] 571 | /// = sum over i=1,2,...,n of { a - (i * b) } 572 | /// = a n - b/2 n**2 573 | /// 574 | /// where the constants a and b control the detailed shape 575 | /// and amplitude of the voicing waveform over the open 576 | /// potion of the voicing cycle "nopen". 577 | /// 578 | /// Let integral of dV/dt have no net dc flow --> a = (b * nopen) / 3 579 | /// 580 | /// Let maximum of dUg(n)/dn be constant --> b = gain / (nopen * nopen) 581 | /// meaning as nopen gets bigger, V has bigger peak proportional to n 582 | /// 583 | /// Thus, to generate the table below for 40 <= nopen <= 263: 584 | /// 585 | /// B0[nopen - 40] = 1920000 / (nopen * nopen) 586 | /// 587 | procedure TKlattSynth.pitch_synch_par_reset(var aFrame: TKlattFrame); 588 | var 589 | temp : Integer; 590 | temp1: Single; 591 | const 592 | B0: array [0 .. 223] of uint16 = (1200, 1142, 1088, 1038, 991, 948, 907, 869, 833, 799, 768, 738, 710, 683, 658, 634, 612, 590, 570, 551, 533, 515, 499, 483, 468, 454, 440, 427, 415, 403, 391, 380, 370, 360, 350, 341, 332, 323, 315, 307, 300, 292, 285, 593 | 278, 272, 265, 259, 253, 247, 242, 237, 231, 226, 221, 217, 212, 208, 204, 199, 195, 192, 188, 184, 180, 177, 174, 170, 167, 164, 161, 158, 155, 153, 150, 147, 145, 142, 140, 137, 135, 133, 131, 128, 126, 124, 122, 120, 119, 117, 115, 113, 111, 110, 594 | 108, 106, 105, 103, 102, 100, 99, 97, 96, 95, 93, 92, 91, 90, 88, 87, 86, 85, 84, 83, 82, 80, 79, 78, 77, 76, 75, 75, 74, 73, 72, 71, 70, 69, 68, 68, 67, 66, 65, 64, 64, 63, 62, 61, 61, 60, 59, 59, 58, 57, 57, 56, 56, 55, 55, 54, 54, 53, 53, 52, 52, 595 | 51, 51, 50, 50, 49, 49, 48, 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 41, 41, 40, 40, 39, 39, 38, 38, 38, 38, 37, 37, 36, 36, 36, 36, 35, 35, 35, 35, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 596 | 29, 28, 28, 28, 28, 27, 27); 597 | begin 598 | if (aFrame.F0Hz10 > 0) then 599 | begin 600 | // T0 is 4* the number of samples in one pitch period 601 | T0 := Round((40 * SampleRateHz) / aFrame.F0Hz10); 602 | 603 | amp_voice := DBtoLIN(aFrame.AVdb); 604 | 605 | // Duration of period before amplitude modulation 606 | nmod := T0; 607 | if (aFrame.AVdb > 0) then 608 | nmod := nmod shr 1; 609 | 610 | // Breathiness of voicing waveform 611 | amp_breth := DBtoLIN(aFrame.Aturb) * 0.1; 612 | 613 | // Set open phase of glottal period where 40 <= open phase <= 263 614 | nopen := 4 * aFrame.Kopen; 615 | 616 | if ((VoicingSource = Impulsive) and (nopen > 263)) then 617 | nopen := 263; 618 | 619 | if (nopen >= (T0 - 1)) then 620 | begin 621 | nopen := T0 - 2; 622 | 623 | //if (globals.quiet = FALSE) then 624 | // raise Exception.Create('Glottal open period cannot exceed T0, truncated'); 625 | end; 626 | 627 | if (nopen < 40) then 628 | begin 629 | // F0 max = 1000 Hz 630 | nopen := 40; 631 | if (Quiet = FALSE) then 632 | begin 633 | // writeln('Warning: minimum glottal open period is 10 samples.'); 634 | // writeln(format('truncated, nopen = %d', [globals.nopen])); 635 | end; 636 | end; 637 | 638 | // Reset a & b, which determine shape of "natural" glottal waveform 639 | pulse_shape_b := B0[nopen - 40]; 640 | pulse_shape_a := (pulse_shape_b * nopen) * 0.333; 641 | 642 | // Reset width of "impulsive" glottal pulse 643 | temp := Round(SampleRateHz / nopen); 644 | 645 | SetABC(0, temp, rgl); 646 | 647 | // Make gain at F1 about constant 648 | temp1 := nopen * 0.00833; 649 | rgl.a := rgl.a * temp1 * temp1; 650 | 651 | // Truncate skewness so as not to exceed duration of closed phase of glottal period. 652 | temp := T0 - nopen; 653 | if (aFrame.Kskew > temp) then 654 | begin 655 | if (Quiet = FALSE) then 656 | begin 657 | // writeln(format('Kskew duration=%d > glottal closed period=%d, truncate\n', 658 | // [ 659 | // frame.Kskew, globals.T0 - globals.nopen 660 | // ])); 661 | end; 662 | aFrame.Kskew := temp; 663 | end; 664 | if (Skew >= 0) then 665 | Skew := aFrame.Kskew 666 | else 667 | Skew := -aFrame.Kskew; 668 | 669 | // Add skewness to closed portion of voicing period 670 | T0 := T0 + Skew; 671 | Skew := -Skew; 672 | end 673 | else 674 | begin 675 | T0 := 4; // Default for f0 undefined 676 | amp_voice := 0; 677 | nmod := T0; 678 | amp_breth := 0; 679 | pulse_shape_a := 0; 680 | pulse_shape_b := 0; 681 | end; 682 | 683 | //Reset these pars pitch synchronously or at update rate if f0=0 684 | if ((T0 <> 4) or (CurrentSample = 0)) then 685 | begin 686 | // Set one-pole low-pass filter that tilts glottal source 687 | Decay := (0.033 * aFrame.TLTdb); 688 | 689 | if (Decay > 0) then 690 | onemd := 1 - Decay 691 | else 692 | onemd := 1; 693 | end; 694 | end; 695 | 696 | function TKlattSynth.Render: TArray; 697 | var f:TKlattFrame; s:TArray; 698 | begin 699 | SetLength(s, cMaxSampleRateHz); 700 | for f in Frames do 701 | // 702 | end; 703 | 704 | /// 705 | /// Converts synthesis parameters to a waveform. 706 | /// 707 | procedure TKlattSynth.RenderParWave(var aFrame: TKlattFrame; var aOutput: TArray); 708 | var 709 | i : Integer; 710 | temp, outbypas : Single; 711 | n4 : Integer; 712 | frics, glotout, aspiration: Single; 713 | casc_next_in, par_glotout : Single; 714 | begin 715 | // get parameters for next frame of speech 716 | InitFrame(aFrame); // get parameters for next frame of speech 717 | Flutter(aFrame); // add f0 flutter, 718 | 719 | 720 | // MAIN LOOP, for each output sample of current frame: 721 | for i := 0 to SamplesPerFrame - 1 do 722 | begin 723 | Inc(CurrentSample); 724 | 725 | // Get low-passed random number for aspiration and frication noise 726 | noise := GenerateNoise(noise); 727 | 728 | // Amplitude modulate noise (reduce noise amplitude during 729 | // second half of glottal period) if voicing simultaneously present. 730 | if (nper > nmod) then 731 | noise := noise * 0.5; 732 | 733 | // Compute frication noise 734 | frics := amp_frica * noise; 735 | 736 | 737 | // Compute voicing waveform. Run glottal source simulation at 4 738 | // times normal sample rate to minimize quantization noise in 739 | // period of female voice. 740 | for n4 := 0 to 3 do 741 | begin 742 | case (VoicingSource) of 743 | Impulsive : voice := ImpulsiveSource; 744 | Natural : voice := NaturalSource; 745 | //SAMPLED : voice := sampled_source(globals); 746 | end; 747 | 748 | // Reset period when counter 'nper' reaches T0 749 | if (nper >= T0) then 750 | begin 751 | nper := 0; 752 | pitch_synch_par_reset(aFrame); 753 | end; 754 | 755 | // Low-pass filter voicing waveform before downsampling from 4*samrate 756 | // to samrate samples/sec. Resonator f=.09*samrate, bw=.06*samrate 757 | voice := rlp.Resonate(voice); 758 | 759 | // Increment counter that keeps track of 4*samrate samples per sec 760 | Inc(nper); 761 | end; 762 | 763 | // Tilt spectrum of voicing source down by soft low-pass filtering, 764 | // amount of tilt determined by TLTdb 765 | voice := (voice * onemd) + (vlast * Decay); 766 | vlast := voice; 767 | 768 | { 769 | Add breathiness during glottal open phase. Amount of breathiness 770 | determined by parameter Aturb Use nrand rather than noise because 771 | noise is low-passed. 772 | } 773 | if (nper < nopen) then 774 | voice := voice + (amp_breth * nrand); 775 | 776 | // Set amplitude of voicing 777 | glotout := amp_voice * voice; 778 | par_glotout := amp_par_voice * voice; 779 | 780 | // Compute aspiration amplitude and add to voicing source 781 | aspiration := amp_aspir * noise; 782 | glotout := glotout + aspiration; 783 | 784 | par_glotout := par_glotout + aspiration; 785 | 786 | // Cascade vocal tract, excited by laryngeal sources. 787 | // Nasal antiresonator, then formants FNP, F5, F4, F3, F2, F1 788 | if (SynthesisModel <> TSynthesisModel.AllParallel) then 789 | begin 790 | casc_next_in := rnz.AntiResonate(glotout); 791 | casc_next_in := rnpc.Resonate(casc_next_in); 792 | // Do not use unless sample rate >= 16000 793 | if (nfcascade >= 8) then casc_next_in := rc[8].Resonate(casc_next_in); 794 | // Do not use unless sample rate >= 16000 795 | if (nfcascade >= 7) then casc_next_in := rc[7].Resonate( casc_next_in); 796 | { Do not use unless long vocal tract or sample rate increased } 797 | if (nfcascade >= 6) then casc_next_in := rc[6].Resonate( casc_next_in); 798 | if (nfcascade >= 5) then casc_next_in := rc[5].Resonate( casc_next_in); 799 | if (nfcascade >= 4) then casc_next_in := rc[4].Resonate( casc_next_in); 800 | if (nfcascade >= 3) then casc_next_in := rc[3].Resonate( casc_next_in); 801 | if (nfcascade >= 2) then casc_next_in := rc[2].Resonate( casc_next_in); 802 | if (nfcascade >= 1) then aOutput[i] := rc[1].Resonate( casc_next_in); 803 | aOutput[i] := aOutput[i]; 804 | end 805 | else 806 | begin 807 | // we are not using the cascade tract, set out to zero 808 | aOutput[i] := 0; 809 | end; 810 | 811 | // Excite parallel F1 and FNP by voicing waveform 812 | sourc := par_glotout; // Source is voicing plus aspiration 813 | 814 | { 815 | Standard parallel vocal tract Formants F6,F5,F4,F3,F2, 816 | outputs added with alternating sign. Sound sourc for othe, 817 | 818 | parallel resonators is frication plus first difference of 819 | voicing waveform. 820 | } 821 | aOutput[i] := aOutput[i] + rp[1].Resonate(sourc); 822 | aOutput[i] := aOutput[i] + rnpp.Resonate(sourc); 823 | 824 | sourc := frics + par_glotout - glotlast; 825 | glotlast := par_glotout; 826 | 827 | aOutput[i] := rp[6].Resonate(sourc) - aOutput[i]; 828 | aOutput[i] := rp[5].Resonate(sourc) - aOutput[i]; 829 | aOutput[i] := rp[4].Resonate(sourc) - aOutput[i]; 830 | aOutput[i] := rp[3].Resonate(sourc) - aOutput[i]; 831 | aOutput[i] := rp[2].Resonate(sourc) - aOutput[i]; 832 | 833 | outbypas := amp_bypas * sourc; 834 | aOutput[i] := outbypas - aOutput[i]; 835 | aOutput[i] := aOutput[i] / 10; 836 | 837 | if (OutputChannel <> OutputNone) then 838 | begin 839 | case OutputChannel of 840 | OutputNone: ; 841 | OutputVoice: aOutput[i] := voice; 842 | OutputAspiration: aOutput[i] := aspiration; 843 | OutputFrics: aOutput[i] := frics; 844 | OutputGlotout: aOutput[i] := glotout; 845 | OutputPar_glotout: aOutput[i] := par_glotout; 846 | OutputOutbypas: aOutput[i] := outbypas; 847 | OutputSourc: aOutput[i] := sourc; 848 | end; 849 | 850 | aOutput[i] := rout.Resonate(aOutput[i]); 851 | 852 | temp := aOutput[i] * amp_gain0/1000; 853 | 854 | (* Convert back to integer *) 855 | if (temp < -32768) then 856 | temp := -32768; 857 | 858 | if (temp > 32767) then 859 | temp := 32767; 860 | 861 | aOutput[i] := temp; 862 | end; 863 | end; 864 | end; 865 | 866 | { TResonator } 867 | 868 | function TResonator.AntiResonate(aInput: Single): Single; 869 | var 870 | x: Single; 871 | begin 872 | x := a * aInput + 873 | b * p[1] + 874 | c * p[2]; 875 | 876 | p[2] := p[1]; 877 | p[1] := aInput; 878 | Result := x; 879 | end; 880 | 881 | function TResonator.Resonate(input: Single): Single; 882 | var 883 | x: Single; 884 | begin 885 | { This is a generic resonator function. Internal memory for the resonator, 886 | is stored in the globals structure. } 887 | x := (a * input + b * p[1] + c * p[2]); 888 | p[2] := p[1]; 889 | p[1] := x; 890 | Result := x; 891 | end; 892 | 893 | end. 894 | -------------------------------------------------------------------------------- /Klatt.dpr: -------------------------------------------------------------------------------- 1 | program Klatt; 2 | 3 | { 4 | Description : Klatt synthesizer 5 | Author : Wouter van Nifterick 6 | } 7 | 8 | {$APPTYPE CONSOLE} 9 | 10 | {$R *.res} 11 | 12 | uses 13 | SysUtils, 14 | WvN.Util.CmdLine, 15 | Klatt.ParWave in 'Klatt.ParWave.pas'; 16 | 17 | procedure Usage; 18 | begin 19 | Writeln('Options...'); 20 | Writeln('-h Displays this message'); 21 | Writeln('-i sets input filename'); 22 | Writeln('-o sets output filename'); 23 | Writeln(' If output filename not specified, stdout is used'); 24 | Writeln('-q quiet - print no messages'); 25 | Writeln('-t select output waveform'); 26 | Writeln('-c select cascade-parallel configuration'); 27 | Writeln(' Parallel configuration is default'); 28 | Writeln('-n Number of formants in cascade branch.'); 29 | Writeln(' Default is 5'); 30 | Writeln('-s set sample rate'); 31 | Writeln('-f set number of milliseconds per frame, default 10'); 32 | Writeln('-v Specifies voicing source.'); 33 | Writeln(' 1:=impulse train, 2=natural simulation, 3=sampled natural'); 34 | Writeln(' Default is a simulation of natural voicing'); 35 | Writeln('-V Input file of samples for natural voicing.'); 36 | Writeln('-F percentage of f0 flutter'); 37 | Writeln(' Default is 0'); 38 | Writeln('-r output 16 bit signed integers rather than ASCII'); 39 | Writeln(' integers. cType := 1 gives high byte first, cType = 2 gives'); 40 | Writeln(' low byte first.'); 41 | end; 42 | 43 | type 44 | TAppSettings=record 45 | InFileName , 46 | OutFileName , 47 | SampeFileName : string; 48 | MsPerFrame : Integer; 49 | DoOutputRawSample : Boolean; 50 | OutputByteOrder : byte; 51 | procedure Init; 52 | end; 53 | 54 | procedure TAppSettings.Init; 55 | begin 56 | self := default(TAppSettings); 57 | InFileName := ''; 58 | OutFileName := ''; 59 | SampeFileName := ''; 60 | MsPerFrame := 10; 61 | DoOutputRawSample := FALSE; 62 | end; 63 | 64 | procedure Main; 65 | var 66 | OutFile : File of byte; 67 | KlattSynth : TKlattSynth; 68 | FrameSamples : TArray; 69 | FrameSampleIndex : Integer; 70 | Sample : uint16; 71 | AppSettings : TAppSettings; 72 | FrameIndex : Integer; 73 | begin 74 | if(ParamCount=0) then 75 | begin 76 | usage; 77 | halt(1); 78 | end; 79 | 80 | AppSettings.Init; 81 | 82 | KlattSynth := TKlattSynth.Create; 83 | 84 | SetLength(FrameSamples, cMaxSampleRateHz); 85 | 86 | CommandLine.ProcessKeys( procedure(const key:char; const name,value:string ) 87 | begin 88 | case Key of 89 | 'i': AppSettings.InFileName := Value; 90 | 'o': AppSettings.OutFileName := Value; 91 | 'q': KlattSynth.Quiet := TRUE; 92 | 't': KlattSynth.OutputChannel := TOutputChannel(StrToInt(Value)); 93 | 'c': begin KlattSynth.SynthesisModel := CascadeParallel; KlattSynth.nfcascade := 5; end; 94 | 's': KlattSynth.SampleRateHz := StrToInt(Value); 95 | 'f': AppSettings.MsPerFrame := StrToInt(Value); 96 | // 'v': Globals.glsource := StrToInt(Value); 97 | 'V': AppSettings.SampeFileName := Value; 98 | 'h': begin usage(); halt(1); end; 99 | 'n': KlattSynth.nfcascade := StrToInt(Value); 100 | 'F': KlattSynth.f0_flutter := StrToInt(Value); 101 | 'r': begin AppSettings.DoOutputRawSample := TRUE; AppSettings.OutputByteOrder := StrToInt(Value); end; 102 | end; 103 | end 104 | ); 105 | KlattSynth.SamplesPerFrame := Round((KlattSynth.SampleRateHz * AppSettings.MsPerFrame) / 2000); 106 | 107 | { 108 | if SampleFileName <> '' then 109 | begin 110 | AssignFile(InFile, SampeFileName); 111 | Reset(InFile); 112 | read(InFile, Globals.num_samples); 113 | read(InFile, Globals.SAMPLE_FACTOR); 114 | SetLength(Globals.natural_samples, length(natural_samples)); 115 | for I := 0 to Globals.num_samples - 1 do 116 | read(InFile, Globals.natural_samples[I]); 117 | CloseFile(InFile); 118 | end; 119 | } 120 | 121 | if AppSettings.InFileName = '' then 122 | begin 123 | Writeln('Error: No inputfile given'); 124 | Halt(2); 125 | end; 126 | 127 | if AppSettings.OutFileName = '' then 128 | KlattSynth.Quiet := True 129 | else 130 | begin 131 | AssignFile(OutFile, AppSettings.OutFileName); 132 | Rewrite(OutFile); 133 | end; 134 | 135 | KlattSynth.InitParWave; 136 | WriteLn('Reading ',AppSettings.InFileName,' ...'); 137 | KlattSynth.LoadFromFile(AppSettings.InFileName); 138 | 139 | WriteLn(length(KlattSynth.Frames),' frames -> Rendering to ',length(KlattSynth.Frames)*KlattSynth.SamplesPerFrame, ' samples ...'); 140 | WriteLn('Saving ', AppSettings.OutFileName,' ...'); 141 | for FrameIndex := 0 to High(KlattSynth.Frames) do 142 | begin 143 | KlattSynth.RenderParWave(KlattSynth.Frames[FrameIndex], FrameSamples); 144 | 145 | for FrameSampleIndex := 0 to KlattSynth.SamplesPerFrame-1 do 146 | begin 147 | Sample := Round(FrameSamples[FrameSampleIndex]); 148 | if AppSettings.DoOutputRawSample then 149 | begin 150 | Sample := Round((Sample / 256) + 32768) and $FFFF; 151 | Write(OutFile, Sample); 152 | end 153 | else 154 | Writeln(format('%d', [round(FrameSamples[FrameSampleIndex])])); 155 | end; 156 | end; 157 | if AppSettings.OutFileName <> '' then CloseFile(OutFile); 158 | if (not KlattSynth.Quiet) then Writeln('Done'); 159 | WriteLn('Done.'); 160 | end; 161 | 162 | begin 163 | try 164 | Main; 165 | except 166 | on E: Exception do 167 | Writeln(E.ClassName, ': ', E.Message); 168 | end; 169 | end. 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # delphi-klatt-synth 2 | Klatt Speech Synthesizer 3 | 4 | This can be used to do text to speech, or generate singing sounds. 5 | 6 | It's a commandline tool that takes a Klatt file as an input, and generates a sample as output. 7 | 8 | The code is based on the The Klatt Synthesizer which was designed by Dennis H. Klatt in 1980. 9 | Dectalk, which was Stephen Hawking's voice, was largely based on this. 10 | --------------------------------------------------------------------------------