├── .gitattributes
├── .gitignore
├── README.md
├── __init__.py
├── example_workflows
    └── musetalk_vid2vid_example.json
├── musetalk
    └── whisper
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── assets
    │       ├── gpt2
    │       │   ├── merges.txt
    │       │   ├── special_tokens_map.json
    │       │   ├── tokenizer_config.json
    │       │   └── vocab.json
    │       ├── mel_filters.npz
    │       └── multilingual
    │       │   ├── added_tokens.json
    │       │   ├── merges.txt
    │       │   ├── special_tokens_map.json
    │       │   ├── tokenizer_config.json
    │       │   └── vocab.json
    │   ├── audio.py
    │   ├── checkpoints
    │       └── place_whisper_model_here.txt
    │   ├── decoding.py
    │   ├── model.py
    │   ├── normalizers
    │       ├── __init__.py
    │       ├── basic.py
    │       ├── english.json
    │       └── english.py
    │   ├── tokenizer.py
    │   ├── transcribe.py
    │   └── utils.py
├── nodes.py
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | pretrained_models/
2 | example_data/
3 | results/
4 | *.zip
5 | .vscode/
6 | .hypothesis/
7 | *.pt
8 | __pycache__
9 | *.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ComfyUI nodes to use MuseTalk
 2 | 
 3 | Native (as much as possible) implementation of MuseTalk in ComfyUI. 
 4 | 
 5 | ![image](https://github.com/kijai/ComfyUI-MuseTalk-KJ/assets/40791699/0d586490-ef1d-4076-931d-8b701e63d8de)
 6 | 
 7 | 
 8 | 
 9 | Original repo:
10 | https://github.com/TMElyralab/MuseTalk
11 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2 | 
3 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]


--------------------------------------------------------------------------------
/example_workflows/musetalk_vid2vid_example.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "last_node_id": 127,
   3 |   "last_link_id": 295,
   4 |   "nodes": [
   5 |     {
   6 |       "id": 72,
   7 |       "type": "DWPreprocessor",
   8 |       "pos": [
   9 |         -1370,
  10 |         290
  11 |       ],
  12 |       "size": {
  13 |         "0": 315,
  14 |         "1": 198
  15 |       },
  16 |       "flags": {},
  17 |       "order": 13,
  18 |       "mode": 0,
  19 |       "inputs": [
  20 |         {
  21 |           "name": "image",
  22 |           "type": "IMAGE",
  23 |           "link": 164
  24 |         },
  25 |         {
  26 |           "name": "resolution",
  27 |           "type": "INT",
  28 |           "link": 216,
  29 |           "widget": {
  30 |             "name": "resolution"
  31 |           }
  32 |         }
  33 |       ],
  34 |       "outputs": [
  35 |         {
  36 |           "name": "IMAGE",
  37 |           "type": "IMAGE",
  38 |           "links": [
  39 |             167
  40 |           ],
  41 |           "shape": 3,
  42 |           "slot_index": 0
  43 |         },
  44 |         {
  45 |           "name": "POSE_KEYPOINT",
  46 |           "type": "POSE_KEYPOINT",
  47 |           "links": [],
  48 |           "shape": 3,
  49 |           "slot_index": 1
  50 |         }
  51 |       ],
  52 |       "properties": {
  53 |         "Node name for S&R": "DWPreprocessor"
  54 |       },
  55 |       "widgets_values": [
  56 |         "disable",
  57 |         "disable",
  58 |         "enable",
  59 |         512,
  60 |         "yolox_l.torchscript.pt",
  61 |         "dw-ll_ucoco_384_bs5.torchscript.pt"
  62 |       ]
  63 |     },
  64 |     {
  65 |       "id": 76,
  66 |       "type": "ImageToMask",
  67 |       "pos": [
  68 |         -1380,
  69 |         540
  70 |       ],
  71 |       "size": {
  72 |         "0": 315,
  73 |         "1": 58
  74 |       },
  75 |       "flags": {},
  76 |       "order": 14,
  77 |       "mode": 0,
  78 |       "inputs": [
  79 |         {
  80 |           "name": "image",
  81 |           "type": "IMAGE",
  82 |           "link": 167
  83 |         }
  84 |       ],
  85 |       "outputs": [
  86 |         {
  87 |           "name": "MASK",
  88 |           "type": "MASK",
  89 |           "links": [
  90 |             207
  91 |           ],
  92 |           "shape": 3,
  93 |           "slot_index": 0
  94 |         }
  95 |       ],
  96 |       "properties": {
  97 |         "Node name for S&R": "ImageToMask"
  98 |       },
  99 |       "widgets_values": [
 100 |         "red"
 101 |       ]
 102 |     },
 103 |     {
 104 |       "id": 47,
 105 |       "type": "ImageResize+",
 106 |       "pos": [
 107 |         -1796,
 108 |         292
 109 |       ],
 110 |       "size": {
 111 |         "0": 315,
 112 |         "1": 218
 113 |       },
 114 |       "flags": {},
 115 |       "order": 10,
 116 |       "mode": 0,
 117 |       "inputs": [
 118 |         {
 119 |           "name": "image",
 120 |           "type": "IMAGE",
 121 |           "link": 245
 122 |         }
 123 |       ],
 124 |       "outputs": [
 125 |         {
 126 |           "name": "IMAGE",
 127 |           "type": "IMAGE",
 128 |           "links": [
 129 |             164,
 130 |             214,
 131 |             215
 132 |           ],
 133 |           "shape": 3,
 134 |           "slot_index": 0
 135 |         },
 136 |         {
 137 |           "name": "width",
 138 |           "type": "INT",
 139 |           "links": [
 140 |             217
 141 |           ],
 142 |           "shape": 3
 143 |         },
 144 |         {
 145 |           "name": "height",
 146 |           "type": "INT",
 147 |           "links": [
 148 |             218
 149 |           ],
 150 |           "shape": 3
 151 |         }
 152 |       ],
 153 |       "properties": {
 154 |         "Node name for S&R": "ImageResize+"
 155 |       },
 156 |       "widgets_values": [
 157 |         1024,
 158 |         1024,
 159 |         "nearest",
 160 |         true,
 161 |         "always",
 162 |         2
 163 |       ]
 164 |     },
 165 |     {
 166 |       "id": 94,
 167 |       "type": "PixelPerfectResolution",
 168 |       "pos": [
 169 |         -1521,
 170 |         221
 171 |       ],
 172 |       "size": {
 173 |         "0": 393,
 174 |         "1": 106
 175 |       },
 176 |       "flags": {
 177 |         "collapsed": true
 178 |       },
 179 |       "order": 12,
 180 |       "mode": 0,
 181 |       "inputs": [
 182 |         {
 183 |           "name": "original_image",
 184 |           "type": "IMAGE",
 185 |           "link": 215
 186 |         },
 187 |         {
 188 |           "name": "image_gen_width",
 189 |           "type": "INT",
 190 |           "link": 217,
 191 |           "widget": {
 192 |             "name": "image_gen_width"
 193 |           },
 194 |           "slot_index": 1
 195 |         },
 196 |         {
 197 |           "name": "image_gen_height",
 198 |           "type": "INT",
 199 |           "link": 218,
 200 |           "widget": {
 201 |             "name": "image_gen_height"
 202 |           },
 203 |           "slot_index": 2
 204 |         }
 205 |       ],
 206 |       "outputs": [
 207 |         {
 208 |           "name": "RESOLUTION (INT)",
 209 |           "type": "INT",
 210 |           "links": [
 211 |             216
 212 |           ],
 213 |           "shape": 3,
 214 |           "slot_index": 0
 215 |         }
 216 |       ],
 217 |       "properties": {
 218 |         "Node name for S&R": "PixelPerfectResolution"
 219 |       },
 220 |       "widgets_values": [
 221 |         512,
 222 |         512,
 223 |         "Just Resize"
 224 |       ]
 225 |     },
 226 |     {
 227 |       "id": 101,
 228 |       "type": "MaskToImage",
 229 |       "pos": [
 230 |         -649,
 231 |         316
 232 |       ],
 233 |       "size": {
 234 |         "0": 210,
 235 |         "1": 26
 236 |       },
 237 |       "flags": {},
 238 |       "order": 17,
 239 |       "mode": 0,
 240 |       "inputs": [
 241 |         {
 242 |           "name": "mask",
 243 |           "type": "MASK",
 244 |           "link": 234
 245 |         }
 246 |       ],
 247 |       "outputs": [
 248 |         {
 249 |           "name": "IMAGE",
 250 |           "type": "IMAGE",
 251 |           "links": [
 252 |             235
 253 |           ],
 254 |           "shape": 3,
 255 |           "slot_index": 0
 256 |         }
 257 |       ],
 258 |       "properties": {
 259 |         "Node name for S&R": "MaskToImage"
 260 |       }
 261 |     },
 262 |     {
 263 |       "id": 92,
 264 |       "type": "GrowMaskWithBlur",
 265 |       "pos": [
 266 |         -1010,
 267 |         310
 268 |       ],
 269 |       "size": {
 270 |         "0": 315,
 271 |         "1": 246
 272 |       },
 273 |       "flags": {},
 274 |       "order": 15,
 275 |       "mode": 0,
 276 |       "inputs": [
 277 |         {
 278 |           "name": "mask",
 279 |           "type": "MASK",
 280 |           "link": 207
 281 |         }
 282 |       ],
 283 |       "outputs": [
 284 |         {
 285 |           "name": "mask",
 286 |           "type": "MASK",
 287 |           "links": [
 288 |             221,
 289 |             234
 290 |           ],
 291 |           "shape": 3,
 292 |           "slot_index": 0
 293 |         },
 294 |         {
 295 |           "name": "mask_inverted",
 296 |           "type": "MASK",
 297 |           "links": null,
 298 |           "shape": 3
 299 |         }
 300 |       ],
 301 |       "properties": {
 302 |         "Node name for S&R": "GrowMaskWithBlur"
 303 |       },
 304 |       "widgets_values": [
 305 |         15,
 306 |         0,
 307 |         true,
 308 |         false,
 309 |         0,
 310 |         1,
 311 |         1,
 312 |         true
 313 |       ]
 314 |     },
 315 |     {
 316 |       "id": 97,
 317 |       "type": "Display Any (rgthree)",
 318 |       "pos": [
 319 |         -1738,
 320 |         582
 321 |       ],
 322 |       "size": {
 323 |         "0": 226.42002868652344,
 324 |         "1": 116.54998779296875
 325 |       },
 326 |       "flags": {},
 327 |       "order": 11,
 328 |       "mode": 0,
 329 |       "inputs": [
 330 |         {
 331 |           "name": "source",
 332 |           "type": "*",
 333 |           "link": 251,
 334 |           "dir": 3
 335 |         }
 336 |       ],
 337 |       "properties": {
 338 |         "Node name for S&R": "Display Any (rgthree)"
 339 |       },
 340 |       "widgets_values": [
 341 |         ""
 342 |       ]
 343 |     },
 344 |     {
 345 |       "id": 121,
 346 |       "type": "UNETLoader_MuseTalk",
 347 |       "pos": [
 348 |         450,
 349 |         -360
 350 |       ],
 351 |       "size": {
 352 |         "0": 214.1832275390625,
 353 |         "1": 26
 354 |       },
 355 |       "flags": {},
 356 |       "order": 0,
 357 |       "mode": 0,
 358 |       "outputs": [
 359 |         {
 360 |           "name": "MODEL",
 361 |           "type": "MODEL",
 362 |           "links": [
 363 |             280
 364 |           ],
 365 |           "shape": 3,
 366 |           "slot_index": 0
 367 |         }
 368 |       ],
 369 |       "properties": {
 370 |         "Node name for S&R": "UNETLoader_MuseTalk"
 371 |       }
 372 |     },
 373 |     {
 374 |       "id": 4,
 375 |       "type": "VAELoader",
 376 |       "pos": [
 377 |         290,
 378 |         -280
 379 |       ],
 380 |       "size": {
 381 |         "0": 379.3569641113281,
 382 |         "1": 58.21699523925781
 383 |       },
 384 |       "flags": {},
 385 |       "order": 1,
 386 |       "mode": 0,
 387 |       "outputs": [
 388 |         {
 389 |           "name": "VAE",
 390 |           "type": "VAE",
 391 |           "links": [
 392 |             279
 393 |           ],
 394 |           "shape": 3,
 395 |           "slot_index": 0
 396 |         }
 397 |       ],
 398 |       "properties": {
 399 |         "Node name for S&R": "VAELoader"
 400 |       },
 401 |       "widgets_values": [
 402 |         "vae-ft-mse-840000-ema-pruned.safetensors"
 403 |       ]
 404 |     },
 405 |     {
 406 |       "id": 27,
 407 |       "type": "vhs_audio_to_audio_tensor",
 408 |       "pos": [
 409 |         -1720,
 410 |         -170
 411 |       ],
 412 |       "size": {
 413 |         "0": 315,
 414 |         "1": 102
 415 |       },
 416 |       "flags": {},
 417 |       "order": 6,
 418 |       "mode": 0,
 419 |       "inputs": [
 420 |         {
 421 |           "name": "vhs_audio",
 422 |           "type": "VHS_AUDIO",
 423 |           "link": 45,
 424 |           "slot_index": 0
 425 |         }
 426 |       ],
 427 |       "outputs": [
 428 |         {
 429 |           "name": "audio_tensor",
 430 |           "type": "VCAUDIOTENSOR",
 431 |           "links": [
 432 |             67
 433 |           ],
 434 |           "shape": 3,
 435 |           "slot_index": 0
 436 |         },
 437 |         {
 438 |           "name": "audio_dur",
 439 |           "type": "INT",
 440 |           "links": null,
 441 |           "shape": 3
 442 |         }
 443 |       ],
 444 |       "properties": {
 445 |         "Node name for S&R": "vhs_audio_to_audio_tensor"
 446 |       },
 447 |       "widgets_values": [
 448 |         16000,
 449 |         1
 450 |       ]
 451 |     },
 452 |     {
 453 |       "id": 28,
 454 |       "type": "VHS_LoadAudio",
 455 |       "pos": [
 456 |         -2120,
 457 |         -270
 458 |       ],
 459 |       "size": {
 460 |         "0": 315,
 461 |         "1": 82
 462 |       },
 463 |       "flags": {},
 464 |       "order": 2,
 465 |       "mode": 0,
 466 |       "outputs": [
 467 |         {
 468 |           "name": "audio",
 469 |           "type": "VHS_AUDIO",
 470 |           "links": [
 471 |             45,
 472 |             287
 473 |           ],
 474 |           "shape": 3,
 475 |           "slot_index": 0
 476 |         }
 477 |       ],
 478 |       "properties": {
 479 |         "Node name for S&R": "VHS_LoadAudio"
 480 |       },
 481 |       "widgets_values": {
 482 |         "audio_file": "input/yongen.wav",
 483 |         "seek_seconds": 0
 484 |       }
 485 |     },
 486 |     {
 487 |       "id": 124,
 488 |       "type": "SetNode",
 489 |       "pos": [
 490 |         -1750,
 491 |         -340
 492 |       ],
 493 |       "size": {
 494 |         "0": 210,
 495 |         "1": 58
 496 |       },
 497 |       "flags": {
 498 |         "collapsed": true
 499 |       },
 500 |       "order": 7,
 501 |       "mode": 0,
 502 |       "inputs": [
 503 |         {
 504 |           "name": "VHS_AUDIO",
 505 |           "type": "VHS_AUDIO",
 506 |           "link": 287
 507 |         }
 508 |       ],
 509 |       "outputs": [
 510 |         {
 511 |           "name": "*",
 512 |           "type": "*",
 513 |           "links": null
 514 |         }
 515 |       ],
 516 |       "title": "Set_OriginaAudioVHS",
 517 |       "properties": {
 518 |         "previousName": "OriginaAudioVHS"
 519 |       },
 520 |       "widgets_values": [
 521 |         "OriginaAudioVHS"
 522 |       ]
 523 |     },
 524 |     {
 525 |       "id": 89,
 526 |       "type": "VHS_LoadVideo",
 527 |       "pos": [
 528 |         -2129,
 529 |         279
 530 |       ],
 531 |       "size": [
 532 |         235.1999969482422,
 533 |         377.04999828338623
 534 |       ],
 535 |       "flags": {},
 536 |       "order": 9,
 537 |       "mode": 0,
 538 |       "inputs": [
 539 |         {
 540 |           "name": "batch_manager",
 541 |           "type": "VHS_BatchManager",
 542 |           "link": null
 543 |         },
 544 |         {
 545 |           "name": "frame_load_cap",
 546 |           "type": "INT",
 547 |           "link": 288,
 548 |           "widget": {
 549 |             "name": "frame_load_cap"
 550 |           }
 551 |         }
 552 |       ],
 553 |       "outputs": [
 554 |         {
 555 |           "name": "IMAGE",
 556 |           "type": "IMAGE",
 557 |           "links": [
 558 |             245
 559 |           ],
 560 |           "shape": 3,
 561 |           "slot_index": 0
 562 |         },
 563 |         {
 564 |           "name": "frame_count",
 565 |           "type": "INT",
 566 |           "links": [
 567 |             251
 568 |           ],
 569 |           "shape": 3,
 570 |           "slot_index": 1
 571 |         },
 572 |         {
 573 |           "name": "audio",
 574 |           "type": "VHS_AUDIO",
 575 |           "links": null,
 576 |           "shape": 3
 577 |         }
 578 |       ],
 579 |       "properties": {
 580 |         "Node name for S&R": "VHS_LoadVideo"
 581 |       },
 582 |       "widgets_values": {
 583 |         "video": "istockphoto-1139641392-640_adpp_is_yongen.mp4",
 584 |         "force_rate": 0,
 585 |         "force_size": "Disabled",
 586 |         "custom_width": 512,
 587 |         "custom_height": 512,
 588 |         "frame_load_cap": 194,
 589 |         "skip_first_frames": 0,
 590 |         "select_every_nth": 1,
 591 |         "choose video to upload": "image",
 592 |         "videopreview": {
 593 |           "hidden": false,
 594 |           "paused": false,
 595 |           "params": {
 596 |             "frame_load_cap": 194,
 597 |             "skip_first_frames": 0,
 598 |             "force_rate": 0,
 599 |             "filename": "istockphoto-1139641392-640_adpp_is_yongen.mp4",
 600 |             "type": "input",
 601 |             "format": "video/mp4",
 602 |             "select_every_nth": 1
 603 |           }
 604 |         }
 605 |       }
 606 |     },
 607 |     {
 608 |       "id": 36,
 609 |       "type": "whisper_to_features",
 610 |       "pos": [
 611 |         -1340,
 612 |         -160
 613 |       ],
 614 |       "size": {
 615 |         "0": 342.5999755859375,
 616 |         "1": 78
 617 |       },
 618 |       "flags": {},
 619 |       "order": 8,
 620 |       "mode": 0,
 621 |       "inputs": [
 622 |         {
 623 |           "name": "audio_tensor",
 624 |           "type": "VCAUDIOTENSOR",
 625 |           "link": 67,
 626 |           "slot_index": 0
 627 |         }
 628 |       ],
 629 |       "outputs": [
 630 |         {
 631 |           "name": "whisper_chunks",
 632 |           "type": "WHISPERFEAT",
 633 |           "links": [
 634 |             281
 635 |           ],
 636 |           "shape": 3,
 637 |           "slot_index": 0
 638 |         },
 639 |         {
 640 |           "name": "frame_count",
 641 |           "type": "INT",
 642 |           "links": [
 643 |             288
 644 |           ],
 645 |           "shape": 3,
 646 |           "slot_index": 1
 647 |         }
 648 |       ],
 649 |       "properties": {
 650 |         "Node name for S&R": "whisper_to_features"
 651 |       },
 652 |       "widgets_values": [
 653 |         24
 654 |       ]
 655 |     },
 656 |     {
 657 |       "id": 125,
 658 |       "type": "GetNode",
 659 |       "pos": [
 660 |         1180,
 661 |         -540
 662 |       ],
 663 |       "size": {
 664 |         "0": 210,
 665 |         "1": 58
 666 |       },
 667 |       "flags": {
 668 |         "collapsed": true
 669 |       },
 670 |       "order": 3,
 671 |       "mode": 0,
 672 |       "outputs": [
 673 |         {
 674 |           "name": "VHS_AUDIO",
 675 |           "type": "VHS_AUDIO",
 676 |           "links": [
 677 |             289
 678 |           ],
 679 |           "slot_index": 0
 680 |         }
 681 |       ],
 682 |       "title": "Get_OriginaAudioVHS",
 683 |       "properties": {},
 684 |       "widgets_values": [
 685 |         "OriginaAudioVHS"
 686 |       ]
 687 |     },
 688 |     {
 689 |       "id": 99,
 690 |       "type": "VHS_VideoCombine",
 691 |       "pos": [
 692 |         460,
 693 |         300
 694 |       ],
 695 |       "size": [
 696 |         437.7622375488281,
 697 |         721.7622375488281
 698 |       ],
 699 |       "flags": {},
 700 |       "order": 22,
 701 |       "mode": 0,
 702 |       "inputs": [
 703 |         {
 704 |           "name": "images",
 705 |           "type": "IMAGE",
 706 |           "link": 230,
 707 |           "slot_index": 0
 708 |         },
 709 |         {
 710 |           "name": "audio",
 711 |           "type": "VHS_AUDIO",
 712 |           "link": null
 713 |         },
 714 |         {
 715 |           "name": "batch_manager",
 716 |           "type": "VHS_BatchManager",
 717 |           "link": null
 718 |         }
 719 |       ],
 720 |       "outputs": [
 721 |         {
 722 |           "name": "Filenames",
 723 |           "type": "VHS_FILENAMES",
 724 |           "links": null,
 725 |           "shape": 3
 726 |         }
 727 |       ],
 728 |       "properties": {
 729 |         "Node name for S&R": "VHS_VideoCombine"
 730 |       },
 731 |       "widgets_values": {
 732 |         "frame_rate": 25,
 733 |         "loop_count": 0,
 734 |         "filename_prefix": "MuseTalkCrop",
 735 |         "format": "video/h264-mp4",
 736 |         "pix_fmt": "yuv420p",
 737 |         "crf": 19,
 738 |         "save_metadata": true,
 739 |         "pingpong": false,
 740 |         "save_output": false,
 741 |         "videopreview": {
 742 |           "hidden": false,
 743 |           "paused": false,
 744 |           "params": {
 745 |             "filename": "MuseTalkCrop_00002.mp4",
 746 |             "subfolder": "",
 747 |             "type": "temp",
 748 |             "format": "video/h264-mp4"
 749 |           }
 750 |         }
 751 |       }
 752 |     },
 753 |     {
 754 |       "id": 16,
 755 |       "type": "ImageCompositeMasked",
 756 |       "pos": [
 757 |         490,
 758 |         70
 759 |       ],
 760 |       "size": {
 761 |         "0": 315,
 762 |         "1": 146
 763 |       },
 764 |       "flags": {},
 765 |       "order": 20,
 766 |       "mode": 0,
 767 |       "inputs": [
 768 |         {
 769 |           "name": "destination",
 770 |           "type": "IMAGE",
 771 |           "link": 56
 772 |         },
 773 |         {
 774 |           "name": "source",
 775 |           "type": "IMAGE",
 776 |           "link": 18
 777 |         },
 778 |         {
 779 |           "name": "mask",
 780 |           "type": "MASK",
 781 |           "link": null
 782 |         }
 783 |       ],
 784 |       "outputs": [
 785 |         {
 786 |           "name": "IMAGE",
 787 |           "type": "IMAGE",
 788 |           "links": [
 789 |             230,
 790 |             285
 791 |           ],
 792 |           "shape": 3,
 793 |           "slot_index": 0
 794 |         }
 795 |       ],
 796 |       "properties": {
 797 |         "Node name for S&R": "ImageCompositeMasked"
 798 |       },
 799 |       "widgets_values": [
 800 |         0,
 801 |         128,
 802 |         false
 803 |       ]
 804 |     },
 805 |     {
 806 |       "id": 15,
 807 |       "type": "EmptyImage",
 808 |       "pos": [
 809 |         500,
 810 |         20
 811 |       ],
 812 |       "size": {
 813 |         "0": 315,
 814 |         "1": 130
 815 |       },
 816 |       "flags": {
 817 |         "collapsed": true
 818 |       },
 819 |       "order": 4,
 820 |       "mode": 0,
 821 |       "outputs": [
 822 |         {
 823 |           "name": "IMAGE",
 824 |           "type": "IMAGE",
 825 |           "links": [
 826 |             18
 827 |           ],
 828 |           "shape": 3,
 829 |           "slot_index": 0
 830 |         }
 831 |       ],
 832 |       "properties": {
 833 |         "Node name for S&R": "EmptyImage"
 834 |       },
 835 |       "widgets_values": [
 836 |         256,
 837 |         256,
 838 |         1,
 839 |         0
 840 |       ]
 841 |     },
 842 |     {
 843 |       "id": 98,
 844 |       "type": "VHS_VideoCombine",
 845 |       "pos": [
 846 |         -10,
 847 |         300
 848 |       ],
 849 |       "size": [
 850 |         437.7622375488281,
 851 |         721.7622375488281
 852 |       ],
 853 |       "flags": {},
 854 |       "order": 21,
 855 |       "mode": 0,
 856 |       "inputs": [
 857 |         {
 858 |           "name": "images",
 859 |           "type": "IMAGE",
 860 |           "link": 233,
 861 |           "slot_index": 0
 862 |         },
 863 |         {
 864 |           "name": "audio",
 865 |           "type": "VHS_AUDIO",
 866 |           "link": null
 867 |         },
 868 |         {
 869 |           "name": "batch_manager",
 870 |           "type": "VHS_BatchManager",
 871 |           "link": null
 872 |         }
 873 |       ],
 874 |       "outputs": [
 875 |         {
 876 |           "name": "Filenames",
 877 |           "type": "VHS_FILENAMES",
 878 |           "links": null,
 879 |           "shape": 3
 880 |         }
 881 |       ],
 882 |       "properties": {
 883 |         "Node name for S&R": "VHS_VideoCombine"
 884 |       },
 885 |       "widgets_values": {
 886 |         "frame_rate": 25,
 887 |         "loop_count": 0,
 888 |         "filename_prefix": "MuseTalkCrop",
 889 |         "format": "video/h264-mp4",
 890 |         "pix_fmt": "yuv420p",
 891 |         "crf": 19,
 892 |         "save_metadata": true,
 893 |         "pingpong": false,
 894 |         "save_output": false,
 895 |         "videopreview": {
 896 |           "hidden": false,
 897 |           "paused": false,
 898 |           "params": {
 899 |             "filename": "MuseTalkCrop_00001.mp4",
 900 |             "subfolder": "",
 901 |             "type": "temp",
 902 |             "format": "video/h264-mp4"
 903 |           }
 904 |         }
 905 |       }
 906 |     },
 907 |     {
 908 |       "id": 100,
 909 |       "type": "VHS_VideoCombine",
 910 |       "pos": [
 911 |         -539,
 912 |         412
 913 |       ],
 914 |       "size": [
 915 |         437.7622375488281,
 916 |         466.9912586212158
 917 |       ],
 918 |       "flags": {},
 919 |       "order": 19,
 920 |       "mode": 0,
 921 |       "inputs": [
 922 |         {
 923 |           "name": "images",
 924 |           "type": "IMAGE",
 925 |           "link": 235,
 926 |           "slot_index": 0
 927 |         },
 928 |         {
 929 |           "name": "audio",
 930 |           "type": "VHS_AUDIO",
 931 |           "link": null
 932 |         },
 933 |         {
 934 |           "name": "batch_manager",
 935 |           "type": "VHS_BatchManager",
 936 |           "link": null
 937 |         }
 938 |       ],
 939 |       "outputs": [
 940 |         {
 941 |           "name": "Filenames",
 942 |           "type": "VHS_FILENAMES",
 943 |           "links": null,
 944 |           "shape": 3
 945 |         }
 946 |       ],
 947 |       "properties": {
 948 |         "Node name for S&R": "VHS_VideoCombine"
 949 |       },
 950 |       "widgets_values": {
 951 |         "frame_rate": 25,
 952 |         "loop_count": 0,
 953 |         "filename_prefix": "Masks",
 954 |         "format": "image/webp",
 955 |         "pingpong": false,
 956 |         "save_output": false,
 957 |         "videopreview": {
 958 |           "hidden": false,
 959 |           "paused": false,
 960 |           "params": {
 961 |             "filename": "Masks_00001.webp",
 962 |             "subfolder": "",
 963 |             "type": "temp",
 964 |             "format": "image/webp"
 965 |           }
 966 |         }
 967 |       }
 968 |     },
 969 |     {
 970 |       "id": 122,
 971 |       "type": "muse_talk_sampler",
 972 |       "pos": [
 973 |         770,
 974 |         -200
 975 |       ],
 976 |       "size": {
 977 |         "0": 315,
 978 |         "1": 162
 979 |       },
 980 |       "flags": {},
 981 |       "order": 23,
 982 |       "mode": 0,
 983 |       "inputs": [
 984 |         {
 985 |           "name": "model",
 986 |           "type": "MODEL",
 987 |           "link": 280
 988 |         },
 989 |         {
 990 |           "name": "vae",
 991 |           "type": "VAE",
 992 |           "link": 279,
 993 |           "slot_index": 1
 994 |         },
 995 |         {
 996 |           "name": "whisper_features",
 997 |           "type": "WHISPERFEAT",
 998 |           "link": 281,
 999 |           "slot_index": 2
1000 |         },
1001 |         {
1002 |           "name": "images",
1003 |           "type": "IMAGE",
1004 |           "link": 284,
1005 |           "slot_index": 3
1006 |         },
1007 |         {
1008 |           "name": "masked_images",
1009 |           "type": "IMAGE",
1010 |           "link": 285,
1011 |           "slot_index": 4
1012 |         }
1013 |       ],
1014 |       "outputs": [
1015 |         {
1016 |           "name": "image",
1017 |           "type": "IMAGE",
1018 |           "links": [
1019 |             282,
1020 |             295
1021 |           ],
1022 |           "shape": 3,
1023 |           "slot_index": 0
1024 |         }
1025 |       ],
1026 |       "properties": {
1027 |         "Node name for S&R": "muse_talk_sampler"
1028 |       },
1029 |       "widgets_values": [
1030 |         16,
1031 |         0
1032 |       ]
1033 |     },
1034 |     {
1035 |       "id": 30,
1036 |       "type": "VHS_VideoCombine",
1037 |       "pos": [
1038 |         1390,
1039 |         -600
1040 |       ],
1041 |       "size": [
1042 |         421.8526815820319,
1043 |         705.8526815820319
1044 |       ],
1045 |       "flags": {},
1046 |       "order": 24,
1047 |       "mode": 0,
1048 |       "inputs": [
1049 |         {
1050 |           "name": "images",
1051 |           "type": "IMAGE",
1052 |           "link": 282,
1053 |           "slot_index": 0
1054 |         },
1055 |         {
1056 |           "name": "audio",
1057 |           "type": "VHS_AUDIO",
1058 |           "link": 289
1059 |         },
1060 |         {
1061 |           "name": "batch_manager",
1062 |           "type": "VHS_BatchManager",
1063 |           "link": null
1064 |         }
1065 |       ],
1066 |       "outputs": [
1067 |         {
1068 |           "name": "Filenames",
1069 |           "type": "VHS_FILENAMES",
1070 |           "links": null,
1071 |           "shape": 3
1072 |         }
1073 |       ],
1074 |       "properties": {
1075 |         "Node name for S&R": "VHS_VideoCombine"
1076 |       },
1077 |       "widgets_values": {
1078 |         "frame_rate": 25,
1079 |         "loop_count": 0,
1080 |         "filename_prefix": "MuseTalkCrop",
1081 |         "format": "video/h264-mp4",
1082 |         "pix_fmt": "yuv420p",
1083 |         "crf": 19,
1084 |         "save_metadata": true,
1085 |         "pingpong": false,
1086 |         "save_output": false,
1087 |         "videopreview": {
1088 |           "hidden": false,
1089 |           "paused": false,
1090 |           "params": {
1091 |             "filename": "MuseTalkCrop_00003-audio.mp4",
1092 |             "subfolder": "",
1093 |             "type": "temp",
1094 |             "format": "video/h264-mp4"
1095 |           }
1096 |         }
1097 |       }
1098 |     },
1099 |     {
1100 |       "id": 126,
1101 |       "type": "GetNode",
1102 |       "pos": [
1103 |         1164,
1104 |         459
1105 |       ],
1106 |       "size": {
1107 |         "0": 210,
1108 |         "1": 58
1109 |       },
1110 |       "flags": {
1111 |         "collapsed": true
1112 |       },
1113 |       "order": 5,
1114 |       "mode": 0,
1115 |       "outputs": [
1116 |         {
1117 |           "name": "VHS_AUDIO",
1118 |           "type": "VHS_AUDIO",
1119 |           "links": [
1120 |             290
1121 |           ],
1122 |           "slot_index": 0
1123 |         }
1124 |       ],
1125 |       "title": "Get_OriginaAudioVHS",
1126 |       "properties": {},
1127 |       "widgets_values": [
1128 |         "OriginaAudioVHS"
1129 |       ]
1130 |     },
1131 |     {
1132 |       "id": 96,
1133 |       "type": "VHS_VideoCombine",
1134 |       "pos": [
1135 |         1512,
1136 |         422
1137 |       ],
1138 |       "size": [
1139 |         830.9005747743759,
1140 |         788.3825163935942
1141 |       ],
1142 |       "flags": {},
1143 |       "order": 26,
1144 |       "mode": 0,
1145 |       "inputs": [
1146 |         {
1147 |           "name": "images",
1148 |           "type": "IMAGE",
1149 |           "link": 225,
1150 |           "slot_index": 0
1151 |         },
1152 |         {
1153 |           "name": "audio",
1154 |           "type": "VHS_AUDIO",
1155 |           "link": 290,
1156 |           "slot_index": 1
1157 |         },
1158 |         {
1159 |           "name": "batch_manager",
1160 |           "type": "VHS_BatchManager",
1161 |           "link": null
1162 |         }
1163 |       ],
1164 |       "outputs": [
1165 |         {
1166 |           "name": "Filenames",
1167 |           "type": "VHS_FILENAMES",
1168 |           "links": null,
1169 |           "shape": 3
1170 |         }
1171 |       ],
1172 |       "properties": {
1173 |         "Node name for S&R": "VHS_VideoCombine"
1174 |       },
1175 |       "widgets_values": {
1176 |         "frame_rate": 25,
1177 |         "loop_count": 0,
1178 |         "filename_prefix": "MuseTalk",
1179 |         "format": "video/h264-mp4",
1180 |         "pix_fmt": "yuv420p",
1181 |         "crf": 19,
1182 |         "save_metadata": true,
1183 |         "pingpong": false,
1184 |         "save_output": false,
1185 |         "videopreview": {
1186 |           "hidden": false,
1187 |           "paused": false,
1188 |           "params": {
1189 |             "filename": "MuseTalk_00001-audio.mp4",
1190 |             "subfolder": "",
1191 |             "type": "temp",
1192 |             "format": "video/h264-mp4"
1193 |           }
1194 |         }
1195 |       }
1196 |     },
1197 |     {
1198 |       "id": 31,
1199 |       "type": "ImageResize+",
1200 |       "pos": [
1201 |         60,
1202 |         10
1203 |       ],
1204 |       "size": {
1205 |         "0": 315,
1206 |         "1": 218
1207 |       },
1208 |       "flags": {},
1209 |       "order": 18,
1210 |       "mode": 0,
1211 |       "inputs": [
1212 |         {
1213 |           "name": "image",
1214 |           "type": "IMAGE",
1215 |           "link": 220,
1216 |           "slot_index": 0
1217 |         }
1218 |       ],
1219 |       "outputs": [
1220 |         {
1221 |           "name": "IMAGE",
1222 |           "type": "IMAGE",
1223 |           "links": [
1224 |             56,
1225 |             233,
1226 |             284
1227 |           ],
1228 |           "shape": 3,
1229 |           "slot_index": 0
1230 |         },
1231 |         {
1232 |           "name": "width",
1233 |           "type": "INT",
1234 |           "links": null,
1235 |           "shape": 3
1236 |         },
1237 |         {
1238 |           "name": "height",
1239 |           "type": "INT",
1240 |           "links": null,
1241 |           "shape": 3
1242 |         }
1243 |       ],
1244 |       "properties": {
1245 |         "Node name for S&R": "ImageResize+"
1246 |       },
1247 |       "widgets_values": [
1248 |         256,
1249 |         256,
1250 |         "nearest",
1251 |         false,
1252 |         "always",
1253 |         0
1254 |       ]
1255 |     },
1256 |     {
1257 |       "id": 95,
1258 |       "type": "BatchUncrop",
1259 |       "pos": [
1260 |         1020,
1261 |         1120
1262 |       ],
1263 |       "size": {
1264 |         "0": 210,
1265 |         "1": 218
1266 |       },
1267 |       "flags": {},
1268 |       "order": 25,
1269 |       "mode": 0,
1270 |       "inputs": [
1271 |         {
1272 |           "name": "original_images",
1273 |           "type": "IMAGE",
1274 |           "link": 294
1275 |         },
1276 |         {
1277 |           "name": "cropped_images",
1278 |           "type": "IMAGE",
1279 |           "link": 295
1280 |         },
1281 |         {
1282 |           "name": "bboxes",
1283 |           "type": "BBOX",
1284 |           "link": 224
1285 |         }
1286 |       ],
1287 |       "outputs": [
1288 |         {
1289 |           "name": "IMAGE",
1290 |           "type": "IMAGE",
1291 |           "links": [
1292 |             225
1293 |           ],
1294 |           "shape": 3,
1295 |           "slot_index": 0
1296 |         }
1297 |       ],
1298 |       "properties": {
1299 |         "Node name for S&R": "BatchUncrop"
1300 |       },
1301 |       "widgets_values": [
1302 |         0.25,
1303 |         1,
1304 |         true,
1305 |         true,
1306 |         true,
1307 |         true
1308 |       ]
1309 |     },
1310 |     {
1311 |       "id": 86,
1312 |       "type": "BatchCropFromMask",
1313 |       "pos": [
1314 |         -900,
1315 |         1140
1316 |       ],
1317 |       "size": {
1318 |         "0": 393,
1319 |         "1": 162
1320 |       },
1321 |       "flags": {},
1322 |       "order": 16,
1323 |       "mode": 0,
1324 |       "inputs": [
1325 |         {
1326 |           "name": "original_images",
1327 |           "type": "IMAGE",
1328 |           "link": 214
1329 |         },
1330 |         {
1331 |           "name": "masks",
1332 |           "type": "MASK",
1333 |           "link": 221
1334 |         }
1335 |       ],
1336 |       "outputs": [
1337 |         {
1338 |           "name": "original_images",
1339 |           "type": "IMAGE",
1340 |           "links": [
1341 |             294
1342 |           ],
1343 |           "shape": 3,
1344 |           "slot_index": 0
1345 |         },
1346 |         {
1347 |           "name": "cropped_images",
1348 |           "type": "IMAGE",
1349 |           "links": [
1350 |             220
1351 |           ],
1352 |           "shape": 3,
1353 |           "slot_index": 1
1354 |         },
1355 |         {
1356 |           "name": "bboxes",
1357 |           "type": "BBOX",
1358 |           "links": [
1359 |             224
1360 |           ],
1361 |           "shape": 3,
1362 |           "slot_index": 2
1363 |         },
1364 |         {
1365 |           "name": "width",
1366 |           "type": "INT",
1367 |           "links": null,
1368 |           "shape": 3
1369 |         },
1370 |         {
1371 |           "name": "height",
1372 |           "type": "INT",
1373 |           "links": null,
1374 |           "shape": 3
1375 |         }
1376 |       ],
1377 |       "properties": {
1378 |         "Node name for S&R": "BatchCropFromMask"
1379 |       },
1380 |       "widgets_values": [
1381 |         1.801,
1382 |         0.5
1383 |       ]
1384 |     }
1385 |   ],
1386 |   "links": [
1387 |     [
1388 |       18,
1389 |       15,
1390 |       0,
1391 |       16,
1392 |       1,
1393 |       "IMAGE"
1394 |     ],
1395 |     [
1396 |       45,
1397 |       28,
1398 |       0,
1399 |       27,
1400 |       0,
1401 |       "VHS_AUDIO"
1402 |     ],
1403 |     [
1404 |       56,
1405 |       31,
1406 |       0,
1407 |       16,
1408 |       0,
1409 |       "IMAGE"
1410 |     ],
1411 |     [
1412 |       67,
1413 |       27,
1414 |       0,
1415 |       36,
1416 |       0,
1417 |       "VCAUDIOTENSOR"
1418 |     ],
1419 |     [
1420 |       164,
1421 |       47,
1422 |       0,
1423 |       72,
1424 |       0,
1425 |       "IMAGE"
1426 |     ],
1427 |     [
1428 |       167,
1429 |       72,
1430 |       0,
1431 |       76,
1432 |       0,
1433 |       "IMAGE"
1434 |     ],
1435 |     [
1436 |       207,
1437 |       76,
1438 |       0,
1439 |       92,
1440 |       0,
1441 |       "MASK"
1442 |     ],
1443 |     [
1444 |       214,
1445 |       47,
1446 |       0,
1447 |       86,
1448 |       0,
1449 |       "IMAGE"
1450 |     ],
1451 |     [
1452 |       215,
1453 |       47,
1454 |       0,
1455 |       94,
1456 |       0,
1457 |       "IMAGE"
1458 |     ],
1459 |     [
1460 |       216,
1461 |       94,
1462 |       0,
1463 |       72,
1464 |       1,
1465 |       "INT"
1466 |     ],
1467 |     [
1468 |       217,
1469 |       47,
1470 |       1,
1471 |       94,
1472 |       1,
1473 |       "INT"
1474 |     ],
1475 |     [
1476 |       218,
1477 |       47,
1478 |       2,
1479 |       94,
1480 |       2,
1481 |       "INT"
1482 |     ],
1483 |     [
1484 |       220,
1485 |       86,
1486 |       1,
1487 |       31,
1488 |       0,
1489 |       "IMAGE"
1490 |     ],
1491 |     [
1492 |       221,
1493 |       92,
1494 |       0,
1495 |       86,
1496 |       1,
1497 |       "MASK"
1498 |     ],
1499 |     [
1500 |       224,
1501 |       86,
1502 |       2,
1503 |       95,
1504 |       2,
1505 |       "BBOX"
1506 |     ],
1507 |     [
1508 |       225,
1509 |       95,
1510 |       0,
1511 |       96,
1512 |       0,
1513 |       "IMAGE"
1514 |     ],
1515 |     [
1516 |       230,
1517 |       16,
1518 |       0,
1519 |       99,
1520 |       0,
1521 |       "IMAGE"
1522 |     ],
1523 |     [
1524 |       233,
1525 |       31,
1526 |       0,
1527 |       98,
1528 |       0,
1529 |       "IMAGE"
1530 |     ],
1531 |     [
1532 |       234,
1533 |       92,
1534 |       0,
1535 |       101,
1536 |       0,
1537 |       "MASK"
1538 |     ],
1539 |     [
1540 |       235,
1541 |       101,
1542 |       0,
1543 |       100,
1544 |       0,
1545 |       "IMAGE"
1546 |     ],
1547 |     [
1548 |       245,
1549 |       89,
1550 |       0,
1551 |       47,
1552 |       0,
1553 |       "IMAGE"
1554 |     ],
1555 |     [
1556 |       251,
1557 |       89,
1558 |       1,
1559 |       97,
1560 |       0,
1561 |       "*"
1562 |     ],
1563 |     [
1564 |       279,
1565 |       4,
1566 |       0,
1567 |       122,
1568 |       1,
1569 |       "VAE"
1570 |     ],
1571 |     [
1572 |       280,
1573 |       121,
1574 |       0,
1575 |       122,
1576 |       0,
1577 |       "MODEL"
1578 |     ],
1579 |     [
1580 |       281,
1581 |       36,
1582 |       0,
1583 |       122,
1584 |       2,
1585 |       "WHISPERFEAT"
1586 |     ],
1587 |     [
1588 |       282,
1589 |       122,
1590 |       0,
1591 |       30,
1592 |       0,
1593 |       "IMAGE"
1594 |     ],
1595 |     [
1596 |       284,
1597 |       31,
1598 |       0,
1599 |       122,
1600 |       3,
1601 |       "IMAGE"
1602 |     ],
1603 |     [
1604 |       285,
1605 |       16,
1606 |       0,
1607 |       122,
1608 |       4,
1609 |       "IMAGE"
1610 |     ],
1611 |     [
1612 |       287,
1613 |       28,
1614 |       0,
1615 |       124,
1616 |       0,
1617 |       "*"
1618 |     ],
1619 |     [
1620 |       288,
1621 |       36,
1622 |       1,
1623 |       89,
1624 |       1,
1625 |       "INT"
1626 |     ],
1627 |     [
1628 |       289,
1629 |       125,
1630 |       0,
1631 |       30,
1632 |       1,
1633 |       "VHS_AUDIO"
1634 |     ],
1635 |     [
1636 |       290,
1637 |       126,
1638 |       0,
1639 |       96,
1640 |       1,
1641 |       "VHS_AUDIO"
1642 |     ],
1643 |     [
1644 |       294,
1645 |       86,
1646 |       0,
1647 |       95,
1648 |       0,
1649 |       "IMAGE"
1650 |     ],
1651 |     [
1652 |       295,
1653 |       122,
1654 |       0,
1655 |       95,
1656 |       1,
1657 |       "IMAGE"
1658 |     ]
1659 |   ],
1660 |   "groups": [],
1661 |   "config": {},
1662 |   "extra": {},
1663 |   "version": 0.4
1664 | }


--------------------------------------------------------------------------------
/musetalk/whisper/__init__.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import io
  3 | import os
  4 | import urllib
  5 | import warnings
  6 | from typing import List, Optional, Union
  7 | 
  8 | import torch
  9 | from tqdm import tqdm
 10 | 
 11 | from .audio import load_audio, log_mel_spectrogram, pad_or_trim
 12 | from .decoding import DecodingOptions, DecodingResult, decode, detect_language
 13 | from .model import Whisper, ModelDimensions
 14 | from .transcribe import transcribe
 15 | 
 16 | 
 17 | _MODELS = {
 18 |     "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
 19 |     "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
 20 |     "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
 21 |     "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
 22 |     "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
 23 |     "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
 24 |     "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
 25 |     "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
 26 |     "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
 27 |     "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
 28 |     "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
 29 |     "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
 30 | }
 31 | 
 32 | 
 33 | def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
 34 |     os.makedirs(root, exist_ok=True)
 35 | 
 36 |     expected_sha256 = url.split("/")[-2]
 37 |     download_target = os.path.join(root, os.path.basename(url))
 38 | 
 39 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
 40 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
 41 | 
 42 |     if os.path.isfile(download_target):
 43 |         model_bytes = open(download_target, "rb").read()
 44 |         if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
 45 |             return model_bytes if in_memory else download_target
 46 |         else:
 47 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 48 | 
 49 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
 50 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
 51 |             while True:
 52 |                 buffer = source.read(8192)
 53 |                 if not buffer:
 54 |                     break
 55 | 
 56 |                 output.write(buffer)
 57 |                 loop.update(len(buffer))
 58 | 
 59 |     model_bytes = open(download_target, "rb").read()
 60 |     if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
 61 |         raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.")
 62 | 
 63 |     return model_bytes if in_memory else download_target
 64 | 
 65 | 
 66 | def available_models() -> List[str]:
 67 |     """Returns the names of available models"""
 68 |     return list(_MODELS.keys())
 69 | 
 70 | 
 71 | def load_model(name: str, device: Optional[Union[str, torch.device]] = None, download_root: str = None, in_memory: bool = False) -> Whisper:
 72 |     """
 73 |     Load a Whisper ASR model
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     name : str
 78 |         one of the official model names listed by `whisper.available_models()`, or
 79 |         path to a model checkpoint containing the model dimensions and the model state_dict.
 80 |     device : Union[str, torch.device]
 81 |         the PyTorch device to put the model into
 82 |     download_root: str
 83 |         path to download the model files; by default, it uses "~/.cache/whisper"
 84 |     in_memory: bool
 85 |         whether to preload the model weights into host memory
 86 | 
 87 |     Returns
 88 |     -------
 89 |     model : Whisper
 90 |         The Whisper ASR model instance
 91 |     """
 92 | 
 93 |     if device is None:
 94 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 95 |     if download_root is None:
 96 |         download_root = os.getenv(
 97 |             "XDG_CACHE_HOME", 
 98 |             os.path.join(os.path.expanduser("~"), ".cache", "whisper")
 99 |         )
100 | 
101 |     if name in _MODELS:
102 |         checkpoint_file = _download(_MODELS[name], download_root, in_memory)
103 |     elif os.path.isfile(name):
104 |         checkpoint_file = open(name, "rb").read() if in_memory else name
105 |     else:
106 |         raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
107 | 
108 |     with (io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")) as fp:
109 |         checkpoint = torch.load(fp, map_location=device)
110 |     del checkpoint_file
111 | 
112 |     dims = ModelDimensions(**checkpoint["dims"])
113 |     model = Whisper(dims)
114 |     model.load_state_dict(checkpoint["model_state_dict"])
115 | 
116 |     return model.to(device)
117 | 


--------------------------------------------------------------------------------
/musetalk/whisper/__main__.py:
--------------------------------------------------------------------------------
1 | from .transcribe import cli
2 | 
3 | 
4 | cli()
5 | 


--------------------------------------------------------------------------------
/musetalk/whisper/assets/gpt2/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/musetalk/whisper/assets/gpt2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/musetalk/whisper/assets/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kijai/ComfyUI-MuseTalk-KJ/fe908e93ea08d3b3303362c3e19928351f53234b/musetalk/whisper/assets/mel_filters.npz


--------------------------------------------------------------------------------
/musetalk/whisper/assets/multilingual/added_tokens.json:
--------------------------------------------------------------------------------
1 | {"<|endoftext|>": 50257}
2 | 


--------------------------------------------------------------------------------
/musetalk/whisper/assets/multilingual/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}


--------------------------------------------------------------------------------
/musetalk/whisper/assets/multilingual/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"}


--------------------------------------------------------------------------------
/musetalk/whisper/audio.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import lru_cache
  3 | from typing import Union
  4 | 
  5 | #import ffmpeg
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | from .utils import exact_div
 11 | 
 12 | # hard-coded audio hyperparameters
 13 | SAMPLE_RATE = 16000
 14 | N_FFT = 400
 15 | N_MELS = 80
 16 | HOP_LENGTH = 160
 17 | CHUNK_LENGTH = 30
 18 | N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
 19 | N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000: number of frames in a mel spectrogram input
 20 | 
 21 | 
 22 | def load_audio(file: str, sr: int = SAMPLE_RATE):
 23 |     """
 24 |     Open an audio file and read as mono waveform, resampling as necessary
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     file: str
 29 |         The audio file to open
 30 | 
 31 |     sr: int
 32 |         The sample rate to resample the audio if necessary
 33 | 
 34 |     Returns
 35 |     -------
 36 |     A NumPy array containing the audio waveform, in float32 dtype.
 37 |     """
 38 |     try:
 39 |         # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
 40 |         # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
 41 |         out, _ = (
 42 |             ffmpeg.input(file, threads=0)
 43 |             .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
 44 |             .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
 45 |         )
 46 |     except ffmpeg.Error as e:
 47 |         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 48 | 
 49 |     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 50 | 
 51 | 
 52 | def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
 53 |     """
 54 |     Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
 55 |     """
 56 |     if torch.is_tensor(array):
 57 |         if array.shape[axis] > length:
 58 |             array = array.index_select(dim=axis, index=torch.arange(length))
 59 | 
 60 |         if array.shape[axis] < length:
 61 |             pad_widths = [(0, 0)] * array.ndim
 62 |             pad_widths[axis] = (0, length - array.shape[axis])
 63 |             array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
 64 |     else:
 65 |         if array.shape[axis] > length:
 66 |             array = array.take(indices=range(length), axis=axis)
 67 | 
 68 |         if array.shape[axis] < length:
 69 |             pad_widths = [(0, 0)] * array.ndim
 70 |             pad_widths[axis] = (0, length - array.shape[axis])
 71 |             array = np.pad(array, pad_widths)
 72 | 
 73 |     return array
 74 | 
 75 | 
 76 | @lru_cache(maxsize=None)
 77 | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
 78 |     """
 79 |     load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
 80 |     Allows decoupling librosa dependency; saved using:
 81 | 
 82 |         np.savez_compressed(
 83 |             "mel_filters.npz",
 84 |             mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
 85 |         )
 86 |     """
 87 |     assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
 88 |     with np.load(os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")) as f:
 89 |         return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
 90 | 
 91 | 
 92 | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS):
 93 |     """
 94 |     Compute the log-Mel spectrogram of
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
 99 |         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
100 | 
101 |     n_mels: int
102 |         The number of Mel-frequency filters, only 80 is supported
103 | 
104 |     Returns
105 |     -------
106 |     torch.Tensor, shape = (80, n_frames)
107 |         A Tensor that contains the Mel spectrogram
108 |     """
109 |     if not torch.is_tensor(audio):
110 |         if isinstance(audio, str):
111 |             audio = load_audio(audio)
112 |         audio = torch.from_numpy(audio)
113 |         
114 |     window = torch.hann_window(N_FFT).to(audio.device)
115 |     stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
116 |     
117 |     magnitudes = stft[:, :-1].abs() ** 2
118 | 
119 |     filters = mel_filters(audio.device, n_mels)
120 |     mel_spec = filters @ magnitudes
121 | 
122 |     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
123 |     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
124 |     log_spec = (log_spec + 4.0) / 4.0
125 |     return log_spec
126 | 


--------------------------------------------------------------------------------
/musetalk/whisper/checkpoints/place_whisper_model_here.txt:
--------------------------------------------------------------------------------
1 | https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt


--------------------------------------------------------------------------------
/musetalk/whisper/decoding.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch import Tensor
  8 | from torch.distributions import Categorical
  9 | 
 10 | from .audio import CHUNK_LENGTH
 11 | from .tokenizer import Tokenizer, get_tokenizer
 12 | from .utils import compression_ratio
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from .model import Whisper
 16 | 
 17 | 
 18 | @torch.no_grad()
 19 | def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]:
 20 |     """
 21 |     Detect the spoken language in the audio, and return them as list of strings, along with the ids
 22 |     of the most probable language tokens and the probability distribution over all language tokens.
 23 |     This is performed outside the main decode loop in order to not interfere with kv-caching.
 24 | 
 25 |     Returns
 26 |     -------
 27 |     language_tokens : Tensor, shape = (n_audio,)
 28 |         ids of the most probable language tokens, which appears after the startoftranscript token.
 29 |     language_probs : List[Dict[str, float]], length = n_audio
 30 |         list of dictionaries containing the probability distribution over all languages.
 31 |     """
 32 |     if tokenizer is None:
 33 |         tokenizer = get_tokenizer(model.is_multilingual)
 34 |     if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
 35 |         raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
 36 | 
 37 |     single = mel.ndim == 2
 38 |     if single:
 39 |         mel = mel.unsqueeze(0)
 40 | 
 41 |     # skip encoder forward pass if already-encoded audio features were given
 42 |     if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
 43 |         mel = model.encoder(mel)
 44 | 
 45 |     # forward pass using a single token, startoftranscript
 46 |     n_audio = mel.shape[0]
 47 |     x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
 48 |     logits = model.logits(x, mel)[:, 0]
 49 | 
 50 |     # collect detected languages; suppress all non-language tokens
 51 |     mask = torch.ones(logits.shape[-1], dtype=torch.bool)
 52 |     mask[list(tokenizer.all_language_tokens)] = False
 53 |     logits[:, mask] = -np.inf
 54 |     language_tokens = logits.argmax(dim=-1)
 55 |     language_token_probs = logits.softmax(dim=-1).cpu()
 56 |     language_probs = [
 57 |         {
 58 |             c: language_token_probs[i, j].item()
 59 |             for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
 60 |         }
 61 |         for i in range(n_audio)
 62 |     ]
 63 | 
 64 |     if single:
 65 |         language_tokens = language_tokens[0]
 66 |         language_probs = language_probs[0]
 67 | 
 68 |     return language_tokens, language_probs
 69 | 
 70 | 
 71 | @dataclass(frozen=True)
 72 | class DecodingOptions:
 73 |     task: str = "transcribe"  # whether to perform X->X "transcribe" or X->English "translate"
 74 |     language: Optional[str] = None  # language that the audio is in; uses detected language if None
 75 | 
 76 |     # sampling-related options
 77 |     temperature: float = 0.0
 78 |     sample_len: Optional[int] = None  # maximum number of tokens to sample
 79 |     best_of: Optional[int] = None     # number of independent samples to collect, when t > 0
 80 |     beam_size: Optional[int] = None   # number of beams in beam search, when t == 0
 81 |     patience: Optional[float] = None  # patience in beam search (https://arxiv.org/abs/2204.05424)
 82 | 
 83 |     # options for ranking generations (either beams or best-of-N samples)
 84 |     length_penalty: Optional[float] = None   # "alpha" in Google NMT, None defaults to length norm
 85 | 
 86 |     # prompt, prefix, and token suppression
 87 |     prompt: Optional[Union[str, List[int]]] = None   # text or tokens for the previous context
 88 |     prefix: Optional[Union[str, List[int]]] = None   # text or tokens to prefix the current context
 89 |     suppress_blank: bool = True                      # this will suppress blank outputs
 90 | 
 91 |     # list of tokens ids (or comma-separated token ids) to suppress
 92 |     # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
 93 |     suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
 94 | 
 95 |     # timestamp sampling options
 96 |     without_timestamps: bool = False              # use <|notimestamps|> to sample text tokens only
 97 |     max_initial_timestamp: Optional[float] = 1.0  # the initial timestamp cannot be later than this
 98 | 
 99 |     # implementation details
100 |     fp16: bool = True  # use fp16 for most of the calculation
101 | 
102 | 
103 | @dataclass(frozen=True)
104 | class DecodingResult:
105 |     audio_features: Tensor
106 |     language: str
107 |     encoder_embeddings: np.ndarray
108 |     decoder_embeddings: np.ndarray
109 |     language_probs: Optional[Dict[str, float]] = None
110 |     tokens: List[int] = field(default_factory=list)
111 |     text: str = ""
112 |     avg_logprob: float = np.nan
113 |     no_speech_prob: float = np.nan
114 |     temperature: float = np.nan
115 |     compression_ratio: float = np.nan
116 | 
117 | 
118 | class Inference:
119 |     def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
120 |         """Perform a forward pass on the decoder and return per-token logits"""
121 |         raise NotImplementedError
122 | 
123 |     def rearrange_kv_cache(self, source_indices) -> None:
124 |         """Update the key-value cache according to the updated beams"""
125 |         raise NotImplementedError
126 | 
127 |     def cleanup_caching(self) -> None:
128 |         """Clean up any resources or hooks after decoding is finished"""
129 |         pass
130 | 
131 | 
132 | class PyTorchInference(Inference):
133 |     def __init__(self, model: "Whisper", initial_token_length: int):
134 |         self.model: "Whisper" = model
135 |         self.initial_token_length = initial_token_length
136 |         self.kv_cache = {}
137 |         self.hooks = []
138 | 
139 |     def logits(self, tokens: Tensor, audio_features: Tensor, include_embeddings=False) -> Tensor:
140 |         if not self.kv_cache:
141 |             self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
142 | 
143 |         if tokens.shape[-1] > self.initial_token_length:
144 |             # only need to use the last token except in the first forward pass
145 |             tokens = tokens[:, -1:]
146 | 
147 |         return_val = self.model.decoder(tokens, audio_features,
148 |                                         kv_cache=self.kv_cache, include_embeddings=include_embeddings)
149 |         return return_val
150 | 
151 |     def cleanup_caching(self):
152 |         for hook in self.hooks:
153 |             hook.remove()
154 | 
155 |         self.kv_cache = {}
156 |         self.hooks = []
157 | 
158 |     def rearrange_kv_cache(self, source_indices):
159 |         for module, tensor in self.kv_cache.items():
160 |             # update the key/value cache to contain the selected sequences
161 |             self.kv_cache[module] = tensor[source_indices].detach()
162 | 
163 | 
164 | class SequenceRanker:
165 |     def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]:
166 |         """
167 |         Given a list of groups of samples and their cumulative log probabilities,
168 |         return the indices of the samples in each group to select as the final result
169 |         """
170 |         raise NotImplementedError
171 | 
172 | 
173 | class MaximumLikelihoodRanker(SequenceRanker):
174 |     """
175 |     Select the sample with the highest log probabilities, penalized using either
176 |     a simple length normalization or Google NMT paper's length penalty
177 |     """
178 | 
179 |     def __init__(self, length_penalty: Optional[float]):
180 |         self.length_penalty = length_penalty
181 | 
182 |     def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
183 |         def scores(logprobs, lengths):
184 |             result = []
185 |             for logprob, length in zip(logprobs, lengths):
186 |                 if self.length_penalty is None:
187 |                     penalty = length
188 |                 else:
189 |                     # from the Google NMT paper
190 |                     penalty = ((5 + length) / 6) ** self.length_penalty
191 |                 result.append(logprob / penalty)
192 |             return result
193 | 
194 |         # get the sequence with the highest score
195 |         lengths = [[len(t) for t in s] for s in tokens]
196 |         return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
197 | 
198 | 
199 | class TokenDecoder:
200 |     def reset(self):
201 |         """Initialize any stateful variables for decoding a new sequence"""
202 | 
203 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
204 |         """Specify how to select the next token, based on the current trace and logits
205 | 
206 |         Parameters
207 |         ----------
208 |         tokens : Tensor, shape = (n_batch, current_sequence_length)
209 |             all tokens in the context so far, including the prefix and sot_sequence tokens
210 | 
211 |         logits : Tensor, shape = (n_batch, vocab_size)
212 |             per-token logits of the probability distribution at the current step
213 | 
214 |         sum_logprobs : Tensor, shape = (n_batch)
215 |             cumulative log probabilities for each sequence
216 | 
217 |         Returns
218 |         -------
219 |         tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
220 |             the tokens, appended with the selected next token
221 | 
222 |         completed : bool
223 |             True if all sequences has reached the end of text
224 | 
225 |         """
226 |         raise NotImplementedError
227 | 
228 |     def finalize(
229 |         self, tokens: Tensor, sum_logprobs: Tensor
230 |     ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
231 |         """Finalize search and return the final candidate sequences
232 | 
233 |         Parameters
234 |         ----------
235 |         tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
236 |             all tokens in the context so far, including the prefix and sot_sequence
237 | 
238 |         sum_logprobs : Tensor, shape = (n_audio, n_group)
239 |             cumulative log probabilities for each sequence
240 | 
241 |         Returns
242 |         -------
243 |         tokens : Sequence[Sequence[Tensor]], length = n_audio
244 |             sequence of Tensors containing candidate token sequences, for each audio input
245 | 
246 |         sum_logprobs : List[List[float]], length = n_audio
247 |             sequence of cumulative log probabilities corresponding to the above
248 | 
249 |         """
250 |         raise NotImplementedError
251 | 
252 | 
253 | class GreedyDecoder(TokenDecoder):
254 |     def __init__(self, temperature: float, eot: int):
255 |         self.temperature = temperature
256 |         self.eot = eot
257 | 
258 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
259 |         temperature = self.temperature
260 |         if temperature == 0:
261 |             next_tokens = logits.argmax(dim=-1)
262 |         else:
263 |             next_tokens = Categorical(logits=logits / temperature).sample()
264 | 
265 |         logprobs = F.log_softmax(logits.float(), dim=-1)
266 |         current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
267 |         sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
268 | 
269 |         next_tokens[tokens[:, -1] == self.eot] = self.eot
270 |         tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
271 | 
272 |         completed = (tokens[:, -1] == self.eot).all()
273 |         return tokens, completed
274 | 
275 |     def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
276 |         # make sure each sequence has at least one EOT token at the end
277 |         tokens = F.pad(tokens, (0, 1), value=self.eot)
278 |         return tokens, sum_logprobs.tolist()
279 | 
280 | 
281 | class BeamSearchDecoder(TokenDecoder):
282 |     def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None):
283 |         self.beam_size = beam_size
284 |         self.eot = eot
285 |         self.inference = inference
286 |         self.patience = patience or 1.0
287 |         self.max_candidates: int = round(beam_size * self.patience)
288 |         self.finished_sequences = None
289 | 
290 |         assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})"
291 | 
292 |     def reset(self):
293 |         self.finished_sequences = None
294 | 
295 |     def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]:
296 |         if tokens.shape[0] % self.beam_size != 0:
297 |             raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
298 | 
299 |         n_audio = tokens.shape[0] // self.beam_size
300 |         if self.finished_sequences is None:  # for the first update
301 |             self.finished_sequences = [{} for _ in range(n_audio)]
302 | 
303 |         logprobs = F.log_softmax(logits.float(), dim=-1)
304 |         next_tokens, source_indices, finished_sequences = [], [], []
305 |         for i in range(n_audio):
306 |             scores, sources, finished = {}, {}, {}
307 | 
308 |             # STEP 1: calculate the cumulative log probabilities for possible candidates
309 |             for j in range(self.beam_size):
310 |                 idx = i * self.beam_size + j
311 |                 prefix = tokens[idx].tolist()
312 |                 for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
313 |                     new_logprob = (sum_logprobs[idx] + logprob).item()
314 |                     sequence = tuple(prefix + [token.item()])
315 |                     scores[sequence] = new_logprob
316 |                     sources[sequence] = idx
317 | 
318 |             # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
319 |             saved = 0
320 |             for sequence in sorted(scores, key=scores.get, reverse=True):
321 |                 if sequence[-1] == self.eot:
322 |                     finished[sequence] = scores[sequence]
323 |                 else:
324 |                     sum_logprobs[len(next_tokens)] = scores[sequence]
325 |                     next_tokens.append(sequence)
326 |                     source_indices.append(sources[sequence])
327 | 
328 |                     saved += 1
329 |                     if saved == self.beam_size:
330 |                         break
331 | 
332 |             finished_sequences.append(finished)
333 | 
334 |         tokens = torch.tensor(next_tokens, device=tokens.device)
335 |         self.inference.rearrange_kv_cache(source_indices)
336 | 
337 |         # add newly finished sequences to self.finished_sequences
338 |         assert len(self.finished_sequences) == len(finished_sequences)
339 |         for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences):
340 |             for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
341 |                 if len(previously_finished) >= self.max_candidates:
342 |                     break  # the candidate list is full
343 |                 previously_finished[seq] = newly_finished[seq]
344 | 
345 |         # mark as completed if all audio has enough number of samples
346 |         completed = all(
347 |             len(sequences) >= self.max_candidates for sequences in self.finished_sequences
348 |         )
349 |         return tokens, completed
350 | 
351 |     def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
352 |         # collect all finished sequences, including patience, and add unfinished ones if not enough
353 |         sum_logprobs = sum_logprobs.cpu()
354 |         for i, sequences in enumerate(self.finished_sequences):
355 |             if len(sequences) < self.beam_size:  # when not enough sequences are finished
356 |                 for j in list(np.argsort(sum_logprobs[i]))[::-1]:
357 |                     sequence = preceding_tokens[i, j].tolist() + [self.eot]
358 |                     sequences[tuple(sequence)] = sum_logprobs[i][j].item()
359 |                     if len(sequences) >= self.beam_size:
360 |                         break
361 | 
362 |         tokens: List[List[Tensor]] = [
363 |             [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences
364 |         ]
365 |         sum_logprobs: List[List[float]] = [
366 |             list(sequences.values()) for sequences in self.finished_sequences
367 |         ]
368 |         return tokens, sum_logprobs
369 | 
370 | 
371 | class LogitFilter:
372 |     def apply(self, logits: Tensor, tokens: Tensor) -> None:
373 |         """Apply any filtering or masking to logits in-place
374 | 
375 |         Parameters
376 |         ----------
377 |         logits : Tensor, shape = (n_batch, vocab_size)
378 |             per-token logits of the probability distribution at the current step
379 | 
380 |         tokens : Tensor, shape = (n_batch, current_sequence_length)
381 |             all tokens in the context so far, including the prefix and sot_sequence tokens
382 | 
383 |         """
384 |         raise NotImplementedError
385 | 
386 | 
387 | class SuppressBlank(LogitFilter):
388 |     def __init__(self, tokenizer: Tokenizer, sample_begin: int):
389 |         self.tokenizer = tokenizer
390 |         self.sample_begin = sample_begin
391 | 
392 |     def apply(self, logits: Tensor, tokens: Tensor):
393 |         if tokens.shape[1] == self.sample_begin:
394 |             logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
395 | 
396 | 
397 | class SuppressTokens(LogitFilter):
398 |     def __init__(self, suppress_tokens: Sequence[int]):
399 |         self.suppress_tokens = list(suppress_tokens)
400 | 
401 |     def apply(self, logits: Tensor, tokens: Tensor):
402 |         logits[:, self.suppress_tokens] = -np.inf
403 | 
404 | 
405 | class ApplyTimestampRules(LogitFilter):
406 |     def __init__(
407 |         self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int]
408 |     ):
409 |         self.tokenizer = tokenizer
410 |         self.sample_begin = sample_begin
411 |         self.max_initial_timestamp_index = max_initial_timestamp_index
412 | 
413 |     def apply(self, logits: Tensor, tokens: Tensor):
414 |         # suppress <|notimestamps|> which is handled by without_timestamps
415 |         if self.tokenizer.no_timestamps is not None:
416 |             logits[:, self.tokenizer.no_timestamps] = -np.inf
417 | 
418 |         # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
419 |         for k in range(tokens.shape[0]):
420 |             seq = [t for t in tokens[k, self.sample_begin :].tolist()]
421 |             last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
422 |             penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
423 | 
424 |             if last_was_timestamp:
425 |                 if penultimate_was_timestamp:  # has to be non-timestamp
426 |                     logits[k, self.tokenizer.timestamp_begin :] = -np.inf
427 |                 else:  # cannot be normal text tokens
428 |                     logits[k, : self.tokenizer.eot] = -np.inf
429 | 
430 |         # apply the `max_initial_timestamp` option
431 |         if tokens.shape[1] == self.sample_begin and self.max_initial_timestamp_index is not None:
432 |             last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
433 |             logits[:, last_allowed + 1 :] = -np.inf
434 | 
435 |         # if sum of probability over timestamps is above any other token, sample timestamp
436 |         logprobs = F.log_softmax(logits.float(), dim=-1)
437 |         for k in range(tokens.shape[0]):
438 |             timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
439 |             max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
440 |             if timestamp_logprob > max_text_token_logprob:
441 |                 logits[k, : self.tokenizer.timestamp_begin] = -np.inf
442 | 
443 | 
444 | class DecodingTask:
445 |     inference: Inference
446 |     sequence_ranker: SequenceRanker
447 |     decoder: TokenDecoder
448 |     logit_filters: List[LogitFilter]
449 | 
450 |     def __init__(self, model: "Whisper", options: DecodingOptions):
451 |         self.model = model
452 | 
453 |         language = options.language or "en"
454 |         tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task)
455 |         self.tokenizer: Tokenizer = tokenizer
456 |         self.options: DecodingOptions = self._verify_options(options)
457 | 
458 |         self.n_group: int = options.beam_size or options.best_of or 1
459 |         self.n_ctx: int = model.dims.n_text_ctx
460 |         self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
461 | 
462 |         self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
463 |         if self.options.without_timestamps:
464 |             self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
465 | 
466 |         self.initial_tokens: Tuple[int] = self._get_initial_tokens()
467 |         self.sample_begin: int = len(self.initial_tokens)
468 |         self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
469 | 
470 |         # inference: implements the forward pass through the decoder, including kv caching
471 |         self.inference = PyTorchInference(model, len(self.initial_tokens))
472 | 
473 |         # sequence ranker: implements how to rank a group of sampled sequences
474 |         self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
475 | 
476 |         # decoder: implements how to select the next tokens, given the autoregressive distribution
477 |         if options.beam_size is not None:
478 |             self.decoder = BeamSearchDecoder(
479 |                 options.beam_size, tokenizer.eot, self.inference, options.patience
480 |             )
481 |         else:
482 |             self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
483 | 
484 |         # logit filters: applies various rules to suppress or penalize certain tokens
485 |         self.logit_filters = []
486 |         if self.options.suppress_blank:
487 |             self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
488 |         if self.options.suppress_tokens:
489 |             self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
490 |         if not options.without_timestamps:
491 |             precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
492 |             max_initial_timestamp_index = None
493 |             if options.max_initial_timestamp:
494 |                 max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision)
495 |             self.logit_filters.append(
496 |                 ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index)
497 |             )
498 | 
499 |     def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
500 |         if options.beam_size is not None and options.best_of is not None:
501 |             raise ValueError("beam_size and best_of can't be given together")
502 |         if options.temperature == 0:
503 |             if options.best_of is not None:
504 |                 raise ValueError("best_of with greedy sampling (T=0) is not compatible")
505 |         if options.patience is not None and options.beam_size is None:
506 |             raise ValueError("patience requires beam_size to be given")
507 |         if options.length_penalty is not None and not (0 <= options.length_penalty <= 1):
508 |             raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
509 | 
510 |         return options
511 | 
512 |     def _get_initial_tokens(self) -> Tuple[int]:
513 |         tokens = list(self.sot_sequence)
514 |         prefix = self.options.prefix
515 |         prompt = self.options.prompt
516 | 
517 |         if prefix:
518 |             prefix_tokens = (
519 |                 self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix
520 |             )
521 |             if self.sample_len is not None:
522 |                 max_prefix_len = self.n_ctx // 2 - self.sample_len
523 |                 prefix_tokens = prefix_tokens[-max_prefix_len:]
524 |             tokens = tokens + prefix_tokens
525 | 
526 |         if prompt:
527 |             prompt_tokens = (
528 |                 self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt
529 |             )
530 |             tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens
531 | 
532 |         return tuple(tokens)
533 | 
534 |     def _get_suppress_tokens(self) -> Tuple[int]:
535 |         suppress_tokens = self.options.suppress_tokens
536 | 
537 |         if isinstance(suppress_tokens, str):
538 |             suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
539 | 
540 |         if -1 in suppress_tokens:
541 |             suppress_tokens = [t for t in suppress_tokens if t >= 0]
542 |             suppress_tokens.extend(self.tokenizer.non_speech_tokens)
543 |         elif suppress_tokens is None or len(suppress_tokens) == 0:
544 |             suppress_tokens = []  # interpret empty string as an empty list
545 |         else:
546 |             assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
547 | 
548 |         suppress_tokens.extend(
549 |             [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm]
550 |         )
551 |         if self.tokenizer.no_speech is not None:
552 |             # no-speech probability is collected separately
553 |             suppress_tokens.append(self.tokenizer.no_speech)
554 | 
555 |         return tuple(sorted(set(suppress_tokens)))
556 | 
557 |     def _get_audio_features(self, mel: Tensor, include_embeddings: bool = False):
558 |         if self.options.fp16:
559 |             mel = mel.half()
560 | 
561 |         if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state):
562 |             # encoded audio features are given; skip audio encoding
563 |             audio_features = mel
564 |         else:
565 |             result = self.model.encoder(mel, include_embeddings)
566 |             if include_embeddings:
567 |                 audio_features, embeddings = result
568 |             else:
569 |                 audio_features = result
570 | 
571 |         if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32):
572 |             return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}")
573 | 
574 |         if include_embeddings:
575 |             return audio_features, embeddings
576 |         else:
577 |             return audio_features
578 | 
579 |     def _detect_language(self, audio_features: Tensor, tokens: Tensor):
580 |         languages = [self.options.language] * audio_features.shape[0]
581 |         lang_probs = None
582 | 
583 |         if self.options.language is None or self.options.task == "lang_id":
584 |             lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer)
585 |             languages = [max(probs, key=probs.get) for probs in lang_probs]
586 |             if self.options.language is None:
587 |                 tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
588 | 
589 |         return languages, lang_probs
590 | 
591 |     def _main_loop(self, audio_features: Tensor, tokens: Tensor):
592 |         assert audio_features.shape[0] == tokens.shape[0]
593 |         n_batch = tokens.shape[0]
594 |         sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
595 |         no_speech_probs = [np.nan] * n_batch
596 | 
597 |         try:
598 |             embeddings = []
599 |             for i in range(self.sample_len):
600 |                 logits, token_embeddings = self.inference.logits(tokens, audio_features, include_embeddings=True)
601 | 
602 |                 if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
603 |                     probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
604 |                     no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
605 | 
606 |                 # now we need to consider the logits at the last token only
607 |                 logits = logits[:, -1]
608 |                 token_embeddings = token_embeddings[:, :, -1]
609 | 
610 |                 # Append embeddings together
611 |                 embeddings.append(token_embeddings)
612 | 
613 |                 # apply the logit filters, e.g. for suppressing or applying penalty to
614 |                 for logit_filter in self.logit_filters:
615 |                     logit_filter.apply(logits, tokens)
616 | 
617 |                 # expand the tokens tensor with the selected next tokens
618 |                 tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
619 | 
620 |                 if completed or tokens.shape[-1] > self.n_ctx:
621 |                     break
622 |         finally:
623 |             if completed:
624 |                 embeddings = embeddings[:-1]
625 |             embeddings = np.stack(embeddings, 2)
626 |             self.inference.cleanup_caching()
627 | 
628 |         return tokens, sum_logprobs, no_speech_probs, embeddings
629 | 
630 |     @torch.no_grad()
631 |     def run(self, mel: Tensor) -> List[DecodingResult]:
632 |         self.decoder.reset()
633 |         tokenizer: Tokenizer = self.tokenizer
634 |         n_audio: int = mel.shape[0]
635 | 
636 |         # encoder forward pass
637 |         forward_pass: Tuple[Tensor, np.ndarray] = self._get_audio_features(mel, include_embeddings=True)
638 |         audio_features, encoder_embeddings = forward_pass
639 |         tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
640 | 
641 |         # detect language if requested, overwriting the language token
642 |         languages, language_probs = self._detect_language(audio_features, tokens)
643 |         if self.options.task == "lang_id":
644 |             return [
645 |                 DecodingResult(audio_features=features, language=language, language_probs=probs)
646 |                 for features, language, probs in zip(audio_features, languages, language_probs)
647 |             ]
648 | 
649 |         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
650 |         audio_features = audio_features.repeat_interleave(self.n_group, dim=0)
651 |         tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
652 | 
653 |         # call the main sampling loop
654 |         tokens, sum_logprobs, no_speech_probs, decoder_embeddings = self._main_loop(audio_features, tokens)
655 | 
656 |         # reshape the tensors to have (n_audio, n_group) as the first two dimensions
657 |         audio_features = audio_features[:: self.n_group]
658 |         no_speech_probs = no_speech_probs[:: self.n_group]
659 |         assert audio_features.shape[0] == len(no_speech_probs) == n_audio
660 | 
661 |         tokens = tokens.reshape(n_audio, self.n_group, -1)
662 |         sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
663 | 
664 |         # get the final candidates for each group, and slice between the first sampled token and EOT
665 |         tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
666 |         tokens: List[List[Tensor]] = [
667 |             [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens
668 |         ]
669 | 
670 |         # select the top-ranked sample in each group
671 |         selected = self.sequence_ranker.rank(tokens, sum_logprobs)
672 |         tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
673 |         texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
674 | 
675 |         sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
676 |         avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)]
677 | 
678 |         fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs)
679 |         if len(set(map(len, fields))) != 1:
680 |             raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
681 | 
682 |         return [
683 |             DecodingResult(
684 |                 audio_features=features,
685 |                 language=language,
686 |                 tokens=tokens,
687 |                 text=text,
688 |                 avg_logprob=avg_logprob,
689 |                 no_speech_prob=no_speech_prob,
690 |                 temperature=self.options.temperature,
691 |                 compression_ratio=compression_ratio(text),
692 |                 encoder_embeddings=encoder_embeddings,
693 |                 decoder_embeddings=decoder_embeddings
694 |             )
695 |             for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields)
696 |         ]
697 | 
698 | 
699 | @torch.no_grad()
700 | def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]:
701 |     """
702 |     Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
703 | 
704 |     Parameters
705 |     ----------
706 |     model: Whisper
707 |         the Whisper model instance
708 | 
709 |     mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
710 |         A tensor containing the Mel spectrogram(s)
711 | 
712 |     options: DecodingOptions
713 |         A dataclass that contains all necessary options for decoding 30-second segments
714 | 
715 |     Returns
716 |     -------
717 |     result: Union[DecodingResult, List[DecodingResult]]
718 |         The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
719 |     """
720 |     single = mel.ndim == 2
721 |     if single:
722 |         mel = mel.unsqueeze(0)
723 | 
724 |     result = DecodingTask(model, options).run(mel)
725 |     
726 |     if single:
727 |         result = result[0]
728 | 
729 |     return result
730 | 


--------------------------------------------------------------------------------
/musetalk/whisper/model.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Dict
  3 | from typing import Iterable, Optional
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from torch import Tensor
  9 | from torch import nn
 10 | 
 11 | from .transcribe import transcribe as transcribe_function
 12 | from .decoding import detect_language as detect_language_function, decode as decode_function
 13 | 
 14 | 
 15 | @dataclass
 16 | class ModelDimensions:
 17 |     n_mels: int
 18 |     n_audio_ctx: int
 19 |     n_audio_state: int
 20 |     n_audio_head: int
 21 |     n_audio_layer: int
 22 |     n_vocab: int
 23 |     n_text_ctx: int
 24 |     n_text_state: int
 25 |     n_text_head: int
 26 |     n_text_layer: int
 27 | 
 28 | 
 29 | class LayerNorm(nn.LayerNorm):
 30 |     def forward(self, x: Tensor) -> Tensor:
 31 |         return super().forward(x.float()).type(x.dtype)
 32 | 
 33 | 
 34 | class Linear(nn.Linear):
 35 |     def forward(self, x: Tensor) -> Tensor:
 36 |         return F.linear(
 37 |             x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype)
 38 |         )
 39 | 
 40 | 
 41 | class Conv1d(nn.Conv1d):
 42 |     def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:
 43 |         return super()._conv_forward(
 44 |             x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
 45 |         )
 46 | 
 47 | 
 48 | def sinusoids(length, channels, max_timescale=10000):
 49 |     """Returns sinusoids for positional embedding"""
 50 |     assert channels % 2 == 0
 51 |     log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
 52 |     inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
 53 |     scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
 54 |     return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
 55 | 
 56 | 
 57 | class MultiHeadAttention(nn.Module):
 58 |     def __init__(self, n_state: int, n_head: int):
 59 |         super().__init__()
 60 |         self.n_head = n_head
 61 |         self.query = Linear(n_state, n_state)
 62 |         self.key = Linear(n_state, n_state, bias=False)
 63 |         self.value = Linear(n_state, n_state)
 64 |         self.out = Linear(n_state, n_state)
 65 | 
 66 |     def forward(
 67 |         self,
 68 |         x: Tensor,
 69 |         xa: Optional[Tensor] = None,
 70 |         mask: Optional[Tensor] = None,
 71 |         kv_cache: Optional[dict] = None,
 72 |     ):
 73 |         q = self.query(x)
 74 | 
 75 |         if kv_cache is None or xa is None:
 76 |             # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
 77 |             # otherwise, perform key/value projections for self- or cross-attention as usual.
 78 |             k = self.key(x if xa is None else xa)
 79 |             v = self.value(x if xa is None else xa)
 80 |         else:
 81 |             # for cross-attention, calculate keys and values once and reuse in subsequent calls.
 82 |             k = kv_cache.get(self.key, self.key(xa))
 83 |             v = kv_cache.get(self.value, self.value(xa))
 84 | 
 85 |         wv = self.qkv_attention(q, k, v, mask)
 86 |         return self.out(wv)
 87 | 
 88 |     def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
 89 |         n_batch, n_ctx, n_state = q.shape
 90 |         scale = (n_state // self.n_head) ** -0.25
 91 |         q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
 92 |         k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
 93 |         v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
 94 | 
 95 |         qk = q @ k
 96 |         if mask is not None:
 97 |             qk = qk + mask[:n_ctx, :n_ctx]
 98 | 
 99 |         w = F.softmax(qk.float(), dim=-1).to(q.dtype)
100 |         return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
101 | 
102 | 
103 | class ResidualAttentionBlock(nn.Module):
104 |     def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
105 |         super().__init__()
106 | 
107 |         self.attn = MultiHeadAttention(n_state, n_head)
108 |         self.attn_ln = LayerNorm(n_state)
109 | 
110 |         self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
111 |         self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
112 | 
113 |         n_mlp = n_state * 4
114 |         self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
115 |         self.mlp_ln = LayerNorm(n_state)
116 | 
117 |     def forward(
118 |         self,
119 |         x: Tensor,
120 |         xa: Optional[Tensor] = None,
121 |         mask: Optional[Tensor] = None,
122 |         kv_cache: Optional[dict] = None,
123 |     ):
124 |         x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
125 |         if self.cross_attn:
126 |             x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
127 |         x = x + self.mlp(self.mlp_ln(x))
128 |         return x
129 | 
130 | 
131 | class AudioEncoder(nn.Module):
132 |     def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
133 |         super().__init__()
134 |         self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
135 |         self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
136 |         self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
137 | 
138 |         self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
139 |             [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
140 |         )
141 |         self.ln_post = LayerNorm(n_state)
142 | 
143 |     def forward(self, x: Tensor, include_embeddings: bool = False):
144 |         """
145 |         x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
146 |             the mel spectrogram of the audio
147 |         include_embeddings: bool
148 |             whether to include intermediate steps in the output
149 |         """
150 |         x = F.gelu(self.conv1(x))
151 |         x = F.gelu(self.conv2(x))
152 |         x = x.permute(0, 2, 1)
153 | 
154 |         assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
155 |         x = (x + self.positional_embedding).to(x.dtype)
156 | 
157 |         if include_embeddings:
158 |             embeddings = [x.cpu().detach().numpy()]
159 | 
160 |         for block in self.blocks:
161 |             x = block(x)
162 |             if include_embeddings:
163 |                 embeddings.append(x.cpu().detach().numpy())
164 | 
165 |         x = self.ln_post(x)
166 | 
167 |         if include_embeddings:
168 |             embeddings = np.stack(embeddings, axis=1)
169 |             return x, embeddings
170 |         else:
171 |             return x
172 | 
173 | 
174 | class TextDecoder(nn.Module):
175 |     def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
176 |         super().__init__()
177 | 
178 |         self.token_embedding = nn.Embedding(n_vocab, n_state)
179 |         self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
180 | 
181 |         self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
182 |             [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
183 |         )
184 |         self.ln = LayerNorm(n_state)
185 | 
186 |         mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
187 |         self.register_buffer("mask", mask, persistent=False)
188 | 
189 |     def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None, include_embeddings: bool = False):
190 |         """
191 |         x : torch.LongTensor, shape = (batch_size, <= n_ctx)
192 |             the text tokens
193 |         xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
194 |             the encoded audio features to be attended on
195 |         include_embeddings : bool
196 |             Whether to include intermediate values in the output to this function
197 |         """
198 |         offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
199 |         x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
200 |         x = x.to(xa.dtype)
201 | 
202 |         if include_embeddings:
203 |             embeddings = [x.cpu().detach().numpy()]
204 | 
205 |         for block in self.blocks:
206 |             x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
207 |             if include_embeddings:
208 |                 embeddings.append(x.cpu().detach().numpy())
209 | 
210 |         x = self.ln(x)
211 |         logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float()
212 | 
213 |         if include_embeddings:
214 |             embeddings = np.stack(embeddings, axis=1)
215 |             return logits, embeddings
216 |         else:
217 |             return logits
218 | 
219 | 
220 | class Whisper(nn.Module):
221 |     def __init__(self, dims: ModelDimensions):
222 |         super().__init__()
223 |         self.dims = dims
224 |         self.encoder = AudioEncoder(
225 |             self.dims.n_mels,
226 |             self.dims.n_audio_ctx,
227 |             self.dims.n_audio_state,
228 |             self.dims.n_audio_head,
229 |             self.dims.n_audio_layer,
230 |         )
231 |         self.decoder = TextDecoder(
232 |             self.dims.n_vocab,
233 |             self.dims.n_text_ctx,
234 |             self.dims.n_text_state,
235 |             self.dims.n_text_head,
236 |             self.dims.n_text_layer,
237 |         )
238 | 
239 |     def embed_audio(self, mel: torch.Tensor):
240 |         return self.encoder.forward(mel)
241 | 
242 |     def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
243 |         return self.decoder.forward(tokens, audio_features)
244 | 
245 |     def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
246 |         return self.decoder(tokens, self.encoder(mel))
247 | 
248 |     @property
249 |     def device(self):
250 |         return next(self.parameters()).device
251 | 
252 |     @property
253 |     def is_multilingual(self):
254 |         return self.dims.n_vocab == 51865
255 | 
256 |     def install_kv_cache_hooks(self, cache: Optional[dict] = None):
257 |         """
258 |         The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
259 |         tensors calculated for the previous positions. This method returns a dictionary that stores
260 |         all caches, and the necessary hooks for the key and value projection modules that save the
261 |         intermediate tensors to be reused during later calculations.
262 | 
263 |         Returns
264 |         -------
265 |         cache : Dict[nn.Module, torch.Tensor]
266 |             A dictionary object mapping the key/value projection modules to its cache
267 |         hooks : List[RemovableHandle]
268 |             List of PyTorch RemovableHandle objects to stop the hooks to be called
269 |         """
270 |         cache = {**cache} if cache is not None else {}
271 |         hooks = []
272 | 
273 |         def save_to_cache(module, _, output):
274 |             if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]:
275 |                 cache[module] = output  # save as-is, for the first token or cross attention
276 |             else:
277 |                 cache[module] = torch.cat([cache[module], output], dim=1).detach()
278 |             return cache[module]
279 | 
280 |         def install_hooks(layer: nn.Module):
281 |             if isinstance(layer, MultiHeadAttention):
282 |                 hooks.append(layer.key.register_forward_hook(save_to_cache))
283 |                 hooks.append(layer.value.register_forward_hook(save_to_cache))
284 | 
285 |         self.decoder.apply(install_hooks)
286 |         return cache, hooks
287 | 
288 |     detect_language = detect_language_function
289 |     transcribe = transcribe_function
290 |     decode = decode_function
291 | 


--------------------------------------------------------------------------------
/musetalk/whisper/normalizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .basic import BasicTextNormalizer
2 | from .english import EnglishTextNormalizer
3 | 


--------------------------------------------------------------------------------
/musetalk/whisper/normalizers/basic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import unicodedata
 3 | 
 4 | import regex
 5 | 
 6 | # non-ASCII letters that are not separated by "NFKD" normalization
 7 | ADDITIONAL_DIACRITICS = {
 8 |     "œ": "oe",
 9 |     "Œ": "OE",
10 |     "ø": "o",
11 |     "Ø": "O",
12 |     "æ": "ae",
13 |     "Æ": "AE",
14 |     "ß": "ss",
15 |     "ẞ": "SS",
16 |     "đ": "d",
17 |     "Đ": "D",
18 |     "ð": "d",
19 |     "Ð": "D",
20 |     "þ": "th",
21 |     "Þ": "th",
22 |     "ł": "l",
23 |     "Ł": "L",
24 | }
25 | 
26 | 
27 | def remove_symbols_and_diacritics(s: str, keep=""):
28 |     """
29 |     Replace any other markers, symbols, and punctuations with a space,
30 |     and drop any diacritics (category 'Mn' and some manual mappings)
31 |     """
32 |     return "".join(
33 |         c
34 |         if c in keep
35 |         else ADDITIONAL_DIACRITICS[c]
36 |         if c in ADDITIONAL_DIACRITICS
37 |         else ""
38 |         if unicodedata.category(c) == "Mn"
39 |         else " "
40 |         if unicodedata.category(c)[0] in "MSP"
41 |         else c
42 |         for c in unicodedata.normalize("NFKD", s)
43 |     )
44 | 
45 | 
46 | def remove_symbols(s: str):
47 |     """
48 |     Replace any other markers, symbols, punctuations with a space, keeping diacritics
49 |     """
50 |     return "".join(
51 |         " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
52 |     )
53 | 
54 | 
55 | class BasicTextNormalizer:
56 |     def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
57 |         self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
58 |         self.split_letters = split_letters
59 | 
60 |     def __call__(self, s: str):
61 |         s = s.lower()
62 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
63 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
64 |         s = self.clean(s).lower()
65 | 
66 |         if self.split_letters:
67 |             s = " ".join(regex.findall(r"\X", s, regex.U))
68 | 
69 |         s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
70 | 
71 |         return s
72 | 


--------------------------------------------------------------------------------
/musetalk/whisper/normalizers/english.json:
--------------------------------------------------------------------------------
   1 | {
   2 |     "accessorise": "accessorize",
   3 |     "accessorised": "accessorized",
   4 |     "accessorises": "accessorizes",
   5 |     "accessorising": "accessorizing",
   6 |     "acclimatisation": "acclimatization",
   7 |     "acclimatise": "acclimatize",
   8 |     "acclimatised": "acclimatized",
   9 |     "acclimatises": "acclimatizes",
  10 |     "acclimatising": "acclimatizing",
  11 |     "accoutrements": "accouterments",
  12 |     "aeon": "eon",
  13 |     "aeons": "eons",
  14 |     "aerogramme": "aerogram",
  15 |     "aerogrammes": "aerograms",
  16 |     "aeroplane": "airplane",
  17 |     "aeroplanes": "airplanes",
  18 |     "aesthete": "esthete",
  19 |     "aesthetes": "esthetes",
  20 |     "aesthetic": "esthetic",
  21 |     "aesthetically": "esthetically",
  22 |     "aesthetics": "esthetics",
  23 |     "aetiology": "etiology",
  24 |     "ageing": "aging",
  25 |     "aggrandisement": "aggrandizement",
  26 |     "agonise": "agonize",
  27 |     "agonised": "agonized",
  28 |     "agonises": "agonizes",
  29 |     "agonising": "agonizing",
  30 |     "agonisingly": "agonizingly",
  31 |     "almanack": "almanac",
  32 |     "almanacks": "almanacs",
  33 |     "aluminium": "aluminum",
  34 |     "amortisable": "amortizable",
  35 |     "amortisation": "amortization",
  36 |     "amortisations": "amortizations",
  37 |     "amortise": "amortize",
  38 |     "amortised": "amortized",
  39 |     "amortises": "amortizes",
  40 |     "amortising": "amortizing",
  41 |     "amphitheatre": "amphitheater",
  42 |     "amphitheatres": "amphitheaters",
  43 |     "anaemia": "anemia",
  44 |     "anaemic": "anemic",
  45 |     "anaesthesia": "anesthesia",
  46 |     "anaesthetic": "anesthetic",
  47 |     "anaesthetics": "anesthetics",
  48 |     "anaesthetise": "anesthetize",
  49 |     "anaesthetised": "anesthetized",
  50 |     "anaesthetises": "anesthetizes",
  51 |     "anaesthetising": "anesthetizing",
  52 |     "anaesthetist": "anesthetist",
  53 |     "anaesthetists": "anesthetists",
  54 |     "anaesthetize": "anesthetize",
  55 |     "anaesthetized": "anesthetized",
  56 |     "anaesthetizes": "anesthetizes",
  57 |     "anaesthetizing": "anesthetizing",
  58 |     "analogue": "analog",
  59 |     "analogues": "analogs",
  60 |     "analyse": "analyze",
  61 |     "analysed": "analyzed",
  62 |     "analyses": "analyzes",
  63 |     "analysing": "analyzing",
  64 |     "anglicise": "anglicize",
  65 |     "anglicised": "anglicized",
  66 |     "anglicises": "anglicizes",
  67 |     "anglicising": "anglicizing",
  68 |     "annualised": "annualized",
  69 |     "antagonise": "antagonize",
  70 |     "antagonised": "antagonized",
  71 |     "antagonises": "antagonizes",
  72 |     "antagonising": "antagonizing",
  73 |     "apologise": "apologize",
  74 |     "apologised": "apologized",
  75 |     "apologises": "apologizes",
  76 |     "apologising": "apologizing",
  77 |     "appal": "appall",
  78 |     "appals": "appalls",
  79 |     "appetiser": "appetizer",
  80 |     "appetisers": "appetizers",
  81 |     "appetising": "appetizing",
  82 |     "appetisingly": "appetizingly",
  83 |     "arbour": "arbor",
  84 |     "arbours": "arbors",
  85 |     "archeological": "archaeological",
  86 |     "archaeologically": "archeologically",
  87 |     "archaeologist": "archeologist",
  88 |     "archaeologists": "archeologists",
  89 |     "archaeology": "archeology</span>",
  90 |     "ardour": "ardor",
  91 |     "armour": "armor",
  92 |     "armoured": "armored",
  93 |     "armourer": "armorer",
  94 |     "armourers": "armorers",
  95 |     "armouries": "armories",
  96 |     "armoury": "armory",
  97 |     "artefact": "artifact",
  98 |     "artefacts": "artifacts",
  99 |     "authorise": "authorize",
 100 |     "authorised": "authorized",
 101 |     "authorises": "authorizes",
 102 |     "authorising": "authorizing",
 103 |     "axe": "ax",
 104 |     "backpedalled": "backpedaled",
 105 |     "backpedalling": "backpedaling",
 106 |     "bannister": "banister",
 107 |     "bannisters": "banisters",
 108 |     "baptise": "baptize",
 109 |     "baptised": "baptized",
 110 |     "baptises": "baptizes",
 111 |     "baptising": "baptizing",
 112 |     "bastardise": "bastardize",
 113 |     "bastardised": "bastardized",
 114 |     "bastardises": "bastardizes",
 115 |     "bastardising": "bastardizing",
 116 |     "battleax": "battleaxe",
 117 |     "baulk": "balk",
 118 |     "baulked": "balked",
 119 |     "baulking": "balking",
 120 |     "baulks": "balks",
 121 |     "bedevilled": "bedeviled",
 122 |     "bedevilling": "bedeviling",
 123 |     "behaviour": "behavior",
 124 |     "behavioural": "behavioral",
 125 |     "behaviourism": "behaviorism",
 126 |     "behaviourist": "behaviorist",
 127 |     "behaviourists": "behaviorists",
 128 |     "behaviours": "behaviors",
 129 |     "behove": "behoove",
 130 |     "behoved": "behooved",
 131 |     "behoves": "behooves",
 132 |     "bejewelled": "bejeweled",
 133 |     "belabour": "belabor",
 134 |     "belaboured": "belabored",
 135 |     "belabouring": "belaboring",
 136 |     "belabours": "belabors",
 137 |     "bevelled": "beveled",
 138 |     "bevvies": "bevies",
 139 |     "bevvy": "bevy",
 140 |     "biassed": "biased",
 141 |     "biassing": "biasing",
 142 |     "bingeing": "binging",
 143 |     "bougainvillaea": "bougainvillea",
 144 |     "bougainvillaeas": "bougainvilleas",
 145 |     "bowdlerise": "bowdlerize",
 146 |     "bowdlerised": "bowdlerized",
 147 |     "bowdlerises": "bowdlerizes",
 148 |     "bowdlerising": "bowdlerizing",
 149 |     "breathalyse": "breathalyze",
 150 |     "breathalysed": "breathalyzed",
 151 |     "breathalyser": "breathalyzer",
 152 |     "breathalysers": "breathalyzers",
 153 |     "breathalyses": "breathalyzes",
 154 |     "breathalysing": "breathalyzing",
 155 |     "brutalise": "brutalize",
 156 |     "brutalised": "brutalized",
 157 |     "brutalises": "brutalizes",
 158 |     "brutalising": "brutalizing",
 159 |     "busses": "buses",
 160 |     "bussing": "busing",
 161 |     "caesarean": "cesarean",
 162 |     "caesareans": "cesareans",
 163 |     "calibre": "caliber",
 164 |     "calibres": "calibers",
 165 |     "calliper": "caliper",
 166 |     "callipers": "calipers",
 167 |     "callisthenics": "calisthenics",
 168 |     "canalise": "canalize",
 169 |     "canalised": "canalized",
 170 |     "canalises": "canalizes",
 171 |     "canalising": "canalizing",
 172 |     "cancelation": "cancellation",
 173 |     "cancelations": "cancellations",
 174 |     "cancelled": "canceled",
 175 |     "cancelling": "canceling",
 176 |     "candour": "candor",
 177 |     "cannibalise": "cannibalize",
 178 |     "cannibalised": "cannibalized",
 179 |     "cannibalises": "cannibalizes",
 180 |     "cannibalising": "cannibalizing",
 181 |     "canonise": "canonize",
 182 |     "canonised": "canonized",
 183 |     "canonises": "canonizes",
 184 |     "canonising": "canonizing",
 185 |     "capitalise": "capitalize",
 186 |     "capitalised": "capitalized",
 187 |     "capitalises": "capitalizes",
 188 |     "capitalising": "capitalizing",
 189 |     "caramelise": "caramelize",
 190 |     "caramelised": "caramelized",
 191 |     "caramelises": "caramelizes",
 192 |     "caramelising": "caramelizing",
 193 |     "carbonise": "carbonize",
 194 |     "carbonised": "carbonized",
 195 |     "carbonises": "carbonizes",
 196 |     "carbonising": "carbonizing",
 197 |     "carolled": "caroled",
 198 |     "carolling": "caroling",
 199 |     "catalogue": "catalog",
 200 |     "catalogued": "cataloged",
 201 |     "catalogues": "catalogs",
 202 |     "cataloguing": "cataloging",
 203 |     "catalyse": "catalyze",
 204 |     "catalysed": "catalyzed",
 205 |     "catalyses": "catalyzes",
 206 |     "catalysing": "catalyzing",
 207 |     "categorise": "categorize",
 208 |     "categorised": "categorized",
 209 |     "categorises": "categorizes",
 210 |     "categorising": "categorizing",
 211 |     "cauterise": "cauterize",
 212 |     "cauterised": "cauterized",
 213 |     "cauterises": "cauterizes",
 214 |     "cauterising": "cauterizing",
 215 |     "cavilled": "caviled",
 216 |     "cavilling": "caviling",
 217 |     "centigramme": "centigram",
 218 |     "centigrammes": "centigrams",
 219 |     "centilitre": "centiliter",
 220 |     "centilitres": "centiliters",
 221 |     "centimetre": "centimeter",
 222 |     "centimetres": "centimeters",
 223 |     "centralise": "centralize",
 224 |     "centralised": "centralized",
 225 |     "centralises": "centralizes",
 226 |     "centralising": "centralizing",
 227 |     "centre": "center",
 228 |     "centred": "centered",
 229 |     "centrefold": "centerfold",
 230 |     "centrefolds": "centerfolds",
 231 |     "centrepiece": "centerpiece",
 232 |     "centrepieces": "centerpieces",
 233 |     "centres": "centers",
 234 |     "channelled": "channeled",
 235 |     "channelling": "channeling",
 236 |     "characterise": "characterize",
 237 |     "characterised": "characterized",
 238 |     "characterises": "characterizes",
 239 |     "characterising": "characterizing",
 240 |     "cheque": "check",
 241 |     "chequebook": "checkbook",
 242 |     "chequebooks": "checkbooks",
 243 |     "chequered": "checkered",
 244 |     "cheques": "checks",
 245 |     "chilli": "chili",
 246 |     "chimaera": "chimera",
 247 |     "chimaeras": "chimeras",
 248 |     "chiselled": "chiseled",
 249 |     "chiselling": "chiseling",
 250 |     "circularise": "circularize",
 251 |     "circularised": "circularized",
 252 |     "circularises": "circularizes",
 253 |     "circularising": "circularizing",
 254 |     "civilise": "civilize",
 255 |     "civilised": "civilized",
 256 |     "civilises": "civilizes",
 257 |     "civilising": "civilizing",
 258 |     "clamour": "clamor",
 259 |     "clamoured": "clamored",
 260 |     "clamouring": "clamoring",
 261 |     "clamours": "clamors",
 262 |     "clangour": "clangor",
 263 |     "clarinettist": "clarinetist",
 264 |     "clarinettists": "clarinetists",
 265 |     "collectivise": "collectivize",
 266 |     "collectivised": "collectivized",
 267 |     "collectivises": "collectivizes",
 268 |     "collectivising": "collectivizing",
 269 |     "colonisation": "colonization",
 270 |     "colonise": "colonize",
 271 |     "colonised": "colonized",
 272 |     "coloniser": "colonizer",
 273 |     "colonisers": "colonizers",
 274 |     "colonises": "colonizes",
 275 |     "colonising": "colonizing",
 276 |     "colour": "color",
 277 |     "colourant": "colorant",
 278 |     "colourants": "colorants",
 279 |     "coloured": "colored",
 280 |     "coloureds": "coloreds",
 281 |     "colourful": "colorful",
 282 |     "colourfully": "colorfully",
 283 |     "colouring": "coloring",
 284 |     "colourize": "colorize",
 285 |     "colourized": "colorized",
 286 |     "colourizes": "colorizes",
 287 |     "colourizing": "colorizing",
 288 |     "colourless": "colorless",
 289 |     "colours": "colors",
 290 |     "commercialise": "commercialize",
 291 |     "commercialised": "commercialized",
 292 |     "commercialises": "commercializes",
 293 |     "commercialising": "commercializing",
 294 |     "compartmentalise": "compartmentalize",
 295 |     "compartmentalised": "compartmentalized",
 296 |     "compartmentalises": "compartmentalizes",
 297 |     "compartmentalising": "compartmentalizing",
 298 |     "computerise": "computerize",
 299 |     "computerised": "computerized",
 300 |     "computerises": "computerizes",
 301 |     "computerising": "computerizing",
 302 |     "conceptualise": "conceptualize",
 303 |     "conceptualised": "conceptualized",
 304 |     "conceptualises": "conceptualizes",
 305 |     "conceptualising": "conceptualizing",
 306 |     "connexion": "connection",
 307 |     "connexions": "connections",
 308 |     "contextualise": "contextualize",
 309 |     "contextualised": "contextualized",
 310 |     "contextualises": "contextualizes",
 311 |     "contextualising": "contextualizing",
 312 |     "cosier": "cozier",
 313 |     "cosies": "cozies",
 314 |     "cosiest": "coziest",
 315 |     "cosily": "cozily",
 316 |     "cosiness": "coziness",
 317 |     "cosy": "cozy",
 318 |     "councillor": "councilor",
 319 |     "councillors": "councilors",
 320 |     "counselled": "counseled",
 321 |     "counselling": "counseling",
 322 |     "counsellor": "counselor",
 323 |     "counsellors": "counselors",
 324 |     "crenelated": "crenellated",
 325 |     "criminalise": "criminalize",
 326 |     "criminalised": "criminalized",
 327 |     "criminalises": "criminalizes",
 328 |     "criminalising": "criminalizing",
 329 |     "criticise": "criticize",
 330 |     "criticised": "criticized",
 331 |     "criticises": "criticizes",
 332 |     "criticising": "criticizing",
 333 |     "crueller": "crueler",
 334 |     "cruellest": "cruelest",
 335 |     "crystallisation": "crystallization",
 336 |     "crystallise": "crystallize",
 337 |     "crystallised": "crystallized",
 338 |     "crystallises": "crystallizes",
 339 |     "crystallising": "crystallizing",
 340 |     "cudgelled": "cudgeled",
 341 |     "cudgelling": "cudgeling",
 342 |     "customise": "customize",
 343 |     "customised": "customized",
 344 |     "customises": "customizes",
 345 |     "customising": "customizing",
 346 |     "cypher": "cipher",
 347 |     "cyphers": "ciphers",
 348 |     "decentralisation": "decentralization",
 349 |     "decentralise": "decentralize",
 350 |     "decentralised": "decentralized",
 351 |     "decentralises": "decentralizes",
 352 |     "decentralising": "decentralizing",
 353 |     "decriminalisation": "decriminalization",
 354 |     "decriminalise": "decriminalize",
 355 |     "decriminalised": "decriminalized",
 356 |     "decriminalises": "decriminalizes",
 357 |     "decriminalising": "decriminalizing",
 358 |     "defence": "defense",
 359 |     "defenceless": "defenseless",
 360 |     "defences": "defenses",
 361 |     "dehumanisation": "dehumanization",
 362 |     "dehumanise": "dehumanize",
 363 |     "dehumanised": "dehumanized",
 364 |     "dehumanises": "dehumanizes",
 365 |     "dehumanising": "dehumanizing",
 366 |     "demeanour": "demeanor",
 367 |     "demilitarisation": "demilitarization",
 368 |     "demilitarise": "demilitarize",
 369 |     "demilitarised": "demilitarized",
 370 |     "demilitarises": "demilitarizes",
 371 |     "demilitarising": "demilitarizing",
 372 |     "demobilisation": "demobilization",
 373 |     "demobilise": "demobilize",
 374 |     "demobilised": "demobilized",
 375 |     "demobilises": "demobilizes",
 376 |     "demobilising": "demobilizing",
 377 |     "democratisation": "democratization",
 378 |     "democratise": "democratize",
 379 |     "democratised": "democratized",
 380 |     "democratises": "democratizes",
 381 |     "democratising": "democratizing",
 382 |     "demonise": "demonize",
 383 |     "demonised": "demonized",
 384 |     "demonises": "demonizes",
 385 |     "demonising": "demonizing",
 386 |     "demoralisation": "demoralization",
 387 |     "demoralise": "demoralize",
 388 |     "demoralised": "demoralized",
 389 |     "demoralises": "demoralizes",
 390 |     "demoralising": "demoralizing",
 391 |     "denationalisation": "denationalization",
 392 |     "denationalise": "denationalize",
 393 |     "denationalised": "denationalized",
 394 |     "denationalises": "denationalizes",
 395 |     "denationalising": "denationalizing",
 396 |     "deodorise": "deodorize",
 397 |     "deodorised": "deodorized",
 398 |     "deodorises": "deodorizes",
 399 |     "deodorising": "deodorizing",
 400 |     "depersonalise": "depersonalize",
 401 |     "depersonalised": "depersonalized",
 402 |     "depersonalises": "depersonalizes",
 403 |     "depersonalising": "depersonalizing",
 404 |     "deputise": "deputize",
 405 |     "deputised": "deputized",
 406 |     "deputises": "deputizes",
 407 |     "deputising": "deputizing",
 408 |     "desensitisation": "desensitization",
 409 |     "desensitise": "desensitize",
 410 |     "desensitised": "desensitized",
 411 |     "desensitises": "desensitizes",
 412 |     "desensitising": "desensitizing",
 413 |     "destabilisation": "destabilization",
 414 |     "destabilise": "destabilize",
 415 |     "destabilised": "destabilized",
 416 |     "destabilises": "destabilizes",
 417 |     "destabilising": "destabilizing",
 418 |     "dialled": "dialed",
 419 |     "dialling": "dialing",
 420 |     "dialogue": "dialog",
 421 |     "dialogues": "dialogs",
 422 |     "diarrhoea": "diarrhea",
 423 |     "digitise": "digitize",
 424 |     "digitised": "digitized",
 425 |     "digitises": "digitizes",
 426 |     "digitising": "digitizing",
 427 |     "disc": "disk",
 428 |     "discolour": "discolor",
 429 |     "discoloured": "discolored",
 430 |     "discolouring": "discoloring",
 431 |     "discolours": "discolors",
 432 |     "discs": "disks",
 433 |     "disembowelled": "disemboweled",
 434 |     "disembowelling": "disemboweling",
 435 |     "disfavour": "disfavor",
 436 |     "dishevelled": "disheveled",
 437 |     "dishonour": "dishonor",
 438 |     "dishonourable": "dishonorable",
 439 |     "dishonourably": "dishonorably",
 440 |     "dishonoured": "dishonored",
 441 |     "dishonouring": "dishonoring",
 442 |     "dishonours": "dishonors",
 443 |     "disorganisation": "disorganization",
 444 |     "disorganised": "disorganized",
 445 |     "distil": "distill",
 446 |     "distils": "distills",
 447 |     "dramatisation": "dramatization",
 448 |     "dramatisations": "dramatizations",
 449 |     "dramatise": "dramatize",
 450 |     "dramatised": "dramatized",
 451 |     "dramatises": "dramatizes",
 452 |     "dramatising": "dramatizing",
 453 |     "draught": "draft",
 454 |     "draughtboard": "draftboard",
 455 |     "draughtboards": "draftboards",
 456 |     "draughtier": "draftier",
 457 |     "draughtiest": "draftiest",
 458 |     "draughts": "drafts",
 459 |     "draughtsman": "draftsman",
 460 |     "draughtsmanship": "draftsmanship",
 461 |     "draughtsmen": "draftsmen",
 462 |     "draughtswoman": "draftswoman",
 463 |     "draughtswomen": "draftswomen",
 464 |     "draughty": "drafty",
 465 |     "drivelled": "driveled",
 466 |     "drivelling": "driveling",
 467 |     "duelled": "dueled",
 468 |     "duelling": "dueling",
 469 |     "economise": "economize",
 470 |     "economised": "economized",
 471 |     "economises": "economizes",
 472 |     "economising": "economizing",
 473 |     "edoema": "edema",
 474 |     "editorialise": "editorialize",
 475 |     "editorialised": "editorialized",
 476 |     "editorialises": "editorializes",
 477 |     "editorialising": "editorializing",
 478 |     "empathise": "empathize",
 479 |     "empathised": "empathized",
 480 |     "empathises": "empathizes",
 481 |     "empathising": "empathizing",
 482 |     "emphasise": "emphasize",
 483 |     "emphasised": "emphasized",
 484 |     "emphasises": "emphasizes",
 485 |     "emphasising": "emphasizing",
 486 |     "enamelled": "enameled",
 487 |     "enamelling": "enameling",
 488 |     "enamoured": "enamored",
 489 |     "encyclopaedia": "encyclopedia",
 490 |     "encyclopaedias": "encyclopedias",
 491 |     "encyclopaedic": "encyclopedic",
 492 |     "endeavour": "endeavor",
 493 |     "endeavoured": "endeavored",
 494 |     "endeavouring": "endeavoring",
 495 |     "endeavours": "endeavors",
 496 |     "energise": "energize",
 497 |     "energised": "energized",
 498 |     "energises": "energizes",
 499 |     "energising": "energizing",
 500 |     "enrol": "enroll",
 501 |     "enrols": "enrolls",
 502 |     "enthral": "enthrall",
 503 |     "enthrals": "enthralls",
 504 |     "epaulette": "epaulet",
 505 |     "epaulettes": "epaulets",
 506 |     "epicentre": "epicenter",
 507 |     "epicentres": "epicenters",
 508 |     "epilogue": "epilog",
 509 |     "epilogues": "epilogs",
 510 |     "epitomise": "epitomize",
 511 |     "epitomised": "epitomized",
 512 |     "epitomises": "epitomizes",
 513 |     "epitomising": "epitomizing",
 514 |     "equalisation": "equalization",
 515 |     "equalise": "equalize",
 516 |     "equalised": "equalized",
 517 |     "equaliser": "equalizer",
 518 |     "equalisers": "equalizers",
 519 |     "equalises": "equalizes",
 520 |     "equalising": "equalizing",
 521 |     "eulogise": "eulogize",
 522 |     "eulogised": "eulogized",
 523 |     "eulogises": "eulogizes",
 524 |     "eulogising": "eulogizing",
 525 |     "evangelise": "evangelize",
 526 |     "evangelised": "evangelized",
 527 |     "evangelises": "evangelizes",
 528 |     "evangelising": "evangelizing",
 529 |     "exorcise": "exorcize",
 530 |     "exorcised": "exorcized",
 531 |     "exorcises": "exorcizes",
 532 |     "exorcising": "exorcizing",
 533 |     "extemporisation": "extemporization",
 534 |     "extemporise": "extemporize",
 535 |     "extemporised": "extemporized",
 536 |     "extemporises": "extemporizes",
 537 |     "extemporising": "extemporizing",
 538 |     "externalisation": "externalization",
 539 |     "externalisations": "externalizations",
 540 |     "externalise": "externalize",
 541 |     "externalised": "externalized",
 542 |     "externalises": "externalizes",
 543 |     "externalising": "externalizing",
 544 |     "factorise": "factorize",
 545 |     "factorised": "factorized",
 546 |     "factorises": "factorizes",
 547 |     "factorising": "factorizing",
 548 |     "faecal": "fecal",
 549 |     "faeces": "feces",
 550 |     "familiarisation": "familiarization",
 551 |     "familiarise": "familiarize",
 552 |     "familiarised": "familiarized",
 553 |     "familiarises": "familiarizes",
 554 |     "familiarising": "familiarizing",
 555 |     "fantasise": "fantasize",
 556 |     "fantasised": "fantasized",
 557 |     "fantasises": "fantasizes",
 558 |     "fantasising": "fantasizing",
 559 |     "favour": "favor",
 560 |     "favourable": "favorable",
 561 |     "favourably": "favorably",
 562 |     "favoured": "favored",
 563 |     "favouring": "favoring",
 564 |     "favourite": "favorite",
 565 |     "favourites": "favorites",
 566 |     "favouritism": "favoritism",
 567 |     "favours": "favors",
 568 |     "feminise": "feminize",
 569 |     "feminised": "feminized",
 570 |     "feminises": "feminizes",
 571 |     "feminising": "feminizing",
 572 |     "fertilisation": "fertilization",
 573 |     "fertilise": "fertilize",
 574 |     "fertilised": "fertilized",
 575 |     "fertiliser": "fertilizer",
 576 |     "fertilisers": "fertilizers",
 577 |     "fertilises": "fertilizes",
 578 |     "fertilising": "fertilizing",
 579 |     "fervour": "fervor",
 580 |     "fibre": "fiber",
 581 |     "fibreglass": "fiberglass",
 582 |     "fibres": "fibers",
 583 |     "fictionalisation": "fictionalization",
 584 |     "fictionalisations": "fictionalizations",
 585 |     "fictionalise": "fictionalize",
 586 |     "fictionalised": "fictionalized",
 587 |     "fictionalises": "fictionalizes",
 588 |     "fictionalising": "fictionalizing",
 589 |     "fillet": "filet",
 590 |     "filleted": "fileted",
 591 |     "filleting": "fileting",
 592 |     "fillets": "filets",
 593 |     "finalisation": "finalization",
 594 |     "finalise": "finalize",
 595 |     "finalised": "finalized",
 596 |     "finalises": "finalizes",
 597 |     "finalising": "finalizing",
 598 |     "flautist": "flutist",
 599 |     "flautists": "flutists",
 600 |     "flavour": "flavor",
 601 |     "flavoured": "flavored",
 602 |     "flavouring": "flavoring",
 603 |     "flavourings": "flavorings",
 604 |     "flavourless": "flavorless",
 605 |     "flavours": "flavors",
 606 |     "flavoursome": "flavorsome",
 607 |     "flyer / flier": "flier / flyer",
 608 |     "foetal": "fetal",
 609 |     "foetid": "fetid",
 610 |     "foetus": "fetus",
 611 |     "foetuses": "fetuses",
 612 |     "formalisation": "formalization",
 613 |     "formalise": "formalize",
 614 |     "formalised": "formalized",
 615 |     "formalises": "formalizes",
 616 |     "formalising": "formalizing",
 617 |     "fossilisation": "fossilization",
 618 |     "fossilise": "fossilize",
 619 |     "fossilised": "fossilized",
 620 |     "fossilises": "fossilizes",
 621 |     "fossilising": "fossilizing",
 622 |     "fraternisation": "fraternization",
 623 |     "fraternise": "fraternize",
 624 |     "fraternised": "fraternized",
 625 |     "fraternises": "fraternizes",
 626 |     "fraternising": "fraternizing",
 627 |     "fulfil": "fulfill",
 628 |     "fulfilment": "fulfillment",
 629 |     "fulfils": "fulfills",
 630 |     "funnelled": "funneled",
 631 |     "funnelling": "funneling",
 632 |     "galvanise": "galvanize",
 633 |     "galvanised": "galvanized",
 634 |     "galvanises": "galvanizes",
 635 |     "galvanising": "galvanizing",
 636 |     "gambolled": "gamboled",
 637 |     "gambolling": "gamboling",
 638 |     "gaol": "jail",
 639 |     "gaolbird": "jailbird",
 640 |     "gaolbirds": "jailbirds",
 641 |     "gaolbreak": "jailbreak",
 642 |     "gaolbreaks": "jailbreaks",
 643 |     "gaoled": "jailed",
 644 |     "gaoler": "jailer",
 645 |     "gaolers": "jailers",
 646 |     "gaoling": "jailing",
 647 |     "gaols": "jails",
 648 |     "gasses": "gases",
 649 |     "gage": "gauge",
 650 |     "gaged": "gauged",
 651 |     "gages": "gauges",
 652 |     "gaging": "gauging",
 653 |     "generalisation": "generalization",
 654 |     "generalisations": "generalizations",
 655 |     "generalise": "generalize",
 656 |     "generalised": "generalized",
 657 |     "generalises": "generalizes",
 658 |     "generalising": "generalizing",
 659 |     "ghettoise": "ghettoize",
 660 |     "ghettoised": "ghettoized",
 661 |     "ghettoises": "ghettoizes",
 662 |     "ghettoising": "ghettoizing",
 663 |     "gipsies": "gypsies",
 664 |     "glamorise": "glamorize",
 665 |     "glamorised": "glamorized",
 666 |     "glamorises": "glamorizes",
 667 |     "glamorising": "glamorizing",
 668 |     "glamor": "glamour",
 669 |     "globalisation": "globalization",
 670 |     "globalise": "globalize",
 671 |     "globalised": "globalized",
 672 |     "globalises": "globalizes",
 673 |     "globalising": "globalizing",
 674 |     "glueing": "gluing",
 675 |     "goitre": "goiter",
 676 |     "goitres": "goiters",
 677 |     "gonorrhoea": "gonorrhea",
 678 |     "gramme": "gram",
 679 |     "grammes": "grams",
 680 |     "gravelled": "graveled",
 681 |     "grey": "gray",
 682 |     "greyed": "grayed",
 683 |     "greying": "graying",
 684 |     "greyish": "grayish",
 685 |     "greyness": "grayness",
 686 |     "greys": "grays",
 687 |     "grovelled": "groveled",
 688 |     "grovelling": "groveling",
 689 |     "groyne": "groin",
 690 |     "groynes": "groins",
 691 |     "gruelling": "grueling",
 692 |     "gruellingly": "gruelingly",
 693 |     "gryphon": "griffin",
 694 |     "gryphons": "griffins",
 695 |     "gynaecological": "gynecological",
 696 |     "gynaecologist": "gynecologist",
 697 |     "gynaecologists": "gynecologists",
 698 |     "gynaecology": "gynecology",
 699 |     "haematological": "hematological",
 700 |     "haematologist": "hematologist",
 701 |     "haematologists": "hematologists",
 702 |     "haematology": "hematology",
 703 |     "haemoglobin": "hemoglobin",
 704 |     "haemophilia": "hemophilia",
 705 |     "haemophiliac": "hemophiliac",
 706 |     "haemophiliacs": "hemophiliacs",
 707 |     "haemorrhage": "hemorrhage",
 708 |     "haemorrhaged": "hemorrhaged",
 709 |     "haemorrhages": "hemorrhages",
 710 |     "haemorrhaging": "hemorrhaging",
 711 |     "haemorrhoids": "hemorrhoids",
 712 |     "harbour": "harbor",
 713 |     "harboured": "harbored",
 714 |     "harbouring": "harboring",
 715 |     "harbours": "harbors",
 716 |     "harmonisation": "harmonization",
 717 |     "harmonise": "harmonize",
 718 |     "harmonised": "harmonized",
 719 |     "harmonises": "harmonizes",
 720 |     "harmonising": "harmonizing",
 721 |     "homoeopath": "homeopath",
 722 |     "homoeopathic": "homeopathic",
 723 |     "homoeopaths": "homeopaths",
 724 |     "homoeopathy": "homeopathy",
 725 |     "homogenise": "homogenize",
 726 |     "homogenised": "homogenized",
 727 |     "homogenises": "homogenizes",
 728 |     "homogenising": "homogenizing",
 729 |     "honour": "honor",
 730 |     "honourable": "honorable",
 731 |     "honourably": "honorably",
 732 |     "honoured": "honored",
 733 |     "honouring": "honoring",
 734 |     "honours": "honors",
 735 |     "hospitalisation": "hospitalization",
 736 |     "hospitalise": "hospitalize",
 737 |     "hospitalised": "hospitalized",
 738 |     "hospitalises": "hospitalizes",
 739 |     "hospitalising": "hospitalizing",
 740 |     "humanise": "humanize",
 741 |     "humanised": "humanized",
 742 |     "humanises": "humanizes",
 743 |     "humanising": "humanizing",
 744 |     "humour": "humor",
 745 |     "humoured": "humored",
 746 |     "humouring": "humoring",
 747 |     "humourless": "humorless",
 748 |     "humours": "humors",
 749 |     "hybridise": "hybridize",
 750 |     "hybridised": "hybridized",
 751 |     "hybridises": "hybridizes",
 752 |     "hybridising": "hybridizing",
 753 |     "hypnotise": "hypnotize",
 754 |     "hypnotised": "hypnotized",
 755 |     "hypnotises": "hypnotizes",
 756 |     "hypnotising": "hypnotizing",
 757 |     "hypothesise": "hypothesize",
 758 |     "hypothesised": "hypothesized",
 759 |     "hypothesises": "hypothesizes",
 760 |     "hypothesising": "hypothesizing",
 761 |     "idealisation": "idealization",
 762 |     "idealise": "idealize",
 763 |     "idealised": "idealized",
 764 |     "idealises": "idealizes",
 765 |     "idealising": "idealizing",
 766 |     "idolise": "idolize",
 767 |     "idolised": "idolized",
 768 |     "idolises": "idolizes",
 769 |     "idolising": "idolizing",
 770 |     "immobilisation": "immobilization",
 771 |     "immobilise": "immobilize",
 772 |     "immobilised": "immobilized",
 773 |     "immobiliser": "immobilizer",
 774 |     "immobilisers": "immobilizers",
 775 |     "immobilises": "immobilizes",
 776 |     "immobilising": "immobilizing",
 777 |     "immortalise": "immortalize",
 778 |     "immortalised": "immortalized",
 779 |     "immortalises": "immortalizes",
 780 |     "immortalising": "immortalizing",
 781 |     "immunisation": "immunization",
 782 |     "immunise": "immunize",
 783 |     "immunised": "immunized",
 784 |     "immunises": "immunizes",
 785 |     "immunising": "immunizing",
 786 |     "impanelled": "impaneled",
 787 |     "impanelling": "impaneling",
 788 |     "imperilled": "imperiled",
 789 |     "imperilling": "imperiling",
 790 |     "individualise": "individualize",
 791 |     "individualised": "individualized",
 792 |     "individualises": "individualizes",
 793 |     "individualising": "individualizing",
 794 |     "industrialise": "industrialize",
 795 |     "industrialised": "industrialized",
 796 |     "industrialises": "industrializes",
 797 |     "industrialising": "industrializing",
 798 |     "inflexion": "inflection",
 799 |     "inflexions": "inflections",
 800 |     "initialise": "initialize",
 801 |     "initialised": "initialized",
 802 |     "initialises": "initializes",
 803 |     "initialising": "initializing",
 804 |     "initialled": "initialed",
 805 |     "initialling": "initialing",
 806 |     "instal": "install",
 807 |     "instalment": "installment",
 808 |     "instalments": "installments",
 809 |     "instals": "installs",
 810 |     "instil": "instill",
 811 |     "instils": "instills",
 812 |     "institutionalisation": "institutionalization",
 813 |     "institutionalise": "institutionalize",
 814 |     "institutionalised": "institutionalized",
 815 |     "institutionalises": "institutionalizes",
 816 |     "institutionalising": "institutionalizing",
 817 |     "intellectualise": "intellectualize",
 818 |     "intellectualised": "intellectualized",
 819 |     "intellectualises": "intellectualizes",
 820 |     "intellectualising": "intellectualizing",
 821 |     "internalisation": "internalization",
 822 |     "internalise": "internalize",
 823 |     "internalised": "internalized",
 824 |     "internalises": "internalizes",
 825 |     "internalising": "internalizing",
 826 |     "internationalisation": "internationalization",
 827 |     "internationalise": "internationalize",
 828 |     "internationalised": "internationalized",
 829 |     "internationalises": "internationalizes",
 830 |     "internationalising": "internationalizing",
 831 |     "ionisation": "ionization",
 832 |     "ionise": "ionize",
 833 |     "ionised": "ionized",
 834 |     "ioniser": "ionizer",
 835 |     "ionisers": "ionizers",
 836 |     "ionises": "ionizes",
 837 |     "ionising": "ionizing",
 838 |     "italicise": "italicize",
 839 |     "italicised": "italicized",
 840 |     "italicises": "italicizes",
 841 |     "italicising": "italicizing",
 842 |     "itemise": "itemize",
 843 |     "itemised": "itemized",
 844 |     "itemises": "itemizes",
 845 |     "itemising": "itemizing",
 846 |     "jeopardise": "jeopardize",
 847 |     "jeopardised": "jeopardized",
 848 |     "jeopardises": "jeopardizes",
 849 |     "jeopardising": "jeopardizing",
 850 |     "jewelled": "jeweled",
 851 |     "jeweller": "jeweler",
 852 |     "jewellers": "jewelers",
 853 |     "jewellery": "jewelry",
 854 |     "judgement": "judgment",
 855 |     "kilogramme": "kilogram",
 856 |     "kilogrammes": "kilograms",
 857 |     "kilometre": "kilometer",
 858 |     "kilometres": "kilometers",
 859 |     "labelled": "labeled",
 860 |     "labelling": "labeling",
 861 |     "labour": "labor",
 862 |     "laboured": "labored",
 863 |     "labourer": "laborer",
 864 |     "labourers": "laborers",
 865 |     "labouring": "laboring",
 866 |     "labours": "labors",
 867 |     "lacklustre": "lackluster",
 868 |     "legalisation": "legalization",
 869 |     "legalise": "legalize",
 870 |     "legalised": "legalized",
 871 |     "legalises": "legalizes",
 872 |     "legalising": "legalizing",
 873 |     "legitimise": "legitimize",
 874 |     "legitimised": "legitimized",
 875 |     "legitimises": "legitimizes",
 876 |     "legitimising": "legitimizing",
 877 |     "leukaemia": "leukemia",
 878 |     "levelled": "leveled",
 879 |     "leveller": "leveler",
 880 |     "levellers": "levelers",
 881 |     "levelling": "leveling",
 882 |     "libelled": "libeled",
 883 |     "libelling": "libeling",
 884 |     "libellous": "libelous",
 885 |     "liberalisation": "liberalization",
 886 |     "liberalise": "liberalize",
 887 |     "liberalised": "liberalized",
 888 |     "liberalises": "liberalizes",
 889 |     "liberalising": "liberalizing",
 890 |     "licence": "license",
 891 |     "licenced": "licensed",
 892 |     "licences": "licenses",
 893 |     "licencing": "licensing",
 894 |     "likeable": "likable",
 895 |     "lionisation": "lionization",
 896 |     "lionise": "lionize",
 897 |     "lionised": "lionized",
 898 |     "lionises": "lionizes",
 899 |     "lionising": "lionizing",
 900 |     "liquidise": "liquidize",
 901 |     "liquidised": "liquidized",
 902 |     "liquidiser": "liquidizer",
 903 |     "liquidisers": "liquidizers",
 904 |     "liquidises": "liquidizes",
 905 |     "liquidising": "liquidizing",
 906 |     "litre": "liter",
 907 |     "litres": "liters",
 908 |     "localise": "localize",
 909 |     "localised": "localized",
 910 |     "localises": "localizes",
 911 |     "localising": "localizing",
 912 |     "louvre": "louver",
 913 |     "louvred": "louvered",
 914 |     "louvres": "louvers",
 915 |     "lustre": "luster",
 916 |     "magnetise": "magnetize",
 917 |     "magnetised": "magnetized",
 918 |     "magnetises": "magnetizes",
 919 |     "magnetising": "magnetizing",
 920 |     "manoeuvrability": "maneuverability",
 921 |     "manoeuvrable": "maneuverable",
 922 |     "manoeuvre": "maneuver",
 923 |     "manoeuvred": "maneuvered",
 924 |     "manoeuvres": "maneuvers",
 925 |     "manoeuvring": "maneuvering",
 926 |     "manoeuvrings": "maneuverings",
 927 |     "marginalisation": "marginalization",
 928 |     "marginalise": "marginalize",
 929 |     "marginalised": "marginalized",
 930 |     "marginalises": "marginalizes",
 931 |     "marginalising": "marginalizing",
 932 |     "marshalled": "marshaled",
 933 |     "marshalling": "marshaling",
 934 |     "marvelled": "marveled",
 935 |     "marvelling": "marveling",
 936 |     "marvellous": "marvelous",
 937 |     "marvellously": "marvelously",
 938 |     "materialisation": "materialization",
 939 |     "materialise": "materialize",
 940 |     "materialised": "materialized",
 941 |     "materialises": "materializes",
 942 |     "materialising": "materializing",
 943 |     "maximisation": "maximization",
 944 |     "maximise": "maximize",
 945 |     "maximised": "maximized",
 946 |     "maximises": "maximizes",
 947 |     "maximising": "maximizing",
 948 |     "meagre": "meager",
 949 |     "mechanisation": "mechanization",
 950 |     "mechanise": "mechanize",
 951 |     "mechanised": "mechanized",
 952 |     "mechanises": "mechanizes",
 953 |     "mechanising": "mechanizing",
 954 |     "mediaeval": "medieval",
 955 |     "memorialise": "memorialize",
 956 |     "memorialised": "memorialized",
 957 |     "memorialises": "memorializes",
 958 |     "memorialising": "memorializing",
 959 |     "memorise": "memorize",
 960 |     "memorised": "memorized",
 961 |     "memorises": "memorizes",
 962 |     "memorising": "memorizing",
 963 |     "mesmerise": "mesmerize",
 964 |     "mesmerised": "mesmerized",
 965 |     "mesmerises": "mesmerizes",
 966 |     "mesmerising": "mesmerizing",
 967 |     "metabolise": "metabolize",
 968 |     "metabolised": "metabolized",
 969 |     "metabolises": "metabolizes",
 970 |     "metabolising": "metabolizing",
 971 |     "metre": "meter",
 972 |     "metres": "meters",
 973 |     "micrometre": "micrometer",
 974 |     "micrometres": "micrometers",
 975 |     "militarise": "militarize",
 976 |     "militarised": "militarized",
 977 |     "militarises": "militarizes",
 978 |     "militarising": "militarizing",
 979 |     "milligramme": "milligram",
 980 |     "milligrammes": "milligrams",
 981 |     "millilitre": "milliliter",
 982 |     "millilitres": "milliliters",
 983 |     "millimetre": "millimeter",
 984 |     "millimetres": "millimeters",
 985 |     "miniaturisation": "miniaturization",
 986 |     "miniaturise": "miniaturize",
 987 |     "miniaturised": "miniaturized",
 988 |     "miniaturises": "miniaturizes",
 989 |     "miniaturising": "miniaturizing",
 990 |     "minibusses": "minibuses",
 991 |     "minimise": "minimize",
 992 |     "minimised": "minimized",
 993 |     "minimises": "minimizes",
 994 |     "minimising": "minimizing",
 995 |     "misbehaviour": "misbehavior",
 996 |     "misdemeanour": "misdemeanor",
 997 |     "misdemeanours": "misdemeanors",
 998 |     "misspelt": "misspelled",
 999 |     "mitre": "miter",
1000 |     "mitres": "miters",
1001 |     "mobilisation": "mobilization",
1002 |     "mobilise": "mobilize",
1003 |     "mobilised": "mobilized",
1004 |     "mobilises": "mobilizes",
1005 |     "mobilising": "mobilizing",
1006 |     "modelled": "modeled",
1007 |     "modeller": "modeler",
1008 |     "modellers": "modelers",
1009 |     "modelling": "modeling",
1010 |     "modernise": "modernize",
1011 |     "modernised": "modernized",
1012 |     "modernises": "modernizes",
1013 |     "modernising": "modernizing",
1014 |     "moisturise": "moisturize",
1015 |     "moisturised": "moisturized",
1016 |     "moisturiser": "moisturizer",
1017 |     "moisturisers": "moisturizers",
1018 |     "moisturises": "moisturizes",
1019 |     "moisturising": "moisturizing",
1020 |     "monologue": "monolog",
1021 |     "monologues": "monologs",
1022 |     "monopolisation": "monopolization",
1023 |     "monopolise": "monopolize",
1024 |     "monopolised": "monopolized",
1025 |     "monopolises": "monopolizes",
1026 |     "monopolising": "monopolizing",
1027 |     "moralise": "moralize",
1028 |     "moralised": "moralized",
1029 |     "moralises": "moralizes",
1030 |     "moralising": "moralizing",
1031 |     "motorised": "motorized",
1032 |     "mould": "mold",
1033 |     "moulded": "molded",
1034 |     "moulder": "molder",
1035 |     "mouldered": "moldered",
1036 |     "mouldering": "moldering",
1037 |     "moulders": "molders",
1038 |     "mouldier": "moldier",
1039 |     "mouldiest": "moldiest",
1040 |     "moulding": "molding",
1041 |     "mouldings": "moldings",
1042 |     "moulds": "molds",
1043 |     "mouldy": "moldy",
1044 |     "moult": "molt",
1045 |     "moulted": "molted",
1046 |     "moulting": "molting",
1047 |     "moults": "molts",
1048 |     "moustache": "mustache",
1049 |     "moustached": "mustached",
1050 |     "moustaches": "mustaches",
1051 |     "moustachioed": "mustachioed",
1052 |     "multicoloured": "multicolored",
1053 |     "nationalisation": "nationalization",
1054 |     "nationalisations": "nationalizations",
1055 |     "nationalise": "nationalize",
1056 |     "nationalised": "nationalized",
1057 |     "nationalises": "nationalizes",
1058 |     "nationalising": "nationalizing",
1059 |     "naturalisation": "naturalization",
1060 |     "naturalise": "naturalize",
1061 |     "naturalised": "naturalized",
1062 |     "naturalises": "naturalizes",
1063 |     "naturalising": "naturalizing",
1064 |     "neighbour": "neighbor",
1065 |     "neighbourhood": "neighborhood",
1066 |     "neighbourhoods": "neighborhoods",
1067 |     "neighbouring": "neighboring",
1068 |     "neighbourliness": "neighborliness",
1069 |     "neighbourly": "neighborly",
1070 |     "neighbours": "neighbors",
1071 |     "neutralisation": "neutralization",
1072 |     "neutralise": "neutralize",
1073 |     "neutralised": "neutralized",
1074 |     "neutralises": "neutralizes",
1075 |     "neutralising": "neutralizing",
1076 |     "normalisation": "normalization",
1077 |     "normalise": "normalize",
1078 |     "normalised": "normalized",
1079 |     "normalises": "normalizes",
1080 |     "normalising": "normalizing",
1081 |     "odour": "odor",
1082 |     "odourless": "odorless",
1083 |     "odours": "odors",
1084 |     "oesophagus": "esophagus",
1085 |     "oesophaguses": "esophaguses",
1086 |     "oestrogen": "estrogen",
1087 |     "offence": "offense",
1088 |     "offences": "offenses",
1089 |     "omelette": "omelet",
1090 |     "omelettes": "omelets",
1091 |     "optimise": "optimize",
1092 |     "optimised": "optimized",
1093 |     "optimises": "optimizes",
1094 |     "optimising": "optimizing",
1095 |     "organisation": "organization",
1096 |     "organisational": "organizational",
1097 |     "organisations": "organizations",
1098 |     "organise": "organize",
1099 |     "organised": "organized",
1100 |     "organiser": "organizer",
1101 |     "organisers": "organizers",
1102 |     "organises": "organizes",
1103 |     "organising": "organizing",
1104 |     "orthopaedic": "orthopedic",
1105 |     "orthopaedics": "orthopedics",
1106 |     "ostracise": "ostracize",
1107 |     "ostracised": "ostracized",
1108 |     "ostracises": "ostracizes",
1109 |     "ostracising": "ostracizing",
1110 |     "outmanoeuvre": "outmaneuver",
1111 |     "outmanoeuvred": "outmaneuvered",
1112 |     "outmanoeuvres": "outmaneuvers",
1113 |     "outmanoeuvring": "outmaneuvering",
1114 |     "overemphasise": "overemphasize",
1115 |     "overemphasised": "overemphasized",
1116 |     "overemphasises": "overemphasizes",
1117 |     "overemphasising": "overemphasizing",
1118 |     "oxidisation": "oxidization",
1119 |     "oxidise": "oxidize",
1120 |     "oxidised": "oxidized",
1121 |     "oxidises": "oxidizes",
1122 |     "oxidising": "oxidizing",
1123 |     "paederast": "pederast",
1124 |     "paederasts": "pederasts",
1125 |     "paediatric": "pediatric",
1126 |     "paediatrician": "pediatrician",
1127 |     "paediatricians": "pediatricians",
1128 |     "paediatrics": "pediatrics",
1129 |     "paedophile": "pedophile",
1130 |     "paedophiles": "pedophiles",
1131 |     "paedophilia": "pedophilia",
1132 |     "palaeolithic": "paleolithic",
1133 |     "palaeontologist": "paleontologist",
1134 |     "palaeontologists": "paleontologists",
1135 |     "palaeontology": "paleontology",
1136 |     "panelled": "paneled",
1137 |     "panelling": "paneling",
1138 |     "panellist": "panelist",
1139 |     "panellists": "panelists",
1140 |     "paralyse": "paralyze",
1141 |     "paralysed": "paralyzed",
1142 |     "paralyses": "paralyzes",
1143 |     "paralysing": "paralyzing",
1144 |     "parcelled": "parceled",
1145 |     "parcelling": "parceling",
1146 |     "parlour": "parlor",
1147 |     "parlours": "parlors",
1148 |     "particularise": "particularize",
1149 |     "particularised": "particularized",
1150 |     "particularises": "particularizes",
1151 |     "particularising": "particularizing",
1152 |     "passivisation": "passivization",
1153 |     "passivise": "passivize",
1154 |     "passivised": "passivized",
1155 |     "passivises": "passivizes",
1156 |     "passivising": "passivizing",
1157 |     "pasteurisation": "pasteurization",
1158 |     "pasteurise": "pasteurize",
1159 |     "pasteurised": "pasteurized",
1160 |     "pasteurises": "pasteurizes",
1161 |     "pasteurising": "pasteurizing",
1162 |     "patronise": "patronize",
1163 |     "patronised": "patronized",
1164 |     "patronises": "patronizes",
1165 |     "patronising": "patronizing",
1166 |     "patronisingly": "patronizingly",
1167 |     "pedalled": "pedaled",
1168 |     "pedalling": "pedaling",
1169 |     "pedestrianisation": "pedestrianization",
1170 |     "pedestrianise": "pedestrianize",
1171 |     "pedestrianised": "pedestrianized",
1172 |     "pedestrianises": "pedestrianizes",
1173 |     "pedestrianising": "pedestrianizing",
1174 |     "penalise": "penalize",
1175 |     "penalised": "penalized",
1176 |     "penalises": "penalizes",
1177 |     "penalising": "penalizing",
1178 |     "pencilled": "penciled",
1179 |     "pencilling": "penciling",
1180 |     "personalise": "personalize",
1181 |     "personalised": "personalized",
1182 |     "personalises": "personalizes",
1183 |     "personalising": "personalizing",
1184 |     "pharmacopoeia": "pharmacopeia",
1185 |     "pharmacopoeias": "pharmacopeias",
1186 |     "philosophise": "philosophize",
1187 |     "philosophised": "philosophized",
1188 |     "philosophises": "philosophizes",
1189 |     "philosophising": "philosophizing",
1190 |     "philtre": "filter",
1191 |     "philtres": "filters",
1192 |     "phoney": "phony",
1193 |     "plagiarise": "plagiarize",
1194 |     "plagiarised": "plagiarized",
1195 |     "plagiarises": "plagiarizes",
1196 |     "plagiarising": "plagiarizing",
1197 |     "plough": "plow",
1198 |     "ploughed": "plowed",
1199 |     "ploughing": "plowing",
1200 |     "ploughman": "plowman",
1201 |     "ploughmen": "plowmen",
1202 |     "ploughs": "plows",
1203 |     "ploughshare": "plowshare",
1204 |     "ploughshares": "plowshares",
1205 |     "polarisation": "polarization",
1206 |     "polarise": "polarize",
1207 |     "polarised": "polarized",
1208 |     "polarises": "polarizes",
1209 |     "polarising": "polarizing",
1210 |     "politicisation": "politicization",
1211 |     "politicise": "politicize",
1212 |     "politicised": "politicized",
1213 |     "politicises": "politicizes",
1214 |     "politicising": "politicizing",
1215 |     "popularisation": "popularization",
1216 |     "popularise": "popularize",
1217 |     "popularised": "popularized",
1218 |     "popularises": "popularizes",
1219 |     "popularising": "popularizing",
1220 |     "pouffe": "pouf",
1221 |     "pouffes": "poufs",
1222 |     "practise": "practice",
1223 |     "practised": "practiced",
1224 |     "practises": "practices",
1225 |     "practising": "practicing",
1226 |     "praesidium": "presidium",
1227 |     "praesidiums": "presidiums",
1228 |     "pressurisation": "pressurization",
1229 |     "pressurise": "pressurize",
1230 |     "pressurised": "pressurized",
1231 |     "pressurises": "pressurizes",
1232 |     "pressurising": "pressurizing",
1233 |     "pretence": "pretense",
1234 |     "pretences": "pretenses",
1235 |     "primaeval": "primeval",
1236 |     "prioritisation": "prioritization",
1237 |     "prioritise": "prioritize",
1238 |     "prioritised": "prioritized",
1239 |     "prioritises": "prioritizes",
1240 |     "prioritising": "prioritizing",
1241 |     "privatisation": "privatization",
1242 |     "privatisations": "privatizations",
1243 |     "privatise": "privatize",
1244 |     "privatised": "privatized",
1245 |     "privatises": "privatizes",
1246 |     "privatising": "privatizing",
1247 |     "professionalisation": "professionalization",
1248 |     "professionalise": "professionalize",
1249 |     "professionalised": "professionalized",
1250 |     "professionalises": "professionalizes",
1251 |     "professionalising": "professionalizing",
1252 |     "programme": "program",
1253 |     "programmes": "programs",
1254 |     "prologue": "prolog",
1255 |     "prologues": "prologs",
1256 |     "propagandise": "propagandize",
1257 |     "propagandised": "propagandized",
1258 |     "propagandises": "propagandizes",
1259 |     "propagandising": "propagandizing",
1260 |     "proselytise": "proselytize",
1261 |     "proselytised": "proselytized",
1262 |     "proselytiser": "proselytizer",
1263 |     "proselytisers": "proselytizers",
1264 |     "proselytises": "proselytizes",
1265 |     "proselytising": "proselytizing",
1266 |     "psychoanalyse": "psychoanalyze",
1267 |     "psychoanalysed": "psychoanalyzed",
1268 |     "psychoanalyses": "psychoanalyzes",
1269 |     "psychoanalysing": "psychoanalyzing",
1270 |     "publicise": "publicize",
1271 |     "publicised": "publicized",
1272 |     "publicises": "publicizes",
1273 |     "publicising": "publicizing",
1274 |     "pulverisation": "pulverization",
1275 |     "pulverise": "pulverize",
1276 |     "pulverised": "pulverized",
1277 |     "pulverises": "pulverizes",
1278 |     "pulverising": "pulverizing",
1279 |     "pummelled": "pummel",
1280 |     "pummelling": "pummeled",
1281 |     "pyjama": "pajama",
1282 |     "pyjamas": "pajamas",
1283 |     "pzazz": "pizzazz",
1284 |     "quarrelled": "quarreled",
1285 |     "quarrelling": "quarreling",
1286 |     "radicalise": "radicalize",
1287 |     "radicalised": "radicalized",
1288 |     "radicalises": "radicalizes",
1289 |     "radicalising": "radicalizing",
1290 |     "rancour": "rancor",
1291 |     "randomise": "randomize",
1292 |     "randomised": "randomized",
1293 |     "randomises": "randomizes",
1294 |     "randomising": "randomizing",
1295 |     "rationalisation": "rationalization",
1296 |     "rationalisations": "rationalizations",
1297 |     "rationalise": "rationalize",
1298 |     "rationalised": "rationalized",
1299 |     "rationalises": "rationalizes",
1300 |     "rationalising": "rationalizing",
1301 |     "ravelled": "raveled",
1302 |     "ravelling": "raveling",
1303 |     "realisable": "realizable",
1304 |     "realisation": "realization",
1305 |     "realisations": "realizations",
1306 |     "realise": "realize",
1307 |     "realised": "realized",
1308 |     "realises": "realizes",
1309 |     "realising": "realizing",
1310 |     "recognisable": "recognizable",
1311 |     "recognisably": "recognizably",
1312 |     "recognisance": "recognizance",
1313 |     "recognise": "recognize",
1314 |     "recognised": "recognized",
1315 |     "recognises": "recognizes",
1316 |     "recognising": "recognizing",
1317 |     "reconnoitre": "reconnoiter",
1318 |     "reconnoitred": "reconnoitered",
1319 |     "reconnoitres": "reconnoiters",
1320 |     "reconnoitring": "reconnoitering",
1321 |     "refuelled": "refueled",
1322 |     "refuelling": "refueling",
1323 |     "regularisation": "regularization",
1324 |     "regularise": "regularize",
1325 |     "regularised": "regularized",
1326 |     "regularises": "regularizes",
1327 |     "regularising": "regularizing",
1328 |     "remodelled": "remodeled",
1329 |     "remodelling": "remodeling",
1330 |     "remould": "remold",
1331 |     "remoulded": "remolded",
1332 |     "remoulding": "remolding",
1333 |     "remoulds": "remolds",
1334 |     "reorganisation": "reorganization",
1335 |     "reorganisations": "reorganizations",
1336 |     "reorganise": "reorganize",
1337 |     "reorganised": "reorganized",
1338 |     "reorganises": "reorganizes",
1339 |     "reorganising": "reorganizing",
1340 |     "revelled": "reveled",
1341 |     "reveller": "reveler",
1342 |     "revellers": "revelers",
1343 |     "revelling": "reveling",
1344 |     "revitalise": "revitalize",
1345 |     "revitalised": "revitalized",
1346 |     "revitalises": "revitalizes",
1347 |     "revitalising": "revitalizing",
1348 |     "revolutionise": "revolutionize",
1349 |     "revolutionised": "revolutionized",
1350 |     "revolutionises": "revolutionizes",
1351 |     "revolutionising": "revolutionizing",
1352 |     "rhapsodise": "rhapsodize",
1353 |     "rhapsodised": "rhapsodized",
1354 |     "rhapsodises": "rhapsodizes",
1355 |     "rhapsodising": "rhapsodizing",
1356 |     "rigour": "rigor",
1357 |     "rigours": "rigors",
1358 |     "ritualised": "ritualized",
1359 |     "rivalled": "rivaled",
1360 |     "rivalling": "rivaling",
1361 |     "romanticise": "romanticize",
1362 |     "romanticised": "romanticized",
1363 |     "romanticises": "romanticizes",
1364 |     "romanticising": "romanticizing",
1365 |     "rumour": "rumor",
1366 |     "rumoured": "rumored",
1367 |     "rumours": "rumors",
1368 |     "sabre": "saber",
1369 |     "sabres": "sabers",
1370 |     "saltpetre": "saltpeter",
1371 |     "sanitise": "sanitize",
1372 |     "sanitised": "sanitized",
1373 |     "sanitises": "sanitizes",
1374 |     "sanitising": "sanitizing",
1375 |     "satirise": "satirize",
1376 |     "satirised": "satirized",
1377 |     "satirises": "satirizes",
1378 |     "satirising": "satirizing",
1379 |     "saviour": "savior",
1380 |     "saviours": "saviors",
1381 |     "savour": "savor",
1382 |     "savoured": "savored",
1383 |     "savouries": "savories",
1384 |     "savouring": "savoring",
1385 |     "savours": "savors",
1386 |     "savoury": "savory",
1387 |     "scandalise": "scandalize",
1388 |     "scandalised": "scandalized",
1389 |     "scandalises": "scandalizes",
1390 |     "scandalising": "scandalizing",
1391 |     "sceptic": "skeptic",
1392 |     "sceptical": "skeptical",
1393 |     "sceptically": "skeptically",
1394 |     "scepticism": "skepticism",
1395 |     "sceptics": "skeptics",
1396 |     "sceptre": "scepter",
1397 |     "sceptres": "scepters",
1398 |     "scrutinise": "scrutinize",
1399 |     "scrutinised": "scrutinized",
1400 |     "scrutinises": "scrutinizes",
1401 |     "scrutinising": "scrutinizing",
1402 |     "secularisation": "secularization",
1403 |     "secularise": "secularize",
1404 |     "secularised": "secularized",
1405 |     "secularises": "secularizes",
1406 |     "secularising": "secularizing",
1407 |     "sensationalise": "sensationalize",
1408 |     "sensationalised": "sensationalized",
1409 |     "sensationalises": "sensationalizes",
1410 |     "sensationalising": "sensationalizing",
1411 |     "sensitise": "sensitize",
1412 |     "sensitised": "sensitized",
1413 |     "sensitises": "sensitizes",
1414 |     "sensitising": "sensitizing",
1415 |     "sentimentalise": "sentimentalize",
1416 |     "sentimentalised": "sentimentalized",
1417 |     "sentimentalises": "sentimentalizes",
1418 |     "sentimentalising": "sentimentalizing",
1419 |     "sepulchre": "sepulcher",
1420 |     "sepulchres": "sepulchers",
1421 |     "serialisation": "serialization",
1422 |     "serialisations": "serializations",
1423 |     "serialise": "serialize",
1424 |     "serialised": "serialized",
1425 |     "serialises": "serializes",
1426 |     "serialising": "serializing",
1427 |     "sermonise": "sermonize",
1428 |     "sermonised": "sermonized",
1429 |     "sermonises": "sermonizes",
1430 |     "sermonising": "sermonizing",
1431 |     "sheikh": "sheik",
1432 |     "shovelled": "shoveled",
1433 |     "shovelling": "shoveling",
1434 |     "shrivelled": "shriveled",
1435 |     "shrivelling": "shriveling",
1436 |     "signalise": "signalize",
1437 |     "signalised": "signalized",
1438 |     "signalises": "signalizes",
1439 |     "signalising": "signalizing",
1440 |     "signalled": "signaled",
1441 |     "signalling": "signaling",
1442 |     "smoulder": "smolder",
1443 |     "smouldered": "smoldered",
1444 |     "smouldering": "smoldering",
1445 |     "smoulders": "smolders",
1446 |     "snivelled": "sniveled",
1447 |     "snivelling": "sniveling",
1448 |     "snorkelled": "snorkeled",
1449 |     "snorkelling": "snorkeling",
1450 |     "snowplough": "snowplow",
1451 |     "snowploughs": "snowplow",
1452 |     "socialisation": "socialization",
1453 |     "socialise": "socialize",
1454 |     "socialised": "socialized",
1455 |     "socialises": "socializes",
1456 |     "socialising": "socializing",
1457 |     "sodomise": "sodomize",
1458 |     "sodomised": "sodomized",
1459 |     "sodomises": "sodomizes",
1460 |     "sodomising": "sodomizing",
1461 |     "solemnise": "solemnize",
1462 |     "solemnised": "solemnized",
1463 |     "solemnises": "solemnizes",
1464 |     "solemnising": "solemnizing",
1465 |     "sombre": "somber",
1466 |     "specialisation": "specialization",
1467 |     "specialisations": "specializations",
1468 |     "specialise": "specialize",
1469 |     "specialised": "specialized",
1470 |     "specialises": "specializes",
1471 |     "specialising": "specializing",
1472 |     "spectre": "specter",
1473 |     "spectres": "specters",
1474 |     "spiralled": "spiraled",
1475 |     "spiralling": "spiraling",
1476 |     "splendour": "splendor",
1477 |     "splendours": "splendors",
1478 |     "squirrelled": "squirreled",
1479 |     "squirrelling": "squirreling",
1480 |     "stabilisation": "stabilization",
1481 |     "stabilise": "stabilize",
1482 |     "stabilised": "stabilized",
1483 |     "stabiliser": "stabilizer",
1484 |     "stabilisers": "stabilizers",
1485 |     "stabilises": "stabilizes",
1486 |     "stabilising": "stabilizing",
1487 |     "standardisation": "standardization",
1488 |     "standardise": "standardize",
1489 |     "standardised": "standardized",
1490 |     "standardises": "standardizes",
1491 |     "standardising": "standardizing",
1492 |     "stencilled": "stenciled",
1493 |     "stencilling": "stenciling",
1494 |     "sterilisation": "sterilization",
1495 |     "sterilisations": "sterilizations",
1496 |     "sterilise": "sterilize",
1497 |     "sterilised": "sterilized",
1498 |     "steriliser": "sterilizer",
1499 |     "sterilisers": "sterilizers",
1500 |     "sterilises": "sterilizes",
1501 |     "sterilising": "sterilizing",
1502 |     "stigmatisation": "stigmatization",
1503 |     "stigmatise": "stigmatize",
1504 |     "stigmatised": "stigmatized",
1505 |     "stigmatises": "stigmatizes",
1506 |     "stigmatising": "stigmatizing",
1507 |     "storey": "story",
1508 |     "storeys": "stories",
1509 |     "subsidisation": "subsidization",
1510 |     "subsidise": "subsidize",
1511 |     "subsidised": "subsidized",
1512 |     "subsidiser": "subsidizer",
1513 |     "subsidisers": "subsidizers",
1514 |     "subsidises": "subsidizes",
1515 |     "subsidising": "subsidizing",
1516 |     "succour": "succor",
1517 |     "succoured": "succored",
1518 |     "succouring": "succoring",
1519 |     "succours": "succors",
1520 |     "sulphate": "sulfate",
1521 |     "sulphates": "sulfates",
1522 |     "sulphide": "sulfide",
1523 |     "sulphides": "sulfides",
1524 |     "sulphur": "sulfur",
1525 |     "sulphurous": "sulfurous",
1526 |     "summarise": "summarize",
1527 |     "summarised": "summarized",
1528 |     "summarises": "summarizes",
1529 |     "summarising": "summarizing",
1530 |     "swivelled": "swiveled",
1531 |     "swivelling": "swiveling",
1532 |     "symbolise": "symbolize",
1533 |     "symbolised": "symbolized",
1534 |     "symbolises": "symbolizes",
1535 |     "symbolising": "symbolizing",
1536 |     "sympathise": "sympathize",
1537 |     "sympathised": "sympathized",
1538 |     "sympathiser": "sympathizer",
1539 |     "sympathisers": "sympathizers",
1540 |     "sympathises": "sympathizes",
1541 |     "sympathising": "sympathizing",
1542 |     "synchronisation": "synchronization",
1543 |     "synchronise": "synchronize",
1544 |     "synchronised": "synchronized",
1545 |     "synchronises": "synchronizes",
1546 |     "synchronising": "synchronizing",
1547 |     "synthesise": "synthesize",
1548 |     "synthesised": "synthesized",
1549 |     "synthesiser": "synthesizer",
1550 |     "synthesisers": "synthesizers",
1551 |     "synthesises": "synthesizes",
1552 |     "synthesising": "synthesizing",
1553 |     "syphon": "siphon",
1554 |     "syphoned": "siphoned",
1555 |     "syphoning": "siphoning",
1556 |     "syphons": "siphons",
1557 |     "systematisation": "systematization",
1558 |     "systematise": "systematize",
1559 |     "systematised": "systematized",
1560 |     "systematises": "systematizes",
1561 |     "systematising": "systematizing",
1562 |     "tantalise": "tantalize",
1563 |     "tantalised": "tantalized",
1564 |     "tantalises": "tantalizes",
1565 |     "tantalising": "tantalizing",
1566 |     "tantalisingly": "tantalizingly",
1567 |     "tasselled": "tasseled",
1568 |     "technicolour": "technicolor",
1569 |     "temporise": "temporize",
1570 |     "temporised": "temporized",
1571 |     "temporises": "temporizes",
1572 |     "temporising": "temporizing",
1573 |     "tenderise": "tenderize",
1574 |     "tenderised": "tenderized",
1575 |     "tenderises": "tenderizes",
1576 |     "tenderising": "tenderizing",
1577 |     "terrorise": "terrorize",
1578 |     "terrorised": "terrorized",
1579 |     "terrorises": "terrorizes",
1580 |     "terrorising": "terrorizing",
1581 |     "theatre": "theater",
1582 |     "theatregoer": "theatergoer",
1583 |     "theatregoers": "theatergoers",
1584 |     "theatres": "theaters",
1585 |     "theorise": "theorize",
1586 |     "theorised": "theorized",
1587 |     "theorises": "theorizes",
1588 |     "theorising": "theorizing",
1589 |     "tonne": "ton",
1590 |     "tonnes": "tons",
1591 |     "towelled": "toweled",
1592 |     "towelling": "toweling",
1593 |     "toxaemia": "toxemia",
1594 |     "tranquillise": "tranquilize",
1595 |     "tranquillised": "tranquilized",
1596 |     "tranquilliser": "tranquilizer",
1597 |     "tranquillisers": "tranquilizers",
1598 |     "tranquillises": "tranquilizes",
1599 |     "tranquillising": "tranquilizing",
1600 |     "tranquillity": "tranquility",
1601 |     "tranquillize": "tranquilize",
1602 |     "tranquillized": "tranquilized",
1603 |     "tranquillizer": "tranquilizer",
1604 |     "tranquillizers": "tranquilizers",
1605 |     "tranquillizes": "tranquilizes",
1606 |     "tranquillizing": "tranquilizing",
1607 |     "tranquilly": "tranquility",
1608 |     "transistorised": "transistorized",
1609 |     "traumatise": "traumatize",
1610 |     "traumatised": "traumatized",
1611 |     "traumatises": "traumatizes",
1612 |     "traumatising": "traumatizing",
1613 |     "travelled": "traveled",
1614 |     "traveller": "traveler",
1615 |     "travellers": "travelers",
1616 |     "travelling": "traveling",
1617 |     "travelog": "travelogue",
1618 |     "travelogs": "travelogues",
1619 |     "trialled": "trialed",
1620 |     "trialling": "trialing",
1621 |     "tricolour": "tricolor",
1622 |     "tricolours": "tricolors",
1623 |     "trivialise": "trivialize",
1624 |     "trivialised": "trivialized",
1625 |     "trivialises": "trivializes",
1626 |     "trivialising": "trivializing",
1627 |     "tumour": "tumor",
1628 |     "tumours": "tumors",
1629 |     "tunnelled": "tunneled",
1630 |     "tunnelling": "tunneling",
1631 |     "tyrannise": "tyrannize",
1632 |     "tyrannised": "tyrannized",
1633 |     "tyrannises": "tyrannizes",
1634 |     "tyrannising": "tyrannizing",
1635 |     "tyre": "tire",
1636 |     "tyres": "tires",
1637 |     "unauthorised": "unauthorized",
1638 |     "uncivilised": "uncivilized",
1639 |     "underutilised": "underutilized",
1640 |     "unequalled": "unequaled",
1641 |     "unfavourable": "unfavorable",
1642 |     "unfavourably": "unfavorably",
1643 |     "unionisation": "unionization",
1644 |     "unionise": "unionize",
1645 |     "unionised": "unionized",
1646 |     "unionises": "unionizes",
1647 |     "unionising": "unionizing",
1648 |     "unorganised": "unorganized",
1649 |     "unravelled": "unraveled",
1650 |     "unravelling": "unraveling",
1651 |     "unrecognisable": "unrecognizable",
1652 |     "unrecognised": "unrecognized",
1653 |     "unrivalled": "unrivaled",
1654 |     "unsavoury": "unsavory",
1655 |     "untrammelled": "untrammeled",
1656 |     "urbanisation": "urbanization",
1657 |     "urbanise": "urbanize",
1658 |     "urbanised": "urbanized",
1659 |     "urbanises": "urbanizes",
1660 |     "urbanising": "urbanizing",
1661 |     "utilisable": "utilizable",
1662 |     "utilisation": "utilization",
1663 |     "utilise": "utilize",
1664 |     "utilised": "utilized",
1665 |     "utilises": "utilizes",
1666 |     "utilising": "utilizing",
1667 |     "valour": "valor",
1668 |     "vandalise": "vandalize",
1669 |     "vandalised": "vandalized",
1670 |     "vandalises": "vandalizes",
1671 |     "vandalising": "vandalizing",
1672 |     "vaporisation": "vaporization",
1673 |     "vaporise": "vaporize",
1674 |     "vaporised": "vaporized",
1675 |     "vaporises": "vaporizes",
1676 |     "vaporising": "vaporizing",
1677 |     "vapour": "vapor",
1678 |     "vapours": "vapors",
1679 |     "verbalise": "verbalize",
1680 |     "verbalised": "verbalized",
1681 |     "verbalises": "verbalizes",
1682 |     "verbalising": "verbalizing",
1683 |     "victimisation": "victimization",
1684 |     "victimise": "victimize",
1685 |     "victimised": "victimized",
1686 |     "victimises": "victimizes",
1687 |     "victimising": "victimizing",
1688 |     "videodisc": "videodisk",
1689 |     "videodiscs": "videodisks",
1690 |     "vigour": "vigor",
1691 |     "visualisation": "visualization",
1692 |     "visualisations": "visualizations",
1693 |     "visualise": "visualize",
1694 |     "visualised": "visualized",
1695 |     "visualises": "visualizes",
1696 |     "visualising": "visualizing",
1697 |     "vocalisation": "vocalization",
1698 |     "vocalisations": "vocalizations",
1699 |     "vocalise": "vocalize",
1700 |     "vocalised": "vocalized",
1701 |     "vocalises": "vocalizes",
1702 |     "vocalising": "vocalizing",
1703 |     "vulcanised": "vulcanized",
1704 |     "vulgarisation": "vulgarization",
1705 |     "vulgarise": "vulgarize",
1706 |     "vulgarised": "vulgarized",
1707 |     "vulgarises": "vulgarizes",
1708 |     "vulgarising": "vulgarizing",
1709 |     "waggon": "wagon",
1710 |     "waggons": "wagons",
1711 |     "watercolour": "watercolor",
1712 |     "watercolours": "watercolors",
1713 |     "weaselled": "weaseled",
1714 |     "weaselling": "weaseling",
1715 |     "westernisation": "westernization",
1716 |     "westernise": "westernize",
1717 |     "westernised": "westernized",
1718 |     "westernises": "westernizes",
1719 |     "westernising": "westernizing",
1720 |     "womanise": "womanize",
1721 |     "womanised": "womanized",
1722 |     "womaniser": "womanizer",
1723 |     "womanisers": "womanizers",
1724 |     "womanises": "womanizes",
1725 |     "womanising": "womanizing",
1726 |     "woollen": "woolen",
1727 |     "woollens": "woolens",
1728 |     "woollies": "woolies",
1729 |     "woolly": "wooly",
1730 |     "worshipped": "worshiped",
1731 |     "worshipping": "worshiping",
1732 |     "worshipper": "worshiper",
1733 |     "yodelled": "yodeled",
1734 |     "yodelling": "yodeling",
1735 |     "yoghourt": "yogurt",
1736 |     "yoghourts": "yogurts",
1737 |     "yoghurt": "yogurt",
1738 |     "yoghurts": "yogurts",
1739 |     "mhm": "hmm",
1740 |     "mm": "hmm",
1741 |     "mmm": "hmm"
1742 | }


--------------------------------------------------------------------------------
/musetalk/whisper/normalizers/english.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | from fractions import Fraction
  5 | from typing import Iterator, List, Match, Optional, Union
  6 | 
  7 | from more_itertools import windowed
  8 | 
  9 | from .basic import remove_symbols_and_diacritics
 10 | 
 11 | 
 12 | class EnglishNumberNormalizer:
 13 |     """
 14 |     Convert any spelled-out numbers into arabic numbers, while handling:
 15 | 
 16 |     - remove any commas
 17 |     - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
 18 |     - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
 19 |     - spell out `one` and `ones`
 20 |     - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         super().__init__()
 25 | 
 26 |         self.zeros = {"o", "oh", "zero"}
 27 |         self.ones = {
 28 |             name: i
 29 |             for i, name in enumerate(
 30 |                 [
 31 |                     "one",
 32 |                     "two",
 33 |                     "three",
 34 |                     "four",
 35 |                     "five",
 36 |                     "six",
 37 |                     "seven",
 38 |                     "eight",
 39 |                     "nine",
 40 |                     "ten",
 41 |                     "eleven",
 42 |                     "twelve",
 43 |                     "thirteen",
 44 |                     "fourteen",
 45 |                     "fifteen",
 46 |                     "sixteen",
 47 |                     "seventeen",
 48 |                     "eighteen",
 49 |                     "nineteen",
 50 |                 ],
 51 |                 start=1,
 52 |             )
 53 |         }
 54 |         self.ones_plural = {
 55 |             "sixes" if name == "six" else name + "s": (value, "s")
 56 |             for name, value in self.ones.items()
 57 |         }
 58 |         self.ones_ordinal = {
 59 |             "zeroth": (0, "th"),
 60 |             "first": (1, "st"),
 61 |             "second": (2, "nd"),
 62 |             "third": (3, "rd"),
 63 |             "fifth": (5, "th"),
 64 |             "twelfth": (12, "th"),
 65 |             **{
 66 |                 name + ("h" if name.endswith("t") else "th"): (value, "th")
 67 |                 for name, value in self.ones.items()
 68 |                 if value > 3 and value != 5 and value != 12
 69 |             },
 70 |         }
 71 |         self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
 72 | 
 73 |         self.tens = {
 74 |             "twenty": 20,
 75 |             "thirty": 30,
 76 |             "forty": 40,
 77 |             "fifty": 50,
 78 |             "sixty": 60,
 79 |             "seventy": 70,
 80 |             "eighty": 80,
 81 |             "ninety": 90,
 82 |         }
 83 |         self.tens_plural = {
 84 |             name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
 85 |         }
 86 |         self.tens_ordinal = {
 87 |             name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
 88 |         }
 89 |         self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
 90 | 
 91 |         self.multipliers = {
 92 |             "hundred": 100,
 93 |             "thousand": 1_000,
 94 |             "million": 1_000_000,
 95 |             "billion": 1_000_000_000,
 96 |             "trillion": 1_000_000_000_000,
 97 |             "quadrillion": 1_000_000_000_000_000,
 98 |             "quintillion": 1_000_000_000_000_000_000,
 99 |             "sextillion": 1_000_000_000_000_000_000_000,
100 |             "septillion": 1_000_000_000_000_000_000_000_000,
101 |             "octillion": 1_000_000_000_000_000_000_000_000_000,
102 |             "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
103 |             "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
104 |         }
105 |         self.multipliers_plural = {
106 |             name + "s": (value, "s") for name, value in self.multipliers.items()
107 |         }
108 |         self.multipliers_ordinal = {
109 |             name + "th": (value, "th") for name, value in self.multipliers.items()
110 |         }
111 |         self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
112 |         self.decimals = {*self.ones, *self.tens, *self.zeros}
113 | 
114 |         self.preceding_prefixers = {
115 |             "minus": "-",
116 |             "negative": "-",
117 |             "plus": "+",
118 |             "positive": "+",
119 |         }
120 |         self.following_prefixers = {
121 |             "pound": "£",
122 |             "pounds": "£",
123 |             "euro": "€",
124 |             "euros": "€",
125 |             "dollar": "$",
126 |             "dollars": "$",
127 |             "cent": "¢",
128 |             "cents": "¢",
129 |         }
130 |         self.prefixes = set(
131 |             list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
132 |         )
133 |         self.suffixers = {
134 |             "per": {"cent": "%"},
135 |             "percent": "%",
136 |         }
137 |         self.specials = {"and", "double", "triple", "point"}
138 | 
139 |         self.words = set(
140 |             [
141 |                 key
142 |                 for mapping in [
143 |                     self.zeros,
144 |                     self.ones,
145 |                     self.ones_suffixed,
146 |                     self.tens,
147 |                     self.tens_suffixed,
148 |                     self.multipliers,
149 |                     self.multipliers_suffixed,
150 |                     self.preceding_prefixers,
151 |                     self.following_prefixers,
152 |                     self.suffixers,
153 |                     self.specials,
154 |                 ]
155 |                 for key in mapping
156 |             ]
157 |         )
158 |         self.literal_words = {"one", "ones"}
159 | 
160 |     def process_words(self, words: List[str]) -> Iterator[str]:
161 |         prefix: Optional[str] = None
162 |         value: Optional[Union[str, int]] = None
163 |         skip = False
164 | 
165 |         def to_fraction(s: str):
166 |             try:
167 |                 return Fraction(s)
168 |             except ValueError:
169 |                 return None
170 | 
171 |         def output(result: Union[str, int]):
172 |             nonlocal prefix, value
173 |             result = str(result)
174 |             if prefix is not None:
175 |                 result = prefix + result
176 |             value = None
177 |             prefix = None
178 |             return result
179 | 
180 |         if len(words) == 0:
181 |             return
182 | 
183 |         for prev, current, next in windowed([None] + words + [None], 3):
184 |             if skip:
185 |                 skip = False
186 |                 continue
187 | 
188 |             next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
189 |             has_prefix = current[0] in self.prefixes
190 |             current_without_prefix = current[1:] if has_prefix else current
191 |             if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
192 |                 # arabic numbers (potentially with signs and fractions)
193 |                 f = to_fraction(current_without_prefix)
194 |                 assert f is not None
195 |                 if value is not None:
196 |                     if isinstance(value, str) and value.endswith("."):
197 |                         # concatenate decimals / ip address components
198 |                         value = str(value) + str(current)
199 |                         continue
200 |                     else:
201 |                         yield output(value)
202 | 
203 |                 prefix = current[0] if has_prefix else prefix
204 |                 if f.denominator == 1:
205 |                     value = f.numerator  # store integers as int
206 |                 else:
207 |                     value = current_without_prefix
208 |             elif current not in self.words:
209 |                 # non-numeric words
210 |                 if value is not None:
211 |                     yield output(value)
212 |                 yield output(current)
213 |             elif current in self.zeros:
214 |                 value = str(value or "") + "0"
215 |             elif current in self.ones:
216 |                 ones = self.ones[current]
217 | 
218 |                 if value is None:
219 |                     value = ones
220 |                 elif isinstance(value, str) or prev in self.ones:
221 |                     if prev in self.tens and ones < 10:  # replace the last zero with the digit
222 |                         assert value[-1] == "0"
223 |                         value = value[:-1] + str(ones)
224 |                     else:
225 |                         value = str(value) + str(ones)
226 |                 elif ones < 10:
227 |                     if value % 10 == 0:
228 |                         value += ones
229 |                     else:
230 |                         value = str(value) + str(ones)
231 |                 else:  # eleven to nineteen
232 |                     if value % 100 == 0:
233 |                         value += ones
234 |                     else:
235 |                         value = str(value) + str(ones)
236 |             elif current in self.ones_suffixed:
237 |                 # ordinal or cardinal; yield the number right away
238 |                 ones, suffix = self.ones_suffixed[current]
239 |                 if value is None:
240 |                     yield output(str(ones) + suffix)
241 |                 elif isinstance(value, str) or prev in self.ones:
242 |                     if prev in self.tens and ones < 10:
243 |                         assert value[-1] == "0"
244 |                         yield output(value[:-1] + str(ones) + suffix)
245 |                     else:
246 |                         yield output(str(value) + str(ones) + suffix)
247 |                 elif ones < 10:
248 |                     if value % 10 == 0:
249 |                         yield output(str(value + ones) + suffix)
250 |                     else:
251 |                         yield output(str(value) + str(ones) + suffix)
252 |                 else:  # eleven to nineteen
253 |                     if value % 100 == 0:
254 |                         yield output(str(value + ones) + suffix)
255 |                     else:
256 |                         yield output(str(value) + str(ones) + suffix)
257 |                 value = None
258 |             elif current in self.tens:
259 |                 tens = self.tens[current]
260 |                 if value is None:
261 |                     value = tens
262 |                 elif isinstance(value, str):
263 |                     value = str(value) + str(tens)
264 |                 else:
265 |                     if value % 100 == 0:
266 |                         value += tens
267 |                     else:
268 |                         value = str(value) + str(tens)
269 |             elif current in self.tens_suffixed:
270 |                 # ordinal or cardinal; yield the number right away
271 |                 tens, suffix = self.tens_suffixed[current]
272 |                 if value is None:
273 |                     yield output(str(tens) + suffix)
274 |                 elif isinstance(value, str):
275 |                     yield output(str(value) + str(tens) + suffix)
276 |                 else:
277 |                     if value % 100 == 0:
278 |                         yield output(str(value + tens) + suffix)
279 |                     else:
280 |                         yield output(str(value) + str(tens) + suffix)
281 |             elif current in self.multipliers:
282 |                 multiplier = self.multipliers[current]
283 |                 if value is None:
284 |                     value = multiplier
285 |                 elif isinstance(value, str) or value == 0:
286 |                     f = to_fraction(value)
287 |                     p = f * multiplier if f is not None else None
288 |                     if f is not None and p.denominator == 1:
289 |                         value = p.numerator
290 |                     else:
291 |                         yield output(value)
292 |                         value = multiplier
293 |                 else:
294 |                     before = value // 1000 * 1000
295 |                     residual = value % 1000
296 |                     value = before + residual * multiplier
297 |             elif current in self.multipliers_suffixed:
298 |                 multiplier, suffix = self.multipliers_suffixed[current]
299 |                 if value is None:
300 |                     yield output(str(multiplier) + suffix)
301 |                 elif isinstance(value, str):
302 |                     f = to_fraction(value)
303 |                     p = f * multiplier if f is not None else None
304 |                     if f is not None and p.denominator == 1:
305 |                         yield output(str(p.numerator) + suffix)
306 |                     else:
307 |                         yield output(value)
308 |                         yield output(str(multiplier) + suffix)
309 |                 else:  # int
310 |                     before = value // 1000 * 1000
311 |                     residual = value % 1000
312 |                     value = before + residual * multiplier
313 |                     yield output(str(value) + suffix)
314 |                 value = None
315 |             elif current in self.preceding_prefixers:
316 |                 # apply prefix (positive, minus, etc.) if it precedes a number
317 |                 if value is not None:
318 |                     yield output(value)
319 | 
320 |                 if next in self.words or next_is_numeric:
321 |                     prefix = self.preceding_prefixers[current]
322 |                 else:
323 |                     yield output(current)
324 |             elif current in self.following_prefixers:
325 |                 # apply prefix (dollars, cents, etc.) only after a number
326 |                 if value is not None:
327 |                     prefix = self.following_prefixers[current]
328 |                     yield output(value)
329 |                 else:
330 |                     yield output(current)
331 |             elif current in self.suffixers:
332 |                 # apply suffix symbols (percent -> '%')
333 |                 if value is not None:
334 |                     suffix = self.suffixers[current]
335 |                     if isinstance(suffix, dict):
336 |                         if next in suffix:
337 |                             yield output(str(value) + suffix[next])
338 |                             skip = True
339 |                         else:
340 |                             yield output(value)
341 |                             yield output(current)
342 |                     else:
343 |                         yield output(str(value) + suffix)
344 |                 else:
345 |                     yield output(current)
346 |             elif current in self.specials:
347 |                 if next not in self.words and not next_is_numeric:
348 |                     # apply special handling only if the next word can be numeric
349 |                     if value is not None:
350 |                         yield output(value)
351 |                     yield output(current)
352 |                 elif current == "and":
353 |                     # ignore "and" after hundreds, thousands, etc.
354 |                     if prev not in self.multipliers:
355 |                         if value is not None:
356 |                             yield output(value)
357 |                         yield output(current)
358 |                 elif current == "double" or current == "triple":
359 |                     if next in self.ones or next in self.zeros:
360 |                         repeats = 2 if current == "double" else 3
361 |                         ones = self.ones.get(next, 0)
362 |                         value = str(value or "") + str(ones) * repeats
363 |                         skip = True
364 |                     else:
365 |                         if value is not None:
366 |                             yield output(value)
367 |                         yield output(current)
368 |                 elif current == "point":
369 |                     if next in self.decimals or next_is_numeric:
370 |                         value = str(value or "") + "."
371 |                 else:
372 |                     # should all have been covered at this point
373 |                     raise ValueError(f"Unexpected token: {current}")
374 |             else:
375 |                 # all should have been covered at this point
376 |                 raise ValueError(f"Unexpected token: {current}")
377 | 
378 |         if value is not None:
379 |             yield output(value)
380 | 
381 |     def preprocess(self, s: str):
382 |         # replace "<number> and a half" with "<number> point five"
383 |         results = []
384 | 
385 |         segments = re.split(r"\band\s+a\s+half\b", s)
386 |         for i, segment in enumerate(segments):
387 |             if len(segment.strip()) == 0:
388 |                 continue
389 |             if i == len(segments) - 1:
390 |                 results.append(segment)
391 |             else:
392 |                 results.append(segment)
393 |                 last_word = segment.rsplit(maxsplit=2)[-1]
394 |                 if last_word in self.decimals or last_word in self.multipliers:
395 |                     results.append("point five")
396 |                 else:
397 |                     results.append("and a half")
398 | 
399 |         s = " ".join(results)
400 | 
401 |         # put a space at number/letter boundary
402 |         s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
403 |         s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
404 | 
405 |         # but remove spaces which could be a suffix
406 |         s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
407 | 
408 |         return s
409 | 
410 |     def postprocess(self, s: str):
411 |         def combine_cents(m: Match):
412 |             try:
413 |                 currency = m.group(1)
414 |                 integer = m.group(2)
415 |                 cents = int(m.group(3))
416 |                 return f"{currency}{integer}.{cents:02d}"
417 |             except ValueError:
418 |                 return m.string
419 | 
420 |         def extract_cents(m: Match):
421 |             try:
422 |                 return f"¢{int(m.group(1))}"
423 |             except ValueError:
424 |                 return m.string
425 | 
426 |         # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
427 |         s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
428 |         s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
429 | 
430 |         # write "one(s)" instead of "1(s)", just for the readability
431 |         s = re.sub(r"\b1(s?)\b", r"one\1", s)
432 | 
433 |         return s
434 | 
435 |     def __call__(self, s: str):
436 |         s = self.preprocess(s)
437 |         s = " ".join(word for word in self.process_words(s.split()) if word is not None)
438 |         s = self.postprocess(s)
439 | 
440 |         return s
441 | 
442 | 
443 | class EnglishSpellingNormalizer:
444 |     """
445 |     Applies British-American spelling mappings as listed in [1].
446 | 
447 |     [1] https://www.tysto.com/uk-us-spelling-list.html
448 |     """
449 | 
450 |     def __init__(self):
451 |         mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
452 |         self.mapping = json.load(open(mapping_path))
453 | 
454 |     def __call__(self, s: str):
455 |         return " ".join(self.mapping.get(word, word) for word in s.split())
456 | 
457 | 
458 | class EnglishTextNormalizer:
459 |     def __init__(self):
460 |         self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
461 |         self.replacers = {
462 |             # common contractions
463 |             r"\bwon't\b": "will not",
464 |             r"\bcan't\b": "can not",
465 |             r"\blet's\b": "let us",
466 |             r"\bain't\b": "aint",
467 |             r"\by'all\b": "you all",
468 |             r"\bwanna\b": "want to",
469 |             r"\bgotta\b": "got to",
470 |             r"\bgonna\b": "going to",
471 |             r"\bi'ma\b": "i am going to",
472 |             r"\bimma\b": "i am going to",
473 |             r"\bwoulda\b": "would have",
474 |             r"\bcoulda\b": "could have",
475 |             r"\bshoulda\b": "should have",
476 |             r"\bma'am\b": "madam",
477 |             # contractions in titles/prefixes
478 |             r"\bmr\b": "mister ",
479 |             r"\bmrs\b": "missus ",
480 |             r"\bst\b": "saint ",
481 |             r"\bdr\b": "doctor ",
482 |             r"\bprof\b": "professor ",
483 |             r"\bcapt\b": "captain ",
484 |             r"\bgov\b": "governor ",
485 |             r"\bald\b": "alderman ",
486 |             r"\bgen\b": "general ",
487 |             r"\bsen\b": "senator ",
488 |             r"\brep\b": "representative ",
489 |             r"\bpres\b": "president ",
490 |             r"\brev\b": "reverend ",
491 |             r"\bhon\b": "honorable ",
492 |             r"\basst\b": "assistant ",
493 |             r"\bassoc\b": "associate ",
494 |             r"\blt\b": "lieutenant ",
495 |             r"\bcol\b": "colonel ",
496 |             r"\bjr\b": "junior ",
497 |             r"\bsr\b": "senior ",
498 |             r"\besq\b": "esquire ",
499 |             # prefect tenses, ideally it should be any past participles, but it's harder..
500 |             r"'d been\b": " had been",
501 |             r"'s been\b": " has been",
502 |             r"'d gone\b": " had gone",
503 |             r"'s gone\b": " has gone",
504 |             r"'d done\b": " had done",  # "'s done" is ambiguous
505 |             r"'s got\b": " has got",
506 |             # general contractions
507 |             r"n't\b": " not",
508 |             r"'re\b": " are",
509 |             r"'s\b": " is",
510 |             r"'d\b": " would",
511 |             r"'ll\b": " will",
512 |             r"'t\b": " not",
513 |             r"'ve\b": " have",
514 |             r"'m\b": " am",
515 |         }
516 |         self.standardize_numbers = EnglishNumberNormalizer()
517 |         self.standardize_spellings = EnglishSpellingNormalizer()
518 | 
519 |     def __call__(self, s: str):
520 |         s = s.lower()
521 | 
522 |         s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
523 |         s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
524 |         s = re.sub(self.ignore_patterns, "", s)
525 |         s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe
526 | 
527 |         for pattern, replacement in self.replacers.items():
528 |             s = re.sub(pattern, replacement, s)
529 | 
530 |         s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
531 |         s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
532 |         s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics
533 | 
534 |         s = self.standardize_numbers(s)
535 |         s = self.standardize_spellings(s)
536 | 
537 |         # now remove prefix/suffix symbols that are not preceded/followed by numbers
538 |         s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
539 |         s = re.sub(r"([^0-9])%", r"\1 ", s)
540 | 
541 |         s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
542 | 
543 |         return s
544 | 


--------------------------------------------------------------------------------
/musetalk/whisper/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass
  3 | from functools import lru_cache
  4 | from typing import List, Optional, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from transformers import GPT2TokenizerFast
  9 | 
 10 | LANGUAGES = {
 11 |     "en": "english",
 12 |     "zh": "chinese",
 13 |     "de": "german",
 14 |     "es": "spanish",
 15 |     "ru": "russian",
 16 |     "ko": "korean",
 17 |     "fr": "french",
 18 |     "ja": "japanese",
 19 |     "pt": "portuguese",
 20 |     "tr": "turkish",
 21 |     "pl": "polish",
 22 |     "ca": "catalan",
 23 |     "nl": "dutch",
 24 |     "ar": "arabic",
 25 |     "sv": "swedish",
 26 |     "it": "italian",
 27 |     "id": "indonesian",
 28 |     "hi": "hindi",
 29 |     "fi": "finnish",
 30 |     "vi": "vietnamese",
 31 |     "iw": "hebrew",
 32 |     "uk": "ukrainian",
 33 |     "el": "greek",
 34 |     "ms": "malay",
 35 |     "cs": "czech",
 36 |     "ro": "romanian",
 37 |     "da": "danish",
 38 |     "hu": "hungarian",
 39 |     "ta": "tamil",
 40 |     "no": "norwegian",
 41 |     "th": "thai",
 42 |     "ur": "urdu",
 43 |     "hr": "croatian",
 44 |     "bg": "bulgarian",
 45 |     "lt": "lithuanian",
 46 |     "la": "latin",
 47 |     "mi": "maori",
 48 |     "ml": "malayalam",
 49 |     "cy": "welsh",
 50 |     "sk": "slovak",
 51 |     "te": "telugu",
 52 |     "fa": "persian",
 53 |     "lv": "latvian",
 54 |     "bn": "bengali",
 55 |     "sr": "serbian",
 56 |     "az": "azerbaijani",
 57 |     "sl": "slovenian",
 58 |     "kn": "kannada",
 59 |     "et": "estonian",
 60 |     "mk": "macedonian",
 61 |     "br": "breton",
 62 |     "eu": "basque",
 63 |     "is": "icelandic",
 64 |     "hy": "armenian",
 65 |     "ne": "nepali",
 66 |     "mn": "mongolian",
 67 |     "bs": "bosnian",
 68 |     "kk": "kazakh",
 69 |     "sq": "albanian",
 70 |     "sw": "swahili",
 71 |     "gl": "galician",
 72 |     "mr": "marathi",
 73 |     "pa": "punjabi",
 74 |     "si": "sinhala",
 75 |     "km": "khmer",
 76 |     "sn": "shona",
 77 |     "yo": "yoruba",
 78 |     "so": "somali",
 79 |     "af": "afrikaans",
 80 |     "oc": "occitan",
 81 |     "ka": "georgian",
 82 |     "be": "belarusian",
 83 |     "tg": "tajik",
 84 |     "sd": "sindhi",
 85 |     "gu": "gujarati",
 86 |     "am": "amharic",
 87 |     "yi": "yiddish",
 88 |     "lo": "lao",
 89 |     "uz": "uzbek",
 90 |     "fo": "faroese",
 91 |     "ht": "haitian creole",
 92 |     "ps": "pashto",
 93 |     "tk": "turkmen",
 94 |     "nn": "nynorsk",
 95 |     "mt": "maltese",
 96 |     "sa": "sanskrit",
 97 |     "lb": "luxembourgish",
 98 |     "my": "myanmar",
 99 |     "bo": "tibetan",
100 |     "tl": "tagalog",
101 |     "mg": "malagasy",
102 |     "as": "assamese",
103 |     "tt": "tatar",
104 |     "haw": "hawaiian",
105 |     "ln": "lingala",
106 |     "ha": "hausa",
107 |     "ba": "bashkir",
108 |     "jw": "javanese",
109 |     "su": "sundanese",
110 | }
111 | 
112 | # language code lookup by name, with a few language aliases
113 | TO_LANGUAGE_CODE = {
114 |     **{language: code for code, language in LANGUAGES.items()},
115 |     "burmese": "my",
116 |     "valencian": "ca",
117 |     "flemish": "nl",
118 |     "haitian": "ht",
119 |     "letzeburgesch": "lb",
120 |     "pushto": "ps",
121 |     "panjabi": "pa",
122 |     "moldavian": "ro",
123 |     "moldovan": "ro",
124 |     "sinhalese": "si",
125 |     "castilian": "es",
126 | }
127 | 
128 | 
129 | @dataclass(frozen=True)
130 | class Tokenizer:
131 |     """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens"""
132 | 
133 |     tokenizer: "GPT2TokenizerFast"
134 |     language: Optional[str]
135 |     sot_sequence: Tuple[int]
136 | 
137 |     def encode(self, text, **kwargs):
138 |         return self.tokenizer.encode(text, **kwargs)
139 | 
140 |     def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs):
141 |         return self.tokenizer.decode(token_ids, **kwargs)
142 | 
143 |     def decode_with_timestamps(self, tokens) -> str:
144 |         """
145 |         Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
146 |         This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
147 |         """
148 |         outputs = [[]]
149 |         for token in tokens:
150 |             if token >= self.timestamp_begin:
151 |                 timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
152 |                 outputs.append(timestamp)
153 |                 outputs.append([])
154 |             else:
155 |                 outputs[-1].append(token)
156 |         outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
157 |         return "".join(outputs)
158 | 
159 |     @property
160 |     @lru_cache()
161 |     def eot(self) -> int:
162 |         return self.tokenizer.eos_token_id
163 | 
164 |     @property
165 |     @lru_cache()
166 |     def sot(self) -> int:
167 |         return self._get_single_token_id("<|startoftranscript|>")
168 | 
169 |     @property
170 |     @lru_cache()
171 |     def sot_lm(self) -> int:
172 |         return self._get_single_token_id("<|startoflm|>")
173 | 
174 |     @property
175 |     @lru_cache()
176 |     def sot_prev(self) -> int:
177 |         return self._get_single_token_id("<|startofprev|>")
178 | 
179 |     @property
180 |     @lru_cache()
181 |     def no_speech(self) -> int:
182 |         return self._get_single_token_id("<|nospeech|>")
183 | 
184 |     @property
185 |     @lru_cache()
186 |     def no_timestamps(self) -> int:
187 |         return self._get_single_token_id("<|notimestamps|>")
188 | 
189 |     @property
190 |     @lru_cache()
191 |     def timestamp_begin(self) -> int:
192 |         return self.tokenizer.all_special_ids[-1] + 1
193 | 
194 |     @property
195 |     @lru_cache()
196 |     def language_token(self) -> int:
197 |         """Returns the token id corresponding to the value of the `language` field"""
198 |         if self.language is None:
199 |             raise ValueError(f"This tokenizer does not have language token configured")
200 | 
201 |         additional_tokens = dict(
202 |             zip(
203 |                 self.tokenizer.additional_special_tokens,
204 |                 self.tokenizer.additional_special_tokens_ids,
205 |             )
206 |         )
207 |         candidate = f"<|{self.language}|>"
208 |         if candidate in additional_tokens:
209 |             return additional_tokens[candidate]
210 | 
211 |         raise KeyError(f"Language {self.language} not found in tokenizer.")
212 | 
213 |     @property
214 |     @lru_cache()
215 |     def all_language_tokens(self) -> Tuple[int]:
216 |         result = []
217 |         for token, token_id in zip(
218 |             self.tokenizer.additional_special_tokens,
219 |             self.tokenizer.additional_special_tokens_ids,
220 |         ):
221 |             if token.strip("<|>") in LANGUAGES:
222 |                 result.append(token_id)
223 |         return tuple(result)
224 | 
225 |     @property
226 |     @lru_cache()
227 |     def all_language_codes(self) -> Tuple[str]:
228 |         return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
229 | 
230 |     @property
231 |     @lru_cache()
232 |     def sot_sequence_including_notimestamps(self) -> Tuple[int]:
233 |         return tuple(list(self.sot_sequence) + [self.no_timestamps])
234 | 
235 |     @property
236 |     @lru_cache()
237 |     def non_speech_tokens(self) -> Tuple[int]:
238 |         """
239 |         Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
240 |         annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
241 | 
242 |         - ♪♪♪
243 |         - ( SPEAKING FOREIGN LANGUAGE )
244 |         - [DAVID] Hey there,
245 | 
246 |         keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
247 |         """
248 |         symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
249 |         symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
250 | 
251 |         # symbols that may be a single token or multiple tokens depending on the tokenizer.
252 |         # In case they're multiple tokens, suppress the first token, which is safe because:
253 |         # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
254 |         # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
255 |         miscellaneous = set("♩♪♫♬♭♮♯")
256 |         assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
257 | 
258 |         # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
259 |         result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
260 |         for symbol in symbols + list(miscellaneous):
261 |             for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
262 |                 if len(tokens) == 1 or symbol in miscellaneous:
263 |                     result.add(tokens[0])
264 | 
265 |         return tuple(sorted(result))
266 | 
267 |     def _get_single_token_id(self, text) -> int:
268 |         tokens = self.tokenizer.encode(text)
269 |         assert len(tokens) == 1, f"{text} is not encoded as a single token"
270 |         return tokens[0]
271 | 
272 | 
273 | @lru_cache(maxsize=None)
274 | def build_tokenizer(name: str = "gpt2"):
275 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
276 |     path = os.path.join(os.path.dirname(__file__), "assets", name)
277 |     tokenizer = GPT2TokenizerFast.from_pretrained(path)
278 | 
279 |     specials = [
280 |         "<|startoftranscript|>",
281 |         *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
282 |         "<|translate|>",
283 |         "<|transcribe|>",
284 |         "<|startoflm|>",
285 |         "<|startofprev|>",
286 |         "<|nospeech|>",
287 |         "<|notimestamps|>",
288 |     ]
289 | 
290 |     tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
291 |     return tokenizer
292 | 
293 | 
294 | @lru_cache(maxsize=None)
295 | def get_tokenizer(
296 |     multilingual: bool,
297 |     *,
298 |     task: Optional[str] = None,  # Literal["transcribe", "translate", None]
299 |     language: Optional[str] = None,
300 | ) -> Tokenizer:
301 |     if language is not None:
302 |         language = language.lower()
303 |         if language not in LANGUAGES:
304 |             if language in TO_LANGUAGE_CODE:
305 |                 language = TO_LANGUAGE_CODE[language]
306 |             else:
307 |                 raise ValueError(f"Unsupported language: {language}")
308 | 
309 |     if multilingual:
310 |         tokenizer_name = "multilingual"
311 |         task = task or "transcribe"
312 |         language = language or "en"
313 |     else:
314 |         tokenizer_name = "gpt2"
315 |         task = None
316 |         language = None
317 | 
318 |     tokenizer = build_tokenizer(name=tokenizer_name)
319 |     all_special_ids: List[int] = tokenizer.all_special_ids
320 |     sot: int = all_special_ids[1]
321 |     translate: int = all_special_ids[-6]
322 |     transcribe: int = all_special_ids[-5]
323 | 
324 |     langs = tuple(LANGUAGES.keys())
325 |     sot_sequence = [sot]
326 |     if language is not None:
327 |         sot_sequence.append(sot + 1 + langs.index(language))
328 |     if task is not None:
329 |         sot_sequence.append(transcribe if task == "transcribe" else translate)
330 | 
331 |     return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence))
332 | 


--------------------------------------------------------------------------------
/musetalk/whisper/transcribe.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | from typing import List, Optional, Tuple, Union, TYPE_CHECKING
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import tqdm
  9 | 
 10 | from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
 11 | from .decoding import DecodingOptions, DecodingResult
 12 | from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
 13 | from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_txt, write_vtt, write_srt
 14 | 
 15 | if TYPE_CHECKING:
 16 |     from .model import Whisper
 17 | 
 18 | 
 19 | def transcribe(
 20 |         model: "Whisper",
 21 |         audio: Union[str, np.ndarray, torch.Tensor],
 22 |         *,
 23 |         verbose: Optional[bool] = None,
 24 |         temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
 25 |         compression_ratio_threshold: Optional[float] = 2.4,
 26 |         logprob_threshold: Optional[float] = -1.0,
 27 |         no_speech_threshold: Optional[float] = 0.6,
 28 |         condition_on_previous_text: bool = True,
 29 |         force_extraction: bool = False,
 30 |         **decode_options,
 31 | ):
 32 |     """
 33 |     Transcribe an audio file using Whisper
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     model: Whisper
 38 |         The Whisper model instance
 39 | 
 40 |     audio: Union[str, np.ndarray, torch.Tensor]
 41 |         The path to the audio file to open, or the audio waveform
 42 | 
 43 |     verbose: bool
 44 |         Whether to display the text being decoded to the console. If True, displays all the details,
 45 |         If False, displays minimal details. If None, does not display anything
 46 | 
 47 |     temperature: Union[float, Tuple[float, ...]]
 48 |         Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
 49 |         upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
 50 | 
 51 |     compression_ratio_threshold: float
 52 |         If the gzip compression ratio is above this value, treat as failed
 53 | 
 54 |     logprob_threshold: float
 55 |         If the average log probability over sampled tokens is below this value, treat as failed
 56 | 
 57 |     no_speech_threshold: float
 58 |         If the no_speech probability is higher than this value AND the average log probability
 59 |         over sampled tokens is below `logprob_threshold`, consider the segment as silent
 60 | 
 61 |     condition_on_previous_text: bool
 62 |         if True, the previous output of the model is provided as a prompt for the next window;
 63 |         disabling may make the text inconsistent across windows, but the model becomes less prone to
 64 |         getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
 65 | 
 66 |     decode_options: dict
 67 |         Keyword arguments to construct `DecodingOptions` instances
 68 | 
 69 |     Returns
 70 |     -------
 71 |     A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
 72 |     the spoken language ("language"), which is detected when `decode_options["language"]` is None.
 73 |     """
 74 |     dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
 75 |     if model.device == torch.device("cpu"):
 76 |         if torch.cuda.is_available():
 77 |             warnings.warn("Performing inference on CPU when CUDA is available")
 78 |         if dtype == torch.float16:
 79 |             warnings.warn("FP16 is not supported on CPU; using FP32 instead")
 80 |             dtype = torch.float32
 81 | 
 82 |     if dtype == torch.float32:
 83 |         decode_options["fp16"] = False
 84 | 
 85 |     mel = log_mel_spectrogram(audio)
 86 |    
 87 |     all_segments = []
 88 |     def add_segment(
 89 |             *, start: float, end: float, encoder_embeddings
 90 |     ):
 91 |       
 92 |         all_segments.append(
 93 |             {
 94 |                 "start": start,
 95 |                 "end": end,
 96 |                 "encoder_embeddings":encoder_embeddings,
 97 |             }
 98 |         )
 99 |     # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
100 |     num_frames = mel.shape[-1]
101 |     seek = 0
102 |     previous_seek_value = seek
103 |     sample_skip = 3000 # 
104 |     with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar:
105 |         while seek < num_frames:
106 |             # seek是开始的帧数
107 |             end_seek = min(seek + sample_skip, num_frames)
108 |             segment = pad_or_trim(mel[:,seek:seek+sample_skip], N_FRAMES).to(model.device).to(dtype)
109 |             
110 |             single = segment.ndim == 2
111 |             if single:
112 |                 segment = segment.unsqueeze(0)
113 |             if dtype == torch.float16:
114 |                 segment = segment.half()
115 |             audio_features, embeddings  = model.encoder(segment, include_embeddings = True)
116 |             
117 |             encoder_embeddings = embeddings
118 |             #print(f"encoder_embeddings shape {encoder_embeddings.shape}")
119 |             add_segment(
120 |                 start=seek,
121 |                 end=end_seek,
122 |                 #text_tokens=tokens,
123 |                 #result=result,
124 |                 encoder_embeddings=encoder_embeddings,
125 |             )
126 |             seek+=sample_skip
127 |     
128 |     return dict(segments=all_segments)
129 | 
130 | 
131 | def cli():
132 |     from . import available_models
133 | 
134 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
135 |     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
136 |     parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
137 |     parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
138 |     parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
139 |     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
140 |     parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
141 | 
142 |     parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
143 |     parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
144 | 
145 |     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
146 |     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
147 |     parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
148 |     parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
149 |     parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
150 | 
151 |     parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
152 |     parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
153 |     parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
154 |     parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
155 | 
156 |     parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
157 |     parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
158 |     parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
159 |     parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
160 |     parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
161 | 
162 |     args = parser.parse_args().__dict__
163 |     model_name: str = args.pop("model")
164 |     model_dir: str = args.pop("model_dir")
165 |     output_dir: str = args.pop("output_dir")
166 |     device: str = args.pop("device")
167 |     os.makedirs(output_dir, exist_ok=True)
168 | 
169 |     if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
170 |         if args["language"] is not None:
171 |             warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
172 |         args["language"] = "en"
173 | 
174 |     temperature = args.pop("temperature")
175 |     temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
176 |     if temperature_increment_on_fallback is not None:
177 |         temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
178 |     else:
179 |         temperature = [temperature]
180 | 
181 |     threads = args.pop("threads")
182 |     if threads > 0:
183 |         torch.set_num_threads(threads)
184 | 
185 |     from . import load_model
186 |     model = load_model(model_name, device=device, download_root=model_dir)
187 | 
188 |     for audio_path in args.pop("audio"):
189 |         result = transcribe(model, audio_path, temperature=temperature, **args)
190 | 
191 |         audio_basename = os.path.basename(audio_path)
192 | 
193 |         # save TXT
194 |         with open(os.path.join(output_dir, audio_basename + ".txt"), "w", encoding="utf-8") as txt:
195 |             write_txt(result["segments"], file=txt)
196 | 
197 |         # save VTT
198 |         with open(os.path.join(output_dir, audio_basename + ".vtt"), "w", encoding="utf-8") as vtt:
199 |             write_vtt(result["segments"], file=vtt)
200 | 
201 |         # save SRT
202 |         with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt:
203 |             write_srt(result["segments"], file=srt)
204 | 
205 | 
206 | if __name__ == '__main__':
207 |     cli()
208 | 


--------------------------------------------------------------------------------
/musetalk/whisper/utils.py:
--------------------------------------------------------------------------------
 1 | import zlib
 2 | from typing import Iterator, TextIO
 3 | 
 4 | 
 5 | def exact_div(x, y):
 6 |     assert x % y == 0
 7 |     return x // y
 8 | 
 9 | 
10 | def str2bool(string):
11 |     str2val = {"True": True, "False": False}
12 |     if string in str2val:
13 |         return str2val[string]
14 |     else:
15 |         raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
16 | 
17 | 
18 | def optional_int(string):
19 |     return None if string == "None" else int(string)
20 | 
21 | 
22 | def optional_float(string):
23 |     return None if string == "None" else float(string)
24 | 
25 | 
26 | def compression_ratio(text) -> float:
27 |     return len(text) / len(zlib.compress(text.encode("utf-8")))
28 | 
29 | 
30 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
31 |     assert seconds >= 0, "non-negative timestamp expected"
32 |     milliseconds = round(seconds * 1000.0)
33 | 
34 |     hours = milliseconds // 3_600_000
35 |     milliseconds -= hours * 3_600_000
36 | 
37 |     minutes = milliseconds // 60_000
38 |     milliseconds -= minutes * 60_000
39 | 
40 |     seconds = milliseconds // 1_000
41 |     milliseconds -= seconds * 1_000
42 | 
43 |     hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
44 |     return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
45 | 
46 | 
47 | def write_txt(transcript: Iterator[dict], file: TextIO):
48 |     for segment in transcript:
49 |         print(segment['text'].strip(), file=file, flush=True)
50 | 
51 | 
52 | def write_vtt(transcript: Iterator[dict], file: TextIO):
53 |     print("WEBVTT\n", file=file)
54 |     for segment in transcript:
55 |         print(
56 |             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
57 |             f"{segment['text'].strip().replace('-->', '->')}\n",
58 |             file=file,
59 |             flush=True,
60 |         )
61 | 
62 | 
63 | def write_srt(transcript: Iterator[dict], file: TextIO):
64 |     """
65 |     Write a transcript to a file in SRT format.
66 | 
67 |     Example usage:
68 |         from pathlib import Path
69 |         from whisper.utils import write_srt
70 | 
71 |         result = transcribe(model, audio_path, temperature=temperature, **args)
72 | 
73 |         # save SRT
74 |         audio_basename = Path(audio_path).stem
75 |         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
76 |             write_srt(result["segments"], file=srt)
77 |     """
78 |     for i, segment in enumerate(transcript, start=1):
79 |         # write srt lines
80 |         print(
81 |             f"{i}\n"
82 |             f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> "
83 |             f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n"
84 |             f"{segment['text'].strip().replace('-->', '->')}\n",
85 |             file=file,
86 |             flush=True,
87 |         )
88 | 


--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torchaudio
  5 | import numpy as np
  6 | import math
  7 | import folder_paths
  8 | from contextlib import nullcontext
  9 | from tqdm import tqdm
 10 | 
 11 | import comfy.latent_formats
 12 | import comfy.model_management as mm
 13 | from comfy.utils import ProgressBar, unet_to_diffusers, load_torch_file
 14 | from comfy.model_base import BaseModel
 15 | 
 16 | script_directory = os.path.dirname(os.path.abspath(__file__))
 17 | 
 18 | class PositionalEncoding(nn.Module):
 19 |     def __init__(self, d_model=384, max_len=5000):
 20 |         super(PositionalEncoding, self).__init__()
 21 |         pe = torch.zeros(max_len, d_model)
 22 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
 23 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
 24 |         pe[:, 0::2] = torch.sin(position * div_term)
 25 |         pe[:, 1::2] = torch.cos(position * div_term)
 26 |         pe = pe.unsqueeze(0)
 27 |         self.register_buffer('pe', pe)
 28 | 
 29 |     def forward(self, x):
 30 |         b, seq_len, d_model = x.size()
 31 |         pe = self.pe[:, :seq_len, :]
 32 |         x = x + pe.to(x.device)
 33 |         return x
 34 |     
 35 | class MuseModelConfig:
 36 |     def __init__(self):
 37 |         unet_dtype = mm.unet_dtype()
 38 |         self.unet_config = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
 39 |             'dtype': unet_dtype, 'in_channels': 8, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
 40 |             'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 384, 'num_heads': 8,
 41 |             'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 42 |             'use_temporal_attention': False, 'use_temporal_resblock': False}
 43 |         self.latent_format = comfy.latent_formats.SD15
 44 |         self.manual_cast_dtype = None
 45 |         self.sampling_settings = {}
 46 | 
 47 | class UNETLoader_MuseTalk:
 48 |     @classmethod
 49 |     def INPUT_TYPES(s):
 50 |         return {"required": { 
 51 |                              }}
 52 |     RETURN_TYPES = ("MODEL",)
 53 |     FUNCTION = "load_unet"
 54 | 
 55 |     CATEGORY = "MuseTalk"
 56 | 
 57 |     def load_unet(self):
 58 |         
 59 |         model_path = os.path.join(folder_paths.models_dir,'musetalk')
 60 |         
 61 |         if not os.path.exists(model_path):
 62 |             from huggingface_hub import snapshot_download
 63 |             snapshot_download(repo_id="TMElyralab/MuseTalk", local_dir=model_path, local_dir_use_symlinks=False)
 64 | 
 65 |         unet_weight_path = os.path.join(model_path, "musetalk","pytorch_model.bin") 
 66 |         
 67 |         sd = load_torch_file(unet_weight_path)
 68 |      
 69 |         model_config = MuseModelConfig()
 70 |         diffusers_keys = unet_to_diffusers(model_config.unet_config)
 71 |         
 72 |         new_sd = {}
 73 |         for k in diffusers_keys:
 74 |             if k in sd:
 75 |                 new_sd[diffusers_keys[k]] = sd.pop(k)
 76 | 
 77 |         model = BaseModel(model_config)
 78 |         model.diffusion_model.load_state_dict(new_sd, strict=False)
 79 |         return (model,)
 80 | 
 81 | class muse_talk_sampler:
 82 |     @classmethod
 83 |     def INPUT_TYPES(s):
 84 |         return {"required": {
 85 |             "model": ("MODEL",),
 86 |             "vae": ("VAE",),
 87 |             "whisper_features" : ("WHISPERFEAT",),
 88 |             "images": ("IMAGE",),
 89 |             "masked_images": ("IMAGE",),
 90 |             "batch_size": ("INT", {"default": 8, "min": 1, "max": 4096, "step": 1}),
 91 |             "delay_frame": ("INT", {"default": 0, "min": 0, "max": 4096, "step": 1}),
 92 |             },
 93 |         }
 94 | 
 95 |     RETURN_TYPES = ("IMAGE",  )
 96 |     RETURN_NAMES = ("image",  )
 97 |     FUNCTION = "process"
 98 |     CATEGORY = "MuseTalk"
 99 | 
100 |     def process(self, model, vae, whisper_features, images, masked_images, batch_size, delay_frame):
101 | 
102 |         device = mm.get_torch_device()
103 |         offload_device = mm.unet_offload_device()
104 |         dtype = mm.unet_dtype()
105 |         vae_scale_factor = 0.18215
106 |         mm.unload_all_models()
107 |         mm.soft_empty_cache()
108 |         
109 |         images = images.to(dtype).to(device)
110 |         masked_images = masked_images.to(dtype).to(device)      
111 | 
112 |         autocast_condition = (dtype != torch.float32) and not mm.is_device_mps(device)
113 |         with torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocast_condition else nullcontext():
114 |             timesteps = torch.tensor([0], device=device)
115 |             vae.first_stage_model.to(device)
116 |             input_latent_list = []
117 |             for image, masked_image in zip(images, masked_images):
118 |                 latent = vae.encode(image.unsqueeze(0)).to(dtype).to(device) * vae_scale_factor
119 |                 masked_latents = vae.encode(masked_image.unsqueeze(0)).to(dtype).to(device) * vae_scale_factor
120 | 
121 |                 latent_model_input = torch.cat([masked_latents, latent], dim=1)
122 |                 input_latent_list.append(latent_model_input)
123 | 
124 |             input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
125 |             video_num = len(whisper_features)
126 |             gen = self.datagen(whisper_features, input_latent_list_cycle, batch_size, delay_frame)
127 |             
128 |             total=int(np.ceil(float(video_num)/batch_size))
129 |             
130 |             out_frame_list = []
131 |             
132 |             pbar = ProgressBar(total)
133 |             model.diffusion_model.to(device)
134 |             for i, (whisper_batch,latent_batch) in enumerate(tqdm(gen,total=total)):
135 |         
136 |                 tensor_list = [torch.FloatTensor(arr) for arr in whisper_batch]
137 |                 audio_feature_batch = torch.stack(tensor_list).to(device) # torch, B, 5*N,384
138 |                 audio_feature_batch = PositionalEncoding(d_model=384)(audio_feature_batch)
139 |               
140 |                 pred_latents = model.diffusion_model(latent_batch, timesteps, context=audio_feature_batch)
141 | 
142 |                 pred_latents = (1 / vae_scale_factor) * pred_latents
143 |                 decoded = vae.decode(pred_latents)
144 |                 
145 |                 for frame in decoded:
146 |                     out_frame_list.append(frame)
147 |                 pbar.update(1)
148 | 
149 |             out = torch.stack(out_frame_list, dim=0).float().cpu()
150 |         model.diffusion_model.to(offload_device)
151 |         vae.first_stage_model.to(offload_device)
152 |         return (out,)
153 |     
154 |     def datagen(self, whisper_chunks,vae_encode_latents,batch_size,delay_frame):
155 |         whisper_batch, latent_batch = [], []
156 |         for i, w in enumerate(whisper_chunks):
157 |             idx = (i+delay_frame)%len(vae_encode_latents)
158 |             latent = vae_encode_latents[idx]
159 |             whisper_batch.append(w)
160 |             latent_batch.append(latent)
161 | 
162 |             if len(latent_batch) >= batch_size:
163 |                 whisper_batch = np.asarray(whisper_batch)
164 |                 latent_batch = torch.cat(latent_batch, dim=0)
165 |                 yield whisper_batch, latent_batch
166 |                 whisper_batch, latent_batch = [], []
167 | 
168 |         # the last batch may smaller than batch size
169 |         if len(latent_batch) > 0:
170 |             whisper_batch = np.asarray(whisper_batch)
171 |             latent_batch = torch.cat(latent_batch, dim=0)
172 | 
173 |             yield whisper_batch, latent_batch
174 | 
175 | class vhs_audio_to_audio_tensor:
176 |     @classmethod
177 |     def INPUT_TYPES(s):
178 |         return {"required": {
179 |             "vhs_audio": ("VHS_AUDIO",),
180 |             "target_sample_rate": ("INT", {"default": 16000, "min": 0, "max": 48000}),
181 |             "target_channels": ("INT", {"default": 1, "min": 1, "max": 2}),
182 |              },
183 |     
184 |         }
185 | 
186 |     RETURN_TYPES = ("VCAUDIOTENSOR", "INT",)
187 |     RETURN_NAMES = ("audio_tensor", "audio_dur",)
188 |     FUNCTION = "process"
189 |     CATEGORY = "VoiceCraft"
190 | 
191 |     def process(self, vhs_audio, target_sample_rate, target_channels):
192 |         import io
193 |         # Convert the byte stream to a tensor
194 |         audio_bytes = vhs_audio()
195 |         audio_buffer = io.BytesIO(audio_bytes)
196 |         audio_tensor, sample_rate = torchaudio.load(audio_buffer)
197 |         assert audio_tensor.shape[0] in [1, 2], "Audio must be mono or stereo."
198 |         if target_channels == 1:
199 |             audio_tensor = audio_tensor.mean(0, keepdim=True)
200 |         elif target_channels == 2:
201 |             *shape, _, length = audio_tensor.shape
202 |             audio_tensor = audio_tensor.expand(*shape, target_channels, length)
203 |         elif audio_tensor.shape[0] == 1:
204 |             audio_tensor = audio_tensor.expand(target_channels, -1)
205 |         resampled_audio_tensor = torchaudio.functional.resample(audio_tensor, sample_rate, target_sample_rate)
206 |         audio_dur = audio_tensor.shape[1] / target_sample_rate
207 |         
208 |         return (resampled_audio_tensor, audio_dur,)
209 | 
210 | class whisper_to_features:
211 |     @classmethod
212 |     def INPUT_TYPES(s):
213 |         return {
214 |             "required": { 
215 |                 "audio_tensor" : ("VCAUDIOTENSOR",),
216 |                 "fps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}),
217 |             }
218 |         }
219 | 
220 |     RETURN_TYPES = ("WHISPERFEAT", "INT",)
221 |     RETURN_NAMES = ("whisper_chunks", "frame_count",)
222 |     FUNCTION = "whispertranscribe"
223 |     CATEGORY = "VoiceCraft"
224 | 
225 |     def whispertranscribe(self, audio_tensor, fps):
226 |         from .musetalk.whisper.model import Whisper, ModelDimensions
227 |         device = mm.get_torch_device()
228 |         model_path = os.path.join(script_directory, "musetalk", "whisper","checkpoints","tiny.pt")
229 |         
230 |         if not os.path.exists(model_path):
231 |             print(f"Downloading whisper tiny model (72MB) to {model_path}")
232 |             import requests
233 |             url = "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
234 |             response = requests.get(url)
235 |             if response.status_code == 200:
236 |                 with open(model_path, 'wb') as file:
237 |                     file.write(response.content)
238 |             else:
239 |                 print(f"Failed to download {url} to {model_path}, status code: {response.status_code}")
240 |         whisper_sd = torch.load(model_path, map_location=device)
241 |         dims = ModelDimensions(**whisper_sd["dims"])
242 |         model = Whisper(dims)
243 |         model.load_state_dict(whisper_sd["model_state_dict"])
244 |         del whisper_sd
245 |         result = model.transcribe(audio_tensor.squeeze(0))
246 |         
247 |         embed_list = []
248 |         for emb in result['segments']:
249 |             encoder_embeddings = emb['encoder_embeddings']
250 |             encoder_embeddings = encoder_embeddings.transpose(0,2,1,3)
251 |             encoder_embeddings = encoder_embeddings.squeeze(0)
252 |             start_idx = int(emb['start'])
253 |             end_idx = int(emb['end'])
254 |             emb_end_idx = int((end_idx - start_idx)/2)
255 |             embed_list.append(encoder_embeddings[:emb_end_idx])
256 |         whisper_feature = np.concatenate(embed_list, axis=0)
257 | 
258 |         audio_feat_length = [2,2]
259 |         whisper_chunks = []
260 |         whisper_idx_multiplier = 50./fps 
261 |         i = 0
262 |         print(f"video in {fps} FPS, audio idx in 50FPS")
263 |         while 1:
264 |             start_idx = int(i * whisper_idx_multiplier)
265 |             selected_feature,selected_idx = self.get_sliced_feature(feature_array= whisper_feature,vid_idx = i,audio_feat_length=audio_feat_length,fps=fps)
266 |             whisper_chunks.append(selected_feature)
267 |             i += 1
268 |             if start_idx>len(whisper_feature):
269 |                 break
270 |         print(f"Whisper chunks: {len(whisper_chunks)}")
271 |         return (whisper_chunks, len(whisper_chunks),)
272 |     
273 |     def get_sliced_feature(self,feature_array, vid_idx, audio_feat_length= [2,2],fps = 25):
274 |         """
275 |         Get sliced features based on a given index
276 |         :param feature_array: 
277 |         :param start_idx: the start index of the feature
278 |         :param audio_feat_length:
279 |         :return: 
280 |         """
281 |         length = len(feature_array)
282 |         selected_feature = []
283 |         selected_idx = []
284 |         
285 |         center_idx = int(vid_idx*50/fps) 
286 |         left_idx = center_idx-audio_feat_length[0]*2
287 |         right_idx = center_idx + (audio_feat_length[1]+1)*2
288 |         
289 |         for idx in range(left_idx,right_idx):
290 |             idx = max(0, idx)
291 |             idx = min(length-1, idx)
292 |             x = feature_array[idx]
293 |             selected_feature.append(x)
294 |             selected_idx.append(idx)
295 |         
296 |         selected_feature = np.concatenate(selected_feature, axis=0)
297 |         selected_feature = selected_feature.reshape(-1, 384)# 50*384
298 |         return selected_feature,selected_idx
299 |             
300 | NODE_CLASS_MAPPINGS = {
301 |     "whisper_to_features": whisper_to_features,
302 |     "vhs_audio_to_audio_tensor": vhs_audio_to_audio_tensor,
303 |     "muse_talk_sampler": muse_talk_sampler,
304 |     "UNETLoader_MuseTalk": UNETLoader_MuseTalk
305 | }
306 | NODE_DISPLAY_NAME_MAPPINGS = {
307 |     "whisper_to_features": "Whisper To Features",
308 |     "vhs_audio_to_audio_tensor": "VHS Audio To Audio Tensor",
309 |     "muse_talk_sampler": "MuseTalk Sampler",
310 |     "UNETLoader_MuseTalk": "UNETLoader_MuseTalk"
311 | }
312 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------