├── .gitattributes ├── .gitignore ├── README.md ├── __init__.py ├── example_workflows └── musetalk_vid2vid_example.json ├── musetalk └── whisper │ ├── __init__.py │ ├── __main__.py │ ├── assets │ ├── gpt2 │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.json │ ├── mel_filters.npz │ └── multilingual │ │ ├── added_tokens.json │ │ ├── merges.txt │ │ ├── special_tokens_map.json │ │ ├── tokenizer_config.json │ │ └── vocab.json │ ├── audio.py │ ├── checkpoints │ └── place_whisper_model_here.txt │ ├── decoding.py │ ├── model.py │ ├── normalizers │ ├── __init__.py │ ├── basic.py │ ├── english.json │ └── english.py │ ├── tokenizer.py │ ├── transcribe.py │ └── utils.py ├── nodes.py └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pretrained_models/ 2 | example_data/ 3 | results/ 4 | *.zip 5 | .vscode/ 6 | .hypothesis/ 7 | *.pt 8 | __pycache__ 9 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ComfyUI nodes to use MuseTalk 2 | 3 | Native (as much as possible) implementation of MuseTalk in ComfyUI. 4 | 5 | ![image](https://github.com/kijai/ComfyUI-MuseTalk-KJ/assets/40791699/0d586490-ef1d-4076-931d-8b701e63d8de) 6 | 7 | 8 | 9 | Original repo: 10 | https://github.com/TMElyralab/MuseTalk 11 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS 2 | 3 | __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] -------------------------------------------------------------------------------- /example_workflows/musetalk_vid2vid_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "last_node_id": 127, 3 | "last_link_id": 295, 4 | "nodes": [ 5 | { 6 | "id": 72, 7 | "type": "DWPreprocessor", 8 | "pos": [ 9 | -1370, 10 | 290 11 | ], 12 | "size": { 13 | "0": 315, 14 | "1": 198 15 | }, 16 | "flags": {}, 17 | "order": 13, 18 | "mode": 0, 19 | "inputs": [ 20 | { 21 | "name": "image", 22 | "type": "IMAGE", 23 | "link": 164 24 | }, 25 | { 26 | "name": "resolution", 27 | "type": "INT", 28 | "link": 216, 29 | "widget": { 30 | "name": "resolution" 31 | } 32 | } 33 | ], 34 | "outputs": [ 35 | { 36 | "name": "IMAGE", 37 | "type": "IMAGE", 38 | "links": [ 39 | 167 40 | ], 41 | "shape": 3, 42 | "slot_index": 0 43 | }, 44 | { 45 | "name": "POSE_KEYPOINT", 46 | "type": "POSE_KEYPOINT", 47 | "links": [], 48 | "shape": 3, 49 | "slot_index": 1 50 | } 51 | ], 52 | "properties": { 53 | "Node name for S&R": "DWPreprocessor" 54 | }, 55 | "widgets_values": [ 56 | "disable", 57 | "disable", 58 | "enable", 59 | 512, 60 | "yolox_l.torchscript.pt", 61 | "dw-ll_ucoco_384_bs5.torchscript.pt" 62 | ] 63 | }, 64 | { 65 | "id": 76, 66 | "type": "ImageToMask", 67 | "pos": [ 68 | -1380, 69 | 540 70 | ], 71 | "size": { 72 | "0": 315, 73 | "1": 58 74 | }, 75 | "flags": {}, 76 | "order": 14, 77 | "mode": 0, 78 | "inputs": [ 79 | { 80 | "name": "image", 81 | "type": "IMAGE", 82 | "link": 167 83 | } 84 | ], 85 | "outputs": [ 86 | { 87 | "name": "MASK", 88 | "type": "MASK", 89 | "links": [ 90 | 207 91 | ], 92 | "shape": 3, 93 | "slot_index": 0 94 | } 95 | ], 96 | "properties": { 97 | "Node name for S&R": "ImageToMask" 98 | }, 99 | "widgets_values": [ 100 | "red" 101 | ] 102 | }, 103 | { 104 | "id": 47, 105 | "type": "ImageResize+", 106 | "pos": [ 107 | -1796, 108 | 292 109 | ], 110 | "size": { 111 | "0": 315, 112 | "1": 218 113 | }, 114 | "flags": {}, 115 | "order": 10, 116 | "mode": 0, 117 | "inputs": [ 118 | { 119 | "name": "image", 120 | "type": "IMAGE", 121 | "link": 245 122 | } 123 | ], 124 | "outputs": [ 125 | { 126 | "name": "IMAGE", 127 | "type": "IMAGE", 128 | "links": [ 129 | 164, 130 | 214, 131 | 215 132 | ], 133 | "shape": 3, 134 | "slot_index": 0 135 | }, 136 | { 137 | "name": "width", 138 | "type": "INT", 139 | "links": [ 140 | 217 141 | ], 142 | "shape": 3 143 | }, 144 | { 145 | "name": "height", 146 | "type": "INT", 147 | "links": [ 148 | 218 149 | ], 150 | "shape": 3 151 | } 152 | ], 153 | "properties": { 154 | "Node name for S&R": "ImageResize+" 155 | }, 156 | "widgets_values": [ 157 | 1024, 158 | 1024, 159 | "nearest", 160 | true, 161 | "always", 162 | 2 163 | ] 164 | }, 165 | { 166 | "id": 94, 167 | "type": "PixelPerfectResolution", 168 | "pos": [ 169 | -1521, 170 | 221 171 | ], 172 | "size": { 173 | "0": 393, 174 | "1": 106 175 | }, 176 | "flags": { 177 | "collapsed": true 178 | }, 179 | "order": 12, 180 | "mode": 0, 181 | "inputs": [ 182 | { 183 | "name": "original_image", 184 | "type": "IMAGE", 185 | "link": 215 186 | }, 187 | { 188 | "name": "image_gen_width", 189 | "type": "INT", 190 | "link": 217, 191 | "widget": { 192 | "name": "image_gen_width" 193 | }, 194 | "slot_index": 1 195 | }, 196 | { 197 | "name": "image_gen_height", 198 | "type": "INT", 199 | "link": 218, 200 | "widget": { 201 | "name": "image_gen_height" 202 | }, 203 | "slot_index": 2 204 | } 205 | ], 206 | "outputs": [ 207 | { 208 | "name": "RESOLUTION (INT)", 209 | "type": "INT", 210 | "links": [ 211 | 216 212 | ], 213 | "shape": 3, 214 | "slot_index": 0 215 | } 216 | ], 217 | "properties": { 218 | "Node name for S&R": "PixelPerfectResolution" 219 | }, 220 | "widgets_values": [ 221 | 512, 222 | 512, 223 | "Just Resize" 224 | ] 225 | }, 226 | { 227 | "id": 101, 228 | "type": "MaskToImage", 229 | "pos": [ 230 | -649, 231 | 316 232 | ], 233 | "size": { 234 | "0": 210, 235 | "1": 26 236 | }, 237 | "flags": {}, 238 | "order": 17, 239 | "mode": 0, 240 | "inputs": [ 241 | { 242 | "name": "mask", 243 | "type": "MASK", 244 | "link": 234 245 | } 246 | ], 247 | "outputs": [ 248 | { 249 | "name": "IMAGE", 250 | "type": "IMAGE", 251 | "links": [ 252 | 235 253 | ], 254 | "shape": 3, 255 | "slot_index": 0 256 | } 257 | ], 258 | "properties": { 259 | "Node name for S&R": "MaskToImage" 260 | } 261 | }, 262 | { 263 | "id": 92, 264 | "type": "GrowMaskWithBlur", 265 | "pos": [ 266 | -1010, 267 | 310 268 | ], 269 | "size": { 270 | "0": 315, 271 | "1": 246 272 | }, 273 | "flags": {}, 274 | "order": 15, 275 | "mode": 0, 276 | "inputs": [ 277 | { 278 | "name": "mask", 279 | "type": "MASK", 280 | "link": 207 281 | } 282 | ], 283 | "outputs": [ 284 | { 285 | "name": "mask", 286 | "type": "MASK", 287 | "links": [ 288 | 221, 289 | 234 290 | ], 291 | "shape": 3, 292 | "slot_index": 0 293 | }, 294 | { 295 | "name": "mask_inverted", 296 | "type": "MASK", 297 | "links": null, 298 | "shape": 3 299 | } 300 | ], 301 | "properties": { 302 | "Node name for S&R": "GrowMaskWithBlur" 303 | }, 304 | "widgets_values": [ 305 | 15, 306 | 0, 307 | true, 308 | false, 309 | 0, 310 | 1, 311 | 1, 312 | true 313 | ] 314 | }, 315 | { 316 | "id": 97, 317 | "type": "Display Any (rgthree)", 318 | "pos": [ 319 | -1738, 320 | 582 321 | ], 322 | "size": { 323 | "0": 226.42002868652344, 324 | "1": 116.54998779296875 325 | }, 326 | "flags": {}, 327 | "order": 11, 328 | "mode": 0, 329 | "inputs": [ 330 | { 331 | "name": "source", 332 | "type": "*", 333 | "link": 251, 334 | "dir": 3 335 | } 336 | ], 337 | "properties": { 338 | "Node name for S&R": "Display Any (rgthree)" 339 | }, 340 | "widgets_values": [ 341 | "" 342 | ] 343 | }, 344 | { 345 | "id": 121, 346 | "type": "UNETLoader_MuseTalk", 347 | "pos": [ 348 | 450, 349 | -360 350 | ], 351 | "size": { 352 | "0": 214.1832275390625, 353 | "1": 26 354 | }, 355 | "flags": {}, 356 | "order": 0, 357 | "mode": 0, 358 | "outputs": [ 359 | { 360 | "name": "MODEL", 361 | "type": "MODEL", 362 | "links": [ 363 | 280 364 | ], 365 | "shape": 3, 366 | "slot_index": 0 367 | } 368 | ], 369 | "properties": { 370 | "Node name for S&R": "UNETLoader_MuseTalk" 371 | } 372 | }, 373 | { 374 | "id": 4, 375 | "type": "VAELoader", 376 | "pos": [ 377 | 290, 378 | -280 379 | ], 380 | "size": { 381 | "0": 379.3569641113281, 382 | "1": 58.21699523925781 383 | }, 384 | "flags": {}, 385 | "order": 1, 386 | "mode": 0, 387 | "outputs": [ 388 | { 389 | "name": "VAE", 390 | "type": "VAE", 391 | "links": [ 392 | 279 393 | ], 394 | "shape": 3, 395 | "slot_index": 0 396 | } 397 | ], 398 | "properties": { 399 | "Node name for S&R": "VAELoader" 400 | }, 401 | "widgets_values": [ 402 | "vae-ft-mse-840000-ema-pruned.safetensors" 403 | ] 404 | }, 405 | { 406 | "id": 27, 407 | "type": "vhs_audio_to_audio_tensor", 408 | "pos": [ 409 | -1720, 410 | -170 411 | ], 412 | "size": { 413 | "0": 315, 414 | "1": 102 415 | }, 416 | "flags": {}, 417 | "order": 6, 418 | "mode": 0, 419 | "inputs": [ 420 | { 421 | "name": "vhs_audio", 422 | "type": "VHS_AUDIO", 423 | "link": 45, 424 | "slot_index": 0 425 | } 426 | ], 427 | "outputs": [ 428 | { 429 | "name": "audio_tensor", 430 | "type": "VCAUDIOTENSOR", 431 | "links": [ 432 | 67 433 | ], 434 | "shape": 3, 435 | "slot_index": 0 436 | }, 437 | { 438 | "name": "audio_dur", 439 | "type": "INT", 440 | "links": null, 441 | "shape": 3 442 | } 443 | ], 444 | "properties": { 445 | "Node name for S&R": "vhs_audio_to_audio_tensor" 446 | }, 447 | "widgets_values": [ 448 | 16000, 449 | 1 450 | ] 451 | }, 452 | { 453 | "id": 28, 454 | "type": "VHS_LoadAudio", 455 | "pos": [ 456 | -2120, 457 | -270 458 | ], 459 | "size": { 460 | "0": 315, 461 | "1": 82 462 | }, 463 | "flags": {}, 464 | "order": 2, 465 | "mode": 0, 466 | "outputs": [ 467 | { 468 | "name": "audio", 469 | "type": "VHS_AUDIO", 470 | "links": [ 471 | 45, 472 | 287 473 | ], 474 | "shape": 3, 475 | "slot_index": 0 476 | } 477 | ], 478 | "properties": { 479 | "Node name for S&R": "VHS_LoadAudio" 480 | }, 481 | "widgets_values": { 482 | "audio_file": "input/yongen.wav", 483 | "seek_seconds": 0 484 | } 485 | }, 486 | { 487 | "id": 124, 488 | "type": "SetNode", 489 | "pos": [ 490 | -1750, 491 | -340 492 | ], 493 | "size": { 494 | "0": 210, 495 | "1": 58 496 | }, 497 | "flags": { 498 | "collapsed": true 499 | }, 500 | "order": 7, 501 | "mode": 0, 502 | "inputs": [ 503 | { 504 | "name": "VHS_AUDIO", 505 | "type": "VHS_AUDIO", 506 | "link": 287 507 | } 508 | ], 509 | "outputs": [ 510 | { 511 | "name": "*", 512 | "type": "*", 513 | "links": null 514 | } 515 | ], 516 | "title": "Set_OriginaAudioVHS", 517 | "properties": { 518 | "previousName": "OriginaAudioVHS" 519 | }, 520 | "widgets_values": [ 521 | "OriginaAudioVHS" 522 | ] 523 | }, 524 | { 525 | "id": 89, 526 | "type": "VHS_LoadVideo", 527 | "pos": [ 528 | -2129, 529 | 279 530 | ], 531 | "size": [ 532 | 235.1999969482422, 533 | 377.04999828338623 534 | ], 535 | "flags": {}, 536 | "order": 9, 537 | "mode": 0, 538 | "inputs": [ 539 | { 540 | "name": "batch_manager", 541 | "type": "VHS_BatchManager", 542 | "link": null 543 | }, 544 | { 545 | "name": "frame_load_cap", 546 | "type": "INT", 547 | "link": 288, 548 | "widget": { 549 | "name": "frame_load_cap" 550 | } 551 | } 552 | ], 553 | "outputs": [ 554 | { 555 | "name": "IMAGE", 556 | "type": "IMAGE", 557 | "links": [ 558 | 245 559 | ], 560 | "shape": 3, 561 | "slot_index": 0 562 | }, 563 | { 564 | "name": "frame_count", 565 | "type": "INT", 566 | "links": [ 567 | 251 568 | ], 569 | "shape": 3, 570 | "slot_index": 1 571 | }, 572 | { 573 | "name": "audio", 574 | "type": "VHS_AUDIO", 575 | "links": null, 576 | "shape": 3 577 | } 578 | ], 579 | "properties": { 580 | "Node name for S&R": "VHS_LoadVideo" 581 | }, 582 | "widgets_values": { 583 | "video": "istockphoto-1139641392-640_adpp_is_yongen.mp4", 584 | "force_rate": 0, 585 | "force_size": "Disabled", 586 | "custom_width": 512, 587 | "custom_height": 512, 588 | "frame_load_cap": 194, 589 | "skip_first_frames": 0, 590 | "select_every_nth": 1, 591 | "choose video to upload": "image", 592 | "videopreview": { 593 | "hidden": false, 594 | "paused": false, 595 | "params": { 596 | "frame_load_cap": 194, 597 | "skip_first_frames": 0, 598 | "force_rate": 0, 599 | "filename": "istockphoto-1139641392-640_adpp_is_yongen.mp4", 600 | "type": "input", 601 | "format": "video/mp4", 602 | "select_every_nth": 1 603 | } 604 | } 605 | } 606 | }, 607 | { 608 | "id": 36, 609 | "type": "whisper_to_features", 610 | "pos": [ 611 | -1340, 612 | -160 613 | ], 614 | "size": { 615 | "0": 342.5999755859375, 616 | "1": 78 617 | }, 618 | "flags": {}, 619 | "order": 8, 620 | "mode": 0, 621 | "inputs": [ 622 | { 623 | "name": "audio_tensor", 624 | "type": "VCAUDIOTENSOR", 625 | "link": 67, 626 | "slot_index": 0 627 | } 628 | ], 629 | "outputs": [ 630 | { 631 | "name": "whisper_chunks", 632 | "type": "WHISPERFEAT", 633 | "links": [ 634 | 281 635 | ], 636 | "shape": 3, 637 | "slot_index": 0 638 | }, 639 | { 640 | "name": "frame_count", 641 | "type": "INT", 642 | "links": [ 643 | 288 644 | ], 645 | "shape": 3, 646 | "slot_index": 1 647 | } 648 | ], 649 | "properties": { 650 | "Node name for S&R": "whisper_to_features" 651 | }, 652 | "widgets_values": [ 653 | 24 654 | ] 655 | }, 656 | { 657 | "id": 125, 658 | "type": "GetNode", 659 | "pos": [ 660 | 1180, 661 | -540 662 | ], 663 | "size": { 664 | "0": 210, 665 | "1": 58 666 | }, 667 | "flags": { 668 | "collapsed": true 669 | }, 670 | "order": 3, 671 | "mode": 0, 672 | "outputs": [ 673 | { 674 | "name": "VHS_AUDIO", 675 | "type": "VHS_AUDIO", 676 | "links": [ 677 | 289 678 | ], 679 | "slot_index": 0 680 | } 681 | ], 682 | "title": "Get_OriginaAudioVHS", 683 | "properties": {}, 684 | "widgets_values": [ 685 | "OriginaAudioVHS" 686 | ] 687 | }, 688 | { 689 | "id": 99, 690 | "type": "VHS_VideoCombine", 691 | "pos": [ 692 | 460, 693 | 300 694 | ], 695 | "size": [ 696 | 437.7622375488281, 697 | 721.7622375488281 698 | ], 699 | "flags": {}, 700 | "order": 22, 701 | "mode": 0, 702 | "inputs": [ 703 | { 704 | "name": "images", 705 | "type": "IMAGE", 706 | "link": 230, 707 | "slot_index": 0 708 | }, 709 | { 710 | "name": "audio", 711 | "type": "VHS_AUDIO", 712 | "link": null 713 | }, 714 | { 715 | "name": "batch_manager", 716 | "type": "VHS_BatchManager", 717 | "link": null 718 | } 719 | ], 720 | "outputs": [ 721 | { 722 | "name": "Filenames", 723 | "type": "VHS_FILENAMES", 724 | "links": null, 725 | "shape": 3 726 | } 727 | ], 728 | "properties": { 729 | "Node name for S&R": "VHS_VideoCombine" 730 | }, 731 | "widgets_values": { 732 | "frame_rate": 25, 733 | "loop_count": 0, 734 | "filename_prefix": "MuseTalkCrop", 735 | "format": "video/h264-mp4", 736 | "pix_fmt": "yuv420p", 737 | "crf": 19, 738 | "save_metadata": true, 739 | "pingpong": false, 740 | "save_output": false, 741 | "videopreview": { 742 | "hidden": false, 743 | "paused": false, 744 | "params": { 745 | "filename": "MuseTalkCrop_00002.mp4", 746 | "subfolder": "", 747 | "type": "temp", 748 | "format": "video/h264-mp4" 749 | } 750 | } 751 | } 752 | }, 753 | { 754 | "id": 16, 755 | "type": "ImageCompositeMasked", 756 | "pos": [ 757 | 490, 758 | 70 759 | ], 760 | "size": { 761 | "0": 315, 762 | "1": 146 763 | }, 764 | "flags": {}, 765 | "order": 20, 766 | "mode": 0, 767 | "inputs": [ 768 | { 769 | "name": "destination", 770 | "type": "IMAGE", 771 | "link": 56 772 | }, 773 | { 774 | "name": "source", 775 | "type": "IMAGE", 776 | "link": 18 777 | }, 778 | { 779 | "name": "mask", 780 | "type": "MASK", 781 | "link": null 782 | } 783 | ], 784 | "outputs": [ 785 | { 786 | "name": "IMAGE", 787 | "type": "IMAGE", 788 | "links": [ 789 | 230, 790 | 285 791 | ], 792 | "shape": 3, 793 | "slot_index": 0 794 | } 795 | ], 796 | "properties": { 797 | "Node name for S&R": "ImageCompositeMasked" 798 | }, 799 | "widgets_values": [ 800 | 0, 801 | 128, 802 | false 803 | ] 804 | }, 805 | { 806 | "id": 15, 807 | "type": "EmptyImage", 808 | "pos": [ 809 | 500, 810 | 20 811 | ], 812 | "size": { 813 | "0": 315, 814 | "1": 130 815 | }, 816 | "flags": { 817 | "collapsed": true 818 | }, 819 | "order": 4, 820 | "mode": 0, 821 | "outputs": [ 822 | { 823 | "name": "IMAGE", 824 | "type": "IMAGE", 825 | "links": [ 826 | 18 827 | ], 828 | "shape": 3, 829 | "slot_index": 0 830 | } 831 | ], 832 | "properties": { 833 | "Node name for S&R": "EmptyImage" 834 | }, 835 | "widgets_values": [ 836 | 256, 837 | 256, 838 | 1, 839 | 0 840 | ] 841 | }, 842 | { 843 | "id": 98, 844 | "type": "VHS_VideoCombine", 845 | "pos": [ 846 | -10, 847 | 300 848 | ], 849 | "size": [ 850 | 437.7622375488281, 851 | 721.7622375488281 852 | ], 853 | "flags": {}, 854 | "order": 21, 855 | "mode": 0, 856 | "inputs": [ 857 | { 858 | "name": "images", 859 | "type": "IMAGE", 860 | "link": 233, 861 | "slot_index": 0 862 | }, 863 | { 864 | "name": "audio", 865 | "type": "VHS_AUDIO", 866 | "link": null 867 | }, 868 | { 869 | "name": "batch_manager", 870 | "type": "VHS_BatchManager", 871 | "link": null 872 | } 873 | ], 874 | "outputs": [ 875 | { 876 | "name": "Filenames", 877 | "type": "VHS_FILENAMES", 878 | "links": null, 879 | "shape": 3 880 | } 881 | ], 882 | "properties": { 883 | "Node name for S&R": "VHS_VideoCombine" 884 | }, 885 | "widgets_values": { 886 | "frame_rate": 25, 887 | "loop_count": 0, 888 | "filename_prefix": "MuseTalkCrop", 889 | "format": "video/h264-mp4", 890 | "pix_fmt": "yuv420p", 891 | "crf": 19, 892 | "save_metadata": true, 893 | "pingpong": false, 894 | "save_output": false, 895 | "videopreview": { 896 | "hidden": false, 897 | "paused": false, 898 | "params": { 899 | "filename": "MuseTalkCrop_00001.mp4", 900 | "subfolder": "", 901 | "type": "temp", 902 | "format": "video/h264-mp4" 903 | } 904 | } 905 | } 906 | }, 907 | { 908 | "id": 100, 909 | "type": "VHS_VideoCombine", 910 | "pos": [ 911 | -539, 912 | 412 913 | ], 914 | "size": [ 915 | 437.7622375488281, 916 | 466.9912586212158 917 | ], 918 | "flags": {}, 919 | "order": 19, 920 | "mode": 0, 921 | "inputs": [ 922 | { 923 | "name": "images", 924 | "type": "IMAGE", 925 | "link": 235, 926 | "slot_index": 0 927 | }, 928 | { 929 | "name": "audio", 930 | "type": "VHS_AUDIO", 931 | "link": null 932 | }, 933 | { 934 | "name": "batch_manager", 935 | "type": "VHS_BatchManager", 936 | "link": null 937 | } 938 | ], 939 | "outputs": [ 940 | { 941 | "name": "Filenames", 942 | "type": "VHS_FILENAMES", 943 | "links": null, 944 | "shape": 3 945 | } 946 | ], 947 | "properties": { 948 | "Node name for S&R": "VHS_VideoCombine" 949 | }, 950 | "widgets_values": { 951 | "frame_rate": 25, 952 | "loop_count": 0, 953 | "filename_prefix": "Masks", 954 | "format": "image/webp", 955 | "pingpong": false, 956 | "save_output": false, 957 | "videopreview": { 958 | "hidden": false, 959 | "paused": false, 960 | "params": { 961 | "filename": "Masks_00001.webp", 962 | "subfolder": "", 963 | "type": "temp", 964 | "format": "image/webp" 965 | } 966 | } 967 | } 968 | }, 969 | { 970 | "id": 122, 971 | "type": "muse_talk_sampler", 972 | "pos": [ 973 | 770, 974 | -200 975 | ], 976 | "size": { 977 | "0": 315, 978 | "1": 162 979 | }, 980 | "flags": {}, 981 | "order": 23, 982 | "mode": 0, 983 | "inputs": [ 984 | { 985 | "name": "model", 986 | "type": "MODEL", 987 | "link": 280 988 | }, 989 | { 990 | "name": "vae", 991 | "type": "VAE", 992 | "link": 279, 993 | "slot_index": 1 994 | }, 995 | { 996 | "name": "whisper_features", 997 | "type": "WHISPERFEAT", 998 | "link": 281, 999 | "slot_index": 2 1000 | }, 1001 | { 1002 | "name": "images", 1003 | "type": "IMAGE", 1004 | "link": 284, 1005 | "slot_index": 3 1006 | }, 1007 | { 1008 | "name": "masked_images", 1009 | "type": "IMAGE", 1010 | "link": 285, 1011 | "slot_index": 4 1012 | } 1013 | ], 1014 | "outputs": [ 1015 | { 1016 | "name": "image", 1017 | "type": "IMAGE", 1018 | "links": [ 1019 | 282, 1020 | 295 1021 | ], 1022 | "shape": 3, 1023 | "slot_index": 0 1024 | } 1025 | ], 1026 | "properties": { 1027 | "Node name for S&R": "muse_talk_sampler" 1028 | }, 1029 | "widgets_values": [ 1030 | 16, 1031 | 0 1032 | ] 1033 | }, 1034 | { 1035 | "id": 30, 1036 | "type": "VHS_VideoCombine", 1037 | "pos": [ 1038 | 1390, 1039 | -600 1040 | ], 1041 | "size": [ 1042 | 421.8526815820319, 1043 | 705.8526815820319 1044 | ], 1045 | "flags": {}, 1046 | "order": 24, 1047 | "mode": 0, 1048 | "inputs": [ 1049 | { 1050 | "name": "images", 1051 | "type": "IMAGE", 1052 | "link": 282, 1053 | "slot_index": 0 1054 | }, 1055 | { 1056 | "name": "audio", 1057 | "type": "VHS_AUDIO", 1058 | "link": 289 1059 | }, 1060 | { 1061 | "name": "batch_manager", 1062 | "type": "VHS_BatchManager", 1063 | "link": null 1064 | } 1065 | ], 1066 | "outputs": [ 1067 | { 1068 | "name": "Filenames", 1069 | "type": "VHS_FILENAMES", 1070 | "links": null, 1071 | "shape": 3 1072 | } 1073 | ], 1074 | "properties": { 1075 | "Node name for S&R": "VHS_VideoCombine" 1076 | }, 1077 | "widgets_values": { 1078 | "frame_rate": 25, 1079 | "loop_count": 0, 1080 | "filename_prefix": "MuseTalkCrop", 1081 | "format": "video/h264-mp4", 1082 | "pix_fmt": "yuv420p", 1083 | "crf": 19, 1084 | "save_metadata": true, 1085 | "pingpong": false, 1086 | "save_output": false, 1087 | "videopreview": { 1088 | "hidden": false, 1089 | "paused": false, 1090 | "params": { 1091 | "filename": "MuseTalkCrop_00003-audio.mp4", 1092 | "subfolder": "", 1093 | "type": "temp", 1094 | "format": "video/h264-mp4" 1095 | } 1096 | } 1097 | } 1098 | }, 1099 | { 1100 | "id": 126, 1101 | "type": "GetNode", 1102 | "pos": [ 1103 | 1164, 1104 | 459 1105 | ], 1106 | "size": { 1107 | "0": 210, 1108 | "1": 58 1109 | }, 1110 | "flags": { 1111 | "collapsed": true 1112 | }, 1113 | "order": 5, 1114 | "mode": 0, 1115 | "outputs": [ 1116 | { 1117 | "name": "VHS_AUDIO", 1118 | "type": "VHS_AUDIO", 1119 | "links": [ 1120 | 290 1121 | ], 1122 | "slot_index": 0 1123 | } 1124 | ], 1125 | "title": "Get_OriginaAudioVHS", 1126 | "properties": {}, 1127 | "widgets_values": [ 1128 | "OriginaAudioVHS" 1129 | ] 1130 | }, 1131 | { 1132 | "id": 96, 1133 | "type": "VHS_VideoCombine", 1134 | "pos": [ 1135 | 1512, 1136 | 422 1137 | ], 1138 | "size": [ 1139 | 830.9005747743759, 1140 | 788.3825163935942 1141 | ], 1142 | "flags": {}, 1143 | "order": 26, 1144 | "mode": 0, 1145 | "inputs": [ 1146 | { 1147 | "name": "images", 1148 | "type": "IMAGE", 1149 | "link": 225, 1150 | "slot_index": 0 1151 | }, 1152 | { 1153 | "name": "audio", 1154 | "type": "VHS_AUDIO", 1155 | "link": 290, 1156 | "slot_index": 1 1157 | }, 1158 | { 1159 | "name": "batch_manager", 1160 | "type": "VHS_BatchManager", 1161 | "link": null 1162 | } 1163 | ], 1164 | "outputs": [ 1165 | { 1166 | "name": "Filenames", 1167 | "type": "VHS_FILENAMES", 1168 | "links": null, 1169 | "shape": 3 1170 | } 1171 | ], 1172 | "properties": { 1173 | "Node name for S&R": "VHS_VideoCombine" 1174 | }, 1175 | "widgets_values": { 1176 | "frame_rate": 25, 1177 | "loop_count": 0, 1178 | "filename_prefix": "MuseTalk", 1179 | "format": "video/h264-mp4", 1180 | "pix_fmt": "yuv420p", 1181 | "crf": 19, 1182 | "save_metadata": true, 1183 | "pingpong": false, 1184 | "save_output": false, 1185 | "videopreview": { 1186 | "hidden": false, 1187 | "paused": false, 1188 | "params": { 1189 | "filename": "MuseTalk_00001-audio.mp4", 1190 | "subfolder": "", 1191 | "type": "temp", 1192 | "format": "video/h264-mp4" 1193 | } 1194 | } 1195 | } 1196 | }, 1197 | { 1198 | "id": 31, 1199 | "type": "ImageResize+", 1200 | "pos": [ 1201 | 60, 1202 | 10 1203 | ], 1204 | "size": { 1205 | "0": 315, 1206 | "1": 218 1207 | }, 1208 | "flags": {}, 1209 | "order": 18, 1210 | "mode": 0, 1211 | "inputs": [ 1212 | { 1213 | "name": "image", 1214 | "type": "IMAGE", 1215 | "link": 220, 1216 | "slot_index": 0 1217 | } 1218 | ], 1219 | "outputs": [ 1220 | { 1221 | "name": "IMAGE", 1222 | "type": "IMAGE", 1223 | "links": [ 1224 | 56, 1225 | 233, 1226 | 284 1227 | ], 1228 | "shape": 3, 1229 | "slot_index": 0 1230 | }, 1231 | { 1232 | "name": "width", 1233 | "type": "INT", 1234 | "links": null, 1235 | "shape": 3 1236 | }, 1237 | { 1238 | "name": "height", 1239 | "type": "INT", 1240 | "links": null, 1241 | "shape": 3 1242 | } 1243 | ], 1244 | "properties": { 1245 | "Node name for S&R": "ImageResize+" 1246 | }, 1247 | "widgets_values": [ 1248 | 256, 1249 | 256, 1250 | "nearest", 1251 | false, 1252 | "always", 1253 | 0 1254 | ] 1255 | }, 1256 | { 1257 | "id": 95, 1258 | "type": "BatchUncrop", 1259 | "pos": [ 1260 | 1020, 1261 | 1120 1262 | ], 1263 | "size": { 1264 | "0": 210, 1265 | "1": 218 1266 | }, 1267 | "flags": {}, 1268 | "order": 25, 1269 | "mode": 0, 1270 | "inputs": [ 1271 | { 1272 | "name": "original_images", 1273 | "type": "IMAGE", 1274 | "link": 294 1275 | }, 1276 | { 1277 | "name": "cropped_images", 1278 | "type": "IMAGE", 1279 | "link": 295 1280 | }, 1281 | { 1282 | "name": "bboxes", 1283 | "type": "BBOX", 1284 | "link": 224 1285 | } 1286 | ], 1287 | "outputs": [ 1288 | { 1289 | "name": "IMAGE", 1290 | "type": "IMAGE", 1291 | "links": [ 1292 | 225 1293 | ], 1294 | "shape": 3, 1295 | "slot_index": 0 1296 | } 1297 | ], 1298 | "properties": { 1299 | "Node name for S&R": "BatchUncrop" 1300 | }, 1301 | "widgets_values": [ 1302 | 0.25, 1303 | 1, 1304 | true, 1305 | true, 1306 | true, 1307 | true 1308 | ] 1309 | }, 1310 | { 1311 | "id": 86, 1312 | "type": "BatchCropFromMask", 1313 | "pos": [ 1314 | -900, 1315 | 1140 1316 | ], 1317 | "size": { 1318 | "0": 393, 1319 | "1": 162 1320 | }, 1321 | "flags": {}, 1322 | "order": 16, 1323 | "mode": 0, 1324 | "inputs": [ 1325 | { 1326 | "name": "original_images", 1327 | "type": "IMAGE", 1328 | "link": 214 1329 | }, 1330 | { 1331 | "name": "masks", 1332 | "type": "MASK", 1333 | "link": 221 1334 | } 1335 | ], 1336 | "outputs": [ 1337 | { 1338 | "name": "original_images", 1339 | "type": "IMAGE", 1340 | "links": [ 1341 | 294 1342 | ], 1343 | "shape": 3, 1344 | "slot_index": 0 1345 | }, 1346 | { 1347 | "name": "cropped_images", 1348 | "type": "IMAGE", 1349 | "links": [ 1350 | 220 1351 | ], 1352 | "shape": 3, 1353 | "slot_index": 1 1354 | }, 1355 | { 1356 | "name": "bboxes", 1357 | "type": "BBOX", 1358 | "links": [ 1359 | 224 1360 | ], 1361 | "shape": 3, 1362 | "slot_index": 2 1363 | }, 1364 | { 1365 | "name": "width", 1366 | "type": "INT", 1367 | "links": null, 1368 | "shape": 3 1369 | }, 1370 | { 1371 | "name": "height", 1372 | "type": "INT", 1373 | "links": null, 1374 | "shape": 3 1375 | } 1376 | ], 1377 | "properties": { 1378 | "Node name for S&R": "BatchCropFromMask" 1379 | }, 1380 | "widgets_values": [ 1381 | 1.801, 1382 | 0.5 1383 | ] 1384 | } 1385 | ], 1386 | "links": [ 1387 | [ 1388 | 18, 1389 | 15, 1390 | 0, 1391 | 16, 1392 | 1, 1393 | "IMAGE" 1394 | ], 1395 | [ 1396 | 45, 1397 | 28, 1398 | 0, 1399 | 27, 1400 | 0, 1401 | "VHS_AUDIO" 1402 | ], 1403 | [ 1404 | 56, 1405 | 31, 1406 | 0, 1407 | 16, 1408 | 0, 1409 | "IMAGE" 1410 | ], 1411 | [ 1412 | 67, 1413 | 27, 1414 | 0, 1415 | 36, 1416 | 0, 1417 | "VCAUDIOTENSOR" 1418 | ], 1419 | [ 1420 | 164, 1421 | 47, 1422 | 0, 1423 | 72, 1424 | 0, 1425 | "IMAGE" 1426 | ], 1427 | [ 1428 | 167, 1429 | 72, 1430 | 0, 1431 | 76, 1432 | 0, 1433 | "IMAGE" 1434 | ], 1435 | [ 1436 | 207, 1437 | 76, 1438 | 0, 1439 | 92, 1440 | 0, 1441 | "MASK" 1442 | ], 1443 | [ 1444 | 214, 1445 | 47, 1446 | 0, 1447 | 86, 1448 | 0, 1449 | "IMAGE" 1450 | ], 1451 | [ 1452 | 215, 1453 | 47, 1454 | 0, 1455 | 94, 1456 | 0, 1457 | "IMAGE" 1458 | ], 1459 | [ 1460 | 216, 1461 | 94, 1462 | 0, 1463 | 72, 1464 | 1, 1465 | "INT" 1466 | ], 1467 | [ 1468 | 217, 1469 | 47, 1470 | 1, 1471 | 94, 1472 | 1, 1473 | "INT" 1474 | ], 1475 | [ 1476 | 218, 1477 | 47, 1478 | 2, 1479 | 94, 1480 | 2, 1481 | "INT" 1482 | ], 1483 | [ 1484 | 220, 1485 | 86, 1486 | 1, 1487 | 31, 1488 | 0, 1489 | "IMAGE" 1490 | ], 1491 | [ 1492 | 221, 1493 | 92, 1494 | 0, 1495 | 86, 1496 | 1, 1497 | "MASK" 1498 | ], 1499 | [ 1500 | 224, 1501 | 86, 1502 | 2, 1503 | 95, 1504 | 2, 1505 | "BBOX" 1506 | ], 1507 | [ 1508 | 225, 1509 | 95, 1510 | 0, 1511 | 96, 1512 | 0, 1513 | "IMAGE" 1514 | ], 1515 | [ 1516 | 230, 1517 | 16, 1518 | 0, 1519 | 99, 1520 | 0, 1521 | "IMAGE" 1522 | ], 1523 | [ 1524 | 233, 1525 | 31, 1526 | 0, 1527 | 98, 1528 | 0, 1529 | "IMAGE" 1530 | ], 1531 | [ 1532 | 234, 1533 | 92, 1534 | 0, 1535 | 101, 1536 | 0, 1537 | "MASK" 1538 | ], 1539 | [ 1540 | 235, 1541 | 101, 1542 | 0, 1543 | 100, 1544 | 0, 1545 | "IMAGE" 1546 | ], 1547 | [ 1548 | 245, 1549 | 89, 1550 | 0, 1551 | 47, 1552 | 0, 1553 | "IMAGE" 1554 | ], 1555 | [ 1556 | 251, 1557 | 89, 1558 | 1, 1559 | 97, 1560 | 0, 1561 | "*" 1562 | ], 1563 | [ 1564 | 279, 1565 | 4, 1566 | 0, 1567 | 122, 1568 | 1, 1569 | "VAE" 1570 | ], 1571 | [ 1572 | 280, 1573 | 121, 1574 | 0, 1575 | 122, 1576 | 0, 1577 | "MODEL" 1578 | ], 1579 | [ 1580 | 281, 1581 | 36, 1582 | 0, 1583 | 122, 1584 | 2, 1585 | "WHISPERFEAT" 1586 | ], 1587 | [ 1588 | 282, 1589 | 122, 1590 | 0, 1591 | 30, 1592 | 0, 1593 | "IMAGE" 1594 | ], 1595 | [ 1596 | 284, 1597 | 31, 1598 | 0, 1599 | 122, 1600 | 3, 1601 | "IMAGE" 1602 | ], 1603 | [ 1604 | 285, 1605 | 16, 1606 | 0, 1607 | 122, 1608 | 4, 1609 | "IMAGE" 1610 | ], 1611 | [ 1612 | 287, 1613 | 28, 1614 | 0, 1615 | 124, 1616 | 0, 1617 | "*" 1618 | ], 1619 | [ 1620 | 288, 1621 | 36, 1622 | 1, 1623 | 89, 1624 | 1, 1625 | "INT" 1626 | ], 1627 | [ 1628 | 289, 1629 | 125, 1630 | 0, 1631 | 30, 1632 | 1, 1633 | "VHS_AUDIO" 1634 | ], 1635 | [ 1636 | 290, 1637 | 126, 1638 | 0, 1639 | 96, 1640 | 1, 1641 | "VHS_AUDIO" 1642 | ], 1643 | [ 1644 | 294, 1645 | 86, 1646 | 0, 1647 | 95, 1648 | 0, 1649 | "IMAGE" 1650 | ], 1651 | [ 1652 | 295, 1653 | 122, 1654 | 0, 1655 | 95, 1656 | 1, 1657 | "IMAGE" 1658 | ] 1659 | ], 1660 | "groups": [], 1661 | "config": {}, 1662 | "extra": {}, 1663 | "version": 0.4 1664 | } -------------------------------------------------------------------------------- /musetalk/whisper/__init__.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import io 3 | import os 4 | import urllib 5 | import warnings 6 | from typing import List, Optional, Union 7 | 8 | import torch 9 | from tqdm import tqdm 10 | 11 | from .audio import load_audio, log_mel_spectrogram, pad_or_trim 12 | from .decoding import DecodingOptions, DecodingResult, decode, detect_language 13 | from .model import Whisper, ModelDimensions 14 | from .transcribe import transcribe 15 | 16 | 17 | _MODELS = { 18 | "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt", 19 | "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt", 20 | "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt", 21 | "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt", 22 | "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt", 23 | "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt", 24 | "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt", 25 | "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt", 26 | "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt", 27 | "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt", 28 | "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", 29 | "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", 30 | } 31 | 32 | 33 | def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]: 34 | os.makedirs(root, exist_ok=True) 35 | 36 | expected_sha256 = url.split("/")[-2] 37 | download_target = os.path.join(root, os.path.basename(url)) 38 | 39 | if os.path.exists(download_target) and not os.path.isfile(download_target): 40 | raise RuntimeError(f"{download_target} exists and is not a regular file") 41 | 42 | if os.path.isfile(download_target): 43 | model_bytes = open(download_target, "rb").read() 44 | if hashlib.sha256(model_bytes).hexdigest() == expected_sha256: 45 | return model_bytes if in_memory else download_target 46 | else: 47 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") 48 | 49 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: 50 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop: 51 | while True: 52 | buffer = source.read(8192) 53 | if not buffer: 54 | break 55 | 56 | output.write(buffer) 57 | loop.update(len(buffer)) 58 | 59 | model_bytes = open(download_target, "rb").read() 60 | if hashlib.sha256(model_bytes).hexdigest() != expected_sha256: 61 | raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.") 62 | 63 | return model_bytes if in_memory else download_target 64 | 65 | 66 | def available_models() -> List[str]: 67 | """Returns the names of available models""" 68 | return list(_MODELS.keys()) 69 | 70 | 71 | def load_model(name: str, device: Optional[Union[str, torch.device]] = None, download_root: str = None, in_memory: bool = False) -> Whisper: 72 | """ 73 | Load a Whisper ASR model 74 | 75 | Parameters 76 | ---------- 77 | name : str 78 | one of the official model names listed by `whisper.available_models()`, or 79 | path to a model checkpoint containing the model dimensions and the model state_dict. 80 | device : Union[str, torch.device] 81 | the PyTorch device to put the model into 82 | download_root: str 83 | path to download the model files; by default, it uses "~/.cache/whisper" 84 | in_memory: bool 85 | whether to preload the model weights into host memory 86 | 87 | Returns 88 | ------- 89 | model : Whisper 90 | The Whisper ASR model instance 91 | """ 92 | 93 | if device is None: 94 | device = "cuda" if torch.cuda.is_available() else "cpu" 95 | if download_root is None: 96 | download_root = os.getenv( 97 | "XDG_CACHE_HOME", 98 | os.path.join(os.path.expanduser("~"), ".cache", "whisper") 99 | ) 100 | 101 | if name in _MODELS: 102 | checkpoint_file = _download(_MODELS[name], download_root, in_memory) 103 | elif os.path.isfile(name): 104 | checkpoint_file = open(name, "rb").read() if in_memory else name 105 | else: 106 | raise RuntimeError(f"Model {name} not found; available models = {available_models()}") 107 | 108 | with (io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")) as fp: 109 | checkpoint = torch.load(fp, map_location=device) 110 | del checkpoint_file 111 | 112 | dims = ModelDimensions(**checkpoint["dims"]) 113 | model = Whisper(dims) 114 | model.load_state_dict(checkpoint["model_state_dict"]) 115 | 116 | return model.to(device) 117 | -------------------------------------------------------------------------------- /musetalk/whisper/__main__.py: -------------------------------------------------------------------------------- 1 | from .transcribe import cli 2 | 3 | 4 | cli() 5 | -------------------------------------------------------------------------------- /musetalk/whisper/assets/gpt2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"} -------------------------------------------------------------------------------- /musetalk/whisper/assets/gpt2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"} -------------------------------------------------------------------------------- /musetalk/whisper/assets/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kijai/ComfyUI-MuseTalk-KJ/fe908e93ea08d3b3303362c3e19928351f53234b/musetalk/whisper/assets/mel_filters.npz -------------------------------------------------------------------------------- /musetalk/whisper/assets/multilingual/added_tokens.json: -------------------------------------------------------------------------------- 1 | {"<|endoftext|>": 50257} 2 | -------------------------------------------------------------------------------- /musetalk/whisper/assets/multilingual/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"} -------------------------------------------------------------------------------- /musetalk/whisper/assets/multilingual/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "multilingual", "errors": "replace", "tokenizer_class": "GPT2Tokenizer"} -------------------------------------------------------------------------------- /musetalk/whisper/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | from typing import Union 4 | 5 | #import ffmpeg 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | from .utils import exact_div 11 | 12 | # hard-coded audio hyperparameters 13 | SAMPLE_RATE = 16000 14 | N_FFT = 400 15 | N_MELS = 80 16 | HOP_LENGTH = 160 17 | CHUNK_LENGTH = 30 18 | N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk 19 | N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input 20 | 21 | 22 | def load_audio(file: str, sr: int = SAMPLE_RATE): 23 | """ 24 | Open an audio file and read as mono waveform, resampling as necessary 25 | 26 | Parameters 27 | ---------- 28 | file: str 29 | The audio file to open 30 | 31 | sr: int 32 | The sample rate to resample the audio if necessary 33 | 34 | Returns 35 | ------- 36 | A NumPy array containing the audio waveform, in float32 dtype. 37 | """ 38 | try: 39 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary. 40 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. 41 | out, _ = ( 42 | ffmpeg.input(file, threads=0) 43 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) 44 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 45 | ) 46 | except ffmpeg.Error as e: 47 | raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e 48 | 49 | return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 50 | 51 | 52 | def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1): 53 | """ 54 | Pad or trim the audio array to N_SAMPLES, as expected by the encoder. 55 | """ 56 | if torch.is_tensor(array): 57 | if array.shape[axis] > length: 58 | array = array.index_select(dim=axis, index=torch.arange(length)) 59 | 60 | if array.shape[axis] < length: 61 | pad_widths = [(0, 0)] * array.ndim 62 | pad_widths[axis] = (0, length - array.shape[axis]) 63 | array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes]) 64 | else: 65 | if array.shape[axis] > length: 66 | array = array.take(indices=range(length), axis=axis) 67 | 68 | if array.shape[axis] < length: 69 | pad_widths = [(0, 0)] * array.ndim 70 | pad_widths[axis] = (0, length - array.shape[axis]) 71 | array = np.pad(array, pad_widths) 72 | 73 | return array 74 | 75 | 76 | @lru_cache(maxsize=None) 77 | def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: 78 | """ 79 | load the mel filterbank matrix for projecting STFT into a Mel spectrogram. 80 | Allows decoupling librosa dependency; saved using: 81 | 82 | np.savez_compressed( 83 | "mel_filters.npz", 84 | mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), 85 | ) 86 | """ 87 | assert n_mels == 80, f"Unsupported n_mels: {n_mels}" 88 | with np.load(os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")) as f: 89 | return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) 90 | 91 | 92 | def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS): 93 | """ 94 | Compute the log-Mel spectrogram of 95 | 96 | Parameters 97 | ---------- 98 | audio: Union[str, np.ndarray, torch.Tensor], shape = (*) 99 | The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz 100 | 101 | n_mels: int 102 | The number of Mel-frequency filters, only 80 is supported 103 | 104 | Returns 105 | ------- 106 | torch.Tensor, shape = (80, n_frames) 107 | A Tensor that contains the Mel spectrogram 108 | """ 109 | if not torch.is_tensor(audio): 110 | if isinstance(audio, str): 111 | audio = load_audio(audio) 112 | audio = torch.from_numpy(audio) 113 | 114 | window = torch.hann_window(N_FFT).to(audio.device) 115 | stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) 116 | 117 | magnitudes = stft[:, :-1].abs() ** 2 118 | 119 | filters = mel_filters(audio.device, n_mels) 120 | mel_spec = filters @ magnitudes 121 | 122 | log_spec = torch.clamp(mel_spec, min=1e-10).log10() 123 | log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) 124 | log_spec = (log_spec + 4.0) / 4.0 125 | return log_spec 126 | -------------------------------------------------------------------------------- /musetalk/whisper/checkpoints/place_whisper_model_here.txt: -------------------------------------------------------------------------------- 1 | https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt -------------------------------------------------------------------------------- /musetalk/whisper/decoding.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import Tensor 8 | from torch.distributions import Categorical 9 | 10 | from .audio import CHUNK_LENGTH 11 | from .tokenizer import Tokenizer, get_tokenizer 12 | from .utils import compression_ratio 13 | 14 | if TYPE_CHECKING: 15 | from .model import Whisper 16 | 17 | 18 | @torch.no_grad() 19 | def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]: 20 | """ 21 | Detect the spoken language in the audio, and return them as list of strings, along with the ids 22 | of the most probable language tokens and the probability distribution over all language tokens. 23 | This is performed outside the main decode loop in order to not interfere with kv-caching. 24 | 25 | Returns 26 | ------- 27 | language_tokens : Tensor, shape = (n_audio,) 28 | ids of the most probable language tokens, which appears after the startoftranscript token. 29 | language_probs : List[Dict[str, float]], length = n_audio 30 | list of dictionaries containing the probability distribution over all languages. 31 | """ 32 | if tokenizer is None: 33 | tokenizer = get_tokenizer(model.is_multilingual) 34 | if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence: 35 | raise ValueError(f"This model doesn't have language tokens so it can't perform lang id") 36 | 37 | single = mel.ndim == 2 38 | if single: 39 | mel = mel.unsqueeze(0) 40 | 41 | # skip encoder forward pass if already-encoded audio features were given 42 | if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): 43 | mel = model.encoder(mel) 44 | 45 | # forward pass using a single token, startoftranscript 46 | n_audio = mel.shape[0] 47 | x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1] 48 | logits = model.logits(x, mel)[:, 0] 49 | 50 | # collect detected languages; suppress all non-language tokens 51 | mask = torch.ones(logits.shape[-1], dtype=torch.bool) 52 | mask[list(tokenizer.all_language_tokens)] = False 53 | logits[:, mask] = -np.inf 54 | language_tokens = logits.argmax(dim=-1) 55 | language_token_probs = logits.softmax(dim=-1).cpu() 56 | language_probs = [ 57 | { 58 | c: language_token_probs[i, j].item() 59 | for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes) 60 | } 61 | for i in range(n_audio) 62 | ] 63 | 64 | if single: 65 | language_tokens = language_tokens[0] 66 | language_probs = language_probs[0] 67 | 68 | return language_tokens, language_probs 69 | 70 | 71 | @dataclass(frozen=True) 72 | class DecodingOptions: 73 | task: str = "transcribe" # whether to perform X->X "transcribe" or X->English "translate" 74 | language: Optional[str] = None # language that the audio is in; uses detected language if None 75 | 76 | # sampling-related options 77 | temperature: float = 0.0 78 | sample_len: Optional[int] = None # maximum number of tokens to sample 79 | best_of: Optional[int] = None # number of independent samples to collect, when t > 0 80 | beam_size: Optional[int] = None # number of beams in beam search, when t == 0 81 | patience: Optional[float] = None # patience in beam search (https://arxiv.org/abs/2204.05424) 82 | 83 | # options for ranking generations (either beams or best-of-N samples) 84 | length_penalty: Optional[float] = None # "alpha" in Google NMT, None defaults to length norm 85 | 86 | # prompt, prefix, and token suppression 87 | prompt: Optional[Union[str, List[int]]] = None # text or tokens for the previous context 88 | prefix: Optional[Union[str, List[int]]] = None # text or tokens to prefix the current context 89 | suppress_blank: bool = True # this will suppress blank outputs 90 | 91 | # list of tokens ids (or comma-separated token ids) to suppress 92 | # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()` 93 | suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1" 94 | 95 | # timestamp sampling options 96 | without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only 97 | max_initial_timestamp: Optional[float] = 1.0 # the initial timestamp cannot be later than this 98 | 99 | # implementation details 100 | fp16: bool = True # use fp16 for most of the calculation 101 | 102 | 103 | @dataclass(frozen=True) 104 | class DecodingResult: 105 | audio_features: Tensor 106 | language: str 107 | encoder_embeddings: np.ndarray 108 | decoder_embeddings: np.ndarray 109 | language_probs: Optional[Dict[str, float]] = None 110 | tokens: List[int] = field(default_factory=list) 111 | text: str = "" 112 | avg_logprob: float = np.nan 113 | no_speech_prob: float = np.nan 114 | temperature: float = np.nan 115 | compression_ratio: float = np.nan 116 | 117 | 118 | class Inference: 119 | def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: 120 | """Perform a forward pass on the decoder and return per-token logits""" 121 | raise NotImplementedError 122 | 123 | def rearrange_kv_cache(self, source_indices) -> None: 124 | """Update the key-value cache according to the updated beams""" 125 | raise NotImplementedError 126 | 127 | def cleanup_caching(self) -> None: 128 | """Clean up any resources or hooks after decoding is finished""" 129 | pass 130 | 131 | 132 | class PyTorchInference(Inference): 133 | def __init__(self, model: "Whisper", initial_token_length: int): 134 | self.model: "Whisper" = model 135 | self.initial_token_length = initial_token_length 136 | self.kv_cache = {} 137 | self.hooks = [] 138 | 139 | def logits(self, tokens: Tensor, audio_features: Tensor, include_embeddings=False) -> Tensor: 140 | if not self.kv_cache: 141 | self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() 142 | 143 | if tokens.shape[-1] > self.initial_token_length: 144 | # only need to use the last token except in the first forward pass 145 | tokens = tokens[:, -1:] 146 | 147 | return_val = self.model.decoder(tokens, audio_features, 148 | kv_cache=self.kv_cache, include_embeddings=include_embeddings) 149 | return return_val 150 | 151 | def cleanup_caching(self): 152 | for hook in self.hooks: 153 | hook.remove() 154 | 155 | self.kv_cache = {} 156 | self.hooks = [] 157 | 158 | def rearrange_kv_cache(self, source_indices): 159 | for module, tensor in self.kv_cache.items(): 160 | # update the key/value cache to contain the selected sequences 161 | self.kv_cache[module] = tensor[source_indices].detach() 162 | 163 | 164 | class SequenceRanker: 165 | def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]: 166 | """ 167 | Given a list of groups of samples and their cumulative log probabilities, 168 | return the indices of the samples in each group to select as the final result 169 | """ 170 | raise NotImplementedError 171 | 172 | 173 | class MaximumLikelihoodRanker(SequenceRanker): 174 | """ 175 | Select the sample with the highest log probabilities, penalized using either 176 | a simple length normalization or Google NMT paper's length penalty 177 | """ 178 | 179 | def __init__(self, length_penalty: Optional[float]): 180 | self.length_penalty = length_penalty 181 | 182 | def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]): 183 | def scores(logprobs, lengths): 184 | result = [] 185 | for logprob, length in zip(logprobs, lengths): 186 | if self.length_penalty is None: 187 | penalty = length 188 | else: 189 | # from the Google NMT paper 190 | penalty = ((5 + length) / 6) ** self.length_penalty 191 | result.append(logprob / penalty) 192 | return result 193 | 194 | # get the sequence with the highest score 195 | lengths = [[len(t) for t in s] for s in tokens] 196 | return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)] 197 | 198 | 199 | class TokenDecoder: 200 | def reset(self): 201 | """Initialize any stateful variables for decoding a new sequence""" 202 | 203 | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: 204 | """Specify how to select the next token, based on the current trace and logits 205 | 206 | Parameters 207 | ---------- 208 | tokens : Tensor, shape = (n_batch, current_sequence_length) 209 | all tokens in the context so far, including the prefix and sot_sequence tokens 210 | 211 | logits : Tensor, shape = (n_batch, vocab_size) 212 | per-token logits of the probability distribution at the current step 213 | 214 | sum_logprobs : Tensor, shape = (n_batch) 215 | cumulative log probabilities for each sequence 216 | 217 | Returns 218 | ------- 219 | tokens : Tensor, shape = (n_batch, current_sequence_length + 1) 220 | the tokens, appended with the selected next token 221 | 222 | completed : bool 223 | True if all sequences has reached the end of text 224 | 225 | """ 226 | raise NotImplementedError 227 | 228 | def finalize( 229 | self, tokens: Tensor, sum_logprobs: Tensor 230 | ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]: 231 | """Finalize search and return the final candidate sequences 232 | 233 | Parameters 234 | ---------- 235 | tokens : Tensor, shape = (n_audio, n_group, current_sequence_length) 236 | all tokens in the context so far, including the prefix and sot_sequence 237 | 238 | sum_logprobs : Tensor, shape = (n_audio, n_group) 239 | cumulative log probabilities for each sequence 240 | 241 | Returns 242 | ------- 243 | tokens : Sequence[Sequence[Tensor]], length = n_audio 244 | sequence of Tensors containing candidate token sequences, for each audio input 245 | 246 | sum_logprobs : List[List[float]], length = n_audio 247 | sequence of cumulative log probabilities corresponding to the above 248 | 249 | """ 250 | raise NotImplementedError 251 | 252 | 253 | class GreedyDecoder(TokenDecoder): 254 | def __init__(self, temperature: float, eot: int): 255 | self.temperature = temperature 256 | self.eot = eot 257 | 258 | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: 259 | temperature = self.temperature 260 | if temperature == 0: 261 | next_tokens = logits.argmax(dim=-1) 262 | else: 263 | next_tokens = Categorical(logits=logits / temperature).sample() 264 | 265 | logprobs = F.log_softmax(logits.float(), dim=-1) 266 | current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens] 267 | sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot) 268 | 269 | next_tokens[tokens[:, -1] == self.eot] = self.eot 270 | tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1) 271 | 272 | completed = (tokens[:, -1] == self.eot).all() 273 | return tokens, completed 274 | 275 | def finalize(self, tokens: Tensor, sum_logprobs: Tensor): 276 | # make sure each sequence has at least one EOT token at the end 277 | tokens = F.pad(tokens, (0, 1), value=self.eot) 278 | return tokens, sum_logprobs.tolist() 279 | 280 | 281 | class BeamSearchDecoder(TokenDecoder): 282 | def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None): 283 | self.beam_size = beam_size 284 | self.eot = eot 285 | self.inference = inference 286 | self.patience = patience or 1.0 287 | self.max_candidates: int = round(beam_size * self.patience) 288 | self.finished_sequences = None 289 | 290 | assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})" 291 | 292 | def reset(self): 293 | self.finished_sequences = None 294 | 295 | def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: 296 | if tokens.shape[0] % self.beam_size != 0: 297 | raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0") 298 | 299 | n_audio = tokens.shape[0] // self.beam_size 300 | if self.finished_sequences is None: # for the first update 301 | self.finished_sequences = [{} for _ in range(n_audio)] 302 | 303 | logprobs = F.log_softmax(logits.float(), dim=-1) 304 | next_tokens, source_indices, finished_sequences = [], [], [] 305 | for i in range(n_audio): 306 | scores, sources, finished = {}, {}, {} 307 | 308 | # STEP 1: calculate the cumulative log probabilities for possible candidates 309 | for j in range(self.beam_size): 310 | idx = i * self.beam_size + j 311 | prefix = tokens[idx].tolist() 312 | for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)): 313 | new_logprob = (sum_logprobs[idx] + logprob).item() 314 | sequence = tuple(prefix + [token.item()]) 315 | scores[sequence] = new_logprob 316 | sources[sequence] = idx 317 | 318 | # STEP 2: rank the candidates and keep the top beam_size sequences for each audio 319 | saved = 0 320 | for sequence in sorted(scores, key=scores.get, reverse=True): 321 | if sequence[-1] == self.eot: 322 | finished[sequence] = scores[sequence] 323 | else: 324 | sum_logprobs[len(next_tokens)] = scores[sequence] 325 | next_tokens.append(sequence) 326 | source_indices.append(sources[sequence]) 327 | 328 | saved += 1 329 | if saved == self.beam_size: 330 | break 331 | 332 | finished_sequences.append(finished) 333 | 334 | tokens = torch.tensor(next_tokens, device=tokens.device) 335 | self.inference.rearrange_kv_cache(source_indices) 336 | 337 | # add newly finished sequences to self.finished_sequences 338 | assert len(self.finished_sequences) == len(finished_sequences) 339 | for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences): 340 | for seq in sorted(newly_finished, key=newly_finished.get, reverse=True): 341 | if len(previously_finished) >= self.max_candidates: 342 | break # the candidate list is full 343 | previously_finished[seq] = newly_finished[seq] 344 | 345 | # mark as completed if all audio has enough number of samples 346 | completed = all( 347 | len(sequences) >= self.max_candidates for sequences in self.finished_sequences 348 | ) 349 | return tokens, completed 350 | 351 | def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor): 352 | # collect all finished sequences, including patience, and add unfinished ones if not enough 353 | sum_logprobs = sum_logprobs.cpu() 354 | for i, sequences in enumerate(self.finished_sequences): 355 | if len(sequences) < self.beam_size: # when not enough sequences are finished 356 | for j in list(np.argsort(sum_logprobs[i]))[::-1]: 357 | sequence = preceding_tokens[i, j].tolist() + [self.eot] 358 | sequences[tuple(sequence)] = sum_logprobs[i][j].item() 359 | if len(sequences) >= self.beam_size: 360 | break 361 | 362 | tokens: List[List[Tensor]] = [ 363 | [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences 364 | ] 365 | sum_logprobs: List[List[float]] = [ 366 | list(sequences.values()) for sequences in self.finished_sequences 367 | ] 368 | return tokens, sum_logprobs 369 | 370 | 371 | class LogitFilter: 372 | def apply(self, logits: Tensor, tokens: Tensor) -> None: 373 | """Apply any filtering or masking to logits in-place 374 | 375 | Parameters 376 | ---------- 377 | logits : Tensor, shape = (n_batch, vocab_size) 378 | per-token logits of the probability distribution at the current step 379 | 380 | tokens : Tensor, shape = (n_batch, current_sequence_length) 381 | all tokens in the context so far, including the prefix and sot_sequence tokens 382 | 383 | """ 384 | raise NotImplementedError 385 | 386 | 387 | class SuppressBlank(LogitFilter): 388 | def __init__(self, tokenizer: Tokenizer, sample_begin: int): 389 | self.tokenizer = tokenizer 390 | self.sample_begin = sample_begin 391 | 392 | def apply(self, logits: Tensor, tokens: Tensor): 393 | if tokens.shape[1] == self.sample_begin: 394 | logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf 395 | 396 | 397 | class SuppressTokens(LogitFilter): 398 | def __init__(self, suppress_tokens: Sequence[int]): 399 | self.suppress_tokens = list(suppress_tokens) 400 | 401 | def apply(self, logits: Tensor, tokens: Tensor): 402 | logits[:, self.suppress_tokens] = -np.inf 403 | 404 | 405 | class ApplyTimestampRules(LogitFilter): 406 | def __init__( 407 | self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int] 408 | ): 409 | self.tokenizer = tokenizer 410 | self.sample_begin = sample_begin 411 | self.max_initial_timestamp_index = max_initial_timestamp_index 412 | 413 | def apply(self, logits: Tensor, tokens: Tensor): 414 | # suppress <|notimestamps|> which is handled by without_timestamps 415 | if self.tokenizer.no_timestamps is not None: 416 | logits[:, self.tokenizer.no_timestamps] = -np.inf 417 | 418 | # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly 419 | for k in range(tokens.shape[0]): 420 | seq = [t for t in tokens[k, self.sample_begin :].tolist()] 421 | last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin 422 | penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin 423 | 424 | if last_was_timestamp: 425 | if penultimate_was_timestamp: # has to be non-timestamp 426 | logits[k, self.tokenizer.timestamp_begin :] = -np.inf 427 | else: # cannot be normal text tokens 428 | logits[k, : self.tokenizer.eot] = -np.inf 429 | 430 | # apply the `max_initial_timestamp` option 431 | if tokens.shape[1] == self.sample_begin and self.max_initial_timestamp_index is not None: 432 | last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index 433 | logits[:, last_allowed + 1 :] = -np.inf 434 | 435 | # if sum of probability over timestamps is above any other token, sample timestamp 436 | logprobs = F.log_softmax(logits.float(), dim=-1) 437 | for k in range(tokens.shape[0]): 438 | timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1) 439 | max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max() 440 | if timestamp_logprob > max_text_token_logprob: 441 | logits[k, : self.tokenizer.timestamp_begin] = -np.inf 442 | 443 | 444 | class DecodingTask: 445 | inference: Inference 446 | sequence_ranker: SequenceRanker 447 | decoder: TokenDecoder 448 | logit_filters: List[LogitFilter] 449 | 450 | def __init__(self, model: "Whisper", options: DecodingOptions): 451 | self.model = model 452 | 453 | language = options.language or "en" 454 | tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task) 455 | self.tokenizer: Tokenizer = tokenizer 456 | self.options: DecodingOptions = self._verify_options(options) 457 | 458 | self.n_group: int = options.beam_size or options.best_of or 1 459 | self.n_ctx: int = model.dims.n_text_ctx 460 | self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2 461 | 462 | self.sot_sequence: Tuple[int] = tokenizer.sot_sequence 463 | if self.options.without_timestamps: 464 | self.sot_sequence = tokenizer.sot_sequence_including_notimestamps 465 | 466 | self.initial_tokens: Tuple[int] = self._get_initial_tokens() 467 | self.sample_begin: int = len(self.initial_tokens) 468 | self.sot_index: int = self.initial_tokens.index(tokenizer.sot) 469 | 470 | # inference: implements the forward pass through the decoder, including kv caching 471 | self.inference = PyTorchInference(model, len(self.initial_tokens)) 472 | 473 | # sequence ranker: implements how to rank a group of sampled sequences 474 | self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty) 475 | 476 | # decoder: implements how to select the next tokens, given the autoregressive distribution 477 | if options.beam_size is not None: 478 | self.decoder = BeamSearchDecoder( 479 | options.beam_size, tokenizer.eot, self.inference, options.patience 480 | ) 481 | else: 482 | self.decoder = GreedyDecoder(options.temperature, tokenizer.eot) 483 | 484 | # logit filters: applies various rules to suppress or penalize certain tokens 485 | self.logit_filters = [] 486 | if self.options.suppress_blank: 487 | self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin)) 488 | if self.options.suppress_tokens: 489 | self.logit_filters.append(SuppressTokens(self._get_suppress_tokens())) 490 | if not options.without_timestamps: 491 | precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds 492 | max_initial_timestamp_index = None 493 | if options.max_initial_timestamp: 494 | max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision) 495 | self.logit_filters.append( 496 | ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index) 497 | ) 498 | 499 | def _verify_options(self, options: DecodingOptions) -> DecodingOptions: 500 | if options.beam_size is not None and options.best_of is not None: 501 | raise ValueError("beam_size and best_of can't be given together") 502 | if options.temperature == 0: 503 | if options.best_of is not None: 504 | raise ValueError("best_of with greedy sampling (T=0) is not compatible") 505 | if options.patience is not None and options.beam_size is None: 506 | raise ValueError("patience requires beam_size to be given") 507 | if options.length_penalty is not None and not (0 <= options.length_penalty <= 1): 508 | raise ValueError("length_penalty (alpha) should be a value between 0 and 1") 509 | 510 | return options 511 | 512 | def _get_initial_tokens(self) -> Tuple[int]: 513 | tokens = list(self.sot_sequence) 514 | prefix = self.options.prefix 515 | prompt = self.options.prompt 516 | 517 | if prefix: 518 | prefix_tokens = ( 519 | self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix 520 | ) 521 | if self.sample_len is not None: 522 | max_prefix_len = self.n_ctx // 2 - self.sample_len 523 | prefix_tokens = prefix_tokens[-max_prefix_len:] 524 | tokens = tokens + prefix_tokens 525 | 526 | if prompt: 527 | prompt_tokens = ( 528 | self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt 529 | ) 530 | tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens 531 | 532 | return tuple(tokens) 533 | 534 | def _get_suppress_tokens(self) -> Tuple[int]: 535 | suppress_tokens = self.options.suppress_tokens 536 | 537 | if isinstance(suppress_tokens, str): 538 | suppress_tokens = [int(t) for t in suppress_tokens.split(",")] 539 | 540 | if -1 in suppress_tokens: 541 | suppress_tokens = [t for t in suppress_tokens if t >= 0] 542 | suppress_tokens.extend(self.tokenizer.non_speech_tokens) 543 | elif suppress_tokens is None or len(suppress_tokens) == 0: 544 | suppress_tokens = [] # interpret empty string as an empty list 545 | else: 546 | assert isinstance(suppress_tokens, list), "suppress_tokens must be a list" 547 | 548 | suppress_tokens.extend( 549 | [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm] 550 | ) 551 | if self.tokenizer.no_speech is not None: 552 | # no-speech probability is collected separately 553 | suppress_tokens.append(self.tokenizer.no_speech) 554 | 555 | return tuple(sorted(set(suppress_tokens))) 556 | 557 | def _get_audio_features(self, mel: Tensor, include_embeddings: bool = False): 558 | if self.options.fp16: 559 | mel = mel.half() 560 | 561 | if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state): 562 | # encoded audio features are given; skip audio encoding 563 | audio_features = mel 564 | else: 565 | result = self.model.encoder(mel, include_embeddings) 566 | if include_embeddings: 567 | audio_features, embeddings = result 568 | else: 569 | audio_features = result 570 | 571 | if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32): 572 | return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}") 573 | 574 | if include_embeddings: 575 | return audio_features, embeddings 576 | else: 577 | return audio_features 578 | 579 | def _detect_language(self, audio_features: Tensor, tokens: Tensor): 580 | languages = [self.options.language] * audio_features.shape[0] 581 | lang_probs = None 582 | 583 | if self.options.language is None or self.options.task == "lang_id": 584 | lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer) 585 | languages = [max(probs, key=probs.get) for probs in lang_probs] 586 | if self.options.language is None: 587 | tokens[:, self.sot_index + 1] = lang_tokens # write language tokens 588 | 589 | return languages, lang_probs 590 | 591 | def _main_loop(self, audio_features: Tensor, tokens: Tensor): 592 | assert audio_features.shape[0] == tokens.shape[0] 593 | n_batch = tokens.shape[0] 594 | sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device) 595 | no_speech_probs = [np.nan] * n_batch 596 | 597 | try: 598 | embeddings = [] 599 | for i in range(self.sample_len): 600 | logits, token_embeddings = self.inference.logits(tokens, audio_features, include_embeddings=True) 601 | 602 | if i == 0 and self.tokenizer.no_speech is not None: # save no_speech_probs 603 | probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1) 604 | no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist() 605 | 606 | # now we need to consider the logits at the last token only 607 | logits = logits[:, -1] 608 | token_embeddings = token_embeddings[:, :, -1] 609 | 610 | # Append embeddings together 611 | embeddings.append(token_embeddings) 612 | 613 | # apply the logit filters, e.g. for suppressing or applying penalty to 614 | for logit_filter in self.logit_filters: 615 | logit_filter.apply(logits, tokens) 616 | 617 | # expand the tokens tensor with the selected next tokens 618 | tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) 619 | 620 | if completed or tokens.shape[-1] > self.n_ctx: 621 | break 622 | finally: 623 | if completed: 624 | embeddings = embeddings[:-1] 625 | embeddings = np.stack(embeddings, 2) 626 | self.inference.cleanup_caching() 627 | 628 | return tokens, sum_logprobs, no_speech_probs, embeddings 629 | 630 | @torch.no_grad() 631 | def run(self, mel: Tensor) -> List[DecodingResult]: 632 | self.decoder.reset() 633 | tokenizer: Tokenizer = self.tokenizer 634 | n_audio: int = mel.shape[0] 635 | 636 | # encoder forward pass 637 | forward_pass: Tuple[Tensor, np.ndarray] = self._get_audio_features(mel, include_embeddings=True) 638 | audio_features, encoder_embeddings = forward_pass 639 | tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1) 640 | 641 | # detect language if requested, overwriting the language token 642 | languages, language_probs = self._detect_language(audio_features, tokens) 643 | if self.options.task == "lang_id": 644 | return [ 645 | DecodingResult(audio_features=features, language=language, language_probs=probs) 646 | for features, language, probs in zip(audio_features, languages, language_probs) 647 | ] 648 | 649 | # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling 650 | audio_features = audio_features.repeat_interleave(self.n_group, dim=0) 651 | tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) 652 | 653 | # call the main sampling loop 654 | tokens, sum_logprobs, no_speech_probs, decoder_embeddings = self._main_loop(audio_features, tokens) 655 | 656 | # reshape the tensors to have (n_audio, n_group) as the first two dimensions 657 | audio_features = audio_features[:: self.n_group] 658 | no_speech_probs = no_speech_probs[:: self.n_group] 659 | assert audio_features.shape[0] == len(no_speech_probs) == n_audio 660 | 661 | tokens = tokens.reshape(n_audio, self.n_group, -1) 662 | sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group) 663 | 664 | # get the final candidates for each group, and slice between the first sampled token and EOT 665 | tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) 666 | tokens: List[List[Tensor]] = [ 667 | [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens 668 | ] 669 | 670 | # select the top-ranked sample in each group 671 | selected = self.sequence_ranker.rank(tokens, sum_logprobs) 672 | tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)] 673 | texts: List[str] = [tokenizer.decode(t).strip() for t in tokens] 674 | 675 | sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)] 676 | avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)] 677 | 678 | fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs) 679 | if len(set(map(len, fields))) != 1: 680 | raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}") 681 | 682 | return [ 683 | DecodingResult( 684 | audio_features=features, 685 | language=language, 686 | tokens=tokens, 687 | text=text, 688 | avg_logprob=avg_logprob, 689 | no_speech_prob=no_speech_prob, 690 | temperature=self.options.temperature, 691 | compression_ratio=compression_ratio(text), 692 | encoder_embeddings=encoder_embeddings, 693 | decoder_embeddings=decoder_embeddings 694 | ) 695 | for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields) 696 | ] 697 | 698 | 699 | @torch.no_grad() 700 | def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]: 701 | """ 702 | Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s). 703 | 704 | Parameters 705 | ---------- 706 | model: Whisper 707 | the Whisper model instance 708 | 709 | mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000) 710 | A tensor containing the Mel spectrogram(s) 711 | 712 | options: DecodingOptions 713 | A dataclass that contains all necessary options for decoding 30-second segments 714 | 715 | Returns 716 | ------- 717 | result: Union[DecodingResult, List[DecodingResult]] 718 | The result(s) of decoding contained in `DecodingResult` dataclass instance(s) 719 | """ 720 | single = mel.ndim == 2 721 | if single: 722 | mel = mel.unsqueeze(0) 723 | 724 | result = DecodingTask(model, options).run(mel) 725 | 726 | if single: 727 | result = result[0] 728 | 729 | return result 730 | -------------------------------------------------------------------------------- /musetalk/whisper/model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Dict 3 | from typing import Iterable, Optional 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import Tensor 9 | from torch import nn 10 | 11 | from .transcribe import transcribe as transcribe_function 12 | from .decoding import detect_language as detect_language_function, decode as decode_function 13 | 14 | 15 | @dataclass 16 | class ModelDimensions: 17 | n_mels: int 18 | n_audio_ctx: int 19 | n_audio_state: int 20 | n_audio_head: int 21 | n_audio_layer: int 22 | n_vocab: int 23 | n_text_ctx: int 24 | n_text_state: int 25 | n_text_head: int 26 | n_text_layer: int 27 | 28 | 29 | class LayerNorm(nn.LayerNorm): 30 | def forward(self, x: Tensor) -> Tensor: 31 | return super().forward(x.float()).type(x.dtype) 32 | 33 | 34 | class Linear(nn.Linear): 35 | def forward(self, x: Tensor) -> Tensor: 36 | return F.linear( 37 | x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype) 38 | ) 39 | 40 | 41 | class Conv1d(nn.Conv1d): 42 | def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: 43 | return super()._conv_forward( 44 | x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype) 45 | ) 46 | 47 | 48 | def sinusoids(length, channels, max_timescale=10000): 49 | """Returns sinusoids for positional embedding""" 50 | assert channels % 2 == 0 51 | log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) 52 | inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2)) 53 | scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] 54 | return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) 55 | 56 | 57 | class MultiHeadAttention(nn.Module): 58 | def __init__(self, n_state: int, n_head: int): 59 | super().__init__() 60 | self.n_head = n_head 61 | self.query = Linear(n_state, n_state) 62 | self.key = Linear(n_state, n_state, bias=False) 63 | self.value = Linear(n_state, n_state) 64 | self.out = Linear(n_state, n_state) 65 | 66 | def forward( 67 | self, 68 | x: Tensor, 69 | xa: Optional[Tensor] = None, 70 | mask: Optional[Tensor] = None, 71 | kv_cache: Optional[dict] = None, 72 | ): 73 | q = self.query(x) 74 | 75 | if kv_cache is None or xa is None: 76 | # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; 77 | # otherwise, perform key/value projections for self- or cross-attention as usual. 78 | k = self.key(x if xa is None else xa) 79 | v = self.value(x if xa is None else xa) 80 | else: 81 | # for cross-attention, calculate keys and values once and reuse in subsequent calls. 82 | k = kv_cache.get(self.key, self.key(xa)) 83 | v = kv_cache.get(self.value, self.value(xa)) 84 | 85 | wv = self.qkv_attention(q, k, v, mask) 86 | return self.out(wv) 87 | 88 | def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None): 89 | n_batch, n_ctx, n_state = q.shape 90 | scale = (n_state // self.n_head) ** -0.25 91 | q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale 92 | k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale 93 | v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) 94 | 95 | qk = q @ k 96 | if mask is not None: 97 | qk = qk + mask[:n_ctx, :n_ctx] 98 | 99 | w = F.softmax(qk.float(), dim=-1).to(q.dtype) 100 | return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) 101 | 102 | 103 | class ResidualAttentionBlock(nn.Module): 104 | def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): 105 | super().__init__() 106 | 107 | self.attn = MultiHeadAttention(n_state, n_head) 108 | self.attn_ln = LayerNorm(n_state) 109 | 110 | self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None 111 | self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None 112 | 113 | n_mlp = n_state * 4 114 | self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)) 115 | self.mlp_ln = LayerNorm(n_state) 116 | 117 | def forward( 118 | self, 119 | x: Tensor, 120 | xa: Optional[Tensor] = None, 121 | mask: Optional[Tensor] = None, 122 | kv_cache: Optional[dict] = None, 123 | ): 124 | x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) 125 | if self.cross_attn: 126 | x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache) 127 | x = x + self.mlp(self.mlp_ln(x)) 128 | return x 129 | 130 | 131 | class AudioEncoder(nn.Module): 132 | def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): 133 | super().__init__() 134 | self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) 135 | self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) 136 | self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) 137 | 138 | self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( 139 | [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)] 140 | ) 141 | self.ln_post = LayerNorm(n_state) 142 | 143 | def forward(self, x: Tensor, include_embeddings: bool = False): 144 | """ 145 | x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) 146 | the mel spectrogram of the audio 147 | include_embeddings: bool 148 | whether to include intermediate steps in the output 149 | """ 150 | x = F.gelu(self.conv1(x)) 151 | x = F.gelu(self.conv2(x)) 152 | x = x.permute(0, 2, 1) 153 | 154 | assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" 155 | x = (x + self.positional_embedding).to(x.dtype) 156 | 157 | if include_embeddings: 158 | embeddings = [x.cpu().detach().numpy()] 159 | 160 | for block in self.blocks: 161 | x = block(x) 162 | if include_embeddings: 163 | embeddings.append(x.cpu().detach().numpy()) 164 | 165 | x = self.ln_post(x) 166 | 167 | if include_embeddings: 168 | embeddings = np.stack(embeddings, axis=1) 169 | return x, embeddings 170 | else: 171 | return x 172 | 173 | 174 | class TextDecoder(nn.Module): 175 | def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): 176 | super().__init__() 177 | 178 | self.token_embedding = nn.Embedding(n_vocab, n_state) 179 | self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) 180 | 181 | self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( 182 | [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)] 183 | ) 184 | self.ln = LayerNorm(n_state) 185 | 186 | mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) 187 | self.register_buffer("mask", mask, persistent=False) 188 | 189 | def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None, include_embeddings: bool = False): 190 | """ 191 | x : torch.LongTensor, shape = (batch_size, <= n_ctx) 192 | the text tokens 193 | xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) 194 | the encoded audio features to be attended on 195 | include_embeddings : bool 196 | Whether to include intermediate values in the output to this function 197 | """ 198 | offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 199 | x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]] 200 | x = x.to(xa.dtype) 201 | 202 | if include_embeddings: 203 | embeddings = [x.cpu().detach().numpy()] 204 | 205 | for block in self.blocks: 206 | x = block(x, xa, mask=self.mask, kv_cache=kv_cache) 207 | if include_embeddings: 208 | embeddings.append(x.cpu().detach().numpy()) 209 | 210 | x = self.ln(x) 211 | logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float() 212 | 213 | if include_embeddings: 214 | embeddings = np.stack(embeddings, axis=1) 215 | return logits, embeddings 216 | else: 217 | return logits 218 | 219 | 220 | class Whisper(nn.Module): 221 | def __init__(self, dims: ModelDimensions): 222 | super().__init__() 223 | self.dims = dims 224 | self.encoder = AudioEncoder( 225 | self.dims.n_mels, 226 | self.dims.n_audio_ctx, 227 | self.dims.n_audio_state, 228 | self.dims.n_audio_head, 229 | self.dims.n_audio_layer, 230 | ) 231 | self.decoder = TextDecoder( 232 | self.dims.n_vocab, 233 | self.dims.n_text_ctx, 234 | self.dims.n_text_state, 235 | self.dims.n_text_head, 236 | self.dims.n_text_layer, 237 | ) 238 | 239 | def embed_audio(self, mel: torch.Tensor): 240 | return self.encoder.forward(mel) 241 | 242 | def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor): 243 | return self.decoder.forward(tokens, audio_features) 244 | 245 | def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]: 246 | return self.decoder(tokens, self.encoder(mel)) 247 | 248 | @property 249 | def device(self): 250 | return next(self.parameters()).device 251 | 252 | @property 253 | def is_multilingual(self): 254 | return self.dims.n_vocab == 51865 255 | 256 | def install_kv_cache_hooks(self, cache: Optional[dict] = None): 257 | """ 258 | The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value 259 | tensors calculated for the previous positions. This method returns a dictionary that stores 260 | all caches, and the necessary hooks for the key and value projection modules that save the 261 | intermediate tensors to be reused during later calculations. 262 | 263 | Returns 264 | ------- 265 | cache : Dict[nn.Module, torch.Tensor] 266 | A dictionary object mapping the key/value projection modules to its cache 267 | hooks : List[RemovableHandle] 268 | List of PyTorch RemovableHandle objects to stop the hooks to be called 269 | """ 270 | cache = {**cache} if cache is not None else {} 271 | hooks = [] 272 | 273 | def save_to_cache(module, _, output): 274 | if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]: 275 | cache[module] = output # save as-is, for the first token or cross attention 276 | else: 277 | cache[module] = torch.cat([cache[module], output], dim=1).detach() 278 | return cache[module] 279 | 280 | def install_hooks(layer: nn.Module): 281 | if isinstance(layer, MultiHeadAttention): 282 | hooks.append(layer.key.register_forward_hook(save_to_cache)) 283 | hooks.append(layer.value.register_forward_hook(save_to_cache)) 284 | 285 | self.decoder.apply(install_hooks) 286 | return cache, hooks 287 | 288 | detect_language = detect_language_function 289 | transcribe = transcribe_function 290 | decode = decode_function 291 | -------------------------------------------------------------------------------- /musetalk/whisper/normalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import BasicTextNormalizer 2 | from .english import EnglishTextNormalizer 3 | -------------------------------------------------------------------------------- /musetalk/whisper/normalizers/basic.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | import regex 5 | 6 | # non-ASCII letters that are not separated by "NFKD" normalization 7 | ADDITIONAL_DIACRITICS = { 8 | "œ": "oe", 9 | "Œ": "OE", 10 | "ø": "o", 11 | "Ø": "O", 12 | "æ": "ae", 13 | "Æ": "AE", 14 | "ß": "ss", 15 | "ẞ": "SS", 16 | "đ": "d", 17 | "Đ": "D", 18 | "ð": "d", 19 | "Ð": "D", 20 | "þ": "th", 21 | "Þ": "th", 22 | "ł": "l", 23 | "Ł": "L", 24 | } 25 | 26 | 27 | def remove_symbols_and_diacritics(s: str, keep=""): 28 | """ 29 | Replace any other markers, symbols, and punctuations with a space, 30 | and drop any diacritics (category 'Mn' and some manual mappings) 31 | """ 32 | return "".join( 33 | c 34 | if c in keep 35 | else ADDITIONAL_DIACRITICS[c] 36 | if c in ADDITIONAL_DIACRITICS 37 | else "" 38 | if unicodedata.category(c) == "Mn" 39 | else " " 40 | if unicodedata.category(c)[0] in "MSP" 41 | else c 42 | for c in unicodedata.normalize("NFKD", s) 43 | ) 44 | 45 | 46 | def remove_symbols(s: str): 47 | """ 48 | Replace any other markers, symbols, punctuations with a space, keeping diacritics 49 | """ 50 | return "".join( 51 | " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s) 52 | ) 53 | 54 | 55 | class BasicTextNormalizer: 56 | def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): 57 | self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols 58 | self.split_letters = split_letters 59 | 60 | def __call__(self, s: str): 61 | s = s.lower() 62 | s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets 63 | s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis 64 | s = self.clean(s).lower() 65 | 66 | if self.split_letters: 67 | s = " ".join(regex.findall(r"\X", s, regex.U)) 68 | 69 | s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space 70 | 71 | return s 72 | -------------------------------------------------------------------------------- /musetalk/whisper/normalizers/english.json: -------------------------------------------------------------------------------- 1 | { 2 | "accessorise": "accessorize", 3 | "accessorised": "accessorized", 4 | "accessorises": "accessorizes", 5 | "accessorising": "accessorizing", 6 | "acclimatisation": "acclimatization", 7 | "acclimatise": "acclimatize", 8 | "acclimatised": "acclimatized", 9 | "acclimatises": "acclimatizes", 10 | "acclimatising": "acclimatizing", 11 | "accoutrements": "accouterments", 12 | "aeon": "eon", 13 | "aeons": "eons", 14 | "aerogramme": "aerogram", 15 | "aerogrammes": "aerograms", 16 | "aeroplane": "airplane", 17 | "aeroplanes": "airplanes", 18 | "aesthete": "esthete", 19 | "aesthetes": "esthetes", 20 | "aesthetic": "esthetic", 21 | "aesthetically": "esthetically", 22 | "aesthetics": "esthetics", 23 | "aetiology": "etiology", 24 | "ageing": "aging", 25 | "aggrandisement": "aggrandizement", 26 | "agonise": "agonize", 27 | "agonised": "agonized", 28 | "agonises": "agonizes", 29 | "agonising": "agonizing", 30 | "agonisingly": "agonizingly", 31 | "almanack": "almanac", 32 | "almanacks": "almanacs", 33 | "aluminium": "aluminum", 34 | "amortisable": "amortizable", 35 | "amortisation": "amortization", 36 | "amortisations": "amortizations", 37 | "amortise": "amortize", 38 | "amortised": "amortized", 39 | "amortises": "amortizes", 40 | "amortising": "amortizing", 41 | "amphitheatre": "amphitheater", 42 | "amphitheatres": "amphitheaters", 43 | "anaemia": "anemia", 44 | "anaemic": "anemic", 45 | "anaesthesia": "anesthesia", 46 | "anaesthetic": "anesthetic", 47 | "anaesthetics": "anesthetics", 48 | "anaesthetise": "anesthetize", 49 | "anaesthetised": "anesthetized", 50 | "anaesthetises": "anesthetizes", 51 | "anaesthetising": "anesthetizing", 52 | "anaesthetist": "anesthetist", 53 | "anaesthetists": "anesthetists", 54 | "anaesthetize": "anesthetize", 55 | "anaesthetized": "anesthetized", 56 | "anaesthetizes": "anesthetizes", 57 | "anaesthetizing": "anesthetizing", 58 | "analogue": "analog", 59 | "analogues": "analogs", 60 | "analyse": "analyze", 61 | "analysed": "analyzed", 62 | "analyses": "analyzes", 63 | "analysing": "analyzing", 64 | "anglicise": "anglicize", 65 | "anglicised": "anglicized", 66 | "anglicises": "anglicizes", 67 | "anglicising": "anglicizing", 68 | "annualised": "annualized", 69 | "antagonise": "antagonize", 70 | "antagonised": "antagonized", 71 | "antagonises": "antagonizes", 72 | "antagonising": "antagonizing", 73 | "apologise": "apologize", 74 | "apologised": "apologized", 75 | "apologises": "apologizes", 76 | "apologising": "apologizing", 77 | "appal": "appall", 78 | "appals": "appalls", 79 | "appetiser": "appetizer", 80 | "appetisers": "appetizers", 81 | "appetising": "appetizing", 82 | "appetisingly": "appetizingly", 83 | "arbour": "arbor", 84 | "arbours": "arbors", 85 | "archeological": "archaeological", 86 | "archaeologically": "archeologically", 87 | "archaeologist": "archeologist", 88 | "archaeologists": "archeologists", 89 | "archaeology": "archeology", 90 | "ardour": "ardor", 91 | "armour": "armor", 92 | "armoured": "armored", 93 | "armourer": "armorer", 94 | "armourers": "armorers", 95 | "armouries": "armories", 96 | "armoury": "armory", 97 | "artefact": "artifact", 98 | "artefacts": "artifacts", 99 | "authorise": "authorize", 100 | "authorised": "authorized", 101 | "authorises": "authorizes", 102 | "authorising": "authorizing", 103 | "axe": "ax", 104 | "backpedalled": "backpedaled", 105 | "backpedalling": "backpedaling", 106 | "bannister": "banister", 107 | "bannisters": "banisters", 108 | "baptise": "baptize", 109 | "baptised": "baptized", 110 | "baptises": "baptizes", 111 | "baptising": "baptizing", 112 | "bastardise": "bastardize", 113 | "bastardised": "bastardized", 114 | "bastardises": "bastardizes", 115 | "bastardising": "bastardizing", 116 | "battleax": "battleaxe", 117 | "baulk": "balk", 118 | "baulked": "balked", 119 | "baulking": "balking", 120 | "baulks": "balks", 121 | "bedevilled": "bedeviled", 122 | "bedevilling": "bedeviling", 123 | "behaviour": "behavior", 124 | "behavioural": "behavioral", 125 | "behaviourism": "behaviorism", 126 | "behaviourist": "behaviorist", 127 | "behaviourists": "behaviorists", 128 | "behaviours": "behaviors", 129 | "behove": "behoove", 130 | "behoved": "behooved", 131 | "behoves": "behooves", 132 | "bejewelled": "bejeweled", 133 | "belabour": "belabor", 134 | "belaboured": "belabored", 135 | "belabouring": "belaboring", 136 | "belabours": "belabors", 137 | "bevelled": "beveled", 138 | "bevvies": "bevies", 139 | "bevvy": "bevy", 140 | "biassed": "biased", 141 | "biassing": "biasing", 142 | "bingeing": "binging", 143 | "bougainvillaea": "bougainvillea", 144 | "bougainvillaeas": "bougainvilleas", 145 | "bowdlerise": "bowdlerize", 146 | "bowdlerised": "bowdlerized", 147 | "bowdlerises": "bowdlerizes", 148 | "bowdlerising": "bowdlerizing", 149 | "breathalyse": "breathalyze", 150 | "breathalysed": "breathalyzed", 151 | "breathalyser": "breathalyzer", 152 | "breathalysers": "breathalyzers", 153 | "breathalyses": "breathalyzes", 154 | "breathalysing": "breathalyzing", 155 | "brutalise": "brutalize", 156 | "brutalised": "brutalized", 157 | "brutalises": "brutalizes", 158 | "brutalising": "brutalizing", 159 | "busses": "buses", 160 | "bussing": "busing", 161 | "caesarean": "cesarean", 162 | "caesareans": "cesareans", 163 | "calibre": "caliber", 164 | "calibres": "calibers", 165 | "calliper": "caliper", 166 | "callipers": "calipers", 167 | "callisthenics": "calisthenics", 168 | "canalise": "canalize", 169 | "canalised": "canalized", 170 | "canalises": "canalizes", 171 | "canalising": "canalizing", 172 | "cancelation": "cancellation", 173 | "cancelations": "cancellations", 174 | "cancelled": "canceled", 175 | "cancelling": "canceling", 176 | "candour": "candor", 177 | "cannibalise": "cannibalize", 178 | "cannibalised": "cannibalized", 179 | "cannibalises": "cannibalizes", 180 | "cannibalising": "cannibalizing", 181 | "canonise": "canonize", 182 | "canonised": "canonized", 183 | "canonises": "canonizes", 184 | "canonising": "canonizing", 185 | "capitalise": "capitalize", 186 | "capitalised": "capitalized", 187 | "capitalises": "capitalizes", 188 | "capitalising": "capitalizing", 189 | "caramelise": "caramelize", 190 | "caramelised": "caramelized", 191 | "caramelises": "caramelizes", 192 | "caramelising": "caramelizing", 193 | "carbonise": "carbonize", 194 | "carbonised": "carbonized", 195 | "carbonises": "carbonizes", 196 | "carbonising": "carbonizing", 197 | "carolled": "caroled", 198 | "carolling": "caroling", 199 | "catalogue": "catalog", 200 | "catalogued": "cataloged", 201 | "catalogues": "catalogs", 202 | "cataloguing": "cataloging", 203 | "catalyse": "catalyze", 204 | "catalysed": "catalyzed", 205 | "catalyses": "catalyzes", 206 | "catalysing": "catalyzing", 207 | "categorise": "categorize", 208 | "categorised": "categorized", 209 | "categorises": "categorizes", 210 | "categorising": "categorizing", 211 | "cauterise": "cauterize", 212 | "cauterised": "cauterized", 213 | "cauterises": "cauterizes", 214 | "cauterising": "cauterizing", 215 | "cavilled": "caviled", 216 | "cavilling": "caviling", 217 | "centigramme": "centigram", 218 | "centigrammes": "centigrams", 219 | "centilitre": "centiliter", 220 | "centilitres": "centiliters", 221 | "centimetre": "centimeter", 222 | "centimetres": "centimeters", 223 | "centralise": "centralize", 224 | "centralised": "centralized", 225 | "centralises": "centralizes", 226 | "centralising": "centralizing", 227 | "centre": "center", 228 | "centred": "centered", 229 | "centrefold": "centerfold", 230 | "centrefolds": "centerfolds", 231 | "centrepiece": "centerpiece", 232 | "centrepieces": "centerpieces", 233 | "centres": "centers", 234 | "channelled": "channeled", 235 | "channelling": "channeling", 236 | "characterise": "characterize", 237 | "characterised": "characterized", 238 | "characterises": "characterizes", 239 | "characterising": "characterizing", 240 | "cheque": "check", 241 | "chequebook": "checkbook", 242 | "chequebooks": "checkbooks", 243 | "chequered": "checkered", 244 | "cheques": "checks", 245 | "chilli": "chili", 246 | "chimaera": "chimera", 247 | "chimaeras": "chimeras", 248 | "chiselled": "chiseled", 249 | "chiselling": "chiseling", 250 | "circularise": "circularize", 251 | "circularised": "circularized", 252 | "circularises": "circularizes", 253 | "circularising": "circularizing", 254 | "civilise": "civilize", 255 | "civilised": "civilized", 256 | "civilises": "civilizes", 257 | "civilising": "civilizing", 258 | "clamour": "clamor", 259 | "clamoured": "clamored", 260 | "clamouring": "clamoring", 261 | "clamours": "clamors", 262 | "clangour": "clangor", 263 | "clarinettist": "clarinetist", 264 | "clarinettists": "clarinetists", 265 | "collectivise": "collectivize", 266 | "collectivised": "collectivized", 267 | "collectivises": "collectivizes", 268 | "collectivising": "collectivizing", 269 | "colonisation": "colonization", 270 | "colonise": "colonize", 271 | "colonised": "colonized", 272 | "coloniser": "colonizer", 273 | "colonisers": "colonizers", 274 | "colonises": "colonizes", 275 | "colonising": "colonizing", 276 | "colour": "color", 277 | "colourant": "colorant", 278 | "colourants": "colorants", 279 | "coloured": "colored", 280 | "coloureds": "coloreds", 281 | "colourful": "colorful", 282 | "colourfully": "colorfully", 283 | "colouring": "coloring", 284 | "colourize": "colorize", 285 | "colourized": "colorized", 286 | "colourizes": "colorizes", 287 | "colourizing": "colorizing", 288 | "colourless": "colorless", 289 | "colours": "colors", 290 | "commercialise": "commercialize", 291 | "commercialised": "commercialized", 292 | "commercialises": "commercializes", 293 | "commercialising": "commercializing", 294 | "compartmentalise": "compartmentalize", 295 | "compartmentalised": "compartmentalized", 296 | "compartmentalises": "compartmentalizes", 297 | "compartmentalising": "compartmentalizing", 298 | "computerise": "computerize", 299 | "computerised": "computerized", 300 | "computerises": "computerizes", 301 | "computerising": "computerizing", 302 | "conceptualise": "conceptualize", 303 | "conceptualised": "conceptualized", 304 | "conceptualises": "conceptualizes", 305 | "conceptualising": "conceptualizing", 306 | "connexion": "connection", 307 | "connexions": "connections", 308 | "contextualise": "contextualize", 309 | "contextualised": "contextualized", 310 | "contextualises": "contextualizes", 311 | "contextualising": "contextualizing", 312 | "cosier": "cozier", 313 | "cosies": "cozies", 314 | "cosiest": "coziest", 315 | "cosily": "cozily", 316 | "cosiness": "coziness", 317 | "cosy": "cozy", 318 | "councillor": "councilor", 319 | "councillors": "councilors", 320 | "counselled": "counseled", 321 | "counselling": "counseling", 322 | "counsellor": "counselor", 323 | "counsellors": "counselors", 324 | "crenelated": "crenellated", 325 | "criminalise": "criminalize", 326 | "criminalised": "criminalized", 327 | "criminalises": "criminalizes", 328 | "criminalising": "criminalizing", 329 | "criticise": "criticize", 330 | "criticised": "criticized", 331 | "criticises": "criticizes", 332 | "criticising": "criticizing", 333 | "crueller": "crueler", 334 | "cruellest": "cruelest", 335 | "crystallisation": "crystallization", 336 | "crystallise": "crystallize", 337 | "crystallised": "crystallized", 338 | "crystallises": "crystallizes", 339 | "crystallising": "crystallizing", 340 | "cudgelled": "cudgeled", 341 | "cudgelling": "cudgeling", 342 | "customise": "customize", 343 | "customised": "customized", 344 | "customises": "customizes", 345 | "customising": "customizing", 346 | "cypher": "cipher", 347 | "cyphers": "ciphers", 348 | "decentralisation": "decentralization", 349 | "decentralise": "decentralize", 350 | "decentralised": "decentralized", 351 | "decentralises": "decentralizes", 352 | "decentralising": "decentralizing", 353 | "decriminalisation": "decriminalization", 354 | "decriminalise": "decriminalize", 355 | "decriminalised": "decriminalized", 356 | "decriminalises": "decriminalizes", 357 | "decriminalising": "decriminalizing", 358 | "defence": "defense", 359 | "defenceless": "defenseless", 360 | "defences": "defenses", 361 | "dehumanisation": "dehumanization", 362 | "dehumanise": "dehumanize", 363 | "dehumanised": "dehumanized", 364 | "dehumanises": "dehumanizes", 365 | "dehumanising": "dehumanizing", 366 | "demeanour": "demeanor", 367 | "demilitarisation": "demilitarization", 368 | "demilitarise": "demilitarize", 369 | "demilitarised": "demilitarized", 370 | "demilitarises": "demilitarizes", 371 | "demilitarising": "demilitarizing", 372 | "demobilisation": "demobilization", 373 | "demobilise": "demobilize", 374 | "demobilised": "demobilized", 375 | "demobilises": "demobilizes", 376 | "demobilising": "demobilizing", 377 | "democratisation": "democratization", 378 | "democratise": "democratize", 379 | "democratised": "democratized", 380 | "democratises": "democratizes", 381 | "democratising": "democratizing", 382 | "demonise": "demonize", 383 | "demonised": "demonized", 384 | "demonises": "demonizes", 385 | "demonising": "demonizing", 386 | "demoralisation": "demoralization", 387 | "demoralise": "demoralize", 388 | "demoralised": "demoralized", 389 | "demoralises": "demoralizes", 390 | "demoralising": "demoralizing", 391 | "denationalisation": "denationalization", 392 | "denationalise": "denationalize", 393 | "denationalised": "denationalized", 394 | "denationalises": "denationalizes", 395 | "denationalising": "denationalizing", 396 | "deodorise": "deodorize", 397 | "deodorised": "deodorized", 398 | "deodorises": "deodorizes", 399 | "deodorising": "deodorizing", 400 | "depersonalise": "depersonalize", 401 | "depersonalised": "depersonalized", 402 | "depersonalises": "depersonalizes", 403 | "depersonalising": "depersonalizing", 404 | "deputise": "deputize", 405 | "deputised": "deputized", 406 | "deputises": "deputizes", 407 | "deputising": "deputizing", 408 | "desensitisation": "desensitization", 409 | "desensitise": "desensitize", 410 | "desensitised": "desensitized", 411 | "desensitises": "desensitizes", 412 | "desensitising": "desensitizing", 413 | "destabilisation": "destabilization", 414 | "destabilise": "destabilize", 415 | "destabilised": "destabilized", 416 | "destabilises": "destabilizes", 417 | "destabilising": "destabilizing", 418 | "dialled": "dialed", 419 | "dialling": "dialing", 420 | "dialogue": "dialog", 421 | "dialogues": "dialogs", 422 | "diarrhoea": "diarrhea", 423 | "digitise": "digitize", 424 | "digitised": "digitized", 425 | "digitises": "digitizes", 426 | "digitising": "digitizing", 427 | "disc": "disk", 428 | "discolour": "discolor", 429 | "discoloured": "discolored", 430 | "discolouring": "discoloring", 431 | "discolours": "discolors", 432 | "discs": "disks", 433 | "disembowelled": "disemboweled", 434 | "disembowelling": "disemboweling", 435 | "disfavour": "disfavor", 436 | "dishevelled": "disheveled", 437 | "dishonour": "dishonor", 438 | "dishonourable": "dishonorable", 439 | "dishonourably": "dishonorably", 440 | "dishonoured": "dishonored", 441 | "dishonouring": "dishonoring", 442 | "dishonours": "dishonors", 443 | "disorganisation": "disorganization", 444 | "disorganised": "disorganized", 445 | "distil": "distill", 446 | "distils": "distills", 447 | "dramatisation": "dramatization", 448 | "dramatisations": "dramatizations", 449 | "dramatise": "dramatize", 450 | "dramatised": "dramatized", 451 | "dramatises": "dramatizes", 452 | "dramatising": "dramatizing", 453 | "draught": "draft", 454 | "draughtboard": "draftboard", 455 | "draughtboards": "draftboards", 456 | "draughtier": "draftier", 457 | "draughtiest": "draftiest", 458 | "draughts": "drafts", 459 | "draughtsman": "draftsman", 460 | "draughtsmanship": "draftsmanship", 461 | "draughtsmen": "draftsmen", 462 | "draughtswoman": "draftswoman", 463 | "draughtswomen": "draftswomen", 464 | "draughty": "drafty", 465 | "drivelled": "driveled", 466 | "drivelling": "driveling", 467 | "duelled": "dueled", 468 | "duelling": "dueling", 469 | "economise": "economize", 470 | "economised": "economized", 471 | "economises": "economizes", 472 | "economising": "economizing", 473 | "edoema": "edema", 474 | "editorialise": "editorialize", 475 | "editorialised": "editorialized", 476 | "editorialises": "editorializes", 477 | "editorialising": "editorializing", 478 | "empathise": "empathize", 479 | "empathised": "empathized", 480 | "empathises": "empathizes", 481 | "empathising": "empathizing", 482 | "emphasise": "emphasize", 483 | "emphasised": "emphasized", 484 | "emphasises": "emphasizes", 485 | "emphasising": "emphasizing", 486 | "enamelled": "enameled", 487 | "enamelling": "enameling", 488 | "enamoured": "enamored", 489 | "encyclopaedia": "encyclopedia", 490 | "encyclopaedias": "encyclopedias", 491 | "encyclopaedic": "encyclopedic", 492 | "endeavour": "endeavor", 493 | "endeavoured": "endeavored", 494 | "endeavouring": "endeavoring", 495 | "endeavours": "endeavors", 496 | "energise": "energize", 497 | "energised": "energized", 498 | "energises": "energizes", 499 | "energising": "energizing", 500 | "enrol": "enroll", 501 | "enrols": "enrolls", 502 | "enthral": "enthrall", 503 | "enthrals": "enthralls", 504 | "epaulette": "epaulet", 505 | "epaulettes": "epaulets", 506 | "epicentre": "epicenter", 507 | "epicentres": "epicenters", 508 | "epilogue": "epilog", 509 | "epilogues": "epilogs", 510 | "epitomise": "epitomize", 511 | "epitomised": "epitomized", 512 | "epitomises": "epitomizes", 513 | "epitomising": "epitomizing", 514 | "equalisation": "equalization", 515 | "equalise": "equalize", 516 | "equalised": "equalized", 517 | "equaliser": "equalizer", 518 | "equalisers": "equalizers", 519 | "equalises": "equalizes", 520 | "equalising": "equalizing", 521 | "eulogise": "eulogize", 522 | "eulogised": "eulogized", 523 | "eulogises": "eulogizes", 524 | "eulogising": "eulogizing", 525 | "evangelise": "evangelize", 526 | "evangelised": "evangelized", 527 | "evangelises": "evangelizes", 528 | "evangelising": "evangelizing", 529 | "exorcise": "exorcize", 530 | "exorcised": "exorcized", 531 | "exorcises": "exorcizes", 532 | "exorcising": "exorcizing", 533 | "extemporisation": "extemporization", 534 | "extemporise": "extemporize", 535 | "extemporised": "extemporized", 536 | "extemporises": "extemporizes", 537 | "extemporising": "extemporizing", 538 | "externalisation": "externalization", 539 | "externalisations": "externalizations", 540 | "externalise": "externalize", 541 | "externalised": "externalized", 542 | "externalises": "externalizes", 543 | "externalising": "externalizing", 544 | "factorise": "factorize", 545 | "factorised": "factorized", 546 | "factorises": "factorizes", 547 | "factorising": "factorizing", 548 | "faecal": "fecal", 549 | "faeces": "feces", 550 | "familiarisation": "familiarization", 551 | "familiarise": "familiarize", 552 | "familiarised": "familiarized", 553 | "familiarises": "familiarizes", 554 | "familiarising": "familiarizing", 555 | "fantasise": "fantasize", 556 | "fantasised": "fantasized", 557 | "fantasises": "fantasizes", 558 | "fantasising": "fantasizing", 559 | "favour": "favor", 560 | "favourable": "favorable", 561 | "favourably": "favorably", 562 | "favoured": "favored", 563 | "favouring": "favoring", 564 | "favourite": "favorite", 565 | "favourites": "favorites", 566 | "favouritism": "favoritism", 567 | "favours": "favors", 568 | "feminise": "feminize", 569 | "feminised": "feminized", 570 | "feminises": "feminizes", 571 | "feminising": "feminizing", 572 | "fertilisation": "fertilization", 573 | "fertilise": "fertilize", 574 | "fertilised": "fertilized", 575 | "fertiliser": "fertilizer", 576 | "fertilisers": "fertilizers", 577 | "fertilises": "fertilizes", 578 | "fertilising": "fertilizing", 579 | "fervour": "fervor", 580 | "fibre": "fiber", 581 | "fibreglass": "fiberglass", 582 | "fibres": "fibers", 583 | "fictionalisation": "fictionalization", 584 | "fictionalisations": "fictionalizations", 585 | "fictionalise": "fictionalize", 586 | "fictionalised": "fictionalized", 587 | "fictionalises": "fictionalizes", 588 | "fictionalising": "fictionalizing", 589 | "fillet": "filet", 590 | "filleted": "fileted", 591 | "filleting": "fileting", 592 | "fillets": "filets", 593 | "finalisation": "finalization", 594 | "finalise": "finalize", 595 | "finalised": "finalized", 596 | "finalises": "finalizes", 597 | "finalising": "finalizing", 598 | "flautist": "flutist", 599 | "flautists": "flutists", 600 | "flavour": "flavor", 601 | "flavoured": "flavored", 602 | "flavouring": "flavoring", 603 | "flavourings": "flavorings", 604 | "flavourless": "flavorless", 605 | "flavours": "flavors", 606 | "flavoursome": "flavorsome", 607 | "flyer / flier": "flier / flyer", 608 | "foetal": "fetal", 609 | "foetid": "fetid", 610 | "foetus": "fetus", 611 | "foetuses": "fetuses", 612 | "formalisation": "formalization", 613 | "formalise": "formalize", 614 | "formalised": "formalized", 615 | "formalises": "formalizes", 616 | "formalising": "formalizing", 617 | "fossilisation": "fossilization", 618 | "fossilise": "fossilize", 619 | "fossilised": "fossilized", 620 | "fossilises": "fossilizes", 621 | "fossilising": "fossilizing", 622 | "fraternisation": "fraternization", 623 | "fraternise": "fraternize", 624 | "fraternised": "fraternized", 625 | "fraternises": "fraternizes", 626 | "fraternising": "fraternizing", 627 | "fulfil": "fulfill", 628 | "fulfilment": "fulfillment", 629 | "fulfils": "fulfills", 630 | "funnelled": "funneled", 631 | "funnelling": "funneling", 632 | "galvanise": "galvanize", 633 | "galvanised": "galvanized", 634 | "galvanises": "galvanizes", 635 | "galvanising": "galvanizing", 636 | "gambolled": "gamboled", 637 | "gambolling": "gamboling", 638 | "gaol": "jail", 639 | "gaolbird": "jailbird", 640 | "gaolbirds": "jailbirds", 641 | "gaolbreak": "jailbreak", 642 | "gaolbreaks": "jailbreaks", 643 | "gaoled": "jailed", 644 | "gaoler": "jailer", 645 | "gaolers": "jailers", 646 | "gaoling": "jailing", 647 | "gaols": "jails", 648 | "gasses": "gases", 649 | "gage": "gauge", 650 | "gaged": "gauged", 651 | "gages": "gauges", 652 | "gaging": "gauging", 653 | "generalisation": "generalization", 654 | "generalisations": "generalizations", 655 | "generalise": "generalize", 656 | "generalised": "generalized", 657 | "generalises": "generalizes", 658 | "generalising": "generalizing", 659 | "ghettoise": "ghettoize", 660 | "ghettoised": "ghettoized", 661 | "ghettoises": "ghettoizes", 662 | "ghettoising": "ghettoizing", 663 | "gipsies": "gypsies", 664 | "glamorise": "glamorize", 665 | "glamorised": "glamorized", 666 | "glamorises": "glamorizes", 667 | "glamorising": "glamorizing", 668 | "glamor": "glamour", 669 | "globalisation": "globalization", 670 | "globalise": "globalize", 671 | "globalised": "globalized", 672 | "globalises": "globalizes", 673 | "globalising": "globalizing", 674 | "glueing": "gluing", 675 | "goitre": "goiter", 676 | "goitres": "goiters", 677 | "gonorrhoea": "gonorrhea", 678 | "gramme": "gram", 679 | "grammes": "grams", 680 | "gravelled": "graveled", 681 | "grey": "gray", 682 | "greyed": "grayed", 683 | "greying": "graying", 684 | "greyish": "grayish", 685 | "greyness": "grayness", 686 | "greys": "grays", 687 | "grovelled": "groveled", 688 | "grovelling": "groveling", 689 | "groyne": "groin", 690 | "groynes": "groins", 691 | "gruelling": "grueling", 692 | "gruellingly": "gruelingly", 693 | "gryphon": "griffin", 694 | "gryphons": "griffins", 695 | "gynaecological": "gynecological", 696 | "gynaecologist": "gynecologist", 697 | "gynaecologists": "gynecologists", 698 | "gynaecology": "gynecology", 699 | "haematological": "hematological", 700 | "haematologist": "hematologist", 701 | "haematologists": "hematologists", 702 | "haematology": "hematology", 703 | "haemoglobin": "hemoglobin", 704 | "haemophilia": "hemophilia", 705 | "haemophiliac": "hemophiliac", 706 | "haemophiliacs": "hemophiliacs", 707 | "haemorrhage": "hemorrhage", 708 | "haemorrhaged": "hemorrhaged", 709 | "haemorrhages": "hemorrhages", 710 | "haemorrhaging": "hemorrhaging", 711 | "haemorrhoids": "hemorrhoids", 712 | "harbour": "harbor", 713 | "harboured": "harbored", 714 | "harbouring": "harboring", 715 | "harbours": "harbors", 716 | "harmonisation": "harmonization", 717 | "harmonise": "harmonize", 718 | "harmonised": "harmonized", 719 | "harmonises": "harmonizes", 720 | "harmonising": "harmonizing", 721 | "homoeopath": "homeopath", 722 | "homoeopathic": "homeopathic", 723 | "homoeopaths": "homeopaths", 724 | "homoeopathy": "homeopathy", 725 | "homogenise": "homogenize", 726 | "homogenised": "homogenized", 727 | "homogenises": "homogenizes", 728 | "homogenising": "homogenizing", 729 | "honour": "honor", 730 | "honourable": "honorable", 731 | "honourably": "honorably", 732 | "honoured": "honored", 733 | "honouring": "honoring", 734 | "honours": "honors", 735 | "hospitalisation": "hospitalization", 736 | "hospitalise": "hospitalize", 737 | "hospitalised": "hospitalized", 738 | "hospitalises": "hospitalizes", 739 | "hospitalising": "hospitalizing", 740 | "humanise": "humanize", 741 | "humanised": "humanized", 742 | "humanises": "humanizes", 743 | "humanising": "humanizing", 744 | "humour": "humor", 745 | "humoured": "humored", 746 | "humouring": "humoring", 747 | "humourless": "humorless", 748 | "humours": "humors", 749 | "hybridise": "hybridize", 750 | "hybridised": "hybridized", 751 | "hybridises": "hybridizes", 752 | "hybridising": "hybridizing", 753 | "hypnotise": "hypnotize", 754 | "hypnotised": "hypnotized", 755 | "hypnotises": "hypnotizes", 756 | "hypnotising": "hypnotizing", 757 | "hypothesise": "hypothesize", 758 | "hypothesised": "hypothesized", 759 | "hypothesises": "hypothesizes", 760 | "hypothesising": "hypothesizing", 761 | "idealisation": "idealization", 762 | "idealise": "idealize", 763 | "idealised": "idealized", 764 | "idealises": "idealizes", 765 | "idealising": "idealizing", 766 | "idolise": "idolize", 767 | "idolised": "idolized", 768 | "idolises": "idolizes", 769 | "idolising": "idolizing", 770 | "immobilisation": "immobilization", 771 | "immobilise": "immobilize", 772 | "immobilised": "immobilized", 773 | "immobiliser": "immobilizer", 774 | "immobilisers": "immobilizers", 775 | "immobilises": "immobilizes", 776 | "immobilising": "immobilizing", 777 | "immortalise": "immortalize", 778 | "immortalised": "immortalized", 779 | "immortalises": "immortalizes", 780 | "immortalising": "immortalizing", 781 | "immunisation": "immunization", 782 | "immunise": "immunize", 783 | "immunised": "immunized", 784 | "immunises": "immunizes", 785 | "immunising": "immunizing", 786 | "impanelled": "impaneled", 787 | "impanelling": "impaneling", 788 | "imperilled": "imperiled", 789 | "imperilling": "imperiling", 790 | "individualise": "individualize", 791 | "individualised": "individualized", 792 | "individualises": "individualizes", 793 | "individualising": "individualizing", 794 | "industrialise": "industrialize", 795 | "industrialised": "industrialized", 796 | "industrialises": "industrializes", 797 | "industrialising": "industrializing", 798 | "inflexion": "inflection", 799 | "inflexions": "inflections", 800 | "initialise": "initialize", 801 | "initialised": "initialized", 802 | "initialises": "initializes", 803 | "initialising": "initializing", 804 | "initialled": "initialed", 805 | "initialling": "initialing", 806 | "instal": "install", 807 | "instalment": "installment", 808 | "instalments": "installments", 809 | "instals": "installs", 810 | "instil": "instill", 811 | "instils": "instills", 812 | "institutionalisation": "institutionalization", 813 | "institutionalise": "institutionalize", 814 | "institutionalised": "institutionalized", 815 | "institutionalises": "institutionalizes", 816 | "institutionalising": "institutionalizing", 817 | "intellectualise": "intellectualize", 818 | "intellectualised": "intellectualized", 819 | "intellectualises": "intellectualizes", 820 | "intellectualising": "intellectualizing", 821 | "internalisation": "internalization", 822 | "internalise": "internalize", 823 | "internalised": "internalized", 824 | "internalises": "internalizes", 825 | "internalising": "internalizing", 826 | "internationalisation": "internationalization", 827 | "internationalise": "internationalize", 828 | "internationalised": "internationalized", 829 | "internationalises": "internationalizes", 830 | "internationalising": "internationalizing", 831 | "ionisation": "ionization", 832 | "ionise": "ionize", 833 | "ionised": "ionized", 834 | "ioniser": "ionizer", 835 | "ionisers": "ionizers", 836 | "ionises": "ionizes", 837 | "ionising": "ionizing", 838 | "italicise": "italicize", 839 | "italicised": "italicized", 840 | "italicises": "italicizes", 841 | "italicising": "italicizing", 842 | "itemise": "itemize", 843 | "itemised": "itemized", 844 | "itemises": "itemizes", 845 | "itemising": "itemizing", 846 | "jeopardise": "jeopardize", 847 | "jeopardised": "jeopardized", 848 | "jeopardises": "jeopardizes", 849 | "jeopardising": "jeopardizing", 850 | "jewelled": "jeweled", 851 | "jeweller": "jeweler", 852 | "jewellers": "jewelers", 853 | "jewellery": "jewelry", 854 | "judgement": "judgment", 855 | "kilogramme": "kilogram", 856 | "kilogrammes": "kilograms", 857 | "kilometre": "kilometer", 858 | "kilometres": "kilometers", 859 | "labelled": "labeled", 860 | "labelling": "labeling", 861 | "labour": "labor", 862 | "laboured": "labored", 863 | "labourer": "laborer", 864 | "labourers": "laborers", 865 | "labouring": "laboring", 866 | "labours": "labors", 867 | "lacklustre": "lackluster", 868 | "legalisation": "legalization", 869 | "legalise": "legalize", 870 | "legalised": "legalized", 871 | "legalises": "legalizes", 872 | "legalising": "legalizing", 873 | "legitimise": "legitimize", 874 | "legitimised": "legitimized", 875 | "legitimises": "legitimizes", 876 | "legitimising": "legitimizing", 877 | "leukaemia": "leukemia", 878 | "levelled": "leveled", 879 | "leveller": "leveler", 880 | "levellers": "levelers", 881 | "levelling": "leveling", 882 | "libelled": "libeled", 883 | "libelling": "libeling", 884 | "libellous": "libelous", 885 | "liberalisation": "liberalization", 886 | "liberalise": "liberalize", 887 | "liberalised": "liberalized", 888 | "liberalises": "liberalizes", 889 | "liberalising": "liberalizing", 890 | "licence": "license", 891 | "licenced": "licensed", 892 | "licences": "licenses", 893 | "licencing": "licensing", 894 | "likeable": "likable", 895 | "lionisation": "lionization", 896 | "lionise": "lionize", 897 | "lionised": "lionized", 898 | "lionises": "lionizes", 899 | "lionising": "lionizing", 900 | "liquidise": "liquidize", 901 | "liquidised": "liquidized", 902 | "liquidiser": "liquidizer", 903 | "liquidisers": "liquidizers", 904 | "liquidises": "liquidizes", 905 | "liquidising": "liquidizing", 906 | "litre": "liter", 907 | "litres": "liters", 908 | "localise": "localize", 909 | "localised": "localized", 910 | "localises": "localizes", 911 | "localising": "localizing", 912 | "louvre": "louver", 913 | "louvred": "louvered", 914 | "louvres": "louvers", 915 | "lustre": "luster", 916 | "magnetise": "magnetize", 917 | "magnetised": "magnetized", 918 | "magnetises": "magnetizes", 919 | "magnetising": "magnetizing", 920 | "manoeuvrability": "maneuverability", 921 | "manoeuvrable": "maneuverable", 922 | "manoeuvre": "maneuver", 923 | "manoeuvred": "maneuvered", 924 | "manoeuvres": "maneuvers", 925 | "manoeuvring": "maneuvering", 926 | "manoeuvrings": "maneuverings", 927 | "marginalisation": "marginalization", 928 | "marginalise": "marginalize", 929 | "marginalised": "marginalized", 930 | "marginalises": "marginalizes", 931 | "marginalising": "marginalizing", 932 | "marshalled": "marshaled", 933 | "marshalling": "marshaling", 934 | "marvelled": "marveled", 935 | "marvelling": "marveling", 936 | "marvellous": "marvelous", 937 | "marvellously": "marvelously", 938 | "materialisation": "materialization", 939 | "materialise": "materialize", 940 | "materialised": "materialized", 941 | "materialises": "materializes", 942 | "materialising": "materializing", 943 | "maximisation": "maximization", 944 | "maximise": "maximize", 945 | "maximised": "maximized", 946 | "maximises": "maximizes", 947 | "maximising": "maximizing", 948 | "meagre": "meager", 949 | "mechanisation": "mechanization", 950 | "mechanise": "mechanize", 951 | "mechanised": "mechanized", 952 | "mechanises": "mechanizes", 953 | "mechanising": "mechanizing", 954 | "mediaeval": "medieval", 955 | "memorialise": "memorialize", 956 | "memorialised": "memorialized", 957 | "memorialises": "memorializes", 958 | "memorialising": "memorializing", 959 | "memorise": "memorize", 960 | "memorised": "memorized", 961 | "memorises": "memorizes", 962 | "memorising": "memorizing", 963 | "mesmerise": "mesmerize", 964 | "mesmerised": "mesmerized", 965 | "mesmerises": "mesmerizes", 966 | "mesmerising": "mesmerizing", 967 | "metabolise": "metabolize", 968 | "metabolised": "metabolized", 969 | "metabolises": "metabolizes", 970 | "metabolising": "metabolizing", 971 | "metre": "meter", 972 | "metres": "meters", 973 | "micrometre": "micrometer", 974 | "micrometres": "micrometers", 975 | "militarise": "militarize", 976 | "militarised": "militarized", 977 | "militarises": "militarizes", 978 | "militarising": "militarizing", 979 | "milligramme": "milligram", 980 | "milligrammes": "milligrams", 981 | "millilitre": "milliliter", 982 | "millilitres": "milliliters", 983 | "millimetre": "millimeter", 984 | "millimetres": "millimeters", 985 | "miniaturisation": "miniaturization", 986 | "miniaturise": "miniaturize", 987 | "miniaturised": "miniaturized", 988 | "miniaturises": "miniaturizes", 989 | "miniaturising": "miniaturizing", 990 | "minibusses": "minibuses", 991 | "minimise": "minimize", 992 | "minimised": "minimized", 993 | "minimises": "minimizes", 994 | "minimising": "minimizing", 995 | "misbehaviour": "misbehavior", 996 | "misdemeanour": "misdemeanor", 997 | "misdemeanours": "misdemeanors", 998 | "misspelt": "misspelled", 999 | "mitre": "miter", 1000 | "mitres": "miters", 1001 | "mobilisation": "mobilization", 1002 | "mobilise": "mobilize", 1003 | "mobilised": "mobilized", 1004 | "mobilises": "mobilizes", 1005 | "mobilising": "mobilizing", 1006 | "modelled": "modeled", 1007 | "modeller": "modeler", 1008 | "modellers": "modelers", 1009 | "modelling": "modeling", 1010 | "modernise": "modernize", 1011 | "modernised": "modernized", 1012 | "modernises": "modernizes", 1013 | "modernising": "modernizing", 1014 | "moisturise": "moisturize", 1015 | "moisturised": "moisturized", 1016 | "moisturiser": "moisturizer", 1017 | "moisturisers": "moisturizers", 1018 | "moisturises": "moisturizes", 1019 | "moisturising": "moisturizing", 1020 | "monologue": "monolog", 1021 | "monologues": "monologs", 1022 | "monopolisation": "monopolization", 1023 | "monopolise": "monopolize", 1024 | "monopolised": "monopolized", 1025 | "monopolises": "monopolizes", 1026 | "monopolising": "monopolizing", 1027 | "moralise": "moralize", 1028 | "moralised": "moralized", 1029 | "moralises": "moralizes", 1030 | "moralising": "moralizing", 1031 | "motorised": "motorized", 1032 | "mould": "mold", 1033 | "moulded": "molded", 1034 | "moulder": "molder", 1035 | "mouldered": "moldered", 1036 | "mouldering": "moldering", 1037 | "moulders": "molders", 1038 | "mouldier": "moldier", 1039 | "mouldiest": "moldiest", 1040 | "moulding": "molding", 1041 | "mouldings": "moldings", 1042 | "moulds": "molds", 1043 | "mouldy": "moldy", 1044 | "moult": "molt", 1045 | "moulted": "molted", 1046 | "moulting": "molting", 1047 | "moults": "molts", 1048 | "moustache": "mustache", 1049 | "moustached": "mustached", 1050 | "moustaches": "mustaches", 1051 | "moustachioed": "mustachioed", 1052 | "multicoloured": "multicolored", 1053 | "nationalisation": "nationalization", 1054 | "nationalisations": "nationalizations", 1055 | "nationalise": "nationalize", 1056 | "nationalised": "nationalized", 1057 | "nationalises": "nationalizes", 1058 | "nationalising": "nationalizing", 1059 | "naturalisation": "naturalization", 1060 | "naturalise": "naturalize", 1061 | "naturalised": "naturalized", 1062 | "naturalises": "naturalizes", 1063 | "naturalising": "naturalizing", 1064 | "neighbour": "neighbor", 1065 | "neighbourhood": "neighborhood", 1066 | "neighbourhoods": "neighborhoods", 1067 | "neighbouring": "neighboring", 1068 | "neighbourliness": "neighborliness", 1069 | "neighbourly": "neighborly", 1070 | "neighbours": "neighbors", 1071 | "neutralisation": "neutralization", 1072 | "neutralise": "neutralize", 1073 | "neutralised": "neutralized", 1074 | "neutralises": "neutralizes", 1075 | "neutralising": "neutralizing", 1076 | "normalisation": "normalization", 1077 | "normalise": "normalize", 1078 | "normalised": "normalized", 1079 | "normalises": "normalizes", 1080 | "normalising": "normalizing", 1081 | "odour": "odor", 1082 | "odourless": "odorless", 1083 | "odours": "odors", 1084 | "oesophagus": "esophagus", 1085 | "oesophaguses": "esophaguses", 1086 | "oestrogen": "estrogen", 1087 | "offence": "offense", 1088 | "offences": "offenses", 1089 | "omelette": "omelet", 1090 | "omelettes": "omelets", 1091 | "optimise": "optimize", 1092 | "optimised": "optimized", 1093 | "optimises": "optimizes", 1094 | "optimising": "optimizing", 1095 | "organisation": "organization", 1096 | "organisational": "organizational", 1097 | "organisations": "organizations", 1098 | "organise": "organize", 1099 | "organised": "organized", 1100 | "organiser": "organizer", 1101 | "organisers": "organizers", 1102 | "organises": "organizes", 1103 | "organising": "organizing", 1104 | "orthopaedic": "orthopedic", 1105 | "orthopaedics": "orthopedics", 1106 | "ostracise": "ostracize", 1107 | "ostracised": "ostracized", 1108 | "ostracises": "ostracizes", 1109 | "ostracising": "ostracizing", 1110 | "outmanoeuvre": "outmaneuver", 1111 | "outmanoeuvred": "outmaneuvered", 1112 | "outmanoeuvres": "outmaneuvers", 1113 | "outmanoeuvring": "outmaneuvering", 1114 | "overemphasise": "overemphasize", 1115 | "overemphasised": "overemphasized", 1116 | "overemphasises": "overemphasizes", 1117 | "overemphasising": "overemphasizing", 1118 | "oxidisation": "oxidization", 1119 | "oxidise": "oxidize", 1120 | "oxidised": "oxidized", 1121 | "oxidises": "oxidizes", 1122 | "oxidising": "oxidizing", 1123 | "paederast": "pederast", 1124 | "paederasts": "pederasts", 1125 | "paediatric": "pediatric", 1126 | "paediatrician": "pediatrician", 1127 | "paediatricians": "pediatricians", 1128 | "paediatrics": "pediatrics", 1129 | "paedophile": "pedophile", 1130 | "paedophiles": "pedophiles", 1131 | "paedophilia": "pedophilia", 1132 | "palaeolithic": "paleolithic", 1133 | "palaeontologist": "paleontologist", 1134 | "palaeontologists": "paleontologists", 1135 | "palaeontology": "paleontology", 1136 | "panelled": "paneled", 1137 | "panelling": "paneling", 1138 | "panellist": "panelist", 1139 | "panellists": "panelists", 1140 | "paralyse": "paralyze", 1141 | "paralysed": "paralyzed", 1142 | "paralyses": "paralyzes", 1143 | "paralysing": "paralyzing", 1144 | "parcelled": "parceled", 1145 | "parcelling": "parceling", 1146 | "parlour": "parlor", 1147 | "parlours": "parlors", 1148 | "particularise": "particularize", 1149 | "particularised": "particularized", 1150 | "particularises": "particularizes", 1151 | "particularising": "particularizing", 1152 | "passivisation": "passivization", 1153 | "passivise": "passivize", 1154 | "passivised": "passivized", 1155 | "passivises": "passivizes", 1156 | "passivising": "passivizing", 1157 | "pasteurisation": "pasteurization", 1158 | "pasteurise": "pasteurize", 1159 | "pasteurised": "pasteurized", 1160 | "pasteurises": "pasteurizes", 1161 | "pasteurising": "pasteurizing", 1162 | "patronise": "patronize", 1163 | "patronised": "patronized", 1164 | "patronises": "patronizes", 1165 | "patronising": "patronizing", 1166 | "patronisingly": "patronizingly", 1167 | "pedalled": "pedaled", 1168 | "pedalling": "pedaling", 1169 | "pedestrianisation": "pedestrianization", 1170 | "pedestrianise": "pedestrianize", 1171 | "pedestrianised": "pedestrianized", 1172 | "pedestrianises": "pedestrianizes", 1173 | "pedestrianising": "pedestrianizing", 1174 | "penalise": "penalize", 1175 | "penalised": "penalized", 1176 | "penalises": "penalizes", 1177 | "penalising": "penalizing", 1178 | "pencilled": "penciled", 1179 | "pencilling": "penciling", 1180 | "personalise": "personalize", 1181 | "personalised": "personalized", 1182 | "personalises": "personalizes", 1183 | "personalising": "personalizing", 1184 | "pharmacopoeia": "pharmacopeia", 1185 | "pharmacopoeias": "pharmacopeias", 1186 | "philosophise": "philosophize", 1187 | "philosophised": "philosophized", 1188 | "philosophises": "philosophizes", 1189 | "philosophising": "philosophizing", 1190 | "philtre": "filter", 1191 | "philtres": "filters", 1192 | "phoney": "phony", 1193 | "plagiarise": "plagiarize", 1194 | "plagiarised": "plagiarized", 1195 | "plagiarises": "plagiarizes", 1196 | "plagiarising": "plagiarizing", 1197 | "plough": "plow", 1198 | "ploughed": "plowed", 1199 | "ploughing": "plowing", 1200 | "ploughman": "plowman", 1201 | "ploughmen": "plowmen", 1202 | "ploughs": "plows", 1203 | "ploughshare": "plowshare", 1204 | "ploughshares": "plowshares", 1205 | "polarisation": "polarization", 1206 | "polarise": "polarize", 1207 | "polarised": "polarized", 1208 | "polarises": "polarizes", 1209 | "polarising": "polarizing", 1210 | "politicisation": "politicization", 1211 | "politicise": "politicize", 1212 | "politicised": "politicized", 1213 | "politicises": "politicizes", 1214 | "politicising": "politicizing", 1215 | "popularisation": "popularization", 1216 | "popularise": "popularize", 1217 | "popularised": "popularized", 1218 | "popularises": "popularizes", 1219 | "popularising": "popularizing", 1220 | "pouffe": "pouf", 1221 | "pouffes": "poufs", 1222 | "practise": "practice", 1223 | "practised": "practiced", 1224 | "practises": "practices", 1225 | "practising": "practicing", 1226 | "praesidium": "presidium", 1227 | "praesidiums": "presidiums", 1228 | "pressurisation": "pressurization", 1229 | "pressurise": "pressurize", 1230 | "pressurised": "pressurized", 1231 | "pressurises": "pressurizes", 1232 | "pressurising": "pressurizing", 1233 | "pretence": "pretense", 1234 | "pretences": "pretenses", 1235 | "primaeval": "primeval", 1236 | "prioritisation": "prioritization", 1237 | "prioritise": "prioritize", 1238 | "prioritised": "prioritized", 1239 | "prioritises": "prioritizes", 1240 | "prioritising": "prioritizing", 1241 | "privatisation": "privatization", 1242 | "privatisations": "privatizations", 1243 | "privatise": "privatize", 1244 | "privatised": "privatized", 1245 | "privatises": "privatizes", 1246 | "privatising": "privatizing", 1247 | "professionalisation": "professionalization", 1248 | "professionalise": "professionalize", 1249 | "professionalised": "professionalized", 1250 | "professionalises": "professionalizes", 1251 | "professionalising": "professionalizing", 1252 | "programme": "program", 1253 | "programmes": "programs", 1254 | "prologue": "prolog", 1255 | "prologues": "prologs", 1256 | "propagandise": "propagandize", 1257 | "propagandised": "propagandized", 1258 | "propagandises": "propagandizes", 1259 | "propagandising": "propagandizing", 1260 | "proselytise": "proselytize", 1261 | "proselytised": "proselytized", 1262 | "proselytiser": "proselytizer", 1263 | "proselytisers": "proselytizers", 1264 | "proselytises": "proselytizes", 1265 | "proselytising": "proselytizing", 1266 | "psychoanalyse": "psychoanalyze", 1267 | "psychoanalysed": "psychoanalyzed", 1268 | "psychoanalyses": "psychoanalyzes", 1269 | "psychoanalysing": "psychoanalyzing", 1270 | "publicise": "publicize", 1271 | "publicised": "publicized", 1272 | "publicises": "publicizes", 1273 | "publicising": "publicizing", 1274 | "pulverisation": "pulverization", 1275 | "pulverise": "pulverize", 1276 | "pulverised": "pulverized", 1277 | "pulverises": "pulverizes", 1278 | "pulverising": "pulverizing", 1279 | "pummelled": "pummel", 1280 | "pummelling": "pummeled", 1281 | "pyjama": "pajama", 1282 | "pyjamas": "pajamas", 1283 | "pzazz": "pizzazz", 1284 | "quarrelled": "quarreled", 1285 | "quarrelling": "quarreling", 1286 | "radicalise": "radicalize", 1287 | "radicalised": "radicalized", 1288 | "radicalises": "radicalizes", 1289 | "radicalising": "radicalizing", 1290 | "rancour": "rancor", 1291 | "randomise": "randomize", 1292 | "randomised": "randomized", 1293 | "randomises": "randomizes", 1294 | "randomising": "randomizing", 1295 | "rationalisation": "rationalization", 1296 | "rationalisations": "rationalizations", 1297 | "rationalise": "rationalize", 1298 | "rationalised": "rationalized", 1299 | "rationalises": "rationalizes", 1300 | "rationalising": "rationalizing", 1301 | "ravelled": "raveled", 1302 | "ravelling": "raveling", 1303 | "realisable": "realizable", 1304 | "realisation": "realization", 1305 | "realisations": "realizations", 1306 | "realise": "realize", 1307 | "realised": "realized", 1308 | "realises": "realizes", 1309 | "realising": "realizing", 1310 | "recognisable": "recognizable", 1311 | "recognisably": "recognizably", 1312 | "recognisance": "recognizance", 1313 | "recognise": "recognize", 1314 | "recognised": "recognized", 1315 | "recognises": "recognizes", 1316 | "recognising": "recognizing", 1317 | "reconnoitre": "reconnoiter", 1318 | "reconnoitred": "reconnoitered", 1319 | "reconnoitres": "reconnoiters", 1320 | "reconnoitring": "reconnoitering", 1321 | "refuelled": "refueled", 1322 | "refuelling": "refueling", 1323 | "regularisation": "regularization", 1324 | "regularise": "regularize", 1325 | "regularised": "regularized", 1326 | "regularises": "regularizes", 1327 | "regularising": "regularizing", 1328 | "remodelled": "remodeled", 1329 | "remodelling": "remodeling", 1330 | "remould": "remold", 1331 | "remoulded": "remolded", 1332 | "remoulding": "remolding", 1333 | "remoulds": "remolds", 1334 | "reorganisation": "reorganization", 1335 | "reorganisations": "reorganizations", 1336 | "reorganise": "reorganize", 1337 | "reorganised": "reorganized", 1338 | "reorganises": "reorganizes", 1339 | "reorganising": "reorganizing", 1340 | "revelled": "reveled", 1341 | "reveller": "reveler", 1342 | "revellers": "revelers", 1343 | "revelling": "reveling", 1344 | "revitalise": "revitalize", 1345 | "revitalised": "revitalized", 1346 | "revitalises": "revitalizes", 1347 | "revitalising": "revitalizing", 1348 | "revolutionise": "revolutionize", 1349 | "revolutionised": "revolutionized", 1350 | "revolutionises": "revolutionizes", 1351 | "revolutionising": "revolutionizing", 1352 | "rhapsodise": "rhapsodize", 1353 | "rhapsodised": "rhapsodized", 1354 | "rhapsodises": "rhapsodizes", 1355 | "rhapsodising": "rhapsodizing", 1356 | "rigour": "rigor", 1357 | "rigours": "rigors", 1358 | "ritualised": "ritualized", 1359 | "rivalled": "rivaled", 1360 | "rivalling": "rivaling", 1361 | "romanticise": "romanticize", 1362 | "romanticised": "romanticized", 1363 | "romanticises": "romanticizes", 1364 | "romanticising": "romanticizing", 1365 | "rumour": "rumor", 1366 | "rumoured": "rumored", 1367 | "rumours": "rumors", 1368 | "sabre": "saber", 1369 | "sabres": "sabers", 1370 | "saltpetre": "saltpeter", 1371 | "sanitise": "sanitize", 1372 | "sanitised": "sanitized", 1373 | "sanitises": "sanitizes", 1374 | "sanitising": "sanitizing", 1375 | "satirise": "satirize", 1376 | "satirised": "satirized", 1377 | "satirises": "satirizes", 1378 | "satirising": "satirizing", 1379 | "saviour": "savior", 1380 | "saviours": "saviors", 1381 | "savour": "savor", 1382 | "savoured": "savored", 1383 | "savouries": "savories", 1384 | "savouring": "savoring", 1385 | "savours": "savors", 1386 | "savoury": "savory", 1387 | "scandalise": "scandalize", 1388 | "scandalised": "scandalized", 1389 | "scandalises": "scandalizes", 1390 | "scandalising": "scandalizing", 1391 | "sceptic": "skeptic", 1392 | "sceptical": "skeptical", 1393 | "sceptically": "skeptically", 1394 | "scepticism": "skepticism", 1395 | "sceptics": "skeptics", 1396 | "sceptre": "scepter", 1397 | "sceptres": "scepters", 1398 | "scrutinise": "scrutinize", 1399 | "scrutinised": "scrutinized", 1400 | "scrutinises": "scrutinizes", 1401 | "scrutinising": "scrutinizing", 1402 | "secularisation": "secularization", 1403 | "secularise": "secularize", 1404 | "secularised": "secularized", 1405 | "secularises": "secularizes", 1406 | "secularising": "secularizing", 1407 | "sensationalise": "sensationalize", 1408 | "sensationalised": "sensationalized", 1409 | "sensationalises": "sensationalizes", 1410 | "sensationalising": "sensationalizing", 1411 | "sensitise": "sensitize", 1412 | "sensitised": "sensitized", 1413 | "sensitises": "sensitizes", 1414 | "sensitising": "sensitizing", 1415 | "sentimentalise": "sentimentalize", 1416 | "sentimentalised": "sentimentalized", 1417 | "sentimentalises": "sentimentalizes", 1418 | "sentimentalising": "sentimentalizing", 1419 | "sepulchre": "sepulcher", 1420 | "sepulchres": "sepulchers", 1421 | "serialisation": "serialization", 1422 | "serialisations": "serializations", 1423 | "serialise": "serialize", 1424 | "serialised": "serialized", 1425 | "serialises": "serializes", 1426 | "serialising": "serializing", 1427 | "sermonise": "sermonize", 1428 | "sermonised": "sermonized", 1429 | "sermonises": "sermonizes", 1430 | "sermonising": "sermonizing", 1431 | "sheikh": "sheik", 1432 | "shovelled": "shoveled", 1433 | "shovelling": "shoveling", 1434 | "shrivelled": "shriveled", 1435 | "shrivelling": "shriveling", 1436 | "signalise": "signalize", 1437 | "signalised": "signalized", 1438 | "signalises": "signalizes", 1439 | "signalising": "signalizing", 1440 | "signalled": "signaled", 1441 | "signalling": "signaling", 1442 | "smoulder": "smolder", 1443 | "smouldered": "smoldered", 1444 | "smouldering": "smoldering", 1445 | "smoulders": "smolders", 1446 | "snivelled": "sniveled", 1447 | "snivelling": "sniveling", 1448 | "snorkelled": "snorkeled", 1449 | "snorkelling": "snorkeling", 1450 | "snowplough": "snowplow", 1451 | "snowploughs": "snowplow", 1452 | "socialisation": "socialization", 1453 | "socialise": "socialize", 1454 | "socialised": "socialized", 1455 | "socialises": "socializes", 1456 | "socialising": "socializing", 1457 | "sodomise": "sodomize", 1458 | "sodomised": "sodomized", 1459 | "sodomises": "sodomizes", 1460 | "sodomising": "sodomizing", 1461 | "solemnise": "solemnize", 1462 | "solemnised": "solemnized", 1463 | "solemnises": "solemnizes", 1464 | "solemnising": "solemnizing", 1465 | "sombre": "somber", 1466 | "specialisation": "specialization", 1467 | "specialisations": "specializations", 1468 | "specialise": "specialize", 1469 | "specialised": "specialized", 1470 | "specialises": "specializes", 1471 | "specialising": "specializing", 1472 | "spectre": "specter", 1473 | "spectres": "specters", 1474 | "spiralled": "spiraled", 1475 | "spiralling": "spiraling", 1476 | "splendour": "splendor", 1477 | "splendours": "splendors", 1478 | "squirrelled": "squirreled", 1479 | "squirrelling": "squirreling", 1480 | "stabilisation": "stabilization", 1481 | "stabilise": "stabilize", 1482 | "stabilised": "stabilized", 1483 | "stabiliser": "stabilizer", 1484 | "stabilisers": "stabilizers", 1485 | "stabilises": "stabilizes", 1486 | "stabilising": "stabilizing", 1487 | "standardisation": "standardization", 1488 | "standardise": "standardize", 1489 | "standardised": "standardized", 1490 | "standardises": "standardizes", 1491 | "standardising": "standardizing", 1492 | "stencilled": "stenciled", 1493 | "stencilling": "stenciling", 1494 | "sterilisation": "sterilization", 1495 | "sterilisations": "sterilizations", 1496 | "sterilise": "sterilize", 1497 | "sterilised": "sterilized", 1498 | "steriliser": "sterilizer", 1499 | "sterilisers": "sterilizers", 1500 | "sterilises": "sterilizes", 1501 | "sterilising": "sterilizing", 1502 | "stigmatisation": "stigmatization", 1503 | "stigmatise": "stigmatize", 1504 | "stigmatised": "stigmatized", 1505 | "stigmatises": "stigmatizes", 1506 | "stigmatising": "stigmatizing", 1507 | "storey": "story", 1508 | "storeys": "stories", 1509 | "subsidisation": "subsidization", 1510 | "subsidise": "subsidize", 1511 | "subsidised": "subsidized", 1512 | "subsidiser": "subsidizer", 1513 | "subsidisers": "subsidizers", 1514 | "subsidises": "subsidizes", 1515 | "subsidising": "subsidizing", 1516 | "succour": "succor", 1517 | "succoured": "succored", 1518 | "succouring": "succoring", 1519 | "succours": "succors", 1520 | "sulphate": "sulfate", 1521 | "sulphates": "sulfates", 1522 | "sulphide": "sulfide", 1523 | "sulphides": "sulfides", 1524 | "sulphur": "sulfur", 1525 | "sulphurous": "sulfurous", 1526 | "summarise": "summarize", 1527 | "summarised": "summarized", 1528 | "summarises": "summarizes", 1529 | "summarising": "summarizing", 1530 | "swivelled": "swiveled", 1531 | "swivelling": "swiveling", 1532 | "symbolise": "symbolize", 1533 | "symbolised": "symbolized", 1534 | "symbolises": "symbolizes", 1535 | "symbolising": "symbolizing", 1536 | "sympathise": "sympathize", 1537 | "sympathised": "sympathized", 1538 | "sympathiser": "sympathizer", 1539 | "sympathisers": "sympathizers", 1540 | "sympathises": "sympathizes", 1541 | "sympathising": "sympathizing", 1542 | "synchronisation": "synchronization", 1543 | "synchronise": "synchronize", 1544 | "synchronised": "synchronized", 1545 | "synchronises": "synchronizes", 1546 | "synchronising": "synchronizing", 1547 | "synthesise": "synthesize", 1548 | "synthesised": "synthesized", 1549 | "synthesiser": "synthesizer", 1550 | "synthesisers": "synthesizers", 1551 | "synthesises": "synthesizes", 1552 | "synthesising": "synthesizing", 1553 | "syphon": "siphon", 1554 | "syphoned": "siphoned", 1555 | "syphoning": "siphoning", 1556 | "syphons": "siphons", 1557 | "systematisation": "systematization", 1558 | "systematise": "systematize", 1559 | "systematised": "systematized", 1560 | "systematises": "systematizes", 1561 | "systematising": "systematizing", 1562 | "tantalise": "tantalize", 1563 | "tantalised": "tantalized", 1564 | "tantalises": "tantalizes", 1565 | "tantalising": "tantalizing", 1566 | "tantalisingly": "tantalizingly", 1567 | "tasselled": "tasseled", 1568 | "technicolour": "technicolor", 1569 | "temporise": "temporize", 1570 | "temporised": "temporized", 1571 | "temporises": "temporizes", 1572 | "temporising": "temporizing", 1573 | "tenderise": "tenderize", 1574 | "tenderised": "tenderized", 1575 | "tenderises": "tenderizes", 1576 | "tenderising": "tenderizing", 1577 | "terrorise": "terrorize", 1578 | "terrorised": "terrorized", 1579 | "terrorises": "terrorizes", 1580 | "terrorising": "terrorizing", 1581 | "theatre": "theater", 1582 | "theatregoer": "theatergoer", 1583 | "theatregoers": "theatergoers", 1584 | "theatres": "theaters", 1585 | "theorise": "theorize", 1586 | "theorised": "theorized", 1587 | "theorises": "theorizes", 1588 | "theorising": "theorizing", 1589 | "tonne": "ton", 1590 | "tonnes": "tons", 1591 | "towelled": "toweled", 1592 | "towelling": "toweling", 1593 | "toxaemia": "toxemia", 1594 | "tranquillise": "tranquilize", 1595 | "tranquillised": "tranquilized", 1596 | "tranquilliser": "tranquilizer", 1597 | "tranquillisers": "tranquilizers", 1598 | "tranquillises": "tranquilizes", 1599 | "tranquillising": "tranquilizing", 1600 | "tranquillity": "tranquility", 1601 | "tranquillize": "tranquilize", 1602 | "tranquillized": "tranquilized", 1603 | "tranquillizer": "tranquilizer", 1604 | "tranquillizers": "tranquilizers", 1605 | "tranquillizes": "tranquilizes", 1606 | "tranquillizing": "tranquilizing", 1607 | "tranquilly": "tranquility", 1608 | "transistorised": "transistorized", 1609 | "traumatise": "traumatize", 1610 | "traumatised": "traumatized", 1611 | "traumatises": "traumatizes", 1612 | "traumatising": "traumatizing", 1613 | "travelled": "traveled", 1614 | "traveller": "traveler", 1615 | "travellers": "travelers", 1616 | "travelling": "traveling", 1617 | "travelog": "travelogue", 1618 | "travelogs": "travelogues", 1619 | "trialled": "trialed", 1620 | "trialling": "trialing", 1621 | "tricolour": "tricolor", 1622 | "tricolours": "tricolors", 1623 | "trivialise": "trivialize", 1624 | "trivialised": "trivialized", 1625 | "trivialises": "trivializes", 1626 | "trivialising": "trivializing", 1627 | "tumour": "tumor", 1628 | "tumours": "tumors", 1629 | "tunnelled": "tunneled", 1630 | "tunnelling": "tunneling", 1631 | "tyrannise": "tyrannize", 1632 | "tyrannised": "tyrannized", 1633 | "tyrannises": "tyrannizes", 1634 | "tyrannising": "tyrannizing", 1635 | "tyre": "tire", 1636 | "tyres": "tires", 1637 | "unauthorised": "unauthorized", 1638 | "uncivilised": "uncivilized", 1639 | "underutilised": "underutilized", 1640 | "unequalled": "unequaled", 1641 | "unfavourable": "unfavorable", 1642 | "unfavourably": "unfavorably", 1643 | "unionisation": "unionization", 1644 | "unionise": "unionize", 1645 | "unionised": "unionized", 1646 | "unionises": "unionizes", 1647 | "unionising": "unionizing", 1648 | "unorganised": "unorganized", 1649 | "unravelled": "unraveled", 1650 | "unravelling": "unraveling", 1651 | "unrecognisable": "unrecognizable", 1652 | "unrecognised": "unrecognized", 1653 | "unrivalled": "unrivaled", 1654 | "unsavoury": "unsavory", 1655 | "untrammelled": "untrammeled", 1656 | "urbanisation": "urbanization", 1657 | "urbanise": "urbanize", 1658 | "urbanised": "urbanized", 1659 | "urbanises": "urbanizes", 1660 | "urbanising": "urbanizing", 1661 | "utilisable": "utilizable", 1662 | "utilisation": "utilization", 1663 | "utilise": "utilize", 1664 | "utilised": "utilized", 1665 | "utilises": "utilizes", 1666 | "utilising": "utilizing", 1667 | "valour": "valor", 1668 | "vandalise": "vandalize", 1669 | "vandalised": "vandalized", 1670 | "vandalises": "vandalizes", 1671 | "vandalising": "vandalizing", 1672 | "vaporisation": "vaporization", 1673 | "vaporise": "vaporize", 1674 | "vaporised": "vaporized", 1675 | "vaporises": "vaporizes", 1676 | "vaporising": "vaporizing", 1677 | "vapour": "vapor", 1678 | "vapours": "vapors", 1679 | "verbalise": "verbalize", 1680 | "verbalised": "verbalized", 1681 | "verbalises": "verbalizes", 1682 | "verbalising": "verbalizing", 1683 | "victimisation": "victimization", 1684 | "victimise": "victimize", 1685 | "victimised": "victimized", 1686 | "victimises": "victimizes", 1687 | "victimising": "victimizing", 1688 | "videodisc": "videodisk", 1689 | "videodiscs": "videodisks", 1690 | "vigour": "vigor", 1691 | "visualisation": "visualization", 1692 | "visualisations": "visualizations", 1693 | "visualise": "visualize", 1694 | "visualised": "visualized", 1695 | "visualises": "visualizes", 1696 | "visualising": "visualizing", 1697 | "vocalisation": "vocalization", 1698 | "vocalisations": "vocalizations", 1699 | "vocalise": "vocalize", 1700 | "vocalised": "vocalized", 1701 | "vocalises": "vocalizes", 1702 | "vocalising": "vocalizing", 1703 | "vulcanised": "vulcanized", 1704 | "vulgarisation": "vulgarization", 1705 | "vulgarise": "vulgarize", 1706 | "vulgarised": "vulgarized", 1707 | "vulgarises": "vulgarizes", 1708 | "vulgarising": "vulgarizing", 1709 | "waggon": "wagon", 1710 | "waggons": "wagons", 1711 | "watercolour": "watercolor", 1712 | "watercolours": "watercolors", 1713 | "weaselled": "weaseled", 1714 | "weaselling": "weaseling", 1715 | "westernisation": "westernization", 1716 | "westernise": "westernize", 1717 | "westernised": "westernized", 1718 | "westernises": "westernizes", 1719 | "westernising": "westernizing", 1720 | "womanise": "womanize", 1721 | "womanised": "womanized", 1722 | "womaniser": "womanizer", 1723 | "womanisers": "womanizers", 1724 | "womanises": "womanizes", 1725 | "womanising": "womanizing", 1726 | "woollen": "woolen", 1727 | "woollens": "woolens", 1728 | "woollies": "woolies", 1729 | "woolly": "wooly", 1730 | "worshipped": "worshiped", 1731 | "worshipping": "worshiping", 1732 | "worshipper": "worshiper", 1733 | "yodelled": "yodeled", 1734 | "yodelling": "yodeling", 1735 | "yoghourt": "yogurt", 1736 | "yoghourts": "yogurts", 1737 | "yoghurt": "yogurt", 1738 | "yoghurts": "yogurts", 1739 | "mhm": "hmm", 1740 | "mm": "hmm", 1741 | "mmm": "hmm" 1742 | } -------------------------------------------------------------------------------- /musetalk/whisper/normalizers/english.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | from fractions import Fraction 5 | from typing import Iterator, List, Match, Optional, Union 6 | 7 | from more_itertools import windowed 8 | 9 | from .basic import remove_symbols_and_diacritics 10 | 11 | 12 | class EnglishNumberNormalizer: 13 | """ 14 | Convert any spelled-out numbers into arabic numbers, while handling: 15 | 16 | - remove any commas 17 | - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc. 18 | - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars` 19 | - spell out `one` and `ones` 20 | - interpret successive single-digit numbers as nominal: `one oh one` -> `101` 21 | """ 22 | 23 | def __init__(self): 24 | super().__init__() 25 | 26 | self.zeros = {"o", "oh", "zero"} 27 | self.ones = { 28 | name: i 29 | for i, name in enumerate( 30 | [ 31 | "one", 32 | "two", 33 | "three", 34 | "four", 35 | "five", 36 | "six", 37 | "seven", 38 | "eight", 39 | "nine", 40 | "ten", 41 | "eleven", 42 | "twelve", 43 | "thirteen", 44 | "fourteen", 45 | "fifteen", 46 | "sixteen", 47 | "seventeen", 48 | "eighteen", 49 | "nineteen", 50 | ], 51 | start=1, 52 | ) 53 | } 54 | self.ones_plural = { 55 | "sixes" if name == "six" else name + "s": (value, "s") 56 | for name, value in self.ones.items() 57 | } 58 | self.ones_ordinal = { 59 | "zeroth": (0, "th"), 60 | "first": (1, "st"), 61 | "second": (2, "nd"), 62 | "third": (3, "rd"), 63 | "fifth": (5, "th"), 64 | "twelfth": (12, "th"), 65 | **{ 66 | name + ("h" if name.endswith("t") else "th"): (value, "th") 67 | for name, value in self.ones.items() 68 | if value > 3 and value != 5 and value != 12 69 | }, 70 | } 71 | self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal} 72 | 73 | self.tens = { 74 | "twenty": 20, 75 | "thirty": 30, 76 | "forty": 40, 77 | "fifty": 50, 78 | "sixty": 60, 79 | "seventy": 70, 80 | "eighty": 80, 81 | "ninety": 90, 82 | } 83 | self.tens_plural = { 84 | name.replace("y", "ies"): (value, "s") for name, value in self.tens.items() 85 | } 86 | self.tens_ordinal = { 87 | name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items() 88 | } 89 | self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal} 90 | 91 | self.multipliers = { 92 | "hundred": 100, 93 | "thousand": 1_000, 94 | "million": 1_000_000, 95 | "billion": 1_000_000_000, 96 | "trillion": 1_000_000_000_000, 97 | "quadrillion": 1_000_000_000_000_000, 98 | "quintillion": 1_000_000_000_000_000_000, 99 | "sextillion": 1_000_000_000_000_000_000_000, 100 | "septillion": 1_000_000_000_000_000_000_000_000, 101 | "octillion": 1_000_000_000_000_000_000_000_000_000, 102 | "nonillion": 1_000_000_000_000_000_000_000_000_000_000, 103 | "decillion": 1_000_000_000_000_000_000_000_000_000_000_000, 104 | } 105 | self.multipliers_plural = { 106 | name + "s": (value, "s") for name, value in self.multipliers.items() 107 | } 108 | self.multipliers_ordinal = { 109 | name + "th": (value, "th") for name, value in self.multipliers.items() 110 | } 111 | self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal} 112 | self.decimals = {*self.ones, *self.tens, *self.zeros} 113 | 114 | self.preceding_prefixers = { 115 | "minus": "-", 116 | "negative": "-", 117 | "plus": "+", 118 | "positive": "+", 119 | } 120 | self.following_prefixers = { 121 | "pound": "£", 122 | "pounds": "£", 123 | "euro": "€", 124 | "euros": "€", 125 | "dollar": "$", 126 | "dollars": "$", 127 | "cent": "¢", 128 | "cents": "¢", 129 | } 130 | self.prefixes = set( 131 | list(self.preceding_prefixers.values()) + list(self.following_prefixers.values()) 132 | ) 133 | self.suffixers = { 134 | "per": {"cent": "%"}, 135 | "percent": "%", 136 | } 137 | self.specials = {"and", "double", "triple", "point"} 138 | 139 | self.words = set( 140 | [ 141 | key 142 | for mapping in [ 143 | self.zeros, 144 | self.ones, 145 | self.ones_suffixed, 146 | self.tens, 147 | self.tens_suffixed, 148 | self.multipliers, 149 | self.multipliers_suffixed, 150 | self.preceding_prefixers, 151 | self.following_prefixers, 152 | self.suffixers, 153 | self.specials, 154 | ] 155 | for key in mapping 156 | ] 157 | ) 158 | self.literal_words = {"one", "ones"} 159 | 160 | def process_words(self, words: List[str]) -> Iterator[str]: 161 | prefix: Optional[str] = None 162 | value: Optional[Union[str, int]] = None 163 | skip = False 164 | 165 | def to_fraction(s: str): 166 | try: 167 | return Fraction(s) 168 | except ValueError: 169 | return None 170 | 171 | def output(result: Union[str, int]): 172 | nonlocal prefix, value 173 | result = str(result) 174 | if prefix is not None: 175 | result = prefix + result 176 | value = None 177 | prefix = None 178 | return result 179 | 180 | if len(words) == 0: 181 | return 182 | 183 | for prev, current, next in windowed([None] + words + [None], 3): 184 | if skip: 185 | skip = False 186 | continue 187 | 188 | next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next) 189 | has_prefix = current[0] in self.prefixes 190 | current_without_prefix = current[1:] if has_prefix else current 191 | if re.match(r"^\d+(\.\d+)?$", current_without_prefix): 192 | # arabic numbers (potentially with signs and fractions) 193 | f = to_fraction(current_without_prefix) 194 | assert f is not None 195 | if value is not None: 196 | if isinstance(value, str) and value.endswith("."): 197 | # concatenate decimals / ip address components 198 | value = str(value) + str(current) 199 | continue 200 | else: 201 | yield output(value) 202 | 203 | prefix = current[0] if has_prefix else prefix 204 | if f.denominator == 1: 205 | value = f.numerator # store integers as int 206 | else: 207 | value = current_without_prefix 208 | elif current not in self.words: 209 | # non-numeric words 210 | if value is not None: 211 | yield output(value) 212 | yield output(current) 213 | elif current in self.zeros: 214 | value = str(value or "") + "0" 215 | elif current in self.ones: 216 | ones = self.ones[current] 217 | 218 | if value is None: 219 | value = ones 220 | elif isinstance(value, str) or prev in self.ones: 221 | if prev in self.tens and ones < 10: # replace the last zero with the digit 222 | assert value[-1] == "0" 223 | value = value[:-1] + str(ones) 224 | else: 225 | value = str(value) + str(ones) 226 | elif ones < 10: 227 | if value % 10 == 0: 228 | value += ones 229 | else: 230 | value = str(value) + str(ones) 231 | else: # eleven to nineteen 232 | if value % 100 == 0: 233 | value += ones 234 | else: 235 | value = str(value) + str(ones) 236 | elif current in self.ones_suffixed: 237 | # ordinal or cardinal; yield the number right away 238 | ones, suffix = self.ones_suffixed[current] 239 | if value is None: 240 | yield output(str(ones) + suffix) 241 | elif isinstance(value, str) or prev in self.ones: 242 | if prev in self.tens and ones < 10: 243 | assert value[-1] == "0" 244 | yield output(value[:-1] + str(ones) + suffix) 245 | else: 246 | yield output(str(value) + str(ones) + suffix) 247 | elif ones < 10: 248 | if value % 10 == 0: 249 | yield output(str(value + ones) + suffix) 250 | else: 251 | yield output(str(value) + str(ones) + suffix) 252 | else: # eleven to nineteen 253 | if value % 100 == 0: 254 | yield output(str(value + ones) + suffix) 255 | else: 256 | yield output(str(value) + str(ones) + suffix) 257 | value = None 258 | elif current in self.tens: 259 | tens = self.tens[current] 260 | if value is None: 261 | value = tens 262 | elif isinstance(value, str): 263 | value = str(value) + str(tens) 264 | else: 265 | if value % 100 == 0: 266 | value += tens 267 | else: 268 | value = str(value) + str(tens) 269 | elif current in self.tens_suffixed: 270 | # ordinal or cardinal; yield the number right away 271 | tens, suffix = self.tens_suffixed[current] 272 | if value is None: 273 | yield output(str(tens) + suffix) 274 | elif isinstance(value, str): 275 | yield output(str(value) + str(tens) + suffix) 276 | else: 277 | if value % 100 == 0: 278 | yield output(str(value + tens) + suffix) 279 | else: 280 | yield output(str(value) + str(tens) + suffix) 281 | elif current in self.multipliers: 282 | multiplier = self.multipliers[current] 283 | if value is None: 284 | value = multiplier 285 | elif isinstance(value, str) or value == 0: 286 | f = to_fraction(value) 287 | p = f * multiplier if f is not None else None 288 | if f is not None and p.denominator == 1: 289 | value = p.numerator 290 | else: 291 | yield output(value) 292 | value = multiplier 293 | else: 294 | before = value // 1000 * 1000 295 | residual = value % 1000 296 | value = before + residual * multiplier 297 | elif current in self.multipliers_suffixed: 298 | multiplier, suffix = self.multipliers_suffixed[current] 299 | if value is None: 300 | yield output(str(multiplier) + suffix) 301 | elif isinstance(value, str): 302 | f = to_fraction(value) 303 | p = f * multiplier if f is not None else None 304 | if f is not None and p.denominator == 1: 305 | yield output(str(p.numerator) + suffix) 306 | else: 307 | yield output(value) 308 | yield output(str(multiplier) + suffix) 309 | else: # int 310 | before = value // 1000 * 1000 311 | residual = value % 1000 312 | value = before + residual * multiplier 313 | yield output(str(value) + suffix) 314 | value = None 315 | elif current in self.preceding_prefixers: 316 | # apply prefix (positive, minus, etc.) if it precedes a number 317 | if value is not None: 318 | yield output(value) 319 | 320 | if next in self.words or next_is_numeric: 321 | prefix = self.preceding_prefixers[current] 322 | else: 323 | yield output(current) 324 | elif current in self.following_prefixers: 325 | # apply prefix (dollars, cents, etc.) only after a number 326 | if value is not None: 327 | prefix = self.following_prefixers[current] 328 | yield output(value) 329 | else: 330 | yield output(current) 331 | elif current in self.suffixers: 332 | # apply suffix symbols (percent -> '%') 333 | if value is not None: 334 | suffix = self.suffixers[current] 335 | if isinstance(suffix, dict): 336 | if next in suffix: 337 | yield output(str(value) + suffix[next]) 338 | skip = True 339 | else: 340 | yield output(value) 341 | yield output(current) 342 | else: 343 | yield output(str(value) + suffix) 344 | else: 345 | yield output(current) 346 | elif current in self.specials: 347 | if next not in self.words and not next_is_numeric: 348 | # apply special handling only if the next word can be numeric 349 | if value is not None: 350 | yield output(value) 351 | yield output(current) 352 | elif current == "and": 353 | # ignore "and" after hundreds, thousands, etc. 354 | if prev not in self.multipliers: 355 | if value is not None: 356 | yield output(value) 357 | yield output(current) 358 | elif current == "double" or current == "triple": 359 | if next in self.ones or next in self.zeros: 360 | repeats = 2 if current == "double" else 3 361 | ones = self.ones.get(next, 0) 362 | value = str(value or "") + str(ones) * repeats 363 | skip = True 364 | else: 365 | if value is not None: 366 | yield output(value) 367 | yield output(current) 368 | elif current == "point": 369 | if next in self.decimals or next_is_numeric: 370 | value = str(value or "") + "." 371 | else: 372 | # should all have been covered at this point 373 | raise ValueError(f"Unexpected token: {current}") 374 | else: 375 | # all should have been covered at this point 376 | raise ValueError(f"Unexpected token: {current}") 377 | 378 | if value is not None: 379 | yield output(value) 380 | 381 | def preprocess(self, s: str): 382 | # replace " and a half" with " point five" 383 | results = [] 384 | 385 | segments = re.split(r"\band\s+a\s+half\b", s) 386 | for i, segment in enumerate(segments): 387 | if len(segment.strip()) == 0: 388 | continue 389 | if i == len(segments) - 1: 390 | results.append(segment) 391 | else: 392 | results.append(segment) 393 | last_word = segment.rsplit(maxsplit=2)[-1] 394 | if last_word in self.decimals or last_word in self.multipliers: 395 | results.append("point five") 396 | else: 397 | results.append("and a half") 398 | 399 | s = " ".join(results) 400 | 401 | # put a space at number/letter boundary 402 | s = re.sub(r"([a-z])([0-9])", r"\1 \2", s) 403 | s = re.sub(r"([0-9])([a-z])", r"\1 \2", s) 404 | 405 | # but remove spaces which could be a suffix 406 | s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s) 407 | 408 | return s 409 | 410 | def postprocess(self, s: str): 411 | def combine_cents(m: Match): 412 | try: 413 | currency = m.group(1) 414 | integer = m.group(2) 415 | cents = int(m.group(3)) 416 | return f"{currency}{integer}.{cents:02d}" 417 | except ValueError: 418 | return m.string 419 | 420 | def extract_cents(m: Match): 421 | try: 422 | return f"¢{int(m.group(1))}" 423 | except ValueError: 424 | return m.string 425 | 426 | # apply currency postprocessing; "$2 and ¢7" -> "$2.07" 427 | s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s) 428 | s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s) 429 | 430 | # write "one(s)" instead of "1(s)", just for the readability 431 | s = re.sub(r"\b1(s?)\b", r"one\1", s) 432 | 433 | return s 434 | 435 | def __call__(self, s: str): 436 | s = self.preprocess(s) 437 | s = " ".join(word for word in self.process_words(s.split()) if word is not None) 438 | s = self.postprocess(s) 439 | 440 | return s 441 | 442 | 443 | class EnglishSpellingNormalizer: 444 | """ 445 | Applies British-American spelling mappings as listed in [1]. 446 | 447 | [1] https://www.tysto.com/uk-us-spelling-list.html 448 | """ 449 | 450 | def __init__(self): 451 | mapping_path = os.path.join(os.path.dirname(__file__), "english.json") 452 | self.mapping = json.load(open(mapping_path)) 453 | 454 | def __call__(self, s: str): 455 | return " ".join(self.mapping.get(word, word) for word in s.split()) 456 | 457 | 458 | class EnglishTextNormalizer: 459 | def __init__(self): 460 | self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" 461 | self.replacers = { 462 | # common contractions 463 | r"\bwon't\b": "will not", 464 | r"\bcan't\b": "can not", 465 | r"\blet's\b": "let us", 466 | r"\bain't\b": "aint", 467 | r"\by'all\b": "you all", 468 | r"\bwanna\b": "want to", 469 | r"\bgotta\b": "got to", 470 | r"\bgonna\b": "going to", 471 | r"\bi'ma\b": "i am going to", 472 | r"\bimma\b": "i am going to", 473 | r"\bwoulda\b": "would have", 474 | r"\bcoulda\b": "could have", 475 | r"\bshoulda\b": "should have", 476 | r"\bma'am\b": "madam", 477 | # contractions in titles/prefixes 478 | r"\bmr\b": "mister ", 479 | r"\bmrs\b": "missus ", 480 | r"\bst\b": "saint ", 481 | r"\bdr\b": "doctor ", 482 | r"\bprof\b": "professor ", 483 | r"\bcapt\b": "captain ", 484 | r"\bgov\b": "governor ", 485 | r"\bald\b": "alderman ", 486 | r"\bgen\b": "general ", 487 | r"\bsen\b": "senator ", 488 | r"\brep\b": "representative ", 489 | r"\bpres\b": "president ", 490 | r"\brev\b": "reverend ", 491 | r"\bhon\b": "honorable ", 492 | r"\basst\b": "assistant ", 493 | r"\bassoc\b": "associate ", 494 | r"\blt\b": "lieutenant ", 495 | r"\bcol\b": "colonel ", 496 | r"\bjr\b": "junior ", 497 | r"\bsr\b": "senior ", 498 | r"\besq\b": "esquire ", 499 | # prefect tenses, ideally it should be any past participles, but it's harder.. 500 | r"'d been\b": " had been", 501 | r"'s been\b": " has been", 502 | r"'d gone\b": " had gone", 503 | r"'s gone\b": " has gone", 504 | r"'d done\b": " had done", # "'s done" is ambiguous 505 | r"'s got\b": " has got", 506 | # general contractions 507 | r"n't\b": " not", 508 | r"'re\b": " are", 509 | r"'s\b": " is", 510 | r"'d\b": " would", 511 | r"'ll\b": " will", 512 | r"'t\b": " not", 513 | r"'ve\b": " have", 514 | r"'m\b": " am", 515 | } 516 | self.standardize_numbers = EnglishNumberNormalizer() 517 | self.standardize_spellings = EnglishSpellingNormalizer() 518 | 519 | def __call__(self, s: str): 520 | s = s.lower() 521 | 522 | s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets 523 | s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis 524 | s = re.sub(self.ignore_patterns, "", s) 525 | s = re.sub(r"\s+'", "'", s) # standardize when there's a space before an apostrophe 526 | 527 | for pattern, replacement in self.replacers.items(): 528 | s = re.sub(pattern, replacement, s) 529 | 530 | s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits 531 | s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers 532 | s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep some symbols for numerics 533 | 534 | s = self.standardize_numbers(s) 535 | s = self.standardize_spellings(s) 536 | 537 | # now remove prefix/suffix symbols that are not preceded/followed by numbers 538 | s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s) 539 | s = re.sub(r"([^0-9])%", r"\1 ", s) 540 | 541 | s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space 542 | 543 | return s 544 | -------------------------------------------------------------------------------- /musetalk/whisper/tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from functools import lru_cache 4 | from typing import List, Optional, Tuple, Union 5 | 6 | import numpy as np 7 | import torch 8 | from transformers import GPT2TokenizerFast 9 | 10 | LANGUAGES = { 11 | "en": "english", 12 | "zh": "chinese", 13 | "de": "german", 14 | "es": "spanish", 15 | "ru": "russian", 16 | "ko": "korean", 17 | "fr": "french", 18 | "ja": "japanese", 19 | "pt": "portuguese", 20 | "tr": "turkish", 21 | "pl": "polish", 22 | "ca": "catalan", 23 | "nl": "dutch", 24 | "ar": "arabic", 25 | "sv": "swedish", 26 | "it": "italian", 27 | "id": "indonesian", 28 | "hi": "hindi", 29 | "fi": "finnish", 30 | "vi": "vietnamese", 31 | "iw": "hebrew", 32 | "uk": "ukrainian", 33 | "el": "greek", 34 | "ms": "malay", 35 | "cs": "czech", 36 | "ro": "romanian", 37 | "da": "danish", 38 | "hu": "hungarian", 39 | "ta": "tamil", 40 | "no": "norwegian", 41 | "th": "thai", 42 | "ur": "urdu", 43 | "hr": "croatian", 44 | "bg": "bulgarian", 45 | "lt": "lithuanian", 46 | "la": "latin", 47 | "mi": "maori", 48 | "ml": "malayalam", 49 | "cy": "welsh", 50 | "sk": "slovak", 51 | "te": "telugu", 52 | "fa": "persian", 53 | "lv": "latvian", 54 | "bn": "bengali", 55 | "sr": "serbian", 56 | "az": "azerbaijani", 57 | "sl": "slovenian", 58 | "kn": "kannada", 59 | "et": "estonian", 60 | "mk": "macedonian", 61 | "br": "breton", 62 | "eu": "basque", 63 | "is": "icelandic", 64 | "hy": "armenian", 65 | "ne": "nepali", 66 | "mn": "mongolian", 67 | "bs": "bosnian", 68 | "kk": "kazakh", 69 | "sq": "albanian", 70 | "sw": "swahili", 71 | "gl": "galician", 72 | "mr": "marathi", 73 | "pa": "punjabi", 74 | "si": "sinhala", 75 | "km": "khmer", 76 | "sn": "shona", 77 | "yo": "yoruba", 78 | "so": "somali", 79 | "af": "afrikaans", 80 | "oc": "occitan", 81 | "ka": "georgian", 82 | "be": "belarusian", 83 | "tg": "tajik", 84 | "sd": "sindhi", 85 | "gu": "gujarati", 86 | "am": "amharic", 87 | "yi": "yiddish", 88 | "lo": "lao", 89 | "uz": "uzbek", 90 | "fo": "faroese", 91 | "ht": "haitian creole", 92 | "ps": "pashto", 93 | "tk": "turkmen", 94 | "nn": "nynorsk", 95 | "mt": "maltese", 96 | "sa": "sanskrit", 97 | "lb": "luxembourgish", 98 | "my": "myanmar", 99 | "bo": "tibetan", 100 | "tl": "tagalog", 101 | "mg": "malagasy", 102 | "as": "assamese", 103 | "tt": "tatar", 104 | "haw": "hawaiian", 105 | "ln": "lingala", 106 | "ha": "hausa", 107 | "ba": "bashkir", 108 | "jw": "javanese", 109 | "su": "sundanese", 110 | } 111 | 112 | # language code lookup by name, with a few language aliases 113 | TO_LANGUAGE_CODE = { 114 | **{language: code for code, language in LANGUAGES.items()}, 115 | "burmese": "my", 116 | "valencian": "ca", 117 | "flemish": "nl", 118 | "haitian": "ht", 119 | "letzeburgesch": "lb", 120 | "pushto": "ps", 121 | "panjabi": "pa", 122 | "moldavian": "ro", 123 | "moldovan": "ro", 124 | "sinhalese": "si", 125 | "castilian": "es", 126 | } 127 | 128 | 129 | @dataclass(frozen=True) 130 | class Tokenizer: 131 | """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens""" 132 | 133 | tokenizer: "GPT2TokenizerFast" 134 | language: Optional[str] 135 | sot_sequence: Tuple[int] 136 | 137 | def encode(self, text, **kwargs): 138 | return self.tokenizer.encode(text, **kwargs) 139 | 140 | def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs): 141 | return self.tokenizer.decode(token_ids, **kwargs) 142 | 143 | def decode_with_timestamps(self, tokens) -> str: 144 | """ 145 | Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. 146 | This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>". 147 | """ 148 | outputs = [[]] 149 | for token in tokens: 150 | if token >= self.timestamp_begin: 151 | timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" 152 | outputs.append(timestamp) 153 | outputs.append([]) 154 | else: 155 | outputs[-1].append(token) 156 | outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] 157 | return "".join(outputs) 158 | 159 | @property 160 | @lru_cache() 161 | def eot(self) -> int: 162 | return self.tokenizer.eos_token_id 163 | 164 | @property 165 | @lru_cache() 166 | def sot(self) -> int: 167 | return self._get_single_token_id("<|startoftranscript|>") 168 | 169 | @property 170 | @lru_cache() 171 | def sot_lm(self) -> int: 172 | return self._get_single_token_id("<|startoflm|>") 173 | 174 | @property 175 | @lru_cache() 176 | def sot_prev(self) -> int: 177 | return self._get_single_token_id("<|startofprev|>") 178 | 179 | @property 180 | @lru_cache() 181 | def no_speech(self) -> int: 182 | return self._get_single_token_id("<|nospeech|>") 183 | 184 | @property 185 | @lru_cache() 186 | def no_timestamps(self) -> int: 187 | return self._get_single_token_id("<|notimestamps|>") 188 | 189 | @property 190 | @lru_cache() 191 | def timestamp_begin(self) -> int: 192 | return self.tokenizer.all_special_ids[-1] + 1 193 | 194 | @property 195 | @lru_cache() 196 | def language_token(self) -> int: 197 | """Returns the token id corresponding to the value of the `language` field""" 198 | if self.language is None: 199 | raise ValueError(f"This tokenizer does not have language token configured") 200 | 201 | additional_tokens = dict( 202 | zip( 203 | self.tokenizer.additional_special_tokens, 204 | self.tokenizer.additional_special_tokens_ids, 205 | ) 206 | ) 207 | candidate = f"<|{self.language}|>" 208 | if candidate in additional_tokens: 209 | return additional_tokens[candidate] 210 | 211 | raise KeyError(f"Language {self.language} not found in tokenizer.") 212 | 213 | @property 214 | @lru_cache() 215 | def all_language_tokens(self) -> Tuple[int]: 216 | result = [] 217 | for token, token_id in zip( 218 | self.tokenizer.additional_special_tokens, 219 | self.tokenizer.additional_special_tokens_ids, 220 | ): 221 | if token.strip("<|>") in LANGUAGES: 222 | result.append(token_id) 223 | return tuple(result) 224 | 225 | @property 226 | @lru_cache() 227 | def all_language_codes(self) -> Tuple[str]: 228 | return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens) 229 | 230 | @property 231 | @lru_cache() 232 | def sot_sequence_including_notimestamps(self) -> Tuple[int]: 233 | return tuple(list(self.sot_sequence) + [self.no_timestamps]) 234 | 235 | @property 236 | @lru_cache() 237 | def non_speech_tokens(self) -> Tuple[int]: 238 | """ 239 | Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech 240 | annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. 241 | 242 | - ♪♪♪ 243 | - ( SPEAKING FOREIGN LANGUAGE ) 244 | - [DAVID] Hey there, 245 | 246 | keeping basic punctuations like commas, periods, question marks, exclamation points, etc. 247 | """ 248 | symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』") 249 | symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() 250 | 251 | # symbols that may be a single token or multiple tokens depending on the tokenizer. 252 | # In case they're multiple tokens, suppress the first token, which is safe because: 253 | # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress 254 | # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. 255 | miscellaneous = set("♩♪♫♬♭♮♯") 256 | assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) 257 | 258 | # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word 259 | result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]} 260 | for symbol in symbols + list(miscellaneous): 261 | for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]: 262 | if len(tokens) == 1 or symbol in miscellaneous: 263 | result.add(tokens[0]) 264 | 265 | return tuple(sorted(result)) 266 | 267 | def _get_single_token_id(self, text) -> int: 268 | tokens = self.tokenizer.encode(text) 269 | assert len(tokens) == 1, f"{text} is not encoded as a single token" 270 | return tokens[0] 271 | 272 | 273 | @lru_cache(maxsize=None) 274 | def build_tokenizer(name: str = "gpt2"): 275 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 276 | path = os.path.join(os.path.dirname(__file__), "assets", name) 277 | tokenizer = GPT2TokenizerFast.from_pretrained(path) 278 | 279 | specials = [ 280 | "<|startoftranscript|>", 281 | *[f"<|{lang}|>" for lang in LANGUAGES.keys()], 282 | "<|translate|>", 283 | "<|transcribe|>", 284 | "<|startoflm|>", 285 | "<|startofprev|>", 286 | "<|nospeech|>", 287 | "<|notimestamps|>", 288 | ] 289 | 290 | tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) 291 | return tokenizer 292 | 293 | 294 | @lru_cache(maxsize=None) 295 | def get_tokenizer( 296 | multilingual: bool, 297 | *, 298 | task: Optional[str] = None, # Literal["transcribe", "translate", None] 299 | language: Optional[str] = None, 300 | ) -> Tokenizer: 301 | if language is not None: 302 | language = language.lower() 303 | if language not in LANGUAGES: 304 | if language in TO_LANGUAGE_CODE: 305 | language = TO_LANGUAGE_CODE[language] 306 | else: 307 | raise ValueError(f"Unsupported language: {language}") 308 | 309 | if multilingual: 310 | tokenizer_name = "multilingual" 311 | task = task or "transcribe" 312 | language = language or "en" 313 | else: 314 | tokenizer_name = "gpt2" 315 | task = None 316 | language = None 317 | 318 | tokenizer = build_tokenizer(name=tokenizer_name) 319 | all_special_ids: List[int] = tokenizer.all_special_ids 320 | sot: int = all_special_ids[1] 321 | translate: int = all_special_ids[-6] 322 | transcribe: int = all_special_ids[-5] 323 | 324 | langs = tuple(LANGUAGES.keys()) 325 | sot_sequence = [sot] 326 | if language is not None: 327 | sot_sequence.append(sot + 1 + langs.index(language)) 328 | if task is not None: 329 | sot_sequence.append(transcribe if task == "transcribe" else translate) 330 | 331 | return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence)) 332 | -------------------------------------------------------------------------------- /musetalk/whisper/transcribe.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import warnings 4 | from typing import List, Optional, Tuple, Union, TYPE_CHECKING 5 | 6 | import numpy as np 7 | import torch 8 | import tqdm 9 | 10 | from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram 11 | from .decoding import DecodingOptions, DecodingResult 12 | from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer 13 | from .utils import exact_div, format_timestamp, optional_int, optional_float, str2bool, write_txt, write_vtt, write_srt 14 | 15 | if TYPE_CHECKING: 16 | from .model import Whisper 17 | 18 | 19 | def transcribe( 20 | model: "Whisper", 21 | audio: Union[str, np.ndarray, torch.Tensor], 22 | *, 23 | verbose: Optional[bool] = None, 24 | temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), 25 | compression_ratio_threshold: Optional[float] = 2.4, 26 | logprob_threshold: Optional[float] = -1.0, 27 | no_speech_threshold: Optional[float] = 0.6, 28 | condition_on_previous_text: bool = True, 29 | force_extraction: bool = False, 30 | **decode_options, 31 | ): 32 | """ 33 | Transcribe an audio file using Whisper 34 | 35 | Parameters 36 | ---------- 37 | model: Whisper 38 | The Whisper model instance 39 | 40 | audio: Union[str, np.ndarray, torch.Tensor] 41 | The path to the audio file to open, or the audio waveform 42 | 43 | verbose: bool 44 | Whether to display the text being decoded to the console. If True, displays all the details, 45 | If False, displays minimal details. If None, does not display anything 46 | 47 | temperature: Union[float, Tuple[float, ...]] 48 | Temperature for sampling. It can be a tuple of temperatures, which will be successfully used 49 | upon failures according to either `compression_ratio_threshold` or `logprob_threshold`. 50 | 51 | compression_ratio_threshold: float 52 | If the gzip compression ratio is above this value, treat as failed 53 | 54 | logprob_threshold: float 55 | If the average log probability over sampled tokens is below this value, treat as failed 56 | 57 | no_speech_threshold: float 58 | If the no_speech probability is higher than this value AND the average log probability 59 | over sampled tokens is below `logprob_threshold`, consider the segment as silent 60 | 61 | condition_on_previous_text: bool 62 | if True, the previous output of the model is provided as a prompt for the next window; 63 | disabling may make the text inconsistent across windows, but the model becomes less prone to 64 | getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. 65 | 66 | decode_options: dict 67 | Keyword arguments to construct `DecodingOptions` instances 68 | 69 | Returns 70 | ------- 71 | A dictionary containing the resulting text ("text") and segment-level details ("segments"), and 72 | the spoken language ("language"), which is detected when `decode_options["language"]` is None. 73 | """ 74 | dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32 75 | if model.device == torch.device("cpu"): 76 | if torch.cuda.is_available(): 77 | warnings.warn("Performing inference on CPU when CUDA is available") 78 | if dtype == torch.float16: 79 | warnings.warn("FP16 is not supported on CPU; using FP32 instead") 80 | dtype = torch.float32 81 | 82 | if dtype == torch.float32: 83 | decode_options["fp16"] = False 84 | 85 | mel = log_mel_spectrogram(audio) 86 | 87 | all_segments = [] 88 | def add_segment( 89 | *, start: float, end: float, encoder_embeddings 90 | ): 91 | 92 | all_segments.append( 93 | { 94 | "start": start, 95 | "end": end, 96 | "encoder_embeddings":encoder_embeddings, 97 | } 98 | ) 99 | # show the progress bar when verbose is False (otherwise the transcribed text will be printed) 100 | num_frames = mel.shape[-1] 101 | seek = 0 102 | previous_seek_value = seek 103 | sample_skip = 3000 # 104 | with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar: 105 | while seek < num_frames: 106 | # seek是开始的帧数 107 | end_seek = min(seek + sample_skip, num_frames) 108 | segment = pad_or_trim(mel[:,seek:seek+sample_skip], N_FRAMES).to(model.device).to(dtype) 109 | 110 | single = segment.ndim == 2 111 | if single: 112 | segment = segment.unsqueeze(0) 113 | if dtype == torch.float16: 114 | segment = segment.half() 115 | audio_features, embeddings = model.encoder(segment, include_embeddings = True) 116 | 117 | encoder_embeddings = embeddings 118 | #print(f"encoder_embeddings shape {encoder_embeddings.shape}") 119 | add_segment( 120 | start=seek, 121 | end=end_seek, 122 | #text_tokens=tokens, 123 | #result=result, 124 | encoder_embeddings=encoder_embeddings, 125 | ) 126 | seek+=sample_skip 127 | 128 | return dict(segments=all_segments) 129 | 130 | 131 | def cli(): 132 | from . import available_models 133 | 134 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 135 | parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe") 136 | parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use") 137 | parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default") 138 | parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") 139 | parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") 140 | parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages") 141 | 142 | parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") 143 | parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection") 144 | 145 | parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling") 146 | parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature") 147 | parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero") 148 | parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search") 149 | parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default") 150 | 151 | parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations") 152 | parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.") 153 | parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop") 154 | parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default") 155 | 156 | parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below") 157 | parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed") 158 | parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed") 159 | parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence") 160 | parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") 161 | 162 | args = parser.parse_args().__dict__ 163 | model_name: str = args.pop("model") 164 | model_dir: str = args.pop("model_dir") 165 | output_dir: str = args.pop("output_dir") 166 | device: str = args.pop("device") 167 | os.makedirs(output_dir, exist_ok=True) 168 | 169 | if model_name.endswith(".en") and args["language"] not in {"en", "English"}: 170 | if args["language"] is not None: 171 | warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.") 172 | args["language"] = "en" 173 | 174 | temperature = args.pop("temperature") 175 | temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback") 176 | if temperature_increment_on_fallback is not None: 177 | temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback)) 178 | else: 179 | temperature = [temperature] 180 | 181 | threads = args.pop("threads") 182 | if threads > 0: 183 | torch.set_num_threads(threads) 184 | 185 | from . import load_model 186 | model = load_model(model_name, device=device, download_root=model_dir) 187 | 188 | for audio_path in args.pop("audio"): 189 | result = transcribe(model, audio_path, temperature=temperature, **args) 190 | 191 | audio_basename = os.path.basename(audio_path) 192 | 193 | # save TXT 194 | with open(os.path.join(output_dir, audio_basename + ".txt"), "w", encoding="utf-8") as txt: 195 | write_txt(result["segments"], file=txt) 196 | 197 | # save VTT 198 | with open(os.path.join(output_dir, audio_basename + ".vtt"), "w", encoding="utf-8") as vtt: 199 | write_vtt(result["segments"], file=vtt) 200 | 201 | # save SRT 202 | with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt: 203 | write_srt(result["segments"], file=srt) 204 | 205 | 206 | if __name__ == '__main__': 207 | cli() 208 | -------------------------------------------------------------------------------- /musetalk/whisper/utils.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | from typing import Iterator, TextIO 3 | 4 | 5 | def exact_div(x, y): 6 | assert x % y == 0 7 | return x // y 8 | 9 | 10 | def str2bool(string): 11 | str2val = {"True": True, "False": False} 12 | if string in str2val: 13 | return str2val[string] 14 | else: 15 | raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") 16 | 17 | 18 | def optional_int(string): 19 | return None if string == "None" else int(string) 20 | 21 | 22 | def optional_float(string): 23 | return None if string == "None" else float(string) 24 | 25 | 26 | def compression_ratio(text) -> float: 27 | return len(text) / len(zlib.compress(text.encode("utf-8"))) 28 | 29 | 30 | def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): 31 | assert seconds >= 0, "non-negative timestamp expected" 32 | milliseconds = round(seconds * 1000.0) 33 | 34 | hours = milliseconds // 3_600_000 35 | milliseconds -= hours * 3_600_000 36 | 37 | minutes = milliseconds // 60_000 38 | milliseconds -= minutes * 60_000 39 | 40 | seconds = milliseconds // 1_000 41 | milliseconds -= seconds * 1_000 42 | 43 | hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" 44 | return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" 45 | 46 | 47 | def write_txt(transcript: Iterator[dict], file: TextIO): 48 | for segment in transcript: 49 | print(segment['text'].strip(), file=file, flush=True) 50 | 51 | 52 | def write_vtt(transcript: Iterator[dict], file: TextIO): 53 | print("WEBVTT\n", file=file) 54 | for segment in transcript: 55 | print( 56 | f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" 57 | f"{segment['text'].strip().replace('-->', '->')}\n", 58 | file=file, 59 | flush=True, 60 | ) 61 | 62 | 63 | def write_srt(transcript: Iterator[dict], file: TextIO): 64 | """ 65 | Write a transcript to a file in SRT format. 66 | 67 | Example usage: 68 | from pathlib import Path 69 | from whisper.utils import write_srt 70 | 71 | result = transcribe(model, audio_path, temperature=temperature, **args) 72 | 73 | # save SRT 74 | audio_basename = Path(audio_path).stem 75 | with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt: 76 | write_srt(result["segments"], file=srt) 77 | """ 78 | for i, segment in enumerate(transcript, start=1): 79 | # write srt lines 80 | print( 81 | f"{i}\n" 82 | f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " 83 | f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n" 84 | f"{segment['text'].strip().replace('-->', '->')}\n", 85 | file=file, 86 | flush=True, 87 | ) 88 | -------------------------------------------------------------------------------- /nodes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torchaudio 5 | import numpy as np 6 | import math 7 | import folder_paths 8 | from contextlib import nullcontext 9 | from tqdm import tqdm 10 | 11 | import comfy.latent_formats 12 | import comfy.model_management as mm 13 | from comfy.utils import ProgressBar, unet_to_diffusers, load_torch_file 14 | from comfy.model_base import BaseModel 15 | 16 | script_directory = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | class PositionalEncoding(nn.Module): 19 | def __init__(self, d_model=384, max_len=5000): 20 | super(PositionalEncoding, self).__init__() 21 | pe = torch.zeros(max_len, d_model) 22 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 23 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 24 | pe[:, 0::2] = torch.sin(position * div_term) 25 | pe[:, 1::2] = torch.cos(position * div_term) 26 | pe = pe.unsqueeze(0) 27 | self.register_buffer('pe', pe) 28 | 29 | def forward(self, x): 30 | b, seq_len, d_model = x.size() 31 | pe = self.pe[:, :seq_len, :] 32 | x = x + pe.to(x.device) 33 | return x 34 | 35 | class MuseModelConfig: 36 | def __init__(self): 37 | unet_dtype = mm.unet_dtype() 38 | self.unet_config = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None, 39 | 'dtype': unet_dtype, 'in_channels': 8, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 40 | 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 384, 'num_heads': 8, 41 | 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 42 | 'use_temporal_attention': False, 'use_temporal_resblock': False} 43 | self.latent_format = comfy.latent_formats.SD15 44 | self.manual_cast_dtype = None 45 | self.sampling_settings = {} 46 | 47 | class UNETLoader_MuseTalk: 48 | @classmethod 49 | def INPUT_TYPES(s): 50 | return {"required": { 51 | }} 52 | RETURN_TYPES = ("MODEL",) 53 | FUNCTION = "load_unet" 54 | 55 | CATEGORY = "MuseTalk" 56 | 57 | def load_unet(self): 58 | 59 | model_path = os.path.join(folder_paths.models_dir,'musetalk') 60 | 61 | if not os.path.exists(model_path): 62 | from huggingface_hub import snapshot_download 63 | snapshot_download(repo_id="TMElyralab/MuseTalk", local_dir=model_path, local_dir_use_symlinks=False) 64 | 65 | unet_weight_path = os.path.join(model_path, "musetalk","pytorch_model.bin") 66 | 67 | sd = load_torch_file(unet_weight_path) 68 | 69 | model_config = MuseModelConfig() 70 | diffusers_keys = unet_to_diffusers(model_config.unet_config) 71 | 72 | new_sd = {} 73 | for k in diffusers_keys: 74 | if k in sd: 75 | new_sd[diffusers_keys[k]] = sd.pop(k) 76 | 77 | model = BaseModel(model_config) 78 | model.diffusion_model.load_state_dict(new_sd, strict=False) 79 | return (model,) 80 | 81 | class muse_talk_sampler: 82 | @classmethod 83 | def INPUT_TYPES(s): 84 | return {"required": { 85 | "model": ("MODEL",), 86 | "vae": ("VAE",), 87 | "whisper_features" : ("WHISPERFEAT",), 88 | "images": ("IMAGE",), 89 | "masked_images": ("IMAGE",), 90 | "batch_size": ("INT", {"default": 8, "min": 1, "max": 4096, "step": 1}), 91 | "delay_frame": ("INT", {"default": 0, "min": 0, "max": 4096, "step": 1}), 92 | }, 93 | } 94 | 95 | RETURN_TYPES = ("IMAGE", ) 96 | RETURN_NAMES = ("image", ) 97 | FUNCTION = "process" 98 | CATEGORY = "MuseTalk" 99 | 100 | def process(self, model, vae, whisper_features, images, masked_images, batch_size, delay_frame): 101 | 102 | device = mm.get_torch_device() 103 | offload_device = mm.unet_offload_device() 104 | dtype = mm.unet_dtype() 105 | vae_scale_factor = 0.18215 106 | mm.unload_all_models() 107 | mm.soft_empty_cache() 108 | 109 | images = images.to(dtype).to(device) 110 | masked_images = masked_images.to(dtype).to(device) 111 | 112 | autocast_condition = (dtype != torch.float32) and not mm.is_device_mps(device) 113 | with torch.autocast(mm.get_autocast_device(device), dtype=dtype) if autocast_condition else nullcontext(): 114 | timesteps = torch.tensor([0], device=device) 115 | vae.first_stage_model.to(device) 116 | input_latent_list = [] 117 | for image, masked_image in zip(images, masked_images): 118 | latent = vae.encode(image.unsqueeze(0)).to(dtype).to(device) * vae_scale_factor 119 | masked_latents = vae.encode(masked_image.unsqueeze(0)).to(dtype).to(device) * vae_scale_factor 120 | 121 | latent_model_input = torch.cat([masked_latents, latent], dim=1) 122 | input_latent_list.append(latent_model_input) 123 | 124 | input_latent_list_cycle = input_latent_list + input_latent_list[::-1] 125 | video_num = len(whisper_features) 126 | gen = self.datagen(whisper_features, input_latent_list_cycle, batch_size, delay_frame) 127 | 128 | total=int(np.ceil(float(video_num)/batch_size)) 129 | 130 | out_frame_list = [] 131 | 132 | pbar = ProgressBar(total) 133 | model.diffusion_model.to(device) 134 | for i, (whisper_batch,latent_batch) in enumerate(tqdm(gen,total=total)): 135 | 136 | tensor_list = [torch.FloatTensor(arr) for arr in whisper_batch] 137 | audio_feature_batch = torch.stack(tensor_list).to(device) # torch, B, 5*N,384 138 | audio_feature_batch = PositionalEncoding(d_model=384)(audio_feature_batch) 139 | 140 | pred_latents = model.diffusion_model(latent_batch, timesteps, context=audio_feature_batch) 141 | 142 | pred_latents = (1 / vae_scale_factor) * pred_latents 143 | decoded = vae.decode(pred_latents) 144 | 145 | for frame in decoded: 146 | out_frame_list.append(frame) 147 | pbar.update(1) 148 | 149 | out = torch.stack(out_frame_list, dim=0).float().cpu() 150 | model.diffusion_model.to(offload_device) 151 | vae.first_stage_model.to(offload_device) 152 | return (out,) 153 | 154 | def datagen(self, whisper_chunks,vae_encode_latents,batch_size,delay_frame): 155 | whisper_batch, latent_batch = [], [] 156 | for i, w in enumerate(whisper_chunks): 157 | idx = (i+delay_frame)%len(vae_encode_latents) 158 | latent = vae_encode_latents[idx] 159 | whisper_batch.append(w) 160 | latent_batch.append(latent) 161 | 162 | if len(latent_batch) >= batch_size: 163 | whisper_batch = np.asarray(whisper_batch) 164 | latent_batch = torch.cat(latent_batch, dim=0) 165 | yield whisper_batch, latent_batch 166 | whisper_batch, latent_batch = [], [] 167 | 168 | # the last batch may smaller than batch size 169 | if len(latent_batch) > 0: 170 | whisper_batch = np.asarray(whisper_batch) 171 | latent_batch = torch.cat(latent_batch, dim=0) 172 | 173 | yield whisper_batch, latent_batch 174 | 175 | class vhs_audio_to_audio_tensor: 176 | @classmethod 177 | def INPUT_TYPES(s): 178 | return {"required": { 179 | "vhs_audio": ("VHS_AUDIO",), 180 | "target_sample_rate": ("INT", {"default": 16000, "min": 0, "max": 48000}), 181 | "target_channels": ("INT", {"default": 1, "min": 1, "max": 2}), 182 | }, 183 | 184 | } 185 | 186 | RETURN_TYPES = ("VCAUDIOTENSOR", "INT",) 187 | RETURN_NAMES = ("audio_tensor", "audio_dur",) 188 | FUNCTION = "process" 189 | CATEGORY = "VoiceCraft" 190 | 191 | def process(self, vhs_audio, target_sample_rate, target_channels): 192 | import io 193 | # Convert the byte stream to a tensor 194 | audio_bytes = vhs_audio() 195 | audio_buffer = io.BytesIO(audio_bytes) 196 | audio_tensor, sample_rate = torchaudio.load(audio_buffer) 197 | assert audio_tensor.shape[0] in [1, 2], "Audio must be mono or stereo." 198 | if target_channels == 1: 199 | audio_tensor = audio_tensor.mean(0, keepdim=True) 200 | elif target_channels == 2: 201 | *shape, _, length = audio_tensor.shape 202 | audio_tensor = audio_tensor.expand(*shape, target_channels, length) 203 | elif audio_tensor.shape[0] == 1: 204 | audio_tensor = audio_tensor.expand(target_channels, -1) 205 | resampled_audio_tensor = torchaudio.functional.resample(audio_tensor, sample_rate, target_sample_rate) 206 | audio_dur = audio_tensor.shape[1] / target_sample_rate 207 | 208 | return (resampled_audio_tensor, audio_dur,) 209 | 210 | class whisper_to_features: 211 | @classmethod 212 | def INPUT_TYPES(s): 213 | return { 214 | "required": { 215 | "audio_tensor" : ("VCAUDIOTENSOR",), 216 | "fps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}), 217 | } 218 | } 219 | 220 | RETURN_TYPES = ("WHISPERFEAT", "INT",) 221 | RETURN_NAMES = ("whisper_chunks", "frame_count",) 222 | FUNCTION = "whispertranscribe" 223 | CATEGORY = "VoiceCraft" 224 | 225 | def whispertranscribe(self, audio_tensor, fps): 226 | from .musetalk.whisper.model import Whisper, ModelDimensions 227 | device = mm.get_torch_device() 228 | model_path = os.path.join(script_directory, "musetalk", "whisper","checkpoints","tiny.pt") 229 | 230 | if not os.path.exists(model_path): 231 | print(f"Downloading whisper tiny model (72MB) to {model_path}") 232 | import requests 233 | url = "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt" 234 | response = requests.get(url) 235 | if response.status_code == 200: 236 | with open(model_path, 'wb') as file: 237 | file.write(response.content) 238 | else: 239 | print(f"Failed to download {url} to {model_path}, status code: {response.status_code}") 240 | whisper_sd = torch.load(model_path, map_location=device) 241 | dims = ModelDimensions(**whisper_sd["dims"]) 242 | model = Whisper(dims) 243 | model.load_state_dict(whisper_sd["model_state_dict"]) 244 | del whisper_sd 245 | result = model.transcribe(audio_tensor.squeeze(0)) 246 | 247 | embed_list = [] 248 | for emb in result['segments']: 249 | encoder_embeddings = emb['encoder_embeddings'] 250 | encoder_embeddings = encoder_embeddings.transpose(0,2,1,3) 251 | encoder_embeddings = encoder_embeddings.squeeze(0) 252 | start_idx = int(emb['start']) 253 | end_idx = int(emb['end']) 254 | emb_end_idx = int((end_idx - start_idx)/2) 255 | embed_list.append(encoder_embeddings[:emb_end_idx]) 256 | whisper_feature = np.concatenate(embed_list, axis=0) 257 | 258 | audio_feat_length = [2,2] 259 | whisper_chunks = [] 260 | whisper_idx_multiplier = 50./fps 261 | i = 0 262 | print(f"video in {fps} FPS, audio idx in 50FPS") 263 | while 1: 264 | start_idx = int(i * whisper_idx_multiplier) 265 | selected_feature,selected_idx = self.get_sliced_feature(feature_array= whisper_feature,vid_idx = i,audio_feat_length=audio_feat_length,fps=fps) 266 | whisper_chunks.append(selected_feature) 267 | i += 1 268 | if start_idx>len(whisper_feature): 269 | break 270 | print(f"Whisper chunks: {len(whisper_chunks)}") 271 | return (whisper_chunks, len(whisper_chunks),) 272 | 273 | def get_sliced_feature(self,feature_array, vid_idx, audio_feat_length= [2,2],fps = 25): 274 | """ 275 | Get sliced features based on a given index 276 | :param feature_array: 277 | :param start_idx: the start index of the feature 278 | :param audio_feat_length: 279 | :return: 280 | """ 281 | length = len(feature_array) 282 | selected_feature = [] 283 | selected_idx = [] 284 | 285 | center_idx = int(vid_idx*50/fps) 286 | left_idx = center_idx-audio_feat_length[0]*2 287 | right_idx = center_idx + (audio_feat_length[1]+1)*2 288 | 289 | for idx in range(left_idx,right_idx): 290 | idx = max(0, idx) 291 | idx = min(length-1, idx) 292 | x = feature_array[idx] 293 | selected_feature.append(x) 294 | selected_idx.append(idx) 295 | 296 | selected_feature = np.concatenate(selected_feature, axis=0) 297 | selected_feature = selected_feature.reshape(-1, 384)# 50*384 298 | return selected_feature,selected_idx 299 | 300 | NODE_CLASS_MAPPINGS = { 301 | "whisper_to_features": whisper_to_features, 302 | "vhs_audio_to_audio_tensor": vhs_audio_to_audio_tensor, 303 | "muse_talk_sampler": muse_talk_sampler, 304 | "UNETLoader_MuseTalk": UNETLoader_MuseTalk 305 | } 306 | NODE_DISPLAY_NAME_MAPPINGS = { 307 | "whisper_to_features": "Whisper To Features", 308 | "vhs_audio_to_audio_tensor": "VHS Audio To Audio Tensor", 309 | "muse_talk_sampler": "MuseTalk Sampler", 310 | "UNETLoader_MuseTalk": "UNETLoader_MuseTalk" 311 | } 312 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests --------------------------------------------------------------------------------