├── .idea
    ├── $CACHE_FILE$
    ├── .gitignore
    ├── Deepfakes.iml
    ├── dictionaries
    │   └── 22staples.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── README.md
├── audio_to_vid.py
├── audio_transform.py
├── main.py
├── pic_to_vid.py
├── requirements.txt
├── test.py
├── text_to_audio.py
├── user_data
    ├── Tate.png
    ├── Tate_audio_samples
    │   ├── sample0.wav
    │   ├── sample1.wav
    │   ├── sample2.wav
    │   ├── sample3.wav
    │   ├── sample4.wav
    │   ├── sample5.wav
    │   ├── sample6.wav
    │   ├── sample7.wav
    │   ├── sample8.wav
    │   ├── sample9.wav
    │   └── voice.npy
    ├── activity_unproductive.wav
    ├── declaration.txt
    ├── obama.mp4
    ├── output.wav
    ├── putin.png
    ├── quick.wav
    ├── result.mp4
    ├── stylized.wav
    ├── thing.mp4
    └── trump.mp4
└── utils.py


/.idea/$CACHE_FILE$:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="NodePackageJsonFileManager">
4 |     <packageJsonPaths />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/.idea/Deepfakes.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |     </content>
 7 |     <orderEntry type="inheritedJdk" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 | </module>


--------------------------------------------------------------------------------
/.idea/dictionaries/22staples.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectDictionaryState">
2 |   <dictionary name="22staples">
3 |     <words>
4 |       <w>deepfake</w>
5 |       <w>obama</w>
6 |     </words>
7 |   </dictionary>
8 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" />
4 |   </settings>
5 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (Deepfakes)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Deepfakes.iml" filepath="$PROJECT_DIR$/.idea/Deepfakes.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="VcsDirectoryMappings">
 4 |     <mapping directory="$PROJECT_DIR$/Real-Time-Voice-Cloning" vcs="Git" />
 5 |     <mapping directory="$PROJECT_DIR$/Real_Time_Voice_Cloning" vcs="Git" />
 6 |     <mapping directory="$PROJECT_DIR$/SpeechSplit" vcs="Git" />
 7 |     <mapping directory="$PROJECT_DIR$/adaptive_voice_conversion" vcs="Git" />
 8 |     <mapping directory="$PROJECT_DIR$/first-order-model" vcs="Git" />
 9 |     <mapping directory="$PROJECT_DIR$/first_order_model" vcs="Git" />
10 |     <mapping directory="$PROJECT_DIR$/first_order_model/face_alignment" vcs="Git" />
11 |     <mapping directory="$PROJECT_DIR$/speech-driven-animation" vcs="Git" />
12 |     <mapping directory="$PROJECT_DIR$/speech_driven_animation" vcs="Git" />
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Premise
 2 | A combination of various deepfake algoritms to quickly create fake audio and video
 3 | 
 4 | # Options
 5 | This project has 4 seperate algorithms.
 6 | 1) [first-order-model](https://github.com/AliaksandrSiarohin/first-order-model): a quick deepfake alorithm that generates a video from a base video and driving image
 7 | 2) [Speech-Driven Facial Animation](https://github.com/DinoMan/speech-driven-animation): animates a picture to speak an audio input
 8 | 3) [Real-Time Voice Cloning Toolbox](https://github.com/CorentinJ/Real-Time-Voice-Cloning): a quick text to speech algorithm based off seconds of driving audio
 9 | 4) [One-shot Voice Conversion](https://github.com/jjery2243542/adaptive_voice_conversion): voice style transform to change the words of one into the words of another
10 | 
11 | # Setup
12 | ## Import
13 | pip install -r requirements.txt
14 | 
15 | Get from Version Control:
16 | 1) https://github.com/AliaksandrSiarohin/first-order-model.git
17 | 2) https://github.com/DinoMan/speech-driven-animation.git
18 | 3) https://github.com/jjery2243542/adaptive_voice_conversion.git
19 | 4) https://github.com/jjery2243542/adaptive_voice_conversion.git
20 | 
21 | and put these into the local project
22 | 
23 | ## Modify
24 | replace all the dashes in file names with _
25 | 
26 | Modify each of the files in the following ways:
27 | 1) go [here](https://drive.google.com/drive/folders/1PyQJmkdCsAkOYwUyaj_l-l0as-iLDgeH) and download vox-cpk.pth.tar, then place it in first_order_model
28 | 2) go [here](https://drive.google.com/drive/folders/1pJdsnknLmMLvA8RQIAV3AQH8vU0FeK16) and download grid.dat, then replace sda/data/grid.dat
29 | 3) download: [model](https://drive.google.com/file/d/1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc/view), delete toolbox/__init__.py
30 | 4) download: [model](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/vctk_model.ckpt) and [attr](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/attr.pkl), add move them into adaptive_voice_conversion
31 | 
32 | ## Alter imports
33 | Next go through each file and correct the imports due to the content root. Go through all the files and add - *project_name.* -  before all necesary imports (if you know a better way, please tell me)
34 | 


--------------------------------------------------------------------------------
/audio_to_vid.py:
--------------------------------------------------------------------------------
 1 | import speech_driven_animation.sda as sda
 2 | import utils
 3 | 
 4 | 
 5 | def generate_video(target_image, audio_path, save_path):
 6 |     animator = sda.VideoAnimator()  # Instantiate the animator
 7 |     fs = None if isinstance(audio_path, str) else utils.sample_rate
 8 |     video, audio_file = animator(target_image, audio_path, fs=fs, aligned=False)
 9 |     # print(video.shape)
10 |     if save_path is not None:
11 |         animator.save_video(video, audio_file, save_path)
12 |     return video, audio_file


--------------------------------------------------------------------------------
/audio_transform.py:
--------------------------------------------------------------------------------
 1 | from adaptive_voice_conversion.inference import Inferencer
 2 | from utils import sample_rate
 3 | import yaml
 4 | 
 5 | 
 6 | class Blank: pass
 7 | 
 8 | def transorm_audio(content, style, output):
 9 |     with open("adaptive_voice_conversion/config.yaml") as f:
10 |         config = yaml.load(f)
11 |     args = Blank()
12 |     args.attr = "adaptive_voice_conversion/attr.pkl"
13 |     args.config = config
14 |     args.model = "adaptive_voice_conversion/vctk_model.ckpt"
15 |     args.source = content
16 |     args.target = style
17 |     args.output = output
18 |     args.sample_rate = sample_rate
19 | 
20 |     inferencer = Inferencer(config=config, args=args)
21 |     return inferencer.inference_from_path(output is not None)


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | # audio section
  5 | def text_to_speech(text, driving_audio, save_audio_path=None, save_voice_path=None, play=False):  # around 10 seconds
  6 |     """
  7 |     Takes a path to a text document and creates a voice saying those words in target's voice
  8 |     Algorithm: modified Real-Time Voice Cloning Toolbox
  9 |     source: https://github.com/CorentinJ/Real-Time-Voice-Cloning
 10 |     :param text: path to the words to be spoken or a string
 11 |     :param driving_audio: single or list of .wav audio samples
 12 |     :param save_audio_path: optional path to save to the voice file to
 13 |     :param save_voice_path: optional path to save your voice embedding (speeds future transfers)
 14 |     :param play: should it be played at the end
 15 |     :return: wav file
 16 |     """
 17 |     import text_to_audio
 18 |     import utils
 19 |     # list of samples or pre-made voice embedding
 20 |     if isinstance(driving_audio, list) or isinstance(driving_audio, str) and driving_audio[-4:] == ".npy":
 21 |         wav, voice = text_to_audio.generate_audio(text, driving_audio)
 22 |     # single voice sample
 23 |     else:
 24 |         wav, voice = text_to_audio.generate_audio(text, [driving_audio])
 25 |     if play:
 26 |         utils.play(wav)
 27 |     if save_audio_path is not None:
 28 |         utils.save(wav, save_audio_path)
 29 |     if save_voice_path is not None and voice is not None:
 30 |         np.save(save_voice_path, voice)
 31 |     return wav
 32 | 
 33 | 
 34 | def audio_stylize(base_audio, driving_audio, result_path):  # 5 seconds
 35 |     """
 36 |     Take a random voice and convert into target
 37 |     Algorithm: One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization
 38 |     source: https://github.com/jjery2243542/adaptive_voice_conversion.git
 39 |     :param base_audio: someone speaking the words
 40 |     :param driving_audio: example of the target voice saying things (recommend large sample size)
 41 |     :param result_path: optional path to save result
 42 |     :return: audio in the target voice (.wav)
 43 |     """
 44 |     import audio_transform
 45 |     return audio_transform.transorm_audio(base_audio, driving_audio, result_path)
 46 | 
 47 | 
 48 | # video
 49 | def audio_to_image(driving_audio, base_img, result_path=None):  # around 40 seconds
 50 |     """
 51 |     Use previously trained videos to imitate how person would say words
 52 |     Algorithm: Speech-Driven Facial Animation
 53 |     source: https://github.com/DinoMan/speech-driven-animation
 54 |     :param driving_audio: The audio you want the person to say
 55 |     :param base_img: A picture of who you want to speak the words
 56 |     :param result_path: optional path for where to store the result
 57 |     :return: video, audio
 58 |     """
 59 |     import audio_to_vid
 60 |     return audio_to_vid.generate_video(base_img, driving_audio, result_path)
 61 | 
 62 | 
 63 | def deepfake_video(base_video, driving_img, result_path=None):  # around 15 minutes
 64 |     """
 65 |     Take a video and an image and generate a new video with the faces swapped
 66 |     Algorithm: First Order Motion Model for Image Animation
 67 |     source: https://github.com/AliaksandrSiarohin/first-order-model
 68 |     :param base_video: video of random movements
 69 |     :param driving_img: target that will perform the motions
 70 |     :param result_path: optional path to save the resulting mp4
 71 |     :return: video file of target doing the actions and the fps of the video
 72 |     """
 73 |     import pic_to_vid
 74 |     video, fps = pic_to_vid.demo_video(driving_img, base_video, result_path, auto_crop=False)
 75 |     return video, fps
 76 | 
 77 | 
 78 | # compound
 79 | def text_to_vid(text, driving_audio, driving_img, result_path=None):
 80 |     """
 81 |     Creates a video of a person saying input text.
 82 |     :param text: a string or txt file path for what you want the target to say
 83 |     :param driving_audio: a sample of the targets voice (.wav)
 84 |     :param driving_img: an image of the target
 85 |     :param result_path: optional save path for the result
 86 |     :return: video, audio
 87 |     """
 88 |     synthesized_audio = text_to_speech(text, driving_audio)
 89 |     return audio_to_image(synthesized_audio, driving_img, result_path)
 90 | 
 91 | 
 92 | def imitate(base_video, driving_audio, driving_img, result_path):
 93 |     """
 94 |     Takes a video of one person saying something and replaces it with someone else
 95 |     :param base_video: path to the original video (.mp4)
 96 |     :param driving_audio: path to the voice of who you want to speak (.wav or .mp3)
 97 |     :param driving_img: path to the img you want to copy
 98 |     :param result_path: path to save the resulting video
 99 |     :return: VideoFileClip (moviepy module) of the video
100 |     """
101 |     import utils
102 |     base_audio = utils.audio_from_mp4(base_video)
103 |     new_audio = audio_stylize(base_video, driving_audio)
104 |     new_video = deepfake_video(base_video, driving_img)
105 |     return utils.save_mp4(new_video, new_audio, result_path)
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     pass
110 | 


--------------------------------------------------------------------------------
/pic_to_vid.py:
--------------------------------------------------------------------------------
 1 | # actions
 2 | from first_order_model.train import train
 3 | 
 4 | # normal modules
 5 | import yaml, os, imageio
 6 | from first_order_model.demo import find_best_frame, resize, load_checkpoints, make_animation, img_as_ubyte
 7 | 
 8 | # networks
 9 | from first_order_model.modules.discriminator import MultiScaleDiscriminator
10 | from first_order_model.modules.generator import OcclusionAwareGenerator
11 | from first_order_model.modules.keypoint_detector import KPDetector
12 | from first_order_model.frames_dataset import FramesDataset
13 | 
14 | 
15 | def get_video(target_img_path, driving_video_path):  # based off run
16 |     config = yaml.load(open("first_order_model/config/vox-256.yaml"))
17 |     mode = "animate"
18 |     log_dir = "video"
19 |     checkpoint = None
20 |     device_ids = "0"
21 |     verbose = False
22 | 
23 |     log_dir = os.path.join(log_dir, os.path.basename(config).split('.')[0])
24 |     generator = OcclusionAwareGenerator(**config['model_params']['generator_params'],
25 |                                         **config['model_params']['common_params'])
26 |     discriminator = MultiScaleDiscriminator(**config['model_params']['discriminator_params'],
27 |                                             **config['model_params']['common_params'])
28 | 
29 |     kp_detector = KPDetector(**config['model_params']['kp_detector_params'],
30 |                              **config['model_params']['common_params'])
31 |     dataset = FramesDataset(is_train=(mode == 'train'), **config['dataset_params'])
32 |     train(config, generator, discriminator, kp_detector, checkpoint, log_dir, dataset, device_ids)
33 | 
34 | 
35 | def demo_video(path_to_img, path_to_video, output_file, auto_crop=True):
36 |     config = "first_order_model/config/vox-256.yaml"  # model settings
37 |     checkpoint = "first_order_model/vox-cpk.pth.tar"  # actual model
38 |     cpu = True  # using cpu not gpu
39 |     relative = False  # make relative motions or move to absolute location
40 |     auto_crop = auto_crop
41 |     adapt_scale = True
42 |     result_video = output_file
43 |     best_frame = None  # where to start
44 | 
45 |     source_image = imageio.imread(path_to_img)
46 |     reader = imageio.get_reader(path_to_video)
47 |     fps = reader.get_meta_data()['fps']
48 |     driving_video = []
49 |     try:
50 |         for im in reader:
51 |             driving_video.append(im)
52 |     except RuntimeError:
53 |         pass
54 |     reader.close()
55 | 
56 |     source_image = resize(source_image, (256, 256))[..., :3]
57 |     driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]
58 |     generator, kp_detector = load_checkpoints(config_path=config, checkpoint_path=checkpoint, cpu=cpu)
59 | 
60 |     if auto_crop or best_frame is not None:
61 |         i = best_frame if best_frame is not None else find_best_frame(source_image, driving_video, cpu=cpu)
62 |         print ("Best frame: " + str(i))
63 |         driving_forward = driving_video[i:]
64 |         driving_backward = driving_video[:(i+1)][::-1]
65 |         predictions_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
66 |         predictions_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
67 |         predictions = predictions_backward[::-1] + predictions_forward[1:]
68 |     else:
69 |         predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
70 |     video = [img_as_ubyte(frame) for frame in predictions]
71 |     if output_file is not None:
72 |         imageio.mimsave(result_video, video, fps=fps)
73 |     return video, fps
74 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cffi==1.11.5
 2 | cloudpickle==0.5.3
 3 | cycler==0.10.0
 4 | dask==0.18.2
 5 | decorator==4.3.0
 6 | imageio==2.3.0
 7 | kiwisolver==1.0.1
 8 | matplotlib==2.2.2
 9 | networkx==2.1
10 | numpy==1.15.0
11 | pandas==0.23.4
12 | Pillow==5.2.0
13 | pycparser==2.18
14 | pygit==0.1
15 | pyparsing==2.2.0
16 | python-dateutil==2.7.3
17 | pytz==2018.5
18 | PyWavelets==0.5.2
19 | PyYAML==5.1
20 | scikit-image==0.14.0
21 | scikit-learn==0.19.2
22 | scipy==1.1.0
23 | six==1.11.0
24 | toolz==0.9.0
25 | torch==1.0.0
26 | torchvision==0.2.1
27 | tqdm==4.24.0
28 | 
29 | tensorflow==1.15
30 | umap-learn
31 | visdom
32 | librosa>=0.5.1
33 | matplotlib>=2.0.2
34 | numpy>=1.14.0
35 | scipy>=1.0.0
36 | tqdm
37 | sounddevice
38 | SoundFile
39 | Unidecode
40 | inflect
41 | PyQt5
42 | multiprocess
43 | numba==0.48
44 | 
45 | ffmpeg
46 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | if __name__ == '__main__':
2 |     print(bool(-5))


--------------------------------------------------------------------------------
/text_to_audio.py:
--------------------------------------------------------------------------------
  1 | # things for models
  2 | from Real_Time_Voice_Cloning.encoder import inference as encoder
  3 | from Real_Time_Voice_Cloning.synthesizer.inference import Synthesizer
  4 | from Real_Time_Voice_Cloning.vocoder import inference as vocoder
  5 | 
  6 | # other package file
  7 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params
  8 | from Real_Time_Voice_Cloning.toolbox.utterance import Utterance
  9 | 
 10 | # normal
 11 | import numpy as np
 12 | import utils
 13 | 
 14 | sample_rate = syn_params.hparams.sample_rate
 15 | synthesizer = None
 16 | current_synthesized_model = None
 17 | 
 18 | 
 19 | def get_synthesizer(path=""):  # create spectrogram for voice
 20 |     if synthesizer is None:
 21 |         checkpoints_dir = path + "/taco_pretrained"
 22 |         return Synthesizer(checkpoints_dir, low_mem=True, verbose=False)
 23 |     return synthesizer
 24 | 
 25 | 
 26 | def generate_spectrogram(text, utterance):
 27 |     texts = text.split("\n")
 28 |     embed = utterance.embed
 29 |     embeds = np.stack([embed] * len(texts))
 30 |     specs = synthesizer.synthesize_spectrograms(texts, embeds)
 31 |     breaks = [spec.shape[1] for spec in specs]
 32 |     spec = np.concatenate(specs, axis=1)
 33 | 
 34 |     # self.ui.draw_spec(spec, "generated")
 35 |     # self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
 36 |     return spec, breaks
 37 | 
 38 | 
 39 | def decode_spectrogram(spec, breaks=False):
 40 |     wav = vocoder.infer_waveform(spec)
 41 | 
 42 |     # Add breaks
 43 |     if breaks:
 44 |         b_ends = np.cumsum(np.array(breaks) * syn_params.hparams.hop_size)
 45 |         b_starts = np.concatenate(([0], b_ends[:-1]))
 46 |         wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
 47 |         breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
 48 |         wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
 49 | 
 50 |     # trim silences
 51 |     wav = encoder.preprocess_wav(wav)
 52 | 
 53 |     # Play it
 54 |     wav = wav / np.abs(wav).max() * 0.97
 55 |     return wav
 56 | 
 57 | 
 58 | def create_utterance(wavs):
 59 |     amount_of_samples = len(wavs)
 60 |     embeds = []
 61 |     # Compute the mel spectrogram
 62 |     spec = Synthesizer.make_spectrogram(wavs[0])
 63 |     # self.ui.draw_spec(spec, "current")
 64 | 
 65 |     for wav in wavs:
 66 |         # Compute the embedding
 67 |         encoder_wav = encoder.preprocess_wav(wav)
 68 |         # embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
 69 |         embed = encoder.embed_utterance(encoder_wav, return_partials=False)
 70 |         embeds.append(embed)
 71 |     avg_embed = sum(embeds) / amount_of_samples
 72 |     speaker_name = "audio_sample"
 73 |     name = speaker_name + "_rec_%05d"
 74 |     # Add the utterance
 75 |     return Utterance(name, speaker_name, wavs[0], spec, avg_embed, None, False)
 76 | 
 77 | 
 78 | def generate_audio(text, audio_samples):
 79 |     '''
 80 |     Return an audio file of a text in the voice of some utterances from the same person
 81 |     :param text: text file or string with line breaks to indicate pauses
 82 |     :param audio_samples: paths to any audio sample of .wav format (5-12 seconds)
 83 |     :return: audio file of .wav format
 84 |     '''
 85 |     # todo: import the pretrained models
 86 |     global current_synthesized_model, synthesizer
 87 |     encoder.load_model("Real_Time_Voice_Cloning/pretrained/encoder/saved_models/pretrained.pt", "cpu")  # what is this used for
 88 |     vocoder.load_model("Real_Time_Voice_Cloning/pretrained/vocoder/saved_models/pretrained.pt", verbose=False)
 89 |     # todo: figure out how the multiple utterances work
 90 |     synthesizer = get_synthesizer("Real_Time_Voice_Cloning/pretrained/synthesizer/saved_models/logs-pretrained")
 91 |     if len(text) > 4 and text[-4:] == ".txt":  # check if file
 92 |         words = ""
 93 |         with open(text) as file:
 94 |             for line in file:
 95 |                 words += line
 96 |         text = words
 97 |         del words
 98 |     if isinstance(audio_samples, str):
 99 |         utterance = Utterance("name", "speaker_name", None, None, np.load(audio_samples), None, None)
100 |     else:
101 |         utterance = create_utterance(audio_samples)
102 |     current_synthesized_model = generate_spectrogram(text, utterance)
103 |     audio_file = decode_spectrogram(*current_synthesized_model)
104 |     return audio_file, utterance.embed
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     sample_rate = syn_params.hparams.sample_rate
109 |     while True:
110 |         input("Hit enter to record:")
111 |         wav = utils.record(sample_rate, 5)
112 |         input("Hit enter to play")
113 |         utils.play(wav, sample_rate)
114 |         print(wav.shape)
115 | 
116 | 


--------------------------------------------------------------------------------
/user_data/Tate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate.png


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample0.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample1.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample2.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample3.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample4.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample5.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample5.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample6.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample6.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample7.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample7.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample8.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample8.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample9.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample9.wav


--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/voice.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/voice.npy


--------------------------------------------------------------------------------
/user_data/activity_unproductive.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/activity_unproductive.wav


--------------------------------------------------------------------------------
/user_data/declaration.txt:
--------------------------------------------------------------------------------
1 | The unanimous Declaration of the thirteen united States of America, When in the Course of human events,
2 | it becomes necessary for one people to dissolve the political bands which have connected them with another,
3 | and to assume among the powers of the earth,
4 | the separate and equal station to which the Laws of Nature and of Nature's God entitle them,
5 | a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.
6 | 


--------------------------------------------------------------------------------
/user_data/obama.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/obama.mp4


--------------------------------------------------------------------------------
/user_data/output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/output.wav


--------------------------------------------------------------------------------
/user_data/putin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/putin.png


--------------------------------------------------------------------------------
/user_data/quick.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/quick.wav


--------------------------------------------------------------------------------
/user_data/result.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/result.mp4


--------------------------------------------------------------------------------
/user_data/stylized.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/stylized.wav


--------------------------------------------------------------------------------
/user_data/thing.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/thing.mp4


--------------------------------------------------------------------------------
/user_data/trump.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/trump.mp4


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import sounddevice, time
 2 | from moviepy.video.io.VideoFileClip import VideoFileClip
 3 | from moviepy.audio.io import AudioFileClip
 4 | import imageio
 5 | from pydub import AudioSegment
 6 | from scipy.io import wavfile
 7 | import librosa
 8 | import numpy as np
 9 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params
10 | sample_rate = syn_params.hparams.sample_rate
11 | 
12 | 
13 | def play(wav, pause_processes=True):
14 |     if isinstance(wav, str):
15 |         wav = load(wav)
16 |     sounddevice.stop()
17 |     sounddevice.play(wav, sample_rate)
18 |     if pause_processes:
19 |         time.sleep(wav.shape[0] / sample_rate + 1)
20 | 
21 | 
22 | def record(duration, pause_processes=True):
23 |     # sounddevice.stop()
24 |     try:
25 |         wav = sounddevice.rec(duration * sample_rate, sample_rate, 1)
26 |         if pause_processes:
27 |             time.sleep(duration)
28 |         wav = wav.reshape(wav.shape[0])
29 |     except Exception as e:
30 |         print(e)
31 |         print("Could not record anything. Is your recording device enabled?")
32 |         print("Your device must be connected before you start the toolbox.")
33 |         return None
34 |     return wav
35 | 
36 | 
37 | def save(wav, filepath):
38 |     # sf.write(filepath, wav, sample_rate)
39 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
40 |     # proposed by @dsmiller
41 |     wavfile.write(filepath, sample_rate, wav.astype(np.int16))
42 | 
43 | 
44 | def load(filepath, get_sample=False):
45 |     wav, sr = librosa.core.load(filepath, sr=sample_rate)
46 |     if get_sample:
47 |         return wav, sr
48 |     return wav
49 | 
50 | 
51 | def mp3_to_wav(mp3_path):
52 |     sound = AudioSegment.from_mp3(mp3_path)
53 |     new_path = mp3_path[:-4] + ".wav"
54 |     sound.export(new_path, format="wav")
55 |     return load(new_path)
56 | 
57 | 
58 | def audio_from_mp4(mp4_file):
59 |     vid = VideoFileClip(mp4_file)
60 |     return vid.audio
61 | 
62 | 
63 | def create_mp4(video, audio, save_path=None):
64 |     audio = AudioFileClip(audio) if audio is not None else None
65 |     fps = video.shape[0] / audio.duration
66 |     imageio.mimsave("backend_files/mute.mp4", video)
67 |     video = VideoFileClip("backend_files/mute.mp4", audio=False)
68 |     video.set_audio(audio)
69 |     if save_path is not None:
70 |         video.write_videofile(save_path, fps=fps)  # , codec='mpeg4')
71 |     return video
72 | 


--------------------------------------------------------------------------------