├── .idea ├── $CACHE_FILE$ ├── .gitignore ├── Deepfakes.iml ├── dictionaries │ └── 22staples.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── audio_to_vid.py ├── audio_transform.py ├── main.py ├── pic_to_vid.py ├── requirements.txt ├── test.py ├── text_to_audio.py ├── user_data ├── Tate.png ├── Tate_audio_samples │ ├── sample0.wav │ ├── sample1.wav │ ├── sample2.wav │ ├── sample3.wav │ ├── sample4.wav │ ├── sample5.wav │ ├── sample6.wav │ ├── sample7.wav │ ├── sample8.wav │ ├── sample9.wav │ └── voice.npy ├── activity_unproductive.wav ├── declaration.txt ├── obama.mp4 ├── output.wav ├── putin.png ├── quick.wav ├── result.mp4 ├── stylized.wav ├── thing.mp4 └── trump.mp4 └── utils.py /.idea/$CACHE_FILE$: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/Deepfakes.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/dictionaries/22staples.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | deepfake 5 | obama 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Premise 2 | A combination of various deepfake algoritms to quickly create fake audio and video 3 | 4 | # Options 5 | This project has 4 seperate algorithms. 6 | 1) [first-order-model](https://github.com/AliaksandrSiarohin/first-order-model): a quick deepfake alorithm that generates a video from a base video and driving image 7 | 2) [Speech-Driven Facial Animation](https://github.com/DinoMan/speech-driven-animation): animates a picture to speak an audio input 8 | 3) [Real-Time Voice Cloning Toolbox](https://github.com/CorentinJ/Real-Time-Voice-Cloning): a quick text to speech algorithm based off seconds of driving audio 9 | 4) [One-shot Voice Conversion](https://github.com/jjery2243542/adaptive_voice_conversion): voice style transform to change the words of one into the words of another 10 | 11 | # Setup 12 | ## Import 13 | pip install -r requirements.txt 14 | 15 | Get from Version Control: 16 | 1) https://github.com/AliaksandrSiarohin/first-order-model.git 17 | 2) https://github.com/DinoMan/speech-driven-animation.git 18 | 3) https://github.com/jjery2243542/adaptive_voice_conversion.git 19 | 4) https://github.com/jjery2243542/adaptive_voice_conversion.git 20 | 21 | and put these into the local project 22 | 23 | ## Modify 24 | replace all the dashes in file names with _ 25 | 26 | Modify each of the files in the following ways: 27 | 1) go [here](https://drive.google.com/drive/folders/1PyQJmkdCsAkOYwUyaj_l-l0as-iLDgeH) and download vox-cpk.pth.tar, then place it in first_order_model 28 | 2) go [here](https://drive.google.com/drive/folders/1pJdsnknLmMLvA8RQIAV3AQH8vU0FeK16) and download grid.dat, then replace sda/data/grid.dat 29 | 3) download: [model](https://drive.google.com/file/d/1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc/view), delete toolbox/__init__.py 30 | 4) download: [model](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/vctk_model.ckpt) and [attr](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/attr.pkl), add move them into adaptive_voice_conversion 31 | 32 | ## Alter imports 33 | Next go through each file and correct the imports due to the content root. Go through all the files and add - *project_name.* - before all necesary imports (if you know a better way, please tell me) 34 | -------------------------------------------------------------------------------- /audio_to_vid.py: -------------------------------------------------------------------------------- 1 | import speech_driven_animation.sda as sda 2 | import utils 3 | 4 | 5 | def generate_video(target_image, audio_path, save_path): 6 | animator = sda.VideoAnimator() # Instantiate the animator 7 | fs = None if isinstance(audio_path, str) else utils.sample_rate 8 | video, audio_file = animator(target_image, audio_path, fs=fs, aligned=False) 9 | # print(video.shape) 10 | if save_path is not None: 11 | animator.save_video(video, audio_file, save_path) 12 | return video, audio_file -------------------------------------------------------------------------------- /audio_transform.py: -------------------------------------------------------------------------------- 1 | from adaptive_voice_conversion.inference import Inferencer 2 | from utils import sample_rate 3 | import yaml 4 | 5 | 6 | class Blank: pass 7 | 8 | def transorm_audio(content, style, output): 9 | with open("adaptive_voice_conversion/config.yaml") as f: 10 | config = yaml.load(f) 11 | args = Blank() 12 | args.attr = "adaptive_voice_conversion/attr.pkl" 13 | args.config = config 14 | args.model = "adaptive_voice_conversion/vctk_model.ckpt" 15 | args.source = content 16 | args.target = style 17 | args.output = output 18 | args.sample_rate = sample_rate 19 | 20 | inferencer = Inferencer(config=config, args=args) 21 | return inferencer.inference_from_path(output is not None) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # audio section 5 | def text_to_speech(text, driving_audio, save_audio_path=None, save_voice_path=None, play=False): # around 10 seconds 6 | """ 7 | Takes a path to a text document and creates a voice saying those words in target's voice 8 | Algorithm: modified Real-Time Voice Cloning Toolbox 9 | source: https://github.com/CorentinJ/Real-Time-Voice-Cloning 10 | :param text: path to the words to be spoken or a string 11 | :param driving_audio: single or list of .wav audio samples 12 | :param save_audio_path: optional path to save to the voice file to 13 | :param save_voice_path: optional path to save your voice embedding (speeds future transfers) 14 | :param play: should it be played at the end 15 | :return: wav file 16 | """ 17 | import text_to_audio 18 | import utils 19 | # list of samples or pre-made voice embedding 20 | if isinstance(driving_audio, list) or isinstance(driving_audio, str) and driving_audio[-4:] == ".npy": 21 | wav, voice = text_to_audio.generate_audio(text, driving_audio) 22 | # single voice sample 23 | else: 24 | wav, voice = text_to_audio.generate_audio(text, [driving_audio]) 25 | if play: 26 | utils.play(wav) 27 | if save_audio_path is not None: 28 | utils.save(wav, save_audio_path) 29 | if save_voice_path is not None and voice is not None: 30 | np.save(save_voice_path, voice) 31 | return wav 32 | 33 | 34 | def audio_stylize(base_audio, driving_audio, result_path): # 5 seconds 35 | """ 36 | Take a random voice and convert into target 37 | Algorithm: One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization 38 | source: https://github.com/jjery2243542/adaptive_voice_conversion.git 39 | :param base_audio: someone speaking the words 40 | :param driving_audio: example of the target voice saying things (recommend large sample size) 41 | :param result_path: optional path to save result 42 | :return: audio in the target voice (.wav) 43 | """ 44 | import audio_transform 45 | return audio_transform.transorm_audio(base_audio, driving_audio, result_path) 46 | 47 | 48 | # video 49 | def audio_to_image(driving_audio, base_img, result_path=None): # around 40 seconds 50 | """ 51 | Use previously trained videos to imitate how person would say words 52 | Algorithm: Speech-Driven Facial Animation 53 | source: https://github.com/DinoMan/speech-driven-animation 54 | :param driving_audio: The audio you want the person to say 55 | :param base_img: A picture of who you want to speak the words 56 | :param result_path: optional path for where to store the result 57 | :return: video, audio 58 | """ 59 | import audio_to_vid 60 | return audio_to_vid.generate_video(base_img, driving_audio, result_path) 61 | 62 | 63 | def deepfake_video(base_video, driving_img, result_path=None): # around 15 minutes 64 | """ 65 | Take a video and an image and generate a new video with the faces swapped 66 | Algorithm: First Order Motion Model for Image Animation 67 | source: https://github.com/AliaksandrSiarohin/first-order-model 68 | :param base_video: video of random movements 69 | :param driving_img: target that will perform the motions 70 | :param result_path: optional path to save the resulting mp4 71 | :return: video file of target doing the actions and the fps of the video 72 | """ 73 | import pic_to_vid 74 | video, fps = pic_to_vid.demo_video(driving_img, base_video, result_path, auto_crop=False) 75 | return video, fps 76 | 77 | 78 | # compound 79 | def text_to_vid(text, driving_audio, driving_img, result_path=None): 80 | """ 81 | Creates a video of a person saying input text. 82 | :param text: a string or txt file path for what you want the target to say 83 | :param driving_audio: a sample of the targets voice (.wav) 84 | :param driving_img: an image of the target 85 | :param result_path: optional save path for the result 86 | :return: video, audio 87 | """ 88 | synthesized_audio = text_to_speech(text, driving_audio) 89 | return audio_to_image(synthesized_audio, driving_img, result_path) 90 | 91 | 92 | def imitate(base_video, driving_audio, driving_img, result_path): 93 | """ 94 | Takes a video of one person saying something and replaces it with someone else 95 | :param base_video: path to the original video (.mp4) 96 | :param driving_audio: path to the voice of who you want to speak (.wav or .mp3) 97 | :param driving_img: path to the img you want to copy 98 | :param result_path: path to save the resulting video 99 | :return: VideoFileClip (moviepy module) of the video 100 | """ 101 | import utils 102 | base_audio = utils.audio_from_mp4(base_video) 103 | new_audio = audio_stylize(base_video, driving_audio) 104 | new_video = deepfake_video(base_video, driving_img) 105 | return utils.save_mp4(new_video, new_audio, result_path) 106 | 107 | 108 | if __name__ == '__main__': 109 | pass 110 | -------------------------------------------------------------------------------- /pic_to_vid.py: -------------------------------------------------------------------------------- 1 | # actions 2 | from first_order_model.train import train 3 | 4 | # normal modules 5 | import yaml, os, imageio 6 | from first_order_model.demo import find_best_frame, resize, load_checkpoints, make_animation, img_as_ubyte 7 | 8 | # networks 9 | from first_order_model.modules.discriminator import MultiScaleDiscriminator 10 | from first_order_model.modules.generator import OcclusionAwareGenerator 11 | from first_order_model.modules.keypoint_detector import KPDetector 12 | from first_order_model.frames_dataset import FramesDataset 13 | 14 | 15 | def get_video(target_img_path, driving_video_path): # based off run 16 | config = yaml.load(open("first_order_model/config/vox-256.yaml")) 17 | mode = "animate" 18 | log_dir = "video" 19 | checkpoint = None 20 | device_ids = "0" 21 | verbose = False 22 | 23 | log_dir = os.path.join(log_dir, os.path.basename(config).split('.')[0]) 24 | generator = OcclusionAwareGenerator(**config['model_params']['generator_params'], 25 | **config['model_params']['common_params']) 26 | discriminator = MultiScaleDiscriminator(**config['model_params']['discriminator_params'], 27 | **config['model_params']['common_params']) 28 | 29 | kp_detector = KPDetector(**config['model_params']['kp_detector_params'], 30 | **config['model_params']['common_params']) 31 | dataset = FramesDataset(is_train=(mode == 'train'), **config['dataset_params']) 32 | train(config, generator, discriminator, kp_detector, checkpoint, log_dir, dataset, device_ids) 33 | 34 | 35 | def demo_video(path_to_img, path_to_video, output_file, auto_crop=True): 36 | config = "first_order_model/config/vox-256.yaml" # model settings 37 | checkpoint = "first_order_model/vox-cpk.pth.tar" # actual model 38 | cpu = True # using cpu not gpu 39 | relative = False # make relative motions or move to absolute location 40 | auto_crop = auto_crop 41 | adapt_scale = True 42 | result_video = output_file 43 | best_frame = None # where to start 44 | 45 | source_image = imageio.imread(path_to_img) 46 | reader = imageio.get_reader(path_to_video) 47 | fps = reader.get_meta_data()['fps'] 48 | driving_video = [] 49 | try: 50 | for im in reader: 51 | driving_video.append(im) 52 | except RuntimeError: 53 | pass 54 | reader.close() 55 | 56 | source_image = resize(source_image, (256, 256))[..., :3] 57 | driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video] 58 | generator, kp_detector = load_checkpoints(config_path=config, checkpoint_path=checkpoint, cpu=cpu) 59 | 60 | if auto_crop or best_frame is not None: 61 | i = best_frame if best_frame is not None else find_best_frame(source_image, driving_video, cpu=cpu) 62 | print ("Best frame: " + str(i)) 63 | driving_forward = driving_video[i:] 64 | driving_backward = driving_video[:(i+1)][::-1] 65 | predictions_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu) 66 | predictions_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu) 67 | predictions = predictions_backward[::-1] + predictions_forward[1:] 68 | else: 69 | predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu) 70 | video = [img_as_ubyte(frame) for frame in predictions] 71 | if output_file is not None: 72 | imageio.mimsave(result_video, video, fps=fps) 73 | return video, fps 74 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cffi==1.11.5 2 | cloudpickle==0.5.3 3 | cycler==0.10.0 4 | dask==0.18.2 5 | decorator==4.3.0 6 | imageio==2.3.0 7 | kiwisolver==1.0.1 8 | matplotlib==2.2.2 9 | networkx==2.1 10 | numpy==1.15.0 11 | pandas==0.23.4 12 | Pillow==5.2.0 13 | pycparser==2.18 14 | pygit==0.1 15 | pyparsing==2.2.0 16 | python-dateutil==2.7.3 17 | pytz==2018.5 18 | PyWavelets==0.5.2 19 | PyYAML==5.1 20 | scikit-image==0.14.0 21 | scikit-learn==0.19.2 22 | scipy==1.1.0 23 | six==1.11.0 24 | toolz==0.9.0 25 | torch==1.0.0 26 | torchvision==0.2.1 27 | tqdm==4.24.0 28 | 29 | tensorflow==1.15 30 | umap-learn 31 | visdom 32 | librosa>=0.5.1 33 | matplotlib>=2.0.2 34 | numpy>=1.14.0 35 | scipy>=1.0.0 36 | tqdm 37 | sounddevice 38 | SoundFile 39 | Unidecode 40 | inflect 41 | PyQt5 42 | multiprocess 43 | numba==0.48 44 | 45 | ffmpeg 46 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | print(bool(-5)) -------------------------------------------------------------------------------- /text_to_audio.py: -------------------------------------------------------------------------------- 1 | # things for models 2 | from Real_Time_Voice_Cloning.encoder import inference as encoder 3 | from Real_Time_Voice_Cloning.synthesizer.inference import Synthesizer 4 | from Real_Time_Voice_Cloning.vocoder import inference as vocoder 5 | 6 | # other package file 7 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params 8 | from Real_Time_Voice_Cloning.toolbox.utterance import Utterance 9 | 10 | # normal 11 | import numpy as np 12 | import utils 13 | 14 | sample_rate = syn_params.hparams.sample_rate 15 | synthesizer = None 16 | current_synthesized_model = None 17 | 18 | 19 | def get_synthesizer(path=""): # create spectrogram for voice 20 | if synthesizer is None: 21 | checkpoints_dir = path + "/taco_pretrained" 22 | return Synthesizer(checkpoints_dir, low_mem=True, verbose=False) 23 | return synthesizer 24 | 25 | 26 | def generate_spectrogram(text, utterance): 27 | texts = text.split("\n") 28 | embed = utterance.embed 29 | embeds = np.stack([embed] * len(texts)) 30 | specs = synthesizer.synthesize_spectrograms(texts, embeds) 31 | breaks = [spec.shape[1] for spec in specs] 32 | spec = np.concatenate(specs, axis=1) 33 | 34 | # self.ui.draw_spec(spec, "generated") 35 | # self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None) 36 | return spec, breaks 37 | 38 | 39 | def decode_spectrogram(spec, breaks=False): 40 | wav = vocoder.infer_waveform(spec) 41 | 42 | # Add breaks 43 | if breaks: 44 | b_ends = np.cumsum(np.array(breaks) * syn_params.hparams.hop_size) 45 | b_starts = np.concatenate(([0], b_ends[:-1])) 46 | wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] 47 | breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) 48 | wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) 49 | 50 | # trim silences 51 | wav = encoder.preprocess_wav(wav) 52 | 53 | # Play it 54 | wav = wav / np.abs(wav).max() * 0.97 55 | return wav 56 | 57 | 58 | def create_utterance(wavs): 59 | amount_of_samples = len(wavs) 60 | embeds = [] 61 | # Compute the mel spectrogram 62 | spec = Synthesizer.make_spectrogram(wavs[0]) 63 | # self.ui.draw_spec(spec, "current") 64 | 65 | for wav in wavs: 66 | # Compute the embedding 67 | encoder_wav = encoder.preprocess_wav(wav) 68 | # embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) 69 | embed = encoder.embed_utterance(encoder_wav, return_partials=False) 70 | embeds.append(embed) 71 | avg_embed = sum(embeds) / amount_of_samples 72 | speaker_name = "audio_sample" 73 | name = speaker_name + "_rec_%05d" 74 | # Add the utterance 75 | return Utterance(name, speaker_name, wavs[0], spec, avg_embed, None, False) 76 | 77 | 78 | def generate_audio(text, audio_samples): 79 | ''' 80 | Return an audio file of a text in the voice of some utterances from the same person 81 | :param text: text file or string with line breaks to indicate pauses 82 | :param audio_samples: paths to any audio sample of .wav format (5-12 seconds) 83 | :return: audio file of .wav format 84 | ''' 85 | # todo: import the pretrained models 86 | global current_synthesized_model, synthesizer 87 | encoder.load_model("Real_Time_Voice_Cloning/pretrained/encoder/saved_models/pretrained.pt", "cpu") # what is this used for 88 | vocoder.load_model("Real_Time_Voice_Cloning/pretrained/vocoder/saved_models/pretrained.pt", verbose=False) 89 | # todo: figure out how the multiple utterances work 90 | synthesizer = get_synthesizer("Real_Time_Voice_Cloning/pretrained/synthesizer/saved_models/logs-pretrained") 91 | if len(text) > 4 and text[-4:] == ".txt": # check if file 92 | words = "" 93 | with open(text) as file: 94 | for line in file: 95 | words += line 96 | text = words 97 | del words 98 | if isinstance(audio_samples, str): 99 | utterance = Utterance("name", "speaker_name", None, None, np.load(audio_samples), None, None) 100 | else: 101 | utterance = create_utterance(audio_samples) 102 | current_synthesized_model = generate_spectrogram(text, utterance) 103 | audio_file = decode_spectrogram(*current_synthesized_model) 104 | return audio_file, utterance.embed 105 | 106 | 107 | if __name__ == '__main__': 108 | sample_rate = syn_params.hparams.sample_rate 109 | while True: 110 | input("Hit enter to record:") 111 | wav = utils.record(sample_rate, 5) 112 | input("Hit enter to play") 113 | utils.play(wav, sample_rate) 114 | print(wav.shape) 115 | 116 | -------------------------------------------------------------------------------- /user_data/Tate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate.png -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample0.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample1.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample2.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample3.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample4.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample5.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample6.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample7.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample8.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/sample9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample9.wav -------------------------------------------------------------------------------- /user_data/Tate_audio_samples/voice.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/voice.npy -------------------------------------------------------------------------------- /user_data/activity_unproductive.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/activity_unproductive.wav -------------------------------------------------------------------------------- /user_data/declaration.txt: -------------------------------------------------------------------------------- 1 | The unanimous Declaration of the thirteen united States of America, When in the Course of human events, 2 | it becomes necessary for one people to dissolve the political bands which have connected them with another, 3 | and to assume among the powers of the earth, 4 | the separate and equal station to which the Laws of Nature and of Nature's God entitle them, 5 | a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. 6 | -------------------------------------------------------------------------------- /user_data/obama.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/obama.mp4 -------------------------------------------------------------------------------- /user_data/output.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/output.wav -------------------------------------------------------------------------------- /user_data/putin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/putin.png -------------------------------------------------------------------------------- /user_data/quick.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/quick.wav -------------------------------------------------------------------------------- /user_data/result.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/result.mp4 -------------------------------------------------------------------------------- /user_data/stylized.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/stylized.wav -------------------------------------------------------------------------------- /user_data/thing.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/thing.mp4 -------------------------------------------------------------------------------- /user_data/trump.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/trump.mp4 -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import sounddevice, time 2 | from moviepy.video.io.VideoFileClip import VideoFileClip 3 | from moviepy.audio.io import AudioFileClip 4 | import imageio 5 | from pydub import AudioSegment 6 | from scipy.io import wavfile 7 | import librosa 8 | import numpy as np 9 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params 10 | sample_rate = syn_params.hparams.sample_rate 11 | 12 | 13 | def play(wav, pause_processes=True): 14 | if isinstance(wav, str): 15 | wav = load(wav) 16 | sounddevice.stop() 17 | sounddevice.play(wav, sample_rate) 18 | if pause_processes: 19 | time.sleep(wav.shape[0] / sample_rate + 1) 20 | 21 | 22 | def record(duration, pause_processes=True): 23 | # sounddevice.stop() 24 | try: 25 | wav = sounddevice.rec(duration * sample_rate, sample_rate, 1) 26 | if pause_processes: 27 | time.sleep(duration) 28 | wav = wav.reshape(wav.shape[0]) 29 | except Exception as e: 30 | print(e) 31 | print("Could not record anything. Is your recording device enabled?") 32 | print("Your device must be connected before you start the toolbox.") 33 | return None 34 | return wav 35 | 36 | 37 | def save(wav, filepath): 38 | # sf.write(filepath, wav, sample_rate) 39 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 40 | # proposed by @dsmiller 41 | wavfile.write(filepath, sample_rate, wav.astype(np.int16)) 42 | 43 | 44 | def load(filepath, get_sample=False): 45 | wav, sr = librosa.core.load(filepath, sr=sample_rate) 46 | if get_sample: 47 | return wav, sr 48 | return wav 49 | 50 | 51 | def mp3_to_wav(mp3_path): 52 | sound = AudioSegment.from_mp3(mp3_path) 53 | new_path = mp3_path[:-4] + ".wav" 54 | sound.export(new_path, format="wav") 55 | return load(new_path) 56 | 57 | 58 | def audio_from_mp4(mp4_file): 59 | vid = VideoFileClip(mp4_file) 60 | return vid.audio 61 | 62 | 63 | def create_mp4(video, audio, save_path=None): 64 | audio = AudioFileClip(audio) if audio is not None else None 65 | fps = video.shape[0] / audio.duration 66 | imageio.mimsave("backend_files/mute.mp4", video) 67 | video = VideoFileClip("backend_files/mute.mp4", audio=False) 68 | video.set_audio(audio) 69 | if save_path is not None: 70 | video.write_videofile(save_path, fps=fps) # , codec='mpeg4') 71 | return video 72 | --------------------------------------------------------------------------------