├── .idea
├── $CACHE_FILE$
├── .gitignore
├── Deepfakes.iml
├── dictionaries
│ └── 22staples.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── audio_to_vid.py
├── audio_transform.py
├── main.py
├── pic_to_vid.py
├── requirements.txt
├── test.py
├── text_to_audio.py
├── user_data
├── Tate.png
├── Tate_audio_samples
│ ├── sample0.wav
│ ├── sample1.wav
│ ├── sample2.wav
│ ├── sample3.wav
│ ├── sample4.wav
│ ├── sample5.wav
│ ├── sample6.wav
│ ├── sample7.wav
│ ├── sample8.wav
│ ├── sample9.wav
│ └── voice.npy
├── activity_unproductive.wav
├── declaration.txt
├── obama.mp4
├── output.wav
├── putin.png
├── quick.wav
├── result.mp4
├── stylized.wav
├── thing.mp4
└── trump.mp4
└── utils.py
/.idea/$CACHE_FILE$:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/Deepfakes.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/dictionaries/22staples.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | deepfake
5 | obama
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Premise
2 | A combination of various deepfake algoritms to quickly create fake audio and video
3 |
4 | # Options
5 | This project has 4 seperate algorithms.
6 | 1) [first-order-model](https://github.com/AliaksandrSiarohin/first-order-model): a quick deepfake alorithm that generates a video from a base video and driving image
7 | 2) [Speech-Driven Facial Animation](https://github.com/DinoMan/speech-driven-animation): animates a picture to speak an audio input
8 | 3) [Real-Time Voice Cloning Toolbox](https://github.com/CorentinJ/Real-Time-Voice-Cloning): a quick text to speech algorithm based off seconds of driving audio
9 | 4) [One-shot Voice Conversion](https://github.com/jjery2243542/adaptive_voice_conversion): voice style transform to change the words of one into the words of another
10 |
11 | # Setup
12 | ## Import
13 | pip install -r requirements.txt
14 |
15 | Get from Version Control:
16 | 1) https://github.com/AliaksandrSiarohin/first-order-model.git
17 | 2) https://github.com/DinoMan/speech-driven-animation.git
18 | 3) https://github.com/jjery2243542/adaptive_voice_conversion.git
19 | 4) https://github.com/jjery2243542/adaptive_voice_conversion.git
20 |
21 | and put these into the local project
22 |
23 | ## Modify
24 | replace all the dashes in file names with _
25 |
26 | Modify each of the files in the following ways:
27 | 1) go [here](https://drive.google.com/drive/folders/1PyQJmkdCsAkOYwUyaj_l-l0as-iLDgeH) and download vox-cpk.pth.tar, then place it in first_order_model
28 | 2) go [here](https://drive.google.com/drive/folders/1pJdsnknLmMLvA8RQIAV3AQH8vU0FeK16) and download grid.dat, then replace sda/data/grid.dat
29 | 3) download: [model](https://drive.google.com/file/d/1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc/view), delete toolbox/__init__.py
30 | 4) download: [model](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/vctk_model.ckpt) and [attr](http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is19/attr.pkl), add move them into adaptive_voice_conversion
31 |
32 | ## Alter imports
33 | Next go through each file and correct the imports due to the content root. Go through all the files and add - *project_name.* - before all necesary imports (if you know a better way, please tell me)
34 |
--------------------------------------------------------------------------------
/audio_to_vid.py:
--------------------------------------------------------------------------------
1 | import speech_driven_animation.sda as sda
2 | import utils
3 |
4 |
5 | def generate_video(target_image, audio_path, save_path):
6 | animator = sda.VideoAnimator() # Instantiate the animator
7 | fs = None if isinstance(audio_path, str) else utils.sample_rate
8 | video, audio_file = animator(target_image, audio_path, fs=fs, aligned=False)
9 | # print(video.shape)
10 | if save_path is not None:
11 | animator.save_video(video, audio_file, save_path)
12 | return video, audio_file
--------------------------------------------------------------------------------
/audio_transform.py:
--------------------------------------------------------------------------------
1 | from adaptive_voice_conversion.inference import Inferencer
2 | from utils import sample_rate
3 | import yaml
4 |
5 |
6 | class Blank: pass
7 |
8 | def transorm_audio(content, style, output):
9 | with open("adaptive_voice_conversion/config.yaml") as f:
10 | config = yaml.load(f)
11 | args = Blank()
12 | args.attr = "adaptive_voice_conversion/attr.pkl"
13 | args.config = config
14 | args.model = "adaptive_voice_conversion/vctk_model.ckpt"
15 | args.source = content
16 | args.target = style
17 | args.output = output
18 | args.sample_rate = sample_rate
19 |
20 | inferencer = Inferencer(config=config, args=args)
21 | return inferencer.inference_from_path(output is not None)
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # audio section
5 | def text_to_speech(text, driving_audio, save_audio_path=None, save_voice_path=None, play=False): # around 10 seconds
6 | """
7 | Takes a path to a text document and creates a voice saying those words in target's voice
8 | Algorithm: modified Real-Time Voice Cloning Toolbox
9 | source: https://github.com/CorentinJ/Real-Time-Voice-Cloning
10 | :param text: path to the words to be spoken or a string
11 | :param driving_audio: single or list of .wav audio samples
12 | :param save_audio_path: optional path to save to the voice file to
13 | :param save_voice_path: optional path to save your voice embedding (speeds future transfers)
14 | :param play: should it be played at the end
15 | :return: wav file
16 | """
17 | import text_to_audio
18 | import utils
19 | # list of samples or pre-made voice embedding
20 | if isinstance(driving_audio, list) or isinstance(driving_audio, str) and driving_audio[-4:] == ".npy":
21 | wav, voice = text_to_audio.generate_audio(text, driving_audio)
22 | # single voice sample
23 | else:
24 | wav, voice = text_to_audio.generate_audio(text, [driving_audio])
25 | if play:
26 | utils.play(wav)
27 | if save_audio_path is not None:
28 | utils.save(wav, save_audio_path)
29 | if save_voice_path is not None and voice is not None:
30 | np.save(save_voice_path, voice)
31 | return wav
32 |
33 |
34 | def audio_stylize(base_audio, driving_audio, result_path): # 5 seconds
35 | """
36 | Take a random voice and convert into target
37 | Algorithm: One-shot Voice Conversion by Separating Speaker and Content Representations with Instance Normalization
38 | source: https://github.com/jjery2243542/adaptive_voice_conversion.git
39 | :param base_audio: someone speaking the words
40 | :param driving_audio: example of the target voice saying things (recommend large sample size)
41 | :param result_path: optional path to save result
42 | :return: audio in the target voice (.wav)
43 | """
44 | import audio_transform
45 | return audio_transform.transorm_audio(base_audio, driving_audio, result_path)
46 |
47 |
48 | # video
49 | def audio_to_image(driving_audio, base_img, result_path=None): # around 40 seconds
50 | """
51 | Use previously trained videos to imitate how person would say words
52 | Algorithm: Speech-Driven Facial Animation
53 | source: https://github.com/DinoMan/speech-driven-animation
54 | :param driving_audio: The audio you want the person to say
55 | :param base_img: A picture of who you want to speak the words
56 | :param result_path: optional path for where to store the result
57 | :return: video, audio
58 | """
59 | import audio_to_vid
60 | return audio_to_vid.generate_video(base_img, driving_audio, result_path)
61 |
62 |
63 | def deepfake_video(base_video, driving_img, result_path=None): # around 15 minutes
64 | """
65 | Take a video and an image and generate a new video with the faces swapped
66 | Algorithm: First Order Motion Model for Image Animation
67 | source: https://github.com/AliaksandrSiarohin/first-order-model
68 | :param base_video: video of random movements
69 | :param driving_img: target that will perform the motions
70 | :param result_path: optional path to save the resulting mp4
71 | :return: video file of target doing the actions and the fps of the video
72 | """
73 | import pic_to_vid
74 | video, fps = pic_to_vid.demo_video(driving_img, base_video, result_path, auto_crop=False)
75 | return video, fps
76 |
77 |
78 | # compound
79 | def text_to_vid(text, driving_audio, driving_img, result_path=None):
80 | """
81 | Creates a video of a person saying input text.
82 | :param text: a string or txt file path for what you want the target to say
83 | :param driving_audio: a sample of the targets voice (.wav)
84 | :param driving_img: an image of the target
85 | :param result_path: optional save path for the result
86 | :return: video, audio
87 | """
88 | synthesized_audio = text_to_speech(text, driving_audio)
89 | return audio_to_image(synthesized_audio, driving_img, result_path)
90 |
91 |
92 | def imitate(base_video, driving_audio, driving_img, result_path):
93 | """
94 | Takes a video of one person saying something and replaces it with someone else
95 | :param base_video: path to the original video (.mp4)
96 | :param driving_audio: path to the voice of who you want to speak (.wav or .mp3)
97 | :param driving_img: path to the img you want to copy
98 | :param result_path: path to save the resulting video
99 | :return: VideoFileClip (moviepy module) of the video
100 | """
101 | import utils
102 | base_audio = utils.audio_from_mp4(base_video)
103 | new_audio = audio_stylize(base_video, driving_audio)
104 | new_video = deepfake_video(base_video, driving_img)
105 | return utils.save_mp4(new_video, new_audio, result_path)
106 |
107 |
108 | if __name__ == '__main__':
109 | pass
110 |
--------------------------------------------------------------------------------
/pic_to_vid.py:
--------------------------------------------------------------------------------
1 | # actions
2 | from first_order_model.train import train
3 |
4 | # normal modules
5 | import yaml, os, imageio
6 | from first_order_model.demo import find_best_frame, resize, load_checkpoints, make_animation, img_as_ubyte
7 |
8 | # networks
9 | from first_order_model.modules.discriminator import MultiScaleDiscriminator
10 | from first_order_model.modules.generator import OcclusionAwareGenerator
11 | from first_order_model.modules.keypoint_detector import KPDetector
12 | from first_order_model.frames_dataset import FramesDataset
13 |
14 |
15 | def get_video(target_img_path, driving_video_path): # based off run
16 | config = yaml.load(open("first_order_model/config/vox-256.yaml"))
17 | mode = "animate"
18 | log_dir = "video"
19 | checkpoint = None
20 | device_ids = "0"
21 | verbose = False
22 |
23 | log_dir = os.path.join(log_dir, os.path.basename(config).split('.')[0])
24 | generator = OcclusionAwareGenerator(**config['model_params']['generator_params'],
25 | **config['model_params']['common_params'])
26 | discriminator = MultiScaleDiscriminator(**config['model_params']['discriminator_params'],
27 | **config['model_params']['common_params'])
28 |
29 | kp_detector = KPDetector(**config['model_params']['kp_detector_params'],
30 | **config['model_params']['common_params'])
31 | dataset = FramesDataset(is_train=(mode == 'train'), **config['dataset_params'])
32 | train(config, generator, discriminator, kp_detector, checkpoint, log_dir, dataset, device_ids)
33 |
34 |
35 | def demo_video(path_to_img, path_to_video, output_file, auto_crop=True):
36 | config = "first_order_model/config/vox-256.yaml" # model settings
37 | checkpoint = "first_order_model/vox-cpk.pth.tar" # actual model
38 | cpu = True # using cpu not gpu
39 | relative = False # make relative motions or move to absolute location
40 | auto_crop = auto_crop
41 | adapt_scale = True
42 | result_video = output_file
43 | best_frame = None # where to start
44 |
45 | source_image = imageio.imread(path_to_img)
46 | reader = imageio.get_reader(path_to_video)
47 | fps = reader.get_meta_data()['fps']
48 | driving_video = []
49 | try:
50 | for im in reader:
51 | driving_video.append(im)
52 | except RuntimeError:
53 | pass
54 | reader.close()
55 |
56 | source_image = resize(source_image, (256, 256))[..., :3]
57 | driving_video = [resize(frame, (256, 256))[..., :3] for frame in driving_video]
58 | generator, kp_detector = load_checkpoints(config_path=config, checkpoint_path=checkpoint, cpu=cpu)
59 |
60 | if auto_crop or best_frame is not None:
61 | i = best_frame if best_frame is not None else find_best_frame(source_image, driving_video, cpu=cpu)
62 | print ("Best frame: " + str(i))
63 | driving_forward = driving_video[i:]
64 | driving_backward = driving_video[:(i+1)][::-1]
65 | predictions_forward = make_animation(source_image, driving_forward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
66 | predictions_backward = make_animation(source_image, driving_backward, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
67 | predictions = predictions_backward[::-1] + predictions_forward[1:]
68 | else:
69 | predictions = make_animation(source_image, driving_video, generator, kp_detector, relative=relative, adapt_movement_scale=adapt_scale, cpu=cpu)
70 | video = [img_as_ubyte(frame) for frame in predictions]
71 | if output_file is not None:
72 | imageio.mimsave(result_video, video, fps=fps)
73 | return video, fps
74 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cffi==1.11.5
2 | cloudpickle==0.5.3
3 | cycler==0.10.0
4 | dask==0.18.2
5 | decorator==4.3.0
6 | imageio==2.3.0
7 | kiwisolver==1.0.1
8 | matplotlib==2.2.2
9 | networkx==2.1
10 | numpy==1.15.0
11 | pandas==0.23.4
12 | Pillow==5.2.0
13 | pycparser==2.18
14 | pygit==0.1
15 | pyparsing==2.2.0
16 | python-dateutil==2.7.3
17 | pytz==2018.5
18 | PyWavelets==0.5.2
19 | PyYAML==5.1
20 | scikit-image==0.14.0
21 | scikit-learn==0.19.2
22 | scipy==1.1.0
23 | six==1.11.0
24 | toolz==0.9.0
25 | torch==1.0.0
26 | torchvision==0.2.1
27 | tqdm==4.24.0
28 |
29 | tensorflow==1.15
30 | umap-learn
31 | visdom
32 | librosa>=0.5.1
33 | matplotlib>=2.0.2
34 | numpy>=1.14.0
35 | scipy>=1.0.0
36 | tqdm
37 | sounddevice
38 | SoundFile
39 | Unidecode
40 | inflect
41 | PyQt5
42 | multiprocess
43 | numba==0.48
44 |
45 | ffmpeg
46 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | if __name__ == '__main__':
2 | print(bool(-5))
--------------------------------------------------------------------------------
/text_to_audio.py:
--------------------------------------------------------------------------------
1 | # things for models
2 | from Real_Time_Voice_Cloning.encoder import inference as encoder
3 | from Real_Time_Voice_Cloning.synthesizer.inference import Synthesizer
4 | from Real_Time_Voice_Cloning.vocoder import inference as vocoder
5 |
6 | # other package file
7 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params
8 | from Real_Time_Voice_Cloning.toolbox.utterance import Utterance
9 |
10 | # normal
11 | import numpy as np
12 | import utils
13 |
14 | sample_rate = syn_params.hparams.sample_rate
15 | synthesizer = None
16 | current_synthesized_model = None
17 |
18 |
19 | def get_synthesizer(path=""): # create spectrogram for voice
20 | if synthesizer is None:
21 | checkpoints_dir = path + "/taco_pretrained"
22 | return Synthesizer(checkpoints_dir, low_mem=True, verbose=False)
23 | return synthesizer
24 |
25 |
26 | def generate_spectrogram(text, utterance):
27 | texts = text.split("\n")
28 | embed = utterance.embed
29 | embeds = np.stack([embed] * len(texts))
30 | specs = synthesizer.synthesize_spectrograms(texts, embeds)
31 | breaks = [spec.shape[1] for spec in specs]
32 | spec = np.concatenate(specs, axis=1)
33 |
34 | # self.ui.draw_spec(spec, "generated")
35 | # self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
36 | return spec, breaks
37 |
38 |
39 | def decode_spectrogram(spec, breaks=False):
40 | wav = vocoder.infer_waveform(spec)
41 |
42 | # Add breaks
43 | if breaks:
44 | b_ends = np.cumsum(np.array(breaks) * syn_params.hparams.hop_size)
45 | b_starts = np.concatenate(([0], b_ends[:-1]))
46 | wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
47 | breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
48 | wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
49 |
50 | # trim silences
51 | wav = encoder.preprocess_wav(wav)
52 |
53 | # Play it
54 | wav = wav / np.abs(wav).max() * 0.97
55 | return wav
56 |
57 |
58 | def create_utterance(wavs):
59 | amount_of_samples = len(wavs)
60 | embeds = []
61 | # Compute the mel spectrogram
62 | spec = Synthesizer.make_spectrogram(wavs[0])
63 | # self.ui.draw_spec(spec, "current")
64 |
65 | for wav in wavs:
66 | # Compute the embedding
67 | encoder_wav = encoder.preprocess_wav(wav)
68 | # embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
69 | embed = encoder.embed_utterance(encoder_wav, return_partials=False)
70 | embeds.append(embed)
71 | avg_embed = sum(embeds) / amount_of_samples
72 | speaker_name = "audio_sample"
73 | name = speaker_name + "_rec_%05d"
74 | # Add the utterance
75 | return Utterance(name, speaker_name, wavs[0], spec, avg_embed, None, False)
76 |
77 |
78 | def generate_audio(text, audio_samples):
79 | '''
80 | Return an audio file of a text in the voice of some utterances from the same person
81 | :param text: text file or string with line breaks to indicate pauses
82 | :param audio_samples: paths to any audio sample of .wav format (5-12 seconds)
83 | :return: audio file of .wav format
84 | '''
85 | # todo: import the pretrained models
86 | global current_synthesized_model, synthesizer
87 | encoder.load_model("Real_Time_Voice_Cloning/pretrained/encoder/saved_models/pretrained.pt", "cpu") # what is this used for
88 | vocoder.load_model("Real_Time_Voice_Cloning/pretrained/vocoder/saved_models/pretrained.pt", verbose=False)
89 | # todo: figure out how the multiple utterances work
90 | synthesizer = get_synthesizer("Real_Time_Voice_Cloning/pretrained/synthesizer/saved_models/logs-pretrained")
91 | if len(text) > 4 and text[-4:] == ".txt": # check if file
92 | words = ""
93 | with open(text) as file:
94 | for line in file:
95 | words += line
96 | text = words
97 | del words
98 | if isinstance(audio_samples, str):
99 | utterance = Utterance("name", "speaker_name", None, None, np.load(audio_samples), None, None)
100 | else:
101 | utterance = create_utterance(audio_samples)
102 | current_synthesized_model = generate_spectrogram(text, utterance)
103 | audio_file = decode_spectrogram(*current_synthesized_model)
104 | return audio_file, utterance.embed
105 |
106 |
107 | if __name__ == '__main__':
108 | sample_rate = syn_params.hparams.sample_rate
109 | while True:
110 | input("Hit enter to record:")
111 | wav = utils.record(sample_rate, 5)
112 | input("Hit enter to play")
113 | utils.play(wav, sample_rate)
114 | print(wav.shape)
115 |
116 |
--------------------------------------------------------------------------------
/user_data/Tate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate.png
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample0.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample1.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample2.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample3.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample4.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample5.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample5.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample6.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample6.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample7.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample7.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample8.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample8.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/sample9.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/sample9.wav
--------------------------------------------------------------------------------
/user_data/Tate_audio_samples/voice.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/Tate_audio_samples/voice.npy
--------------------------------------------------------------------------------
/user_data/activity_unproductive.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/activity_unproductive.wav
--------------------------------------------------------------------------------
/user_data/declaration.txt:
--------------------------------------------------------------------------------
1 | The unanimous Declaration of the thirteen united States of America, When in the Course of human events,
2 | it becomes necessary for one people to dissolve the political bands which have connected them with another,
3 | and to assume among the powers of the earth,
4 | the separate and equal station to which the Laws of Nature and of Nature's God entitle them,
5 | a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.
6 |
--------------------------------------------------------------------------------
/user_data/obama.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/obama.mp4
--------------------------------------------------------------------------------
/user_data/output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/output.wav
--------------------------------------------------------------------------------
/user_data/putin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/putin.png
--------------------------------------------------------------------------------
/user_data/quick.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/quick.wav
--------------------------------------------------------------------------------
/user_data/result.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/result.mp4
--------------------------------------------------------------------------------
/user_data/stylized.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/stylized.wav
--------------------------------------------------------------------------------
/user_data/thing.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/thing.mp4
--------------------------------------------------------------------------------
/user_data/trump.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TateStaples/Deepfakes/a31beb3785bceea93143379054486021aaf33dc3/user_data/trump.mp4
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import sounddevice, time
2 | from moviepy.video.io.VideoFileClip import VideoFileClip
3 | from moviepy.audio.io import AudioFileClip
4 | import imageio
5 | from pydub import AudioSegment
6 | from scipy.io import wavfile
7 | import librosa
8 | import numpy as np
9 | from Real_Time_Voice_Cloning.synthesizer import hparams as syn_params
10 | sample_rate = syn_params.hparams.sample_rate
11 |
12 |
13 | def play(wav, pause_processes=True):
14 | if isinstance(wav, str):
15 | wav = load(wav)
16 | sounddevice.stop()
17 | sounddevice.play(wav, sample_rate)
18 | if pause_processes:
19 | time.sleep(wav.shape[0] / sample_rate + 1)
20 |
21 |
22 | def record(duration, pause_processes=True):
23 | # sounddevice.stop()
24 | try:
25 | wav = sounddevice.rec(duration * sample_rate, sample_rate, 1)
26 | if pause_processes:
27 | time.sleep(duration)
28 | wav = wav.reshape(wav.shape[0])
29 | except Exception as e:
30 | print(e)
31 | print("Could not record anything. Is your recording device enabled?")
32 | print("Your device must be connected before you start the toolbox.")
33 | return None
34 | return wav
35 |
36 |
37 | def save(wav, filepath):
38 | # sf.write(filepath, wav, sample_rate)
39 | wav *= 32767 / max(0.01, np.max(np.abs(wav)))
40 | # proposed by @dsmiller
41 | wavfile.write(filepath, sample_rate, wav.astype(np.int16))
42 |
43 |
44 | def load(filepath, get_sample=False):
45 | wav, sr = librosa.core.load(filepath, sr=sample_rate)
46 | if get_sample:
47 | return wav, sr
48 | return wav
49 |
50 |
51 | def mp3_to_wav(mp3_path):
52 | sound = AudioSegment.from_mp3(mp3_path)
53 | new_path = mp3_path[:-4] + ".wav"
54 | sound.export(new_path, format="wav")
55 | return load(new_path)
56 |
57 |
58 | def audio_from_mp4(mp4_file):
59 | vid = VideoFileClip(mp4_file)
60 | return vid.audio
61 |
62 |
63 | def create_mp4(video, audio, save_path=None):
64 | audio = AudioFileClip(audio) if audio is not None else None
65 | fps = video.shape[0] / audio.duration
66 | imageio.mimsave("backend_files/mute.mp4", video)
67 | video = VideoFileClip("backend_files/mute.mp4", audio=False)
68 | video.set_audio(audio)
69 | if save_path is not None:
70 | video.write_videofile(save_path, fps=fps) # , codec='mpeg4')
71 | return video
72 |
--------------------------------------------------------------------------------