├── LICENSE ├── src ├── packages │ ├── AudioEmbeddingsManager.py │ ├── BaseDeepFakeGenerator.py │ ├── SavedFeatureLoader.py │ ├── SmileFeatureGenerator.py │ ├── ElevenLabsDeepFakeGenerator.py │ ├── playhtDataGenerator.py │ ├── BayesSearch.py │ ├── SmileFeatureManager.py │ ├── CadenceUtils.py │ ├── LJDataLoader.py │ ├── SmileFeatureSelector.py │ ├── ModelManager.py │ ├── AudioManager.py │ ├── TIMITDataLoader.py │ ├── CadenceModelManager.py │ └── ExperimentPipeline.py ├── run_pipeline_ljspeech.py └── run_pipeline_multivoice.py ├── .gitignore ├── pip_requirements.txt ├── conda_requirements.txt └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, Sarah Barrington, Romit Barua, Gautham Koorma, Hany Farid 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /src/packages/AudioEmbeddingsManager.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # local packages 6 | from packages.SavedFeatureLoader import loadFeatures 7 | 8 | # directory to save embeddings to 9 | SAVED_EMBEDDINGS_DIR = "/home/ubuntu/data/wavefake_data/Embeddings/16000KHz" 10 | 11 | # helper function to generate Titanet embeddings 12 | def generateTitaNetEmbeddings(model, paths, normalize): 13 | embeddings = np.array( 14 | [ 15 | model.get_embedding(file_path).cpu().detach().numpy()[0] 16 | for file_path in paths 17 | ] 18 | ) 19 | 20 | if normalize: 21 | raise NotImplementedError("Normalizing embeddings is not implemented yet") 22 | 23 | return embeddings 24 | 25 | 26 | class AudioEmbeddingsManager: 27 | # initialize with model and data 28 | def __init__(self, model, data) -> None: 29 | self.model = model 30 | self.data = data 31 | 32 | # generate the dataframe of embeddings for experiments 33 | def generateFeatureDf( 34 | self, normalize: bool = False, regenerate_embeddings: bool = False 35 | ): 36 | # generate embeddings and save to disk 37 | if regenerate_embeddings: 38 | embeddings_df = pd.DataFrame(self.generateEmbeddings(normalize)) 39 | 40 | feature_cols = list(embeddings_df.columns) 41 | feature_df = pd.concat((self.data, embeddings_df), axis=1) 42 | 43 | # load embeddings from disk 44 | else: 45 | feature_df = loadFeatures(self.data.copy(), "titanet") 46 | feature_cols = list(set(feature_df.columns) ^ set(self.data.columns)) 47 | 48 | return feature_df, feature_cols 49 | 50 | # generate embeddings for each audio file 51 | def generateEmbeddings(self, normalize): 52 | return generateTitaNetEmbeddings(self.model, self.data["path"], normalize) 53 | -------------------------------------------------------------------------------- /src/packages/BaseDeepFakeGenerator.py: -------------------------------------------------------------------------------- 1 | from lib2to3.pgen2.tokenize import tokenize 2 | import os 3 | from secrets import token_urlsafe 4 | import pandas as pd 5 | 6 | #base class used by other generators to load text from a dataframe or directory 7 | #and process transcripts 8 | class BaseDeepFakeGenerator: 9 | def __init__(self, tokenize_type: str = None): 10 | if not isinstance(tokenize_type, type(None)): 11 | assert tokenize_type.lower() in [ 12 | "word", 13 | "sentence", 14 | ], "If you provide a tokenize type, it must be sentence or word" 15 | self.tokenize_type = tokenize_type 16 | 17 | def loadTextFromDataFrame( 18 | self, 19 | dataframe_path: str, 20 | source_col: str, 21 | transcript_col: str, 22 | punc_to_remove: list = None, 23 | ): 24 | metadata = pd.read_csv(dataframe_path) 25 | source_paths = list(metadata[source_col]) 26 | file_names = [os.path.basename(source_path) for source_path in source_paths] 27 | transcripts = list(metadata[transcript_col]) 28 | 29 | if punc_to_remove: 30 | transcripts = self.process_transcripts(transcripts, punc_to_remove) 31 | 32 | return file_names, transcripts 33 | 34 | def loadTextFromDirectory(self, dir_name: str): 35 | for file_name in os.listdir(dir_name): 36 | if ".txt" in file_name: 37 | pass 38 | 39 | def _readTextFile(self, text_path: str): 40 | with open(text_path) as f: 41 | lines = f.readlines() 42 | 43 | f.close() 44 | 45 | def process_transcripts(self, transcripts: list, punc_to_remove: list): 46 | processed_transcripts = [] 47 | for idx, transcript in enumerate(transcripts): 48 | for punc in punc_to_remove: 49 | print(idx) 50 | transcript = transcript.replace(punc, "") 51 | processed_transcripts.append(transcript) 52 | return processed_transcripts 53 | -------------------------------------------------------------------------------- /src/packages/SavedFeatureLoader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from tqdm import tqdm 4 | import json 5 | 6 | #path to the feature map json file that maps 7 | #the metadata path to the feature path 8 | FEATURE_MAP_PATH = ( 9 | "/home/ubuntu/data/FeatureMap.json" 10 | ) 11 | 12 | #list of valid feature types 13 | VALID_FEATURE_TYPES = [ 14 | "titanet", 15 | "openSmile", 16 | "cadence", 17 | ] 18 | #helper function 19 | def loadFeatures( 20 | metadata, 21 | feature_type, 22 | metadata_filepath_col="path", 23 | feature_filepath_col="path", 24 | feature_map_path=FEATURE_MAP_PATH, 25 | ): 26 | assert ( 27 | feature_type in VALID_FEATURE_TYPES 28 | ), f"Please ensure that {feature_type} is a valid feature type" 29 | 30 | #add a column to the metadata dataframe that contains the path to the feature 31 | metadata["path_keys"] = metadata["path"].apply(os.path.dirname) 32 | present_paths = metadata["path_keys"].unique().tolist() 33 | with open(feature_map_path) as f: 34 | feature_map = json.load(f) 35 | 36 | #load the feature dataframe and merge it with the metadata dataframe 37 | merged_df = pd.DataFrame() 38 | for path in tqdm(present_paths): 39 | feature_df = pd.read_csv(feature_map[path][feature_type]) 40 | filter_metadata = metadata[metadata["path_keys"] == path] 41 | merged_df = pd.concat( 42 | [ 43 | merged_df, 44 | pd.merge( 45 | filter_metadata, 46 | feature_df, 47 | how="left", 48 | left_on=metadata_filepath_col, 49 | right_on=feature_filepath_col, 50 | ), 51 | ], 52 | axis=0, 53 | ).reset_index(drop=True) 54 | 55 | #drop the feature path column if it is not the same as the metadata path column 56 | if feature_filepath_col != metadata_filepath_col: 57 | merged_df = merged_df.drop(columns=[feature_filepath_col]) 58 | 59 | #drop the path_keys columnß 60 | merged_df = merged_df.drop(columns=["path_keys"]) 61 | return merged_df 62 | -------------------------------------------------------------------------------- /src/packages/SmileFeatureGenerator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import random 5 | from tqdm import tqdm 6 | from pathlib import Path 7 | import opensmile 8 | 9 | # base_path 10 | base_path = "/home/ubuntu" 11 | 12 | 13 | class smileFeatureGenerator: 14 | # initiate the class to generate openSMILE ComParE_2016 functionals 15 | def __init__( 16 | self, 17 | data_path: str, 18 | feature_set=opensmile.FeatureSet.ComParE_2016, 19 | feature_level=opensmile.FeatureLevel.Functionals, 20 | ) -> None: 21 | self.data_path = data_path 22 | self.feature_extractor = opensmile.Smile( 23 | feature_set=feature_set, feature_level=feature_level 24 | ) 25 | # store the wav files in a list 26 | self.wav_list = self._getWavList() 27 | assert len(self.wav_list) > 0, "No wav files found in data path" 28 | 29 | # private method to iterate through the data path and store the wav files in a list 30 | def _getWavList(self): 31 | wav_list = [] 32 | for file_name in os.listdir(self.data_path): 33 | if ".wav" in file_name.lower(): 34 | wav_list.append(file_name) 35 | return wav_list 36 | 37 | # generate openSMILE features 38 | def generateFeatures(self): 39 | print("Generating openSMILE features...\n") 40 | 41 | self.smile_df = pd.DataFrame() 42 | 43 | for i in tqdm(range(len(self.wav_list))): 44 | file_path = os.path.join(self.data_path, self.wav_list[i]) 45 | try: 46 | features = self.feature_extractor.process_file(file_path).reset_index() 47 | except: 48 | print("Error processing file: {}".format(file_path)) 49 | continue 50 | 51 | # compute file duration 52 | duration = features["end"] - features["start"] 53 | duration = duration.astype("timedelta64[ms]") / 1000 54 | features.insert(1, "duration(seconds)", duration) 55 | 56 | features.drop(columns=["start", "end"], inplace=True) 57 | 58 | self.smile_df = pd.concat([self.smile_df, features]).reset_index(drop=True) 59 | 60 | print("\nopenSMILE features generated... call saveFeatures(filename)\n") 61 | 62 | # save the feature to disk for loading during experiments 63 | def saveFeatures(self, filename: str): 64 | self.smile_df.to_csv(filename, index=False) 65 | print("Features saved to {}\n".format(filename)) 66 | -------------------------------------------------------------------------------- /src/packages/ElevenLabsDeepFakeGenerator.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import requests 3 | import os 4 | 5 | from packages.BaseDeepFakeGenerator import BaseDeepFakeGenerator 6 | from packages.AudioManager import AudioManager 7 | 8 | #class used to generate deepfakes using the ElevenLabs API 9 | class ElevenLabsDeepFakeGenerator(BaseDeepFakeGenerator): 10 | def __init__(self): 11 | super().__init__() 12 | self.api_key = self._load_API_key() 13 | 14 | def _load_API_key( 15 | self, config_path="/home/ubuntu/MultiModalDeepFake/Configs/secret/config.yaml" 16 | ): 17 | with open(config_path, "r") as file: 18 | inputs = yaml.safe_load(file) 19 | xi_api_key = inputs["eleven_labs_api_key"] 20 | return xi_api_key 21 | 22 | def generateDeepFakeFromDataFrame( 23 | self, 24 | dataframe_path: str, 25 | output_dir: str, 26 | source_col: str, 27 | transcript_col: str, 28 | voice_id: str, 29 | voice_name: str = None, 30 | convert_audio_to_format: str = None, 31 | punc_to_remove: list = None, 32 | ): 33 | file_names, transcripts = self.loadTextFromDataFrame( 34 | dataframe_path=dataframe_path, 35 | source_col=source_col, 36 | transcript_col=transcript_col, 37 | punc_to_remove=punc_to_remove, 38 | ) 39 | print(file_names) 40 | if convert_audio_to_format: 41 | audio_manager = AudioManager() 42 | 43 | for idx, transcript in enumerate(transcripts): 44 | try: 45 | audio_clip = self.generateDeepfake(voice_id=voice_id, text=transcript) 46 | 47 | file_name = file_names[idx].replace( 48 | os.path.splitext(file_names[idx])[1], ".mpeg" 49 | ) 50 | with open(os.path.join(output_dir, file_name), "wb") as f: 51 | f.write(audio_clip.content) 52 | f.close() 53 | except Exception as e: 54 | print(f"Failed to Generate DeepFake for {file_names[idx]}") 55 | print(f"Error: {str(e)}") 56 | print() 57 | 58 | if convert_audio_to_format: 59 | audio_manager.convertAudioFileTypes( 60 | os.path.join(output_dir, file_name), 61 | output_format=convert_audio_to_format, 62 | delete_original=True, 63 | ) 64 | 65 | def generateDeepfake(self, voice_id, text): 66 | headers = { 67 | "accept": "audio/mpeg", 68 | "xi-api-key": self.api_key, 69 | "Content-Type": "application/json", 70 | } 71 | 72 | data = f'{{"text": "{text}"}}' 73 | 74 | return requests.post( 75 | f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}", 76 | headers=headers, 77 | data=data, 78 | ) 79 | -------------------------------------------------------------------------------- /src/packages/playhtDataGenerator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | from IPython.display import Audio 5 | import os 6 | import time 7 | import json 8 | import requests 9 | 10 | base_path = "/home/ubuntu/" 11 | 12 | 13 | class PlayHTVoiceClone: 14 | # refresh token -- this is on GitHub 15 | def __init__( 16 | self, credentials_json="/home/ubuntu/configs/playht_api_credentials.json" 17 | ) -> None: 18 | with open(filepath, "r") as f: 19 | api_credentials = json.load(f) 20 | 21 | self.authorization = api_credentials["Authorization"] 22 | self.user_id = api_credentials["X-User-ID"] 23 | self.base_url = "https://play.ht/api/v1/" 24 | 25 | # hold urls here for now 26 | self.convert_url = self.base_url + "convert" 27 | self.download_url = self.base_url + "articleStatus" 28 | 29 | self.headers = api_credentials 30 | # temporarily set content type to json 31 | self.headers["Content-Type"] = "application/json" 32 | 33 | def select_cloned_voice(self): 34 | self.cloned_voices_url = self.base_url + "getClonedVoices" 35 | 36 | self.cloned_voice_resp = requests.get( 37 | self.cloned_voices_url, headers=self.headers 38 | ) 39 | 40 | self.cloned_voice_id = self.cloned_voice_resp.json()["clonedVoices"][0]["id"] 41 | self.cloned_voice_name = self.cloned_voice_resp.json()["clonedVoices"][0][ 42 | "name" 43 | ] 44 | print("Cloned voice name: {}".format(self.cloned_voice_name)) 45 | 46 | def run_tts(self, text): 47 | tid = self._start_conversion(text) 48 | print("_start_conversion completed!! tid: {}".format(tid)) 49 | 50 | audio_url = self._poll_status(tid) 51 | 52 | print(audio_url) 53 | 54 | # self._download_audio(audio_url) 55 | 56 | def _start_conversion(self, text): 57 | payload = {"voice": self.cloned_voice_id} 58 | payload["content"] = [text] 59 | 60 | convert_payload = json.dumps(payload) 61 | 62 | converted_voice_resp = requests.post( 63 | self.convert_url, headers=self.headers, data=convert_payload 64 | ) 65 | 66 | return converted_voice_resp.json()["transcriptionId"] 67 | 68 | def _poll_status(self, tid): 69 | url = self.download_url + f"?transcriptionId={tid}" 70 | 71 | delay = 5 72 | 73 | print("Polling status loop started") 74 | 75 | while True: 76 | # get response 77 | download_resp = requests.get(url, headers=self.headers) 78 | # check if transcription is complete 79 | msg = download_resp.json().get("message") 80 | print(f"Messsage: {msg}") 81 | 82 | if msg == "Transcription completed": 83 | audio_url = download_resp.json().get("audioUrl") 84 | return audio_url 85 | break 86 | 87 | # if not, wait and try again 88 | print("wait and try again") 89 | time.sleep(delay) 90 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /src/packages/BayesSearch.py: -------------------------------------------------------------------------------- 1 | from sklearn.gaussian_process import GaussianProcessRegressor 2 | import pandas as pd 3 | import numpy as np 4 | from scipy.optimize import minimize 5 | from scipy.stats import norm 6 | 7 | 8 | # implemented using 9 | # https://towardsdatascience.com/bayesian-optimization-a-step-by-step-approach-a1cb678dd2ec 10 | class BayesSearch: 11 | def __init__( 12 | self, 13 | data, 14 | target_function, 15 | sampling_function, 16 | n_iter, 17 | init_ex_count=20, 18 | gp_ex_count=1000, 19 | ): 20 | self.target_function = target_function 21 | self.gp_reg = GaussianProcessRegressor() 22 | self.output = pd.DataFrame( 23 | columns=["WindowSize", "SilenceThreshold", "Acc", "EI"] 24 | ) ## SB_COmment - any reason for EI over UCB or PI (Upper Confidence Bound/Probability of Improvement) 25 | self.sampling_function = sampling_function 26 | self.n_iter = n_iter 27 | self.data = data 28 | self.init_ex_count = init_ex_count 29 | self.gp_ex_count = gp_ex_count 30 | self.distances_ = [] 31 | 32 | self.h = None 33 | self.y = None 34 | self._generate_initial() 35 | 36 | def _generate_initial(self): 37 | print(f"Initializing the {self.init_ex_count} hyper-parameters") 38 | 39 | self.h = self.sampling_function(self.init_ex_count) 40 | self.y = self.target_function(self.h, self.data) 41 | 42 | def expected_improvement(self, h_new): 43 | mean_y_new, sigma_y_new = self.gp_reg.predict( 44 | np.array([h_new]), return_std=True 45 | ) 46 | sigma_y_new = sigma_y_new.reshape(-1, 1) 47 | if sigma_y_new == 0.0: 48 | return 0.0 49 | 50 | mean_y = self.gp_reg.predict(self.h) 51 | max_mean_y = np.max(mean_y) 52 | z = (mean_y_new - max_mean_y) / sigma_y_new 53 | exp_imp = (mean_y_new - max_mean_y) * norm.cdf(z) + sigma_y_new * norm.pdf(z) 54 | 55 | return exp_imp 56 | 57 | def next_params(self, explore_exploit_ratio=0.2): 58 | min_ei = np.inf 59 | max_ei = 0 60 | h_optimal = None 61 | h_new_sample = self.sampling_function(self.gp_ex_count) 62 | 63 | for x_new in h_new_sample: 64 | # response = minimize(fun=self.expected_improvement, x0=x_new, method='L-BFGS-B') 65 | # if response.fun < min_ei: 66 | # min_ei = response.fun 67 | # h_optimal = response.x 68 | exp_imp = self.expected_improvement(x_new) 69 | if exp_imp < min_ei: 70 | min_ei = exp_imp 71 | h_optimal = x_new 72 | if exp_imp > max_ei: 73 | max_ei = exp_imp 74 | h_optimal = x_new 75 | 76 | print("Optimal H: ", h_optimal) 77 | 78 | if np.random.rand() < explore_exploit_ratio: 79 | return h_optimal, max_ei 80 | else: 81 | return h_optimal, min_ei 82 | 83 | def optimize(self): 84 | y_max_ind = np.argmax(self.y) 85 | y_max = self.y[y_max_ind] 86 | optimal_h = self.h[y_max_ind] 87 | optimal_ei = None 88 | 89 | for i in range(self.n_iter): 90 | self.gp_reg.fit(self.h, self.y) 91 | h_next, ei = self.next_params() 92 | y_next = self.target_function(np.array([h_next]), self.data) 93 | print("acc: ", y_next) 94 | 95 | self.h = np.concatenate((self.h, np.array([h_next]))) 96 | self.y = np.concatenate((self.y, np.array(y_next))) 97 | 98 | if y_next[0] > y_max: 99 | y_max = y_next[0] 100 | optimal_h = h_next 101 | optimal_ei = ei 102 | 103 | if i == 0: 104 | prev_h = h_next 105 | else: 106 | self.distances_.append(np.linalg.norm(prev_h - h_next)) 107 | prev_h = h_next 108 | 109 | # self.best_samples_ = self.best_samples_.append({"y": y_max, "ei": optimal_ei},ignore_index=True) 110 | 111 | return optimal_h, y_max 112 | -------------------------------------------------------------------------------- /src/packages/SmileFeatureManager.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | # local packages 5 | from packages.SavedFeatureLoader import loadFeatures 6 | from packages.SmileFeatureSelector import * 7 | 8 | # list of valid feature selectors 9 | VALID_FEATURE_SELECTORS = ["random_forest"] 10 | 11 | 12 | class SmileFeatureManager: 13 | # initialize the class 14 | def __init__(self, data) -> None: 15 | self.data = data 16 | self.metadata_cols = data.columns 17 | self.loadSavedFeatures() 18 | 19 | # load saved features from disk into a dataframe 20 | # feature generation is done for the data separately and features are saved to disk 21 | def loadSavedFeatures(self): 22 | self.feature_df = loadFeatures( 23 | self.data.copy(), "openSmile", feature_filepath_col="file" 24 | ) 25 | # drop duration column since it is not used as a feature 26 | self.feature_df = self.feature_df.drop(columns=["duration(seconds)"]) 27 | 28 | # generate the final dataframe with selected features 29 | def generateFeatureDf(self, feature_selector_type, label_type, feature_count=10): 30 | assert ( 31 | feature_selector_type in VALID_FEATURE_SELECTORS 32 | ), f"{feature_selector_type} not valid. Valid types include {VALID_FEATURE_SELECTORS}" 33 | assert label_type in [ 34 | "binary", 35 | "multiclass", 36 | ], "Label type must be either binary or multiclass" 37 | 38 | if feature_selector_type == "random_forest": 39 | # set random state for reproducibility while selecting features 40 | selector = smileFeatureSelectFrmModel( 41 | self.feature_df, 42 | metadata=list(self.metadata_cols), 43 | model=RandomForestClassifier(random_state=12), 44 | ) 45 | 46 | # features for the binary classification task 47 | if label_type == "binary": 48 | df = selector.select_features_binary( 49 | max_features=feature_count, return_df=True 50 | ) 51 | # features for the multiclass classification task 52 | else: 53 | df = selector.select_features_multiclass( 54 | max_features=feature_count, return_df=True 55 | ) 56 | 57 | return df 58 | 59 | 60 | from packages.SavedFeatureLoader import loadFeatures 61 | from packages.SmileFeatureSelector import * 62 | from sklearn.ensemble import RandomForestClassifier 63 | 64 | VALID_FEATURE_SELECTORS = ["random_forest"] 65 | 66 | 67 | class SmileFeatureManager: 68 | def __init__(self, data) -> None: 69 | self.data = data 70 | self.metadata_cols = data.columns 71 | self.loadSavedFeatures() 72 | 73 | def loadSavedFeatures(self): 74 | self.feature_df = loadFeatures( 75 | self.data.copy(), "openSmile", feature_filepath_col="file" 76 | ) 77 | # GK edit 06/02/2023 to drop duration column 78 | self.feature_df = self.feature_df.drop(columns=["duration(seconds)"]) 79 | 80 | def generateFeatureDf(self, feature_selector_type, label_type, feature_count=10): 81 | assert ( 82 | feature_selector_type in VALID_FEATURE_SELECTORS 83 | ), f"{feature_selector_type} not valid. Valid types include {VALID_FEATURE_SELECTORS}" 84 | assert label_type in [ 85 | "binary", 86 | "multiclass", 87 | ], "Label type must be either binary or multiclass" 88 | 89 | if feature_selector_type == "random_forest": 90 | # GK edit 06/29/23 to set random state 91 | selector = smileFeatureSelectFromModel( 92 | self.feature_df, 93 | metadata=list(self.metadata_cols), 94 | model=RandomForestClassifier(random_state=12), 95 | ) 96 | 97 | if label_type == "binary": 98 | df = selector.select_features_binary( 99 | max_features=feature_count, return_df=True 100 | ) 101 | else: 102 | df = selector.select_features_multiclass( 103 | max_features=feature_count, return_df=True 104 | ) 105 | 106 | return df 107 | -------------------------------------------------------------------------------- /src/packages/CadenceUtils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from math import trunc 4 | from scipy import signal 5 | from numpy import diff 6 | 7 | def filter_signal(audio, sr, low_pass_filter_cutoff): 8 | 9 | # Smooth signal with low pass filter, the parameters for which were tuned locally 10 | t = np.arange(len(audio)) / sr 11 | w = low_pass_filter_cutoff / (sr / 2) 12 | b, a = signal.butter(5, w, "low") 13 | smoothed_signal = signal.filtfilt(b, a, audio) 14 | 15 | return smoothed_signal 16 | 17 | def get_amplitude(audio, window_size, silence_threshold, sr, low_pass_filter_cutoff): 18 | 19 | # Generate amplitude features 20 | abs_audio = abs(audio) 21 | smoothed_signal = filter_signal(abs_audio, sr, low_pass_filter_cutoff) 22 | 23 | deriv_amplitude = np.mean(diff(smoothed_signal)) 24 | mean_amplitude = np.mean(smoothed_signal) 25 | 26 | return { 27 | "abs_deriv_amplitude": abs(deriv_amplitude), 28 | "mean_amplitude": mean_amplitude, 29 | } 30 | 31 | 32 | def normalize_audio_amplitudes(paths): 33 | 34 | # Normalize amplitudes to be within [-1, 1] according to max absolute value 35 | normalized_audio = [] 36 | for file in paths: 37 | sample = librosa.load(file)[0] 38 | max_abs = np.max(np.abs(sample)) 39 | normalized_sample = sample / max_abs 40 | normalized_audio.append(normalized_sample) 41 | 42 | return normalized_audio 43 | 44 | 45 | def truncate_silences( 46 | normalized_audio, 47 | window_size, 48 | silence_threshold, 49 | sr=None, 50 | low_pass_filter_cutoff=None, 51 | counter=0, 52 | ): 53 | # Remove start and end silences from clips 54 | start_ids = [] 55 | end_ids = [] 56 | truncated_audio = [] 57 | 58 | for audio in normalized_audio: 59 | truncation_id_start = None 60 | truncation_id_end = None 61 | 62 | counter += 1 63 | if counter % 100 == 0: 64 | print( 65 | f"Truncating audio {counter}/{len(normalized_audio)} ({round(counter*100/len(normalized_audio))}%)" 66 | ) 67 | 68 | for j in range(len(audio)): 69 | roll_average = np.mean(np.abs(audio[j : j + window_size])) 70 | if roll_average > silence_threshold: 71 | truncation_id_start = j 72 | break 73 | 74 | for j in reversed(range(len(audio))): 75 | roll_average = np.mean(np.abs(audio[j - window_size : j])) 76 | if roll_average > silence_threshold: 77 | truncation_id_end = j - window_size 78 | break 79 | 80 | if truncation_id_start is not None and truncation_id_end is not None: 81 | truncated_audio.append(audio[truncation_id_start:truncation_id_end]) 82 | start_ids.append(truncation_id_start) 83 | end_ids.append(truncation_id_end) 84 | 85 | return start_ids, end_ids, truncated_audio 86 | 87 | 88 | def moving_average(x, w): 89 | #compute moving average 90 | return np.convolve(x, np.ones(w), "valid") / w 91 | 92 | 93 | def get_silence( 94 | audio, window_size, silence_threshold, sr=None, low_pass_filter_cutoff=None 95 | ): 96 | #computes silent and voiced portions of audio 97 | thresh = max(abs(audio)) * silence_threshold 98 | moving_avg = moving_average(abs(audio), window_size) 99 | silent = np.where(abs(moving_avg) < thresh) 100 | voiced = np.where(abs(moving_avg) >= thresh) 101 | 102 | # Get percentage of silence and voiced 103 | pct_pause = len(silent[0]) * 100 / (len(silent[0]) + len(voiced[0])) 104 | pct_voiced = len(voiced[0]) * 100 / (len(silent[0]) + len(voiced[0])) 105 | 106 | if len(voiced[0]) == 0: 107 | ratio_pause_voiced = None 108 | else: 109 | ratio_pause_voiced = len(silent[0]) / len(voiced[0]) 110 | 111 | return { 112 | "pct_pause": pct_pause, 113 | "pct_voiced": pct_voiced, 114 | "ratio_pause_voiced": ratio_pause_voiced, 115 | } 116 | 117 | 118 | def get_silence_spread( 119 | audio, window_size, silence_threshold, sr=None, low_pass_filter_cutoff=None 120 | ): 121 | 122 | thresh = max(abs(audio)) * silence_threshold 123 | moving_avg = moving_average(abs(audio), window_size) 124 | 125 | silent_windows = np.where(moving_avg < thresh) 126 | moving_avg[silent_windows] = 0 127 | silence_count = 0 128 | silence_counts = [] 129 | 130 | for i in range(len(moving_avg) - 1): 131 | item = moving_avg[i] 132 | next_item = moving_avg[i + 1] 133 | 134 | if item != 0 and next_item == 0: 135 | silence_count = 0 136 | 137 | elif item == 0 and next_item == 0: 138 | silence_count += 1 139 | 140 | elif item == 0 and next_item != 0: 141 | silence_counts.append(silence_count) 142 | 143 | else: 144 | continue 145 | 146 | # Get spreads/means and normalise 147 | spread_of_silences = np.std(silence_counts) / len(moving_avg) 148 | mean_of_silences = np.mean(silence_counts) / len(moving_avg) 149 | n_pauses = len(silence_counts) 150 | 151 | return { 152 | "spread_of_silences": spread_of_silences, 153 | "mean_of_silences": mean_of_silences, 154 | "silence_counts": silence_counts, 155 | "n_pauses": n_pauses, 156 | } 157 | -------------------------------------------------------------------------------- /src/packages/LJDataLoader.py: -------------------------------------------------------------------------------- 1 | from random import random, sample, seed 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # helper function 6 | def loadExistingFile(file_path): 7 | return pd.read_csv(file_path) 8 | 9 | class LJDataLoader: 10 | # initialization 11 | def __init__( 12 | self, data_path: str, id_col: str = "id", filter_cols: list = [] 13 | ) -> None: 14 | assert ".csv" in data_path, "Data Path should be a csv file." 15 | self.metadata = pd.read_csv(data_path) 16 | self._validateData() 17 | # self._filterCols(filter_cols) 18 | self.id_col = id_col 19 | 20 | # data validation 21 | def _validateData(self): 22 | self.metadata = self.metadata.dropna().reset_index() 23 | 24 | # filtering columns 25 | def _filterCols(self, filter_cols): 26 | for col in filter_cols: 27 | self.metadata = self.metadata[self.metadata[col] == 0] 28 | 29 | # data sampling 30 | def sample(self, perc: float = 0.1): 31 | self.metadata = self.metadata.sample(frac=perc, ignore_index=True) 32 | 33 | # splitting data into train, dev, and test sets 34 | def splitData( 35 | self, train_perc=0.6, dev_perc=0.2, test_perc=0.2, shuffle: bool = True 36 | ): 37 | assert train_perc + dev_perc + test_perc == 1, "" 38 | 39 | if shuffle: 40 | self.metadata = self.metadata.sample( 41 | frac=1, ignore_index=True, random_state=12 42 | ) 43 | 44 | self.metadata["type"] = None 45 | 46 | train_idx, dev_idx = int(self.metadata.shape[0] * train_perc), int( 47 | self.metadata.shape[0] * (train_perc + dev_perc) 48 | ) 49 | 50 | self.metadata.loc[:train_idx, "type"] = "train" 51 | self.metadata.loc[train_idx:dev_idx, "type"] = "dev" 52 | self.metadata.loc[dev_idx:, "type"] = "test" 53 | 54 | # selecting random architecture from a list of columns containing architecture names for mixing data 55 | def selectRandomArchitecture(self, target_col: str, source_cols: list): 56 | def randomlySelectCols(rw): 57 | # setting random seed for reproducibility 58 | # np.random.seed(12) 59 | rand_idx = np.random.randint(0, len(source_cols)) 60 | return rw[source_cols[rand_idx]] 61 | 62 | self.metadata[target_col] = self.metadata.apply( 63 | lambda row: randomlySelectCols(row), axis=1 64 | ) 65 | 66 | # generating final dataframe for experiments 67 | def generateFinalDataFrame( 68 | self, 69 | real_col: str, 70 | fake_cols: list, 71 | single_id_entry: bool = False, 72 | balanced: bool = False, 73 | ): 74 | agg_cols = [real_col] + fake_cols 75 | 76 | if single_id_entry: 77 | filter_df = self.metadata[agg_cols].copy() 78 | multiclass_labels = np.random.randint( 79 | 0, len(agg_cols), filter_df.shape[0] 80 | ).reshape(filter_df.shape[0], -1) 81 | chosen_data = np.take_along_axis( 82 | filter_df.to_numpy(), multiclass_labels, axis=1 83 | ).squeeze() 84 | multiclass_labels = multiclass_labels.squeeze() 85 | labels = np.where( 86 | multiclass_labels == 0, 0, 1 87 | ) # in the future, may need to double check that this works for varying column orders 88 | architectures = [agg_cols[id] for i in multiclass_labels] 89 | return pd.DataFrame( 90 | { 91 | "path": chosen_data, 92 | "label": labels, 93 | "multiclass_label": multiclass_labels, 94 | "type": self.metadata["type"], 95 | "id": self.metadata["id"], 96 | "architecture": architectures, 97 | } 98 | ) 99 | 100 | filter_df = self.metadata[agg_cols + ["type", "id"]].copy() 101 | output = pd.melt( 102 | filter_df, 103 | id_vars=["type", "id"], 104 | value_vars=agg_cols, 105 | value_name="path", 106 | var_name="architecture", 107 | ) 108 | output["label"] = np.where(output["architecture"] == real_col, 0, 1) 109 | multiclass_map = {k: v for v, k in enumerate(agg_cols)} 110 | output["multiclass_label"] = output["architecture"].map(multiclass_map) 111 | # output = output.drop(columns=['architecture']) 112 | 113 | ### balancing code ## 114 | if balanced: 115 | seed(4) 116 | 117 | binary_class_labels = output["label"] 118 | real_indices = list(np.where(binary_class_labels == 0)[0]) 119 | fake_indices = list(np.where(binary_class_labels == 1)[0]) 120 | 121 | # Apply random sampling to rebalance data 122 | # NOTE: currently using equal p(sample) from each all fake samples. 123 | # E.g. we just random sample from all with a 1 class. 124 | if len(real_indices) < len(fake_indices): 125 | fake_indices = sample(fake_indices, len(real_indices)) 126 | elif len(real_indices) > len(fake_indices): 127 | real_indices = sample(real_indices, len(fake_indices)) 128 | 129 | output = output.iloc[real_indices + fake_indices, :].sort_index() 130 | 131 | ### END ### 132 | return output 133 | -------------------------------------------------------------------------------- /src/packages/SmileFeatureSelector.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import opensmile 3 | from tqdm import tqdm 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.feature_selection import SelectFromModel 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | # base_path 9 | base_path = "/home/ubuntu/" 10 | 11 | ############################################################################################ 12 | # Base Class ############################################################################### 13 | ############################################################################################ 14 | 15 | class smileFeatureSelectorBase: 16 | # initialize the class to select features 17 | def __init__( 18 | self, df, metadata, standardize: bool = True, scaler=StandardScaler() 19 | ) -> None: 20 | print("Initializing data...") 21 | 22 | self.data = df 23 | self.metadata = metadata 24 | self.all_features = self.data.drop(columns=self.metadata).columns 25 | 26 | self.train_df = self.data[self.data["type"] == "train"].copy() 27 | self.dev_df = self.data[self.data["type"] == "dev"].copy() 28 | self.test_df = self.data[self.data["type"] == "test"].copy() 29 | 30 | ## standardize the features inside the train, dev, and test sets for the selected features 31 | if standardize: 32 | print("Standardizing features...") 33 | cols_to_scale = list(self.all_features) 34 | scaler.fit(self.train_df[cols_to_scale]) 35 | self.train_df.loc[:, cols_to_scale] = scaler.transform( 36 | self.train_df.loc[:, cols_to_scale] 37 | ) 38 | self.dev_df.loc[:, cols_to_scale] = scaler.transform( 39 | self.dev_df.loc[:, cols_to_scale] 40 | ) 41 | self.test_df.loc[:, cols_to_scale] = scaler.transform( 42 | self.test_df.loc[:, cols_to_scale] 43 | ) 44 | self.scaler = scaler 45 | else: 46 | self.scaler = None 47 | 48 | # print('smileFeatureSelector object initialized.\n') 49 | 50 | ############################################################################################ 51 | # Feature Selection From Model ############################################################# 52 | ############################################################################################ 53 | class smileFeatureSelectFromModel(smileFeatureSelectorBase): 54 | def __init__( 55 | self, df, metadata, standardize: bool = True, model=RandomForestClassifier() 56 | ): 57 | """ 58 | Initialize the smileFeatureSelectorBruteForce class. 59 | """ 60 | # initialize the base class 61 | super().__init__(df, metadata, standardize) 62 | 63 | # load the model to use for brute force feature selection 64 | self.model = model 65 | print("smileFeatureSelectFromModel object initialized.\n") 66 | 67 | # ... (rest of the methods in smileFeatureSelectFromModel) 68 | def select_features_binary( 69 | self, 70 | max_features=10, 71 | return_df=False, 72 | print_features=True, 73 | return_features=False, 74 | ): 75 | """ 76 | Selects the top num_features features based on the model specified 77 | """ 78 | 79 | # for binary classification 80 | sfm_features = self._run_sfm( 81 | self.train_df, self.dev_df, max_features, multiclass=False 82 | ) 83 | self.binary_feature_set = set(sfm_features) 84 | 85 | if print_features: 86 | print("\nSelected features:.\n") 87 | for count, item in enumerate(self.binary_feature_set): 88 | print("{}. {}".format(count + 1, item)) 89 | 90 | if return_features: 91 | return list(self.binary_feature_set) 92 | 93 | if return_df: 94 | return self.data[ 95 | self.data.columns.intersection( 96 | self.metadata + list(self.binary_feature_set) 97 | ) 98 | ], list(self.binary_feature_set) 99 | 100 | def select_features_multiclass( 101 | self, 102 | max_features=10, 103 | archs="all_archs", 104 | return_df=False, 105 | print_features=True, 106 | return_features=False, 107 | ): 108 | # for multiclass classification 109 | sfm_features = self._run_sfm( 110 | self.train_df, self.dev_df, max_features, multiclass=True 111 | ) 112 | self.multiclass_feature_set = set(sfm_features) 113 | 114 | if print_features: 115 | print("\nSelected features:.\n") 116 | for count, item in enumerate(self.multiclass_feature_set): 117 | print("{}. {}".format(count + 1, item)) 118 | 119 | if return_features: 120 | return list(self.multiclass_feature_set) 121 | 122 | if return_df: 123 | return self.data[ 124 | self.data.columns.intersection( 125 | self.metadata + list(self.multiclass_feature_set) 126 | ) 127 | ], list(self.multiclass_feature_set) 128 | 129 | def _run_sfm(self, trdf, dvdf, max_features, multiclass=False): 130 | # split train data into X and y 131 | X_train = trdf.drop(columns=self.metadata).copy() 132 | if multiclass: 133 | y_train = trdf["multiclass_label"].copy() 134 | else: 135 | y_train = trdf["label"].copy() 136 | 137 | # instantiating the model and fitting it 138 | sfm_model = SelectFromModel(self.model, max_features=max_features) 139 | sfm_model.fit(X_train, y_train) 140 | 141 | # getting the selected features 142 | sfm_features = list(X_train.columns[sfm_model.get_support()]) 143 | return sfm_features 144 | import pandas as pd 145 | -------------------------------------------------------------------------------- /src/packages/ModelManager.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | from sklearn.metrics import accuracy_score, log_loss, roc_curve 3 | import pandas as pd 4 | from sklearn.svm import SVC 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.neighbors import KNeighborsClassifier 7 | from sklearn.tree import DecisionTreeClassifier 8 | from sklearn.ensemble import RandomForestClassifier 9 | import numpy as np 10 | from scipy.optimize import brentq 11 | from scipy.interpolate import interp1d 12 | 13 | VALID_MODELS = ["svm", "logreg", "knn", "decision_tree", "random_forest"] 14 | 15 | 16 | class ModelManager: 17 | def __init__(self, model_name, data, feature_cols, merge_train_dev: bool = False): 18 | self.model_name = model_name 19 | self.data = data 20 | self._splitDataframe(merge_train_dev=merge_train_dev) 21 | self.init_model() 22 | self.feature_cols = feature_cols 23 | 24 | #model initialization 25 | def init_model(self, params=None): 26 | assert ( 27 | self.model_name.lower() in VALID_MODELS 28 | ), f"{self.model_name} is not valid. Valid models include {VALID_MODELS}" 29 | 30 | if self.model_name == "svm": 31 | if params is None: 32 | self.model = SVC() 33 | else: 34 | self.model = SVC(**params) 35 | elif self.model_name == "logreg": 36 | if params is None: 37 | self.model = LogisticRegression() 38 | else: 39 | self.model = LogisticRegression(**params) 40 | elif self.model_name == "knn": 41 | if params is None: 42 | self.model = KNeighborsClassifier() 43 | else: 44 | self.model = KNeighborsClassifier(**params) 45 | elif self.model_name == "decision_tree": 46 | if params is None: 47 | self.model = DecisionTreeClassifier() 48 | else: 49 | self.model = DecisionTreeClassifier(**params) 50 | elif self.model_name == "random_forest": 51 | if params is None: 52 | self.model = RandomForestClassifier(random_state=12) 53 | else: 54 | self.model = RandomForestClassifier(**params) 55 | 56 | def _splitDataframe(self, merge_train_dev: bool): 57 | 58 | if merge_train_dev: 59 | self.train = self.data[ 60 | (self.data.type == "train") | (self.data.type == "dev") 61 | ] 62 | self.dev = None 63 | else: 64 | self.train = self.data[(self.data.type == "train")] 65 | self.dev = self.data[(self.data.type == "dev")] 66 | 67 | self.test = self.data[(self.data.type == "test")] 68 | 69 | def trainModel(self, label_col: str): 70 | # Train the model using the training data 71 | self.y_train = self.train[label_col] 72 | self.X_train = self.train[self.feature_cols].copy() 73 | 74 | self.X_train.to_csv("/home/ubuntu/features.csv", index=False) 75 | 76 | self.model.fit(self.X_train, self.y_train) 77 | 78 | def predict(self, label_col: str): 79 | # Make predictions on the test data 80 | self.y_test = self.test[label_col] 81 | self.X_test = self.test[self.feature_cols].copy() 82 | 83 | self.y_pred = self.model.predict(self.X_test) 84 | 85 | # Calculate accuracy and log loss 86 | self.accuracy = accuracy_score(self.y_test, self.y_pred) 87 | 88 | self.class_accuracy = {} 89 | cls_y_test = self.y_test.copy() 90 | cls_y_test = cls_y_test.reset_index(drop=True) 91 | for cls in range(len(set(self.y_test))): 92 | cls_name = self.data.loc[ 93 | self.data[label_col] == cls, "architecture" 94 | ].unique()[0] 95 | 96 | cls_idx = np.where(self.y_test == cls)[0] 97 | cls_test = cls_y_test[cls_idx] 98 | cls_pred = self.y_pred[cls_idx] 99 | self.class_accuracy[cls_name] = accuracy_score(cls_test, cls_pred) 100 | 101 | self.eer_score, self.eer_threshold = None, None 102 | 103 | if self.model_name not in ["svm"]: 104 | self.y_prob = self.model.predict_proba(self.X_test) 105 | self.log_loss_value = log_loss(self.y_test, self.y_prob) 106 | 107 | #calculate eer score 108 | if "multi" not in label_col: 109 | self.eer_score, self.eer_threshold = self.calculate_eer() 110 | 111 | return ( 112 | self.accuracy, 113 | self.log_loss_value, 114 | self.eer_score, 115 | self.eer_threshold, 116 | ) 117 | 118 | self.log_loss_value = None 119 | return self.accuracy, self.log_loss_value, self.eer_score, self.eer_threshold 120 | 121 | #train and predict using model 122 | def trainPredict(self, label_col: str): 123 | self.trainModel(label_col=label_col) 124 | acc, log_loss, eer_score, eer_threshold = self.predict(label_col=label_col) 125 | return acc, log_loss, eer_score, eer_threshold 126 | 127 | def plotRocCurve(self): 128 | # Create a ROC curve plot 129 | fpr, tpr, _ = roc_curve(self.y_test, self.y_prob[:, 1]) 130 | plt.plot(fpr, tpr) 131 | plt.xlabel("False Positive Rate") 132 | plt.ylabel("True Positive Rate") 133 | plt.title("ROC Curve") 134 | plt.show() 135 | 136 | def plotProbaDistribution(self): 137 | # Create a histogram of test set probability scores 138 | plt.hist(self.y_prob) 139 | plt.xlabel("Probability Score") 140 | plt.ylabel("Frequency") 141 | plt.title("Test Set Probability Score Distribution") 142 | plt.show() 143 | 144 | def calculate_eer(self): 145 | # Calculate the False Positive Rate (FPR) and True Positive Rate (TPR) 146 | fpr, tpr, thresholds = roc_curve(self.y_test, self.y_prob[:, 1], pos_label=1) 147 | 148 | # Interpolate the FPR and TPR values 149 | interpolated = interp1d(fpr, tpr) 150 | 151 | # Find the point where FAR and FRR are equal (EER) 152 | eer = brentq(lambda x: 1.0 - x - interpolated(x), 0.0, 1.0) 153 | 154 | optimal_threshold = thresholds[np.nanargmin(np.abs((1.0 - tpr) - fpr))] 155 | 156 | return eer, optimal_threshold 157 | -------------------------------------------------------------------------------- /src/packages/AudioManager.py: -------------------------------------------------------------------------------- 1 | from pydub import AudioSegment 2 | import os 3 | from packages.LibrosaManager import LibrosaManager 4 | import soundfile as sf 5 | import librosa 6 | import numpy as np 7 | import random 8 | import shutil 9 | 10 | #utilities for converting audio files to appropriate sample rates 11 | #and for performing adversarial laundering 12 | class AudioManager: 13 | def __init__(self) -> None: 14 | pass 15 | 16 | def convertAudioDirectory( 17 | self, 18 | audio_dir: str, 19 | input_format: str, 20 | output_format: str = ".wav", 21 | output_dir: str = None, 22 | delete_original: bool = False, 23 | bitrate: str = None, 24 | codec: str = None, 25 | ): 26 | for file in os.listdir(audio_dir): 27 | if input_format in file: 28 | self.convertAudioFileTypes( 29 | os.path.join(audio_dir, file), 30 | output_format=output_format, 31 | delete_original=delete_original, 32 | output_dir=output_dir, 33 | bitrate=bitrate, 34 | codec=codec, 35 | ) 36 | 37 | def convertAudioFileTypes( 38 | self, 39 | audio_path: str, 40 | output_format: str = ".wav", 41 | delete_original: bool = False, 42 | output_dir: str = None, 43 | output_file_name: str = None, 44 | bitrate: str = None, 45 | codec: str = None, 46 | ): 47 | assert output_format in [ 48 | ".wav", 49 | ".mp4", 50 | ], f"{output_format} is an invalid output format. Please enter types: (.wav, .mp4)." 51 | try: 52 | import_audio = AudioSegment.from_file(audio_path) 53 | 54 | if isinstance(output_file_name, type(None)): 55 | output_file_name = os.path.basename(audio_path) 56 | output_file_name = output_file_name.replace( 57 | os.path.splitext(output_file_name)[1], output_format 58 | ) 59 | 60 | if not output_dir: 61 | output_dir = os.path.dirname(audio_path) 62 | 63 | import_audio.export( 64 | os.path.join(output_dir, output_file_name), 65 | format=output_format.replace(".", ""), 66 | codec=codec, 67 | bitrate=bitrate, 68 | ) 69 | 70 | if delete_original: 71 | os.remove(audio_path) 72 | 73 | except Exception as e: 74 | print(f"Failed to Convert Audio File: {audio_path}") 75 | print("Error: ", e) 76 | 77 | #resampling 78 | def resampleAudioDirectory( 79 | self, 80 | input_directory: str, 81 | output_directory: str, 82 | target_sample_rate: int, 83 | replace_existing: bool = False, 84 | ): 85 | for file in os.listdir(input_directory): 86 | if os.path.splitext(file)[1] not in [".wav", ".mp4", ".WAV"]: 87 | continue 88 | 89 | if not replace_existing: 90 | if os.path.isfile(os.path.join(output_directory, file)): 91 | continue 92 | 93 | try: 94 | librosa_manager = LibrosaManager(os.path.join(input_directory, file)) 95 | resampled_audio = librosa_manager.resample( 96 | target_sample_rate 97 | ) ## SB_Comment - see librosa manager re: resampling 98 | sf.write( 99 | os.path.join(output_directory, file), 100 | resampled_audio, 101 | target_sample_rate, 102 | subtype="PCM_24", 103 | ) 104 | except Exception as e: 105 | print(f"Failed to Resample: {file}") 106 | print(f"Error Msg: {e}") 107 | print() 108 | 109 | #function for adding noise to audio 110 | def addNoiseWithSnr(self, audio_path: str, snr_range: list = [10, 80]): 111 | audio, sr = librosa.load( 112 | audio_path 113 | ) 114 | 115 | audio_power = np.mean(audio**2) 116 | 117 | noise_snr = random.randint(snr_range[0], snr_range[1]) 118 | noise_power = audio_power / (10 ** (noise_snr / 10)) 119 | noise = np.random.normal(scale=np.sqrt(noise_power) * 100, size=len(audio)) 120 | 121 | noisy_audio = audio + noise 122 | 123 | return noisy_audio, noise_snr, sr 124 | 125 | #adversarial laundering 126 | def launderAudioDirectory( 127 | self, 128 | input_dir: str, 129 | output_dir: str, 130 | noise_type: str = "random_gaussian", 131 | replace_existing: bool = False, 132 | transcode_prob=0.5, 133 | noise_prob=0.5, 134 | ): 135 | full_launder_details = [] 136 | 137 | # Loop through files for laundering them 138 | for file in os.listdir(input_dir): 139 | 140 | file_launder_details = [os.path.join(input_dir, file), 0, None, 0, None] 141 | 142 | try: 143 | #get flags for laundering 144 | is_transcode = np.random.rand() <= transcode_prob 145 | is_noise = np.random.rand() <= noise_prob 146 | 147 | bitrate_options = ["64k", "127k", "196k"] 148 | 149 | #transcoding 150 | if is_transcode: 151 | bitrate = random.choice(bitrate_options) 152 | 153 | file_launder_details[1] = 1 154 | file_launder_details[2] = bitrate 155 | 156 | self.convertAudioFileTypes( 157 | os.path.join(input_dir, file), 158 | output_dir=output_dir, 159 | output_format=".mp4", 160 | delete_original=False, 161 | bitrate=bitrate, 162 | codec="aac", 163 | ) 164 | 165 | self.convertAudioFileTypes( 166 | os.path.join(output_dir, file.replace("wav", "mp4")), 167 | output_format=".wav", 168 | delete_original=True, 169 | ) 170 | 171 | else: 172 | # if no transcode is necessary, just move the file to the new directory 173 | shutil.copy( 174 | os.path.join(input_dir, file), os.path.join(output_dir, file) 175 | ) 176 | 177 | #adding noise 178 | if is_noise: 179 | noisy_audio, noise_snr, sr = self.addNoiseWithSnr( 180 | os.path.join(output_dir, file) 181 | ) 182 | 183 | file_launder_details[3] = 1 184 | file_launder_details[4] = noise_snr 185 | 186 | sf.write( 187 | os.path.join(output_dir, file), noisy_audio, sr 188 | ) 189 | 190 | full_launder_details.append(file_launder_details) 191 | 192 | except Exception as e: 193 | print(f"Failed to add noise: {file}") 194 | print(f"Error Msg: {e}") 195 | print() 196 | 197 | return full_launder_details 198 | -------------------------------------------------------------------------------- /src/run_pipeline_ljspeech.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | import os 3 | import sys 4 | import multiprocessing 5 | import pandas as pd 6 | import mlflow 7 | import time 8 | import argparse 9 | 10 | # local packages 11 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src") 12 | import packages.ExperimentPipeline as ep 13 | 14 | 15 | # function that runs the pipeline asynchonously 16 | def run_pipeline( 17 | fake_cols, 18 | metadata_path, 19 | open_smile_feature_count, 20 | run_name_prefix, 21 | run_tags, 22 | models, 23 | create_df_artifact, 24 | label_type="label", 25 | ) -> None: 26 | # create and run pipeline object 27 | exp = ep.ExperimentPipeline(fake_cols, metadata_path) 28 | exp.generate_features( 29 | feature_method="all", open_smile_feature_count=open_smile_feature_count 30 | ) 31 | exp.train_predict_using_models( 32 | run_name_prefix=run_name_prefix, 33 | run_tags=run_tags, 34 | models=models, 35 | create_df_artifact=create_df_artifact, 36 | label_type=label_type, 37 | ) 38 | 39 | 40 | # main function 41 | def main(experiment_name, open_smile_feature_count, create_df_artifact, num_processes): 42 | # start timing 43 | start_time = time.time() 44 | 45 | print("\nRunning pipeline for experiment: \n", experiment_name) 46 | mlflow.set_experiment(experiment_name) 47 | 48 | print("\nopen_smile_feature_count: \n", open_smile_feature_count) 49 | print("\ncreate_df_artifact: \n", create_df_artifact) 50 | print("\nnum_processes: \n", num_processes) 51 | print( 52 | "\nusing {} processes out of {} available processes: \n".format( 53 | num_processes, multiprocessing.cpu_count() 54 | ) 55 | ) 56 | 57 | # set the models to run 58 | models = ["logreg", "random_forest"] 59 | 60 | #################################### 61 | ##### start mutliprocessing ######## 62 | #################################### 63 | 64 | # Create a pool of worker processes 65 | pool = multiprocessing.Pool(processes=num_processes) 66 | 67 | # list for holding task arguments 68 | task_args = [] 69 | 70 | ###################################### 71 | ##### tasks for unlaundered data ##### 72 | ###################################### 73 | 74 | # mlflow tag setting 75 | run_tags = {"laundered": 0} 76 | # metadata path 77 | metadata_path_unlaundered = ( 78 | "/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv" 79 | ) 80 | 81 | # pipeline params 82 | run_params = {} 83 | run_params["EL"] = ["ElevenLabs"] 84 | run_params["UD"] = ["UberDuck"] 85 | run_params["WF"] = ["RandWaveFake"] 86 | run_params["EL_UD"] = ["ElevenLabs", "UberDuck"] 87 | run_params["EL_UD_WF"] = ["ElevenLabs", "UberDuck", "RandWaveFake"] 88 | run_params["EL_UD_Fake"] = ["EL_UD_Fake"] 89 | run_params["Fake"] = ["Fake"] 90 | 91 | # get the task params for unlaundered data 92 | for run_name_prefix, fake_cols in run_params.items(): 93 | # binary classifiaction tasks 94 | if len(fake_cols) == 1: 95 | # get args tuple and append to task_args list 96 | args = ( 97 | fake_cols, 98 | metadata_path_unlaundered, 99 | open_smile_feature_count, 100 | run_name_prefix, 101 | run_tags, 102 | models, 103 | create_df_artifact, 104 | "label", 105 | ) 106 | task_args.append(args) 107 | 108 | # multiclass classification tasks 109 | else: 110 | # get args tuple and append to task_args list 111 | args = ( 112 | fake_cols, 113 | metadata_path_unlaundered, 114 | open_smile_feature_count, 115 | run_name_prefix, 116 | run_tags, 117 | models, 118 | create_df_artifact, 119 | "multiclass_label", 120 | ) 121 | task_args.append(args) 122 | 123 | #################################### 124 | ##### tasks for laundered data ##### 125 | #################################### 126 | 127 | # mlflow tag setting 128 | run_tags = {"laundered": 1} 129 | # metadata path 130 | metadata_path_laundered = ( 131 | "/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv" 132 | ) 133 | 134 | # pipeline params 135 | run_params = {} 136 | run_params["EL"] = ["ElevenLabs"] 137 | run_params["UD"] = ["UberDuck"] 138 | run_params["WF"] = ["RandWaveFake"] 139 | run_params["EL_UD"] = ["ElevenLabs", "UberDuck"] 140 | run_params["EL_UD_WF"] = ["ElevenLabs", "UberDuck", "RandWaveFake"] 141 | run_params["EL_UD_Fake"] = ["EL_UD_Fake"] 142 | run_params["Fake"] = ["Fake"] 143 | 144 | # get the task params for laundered data 145 | for run_name_prefix, fake_cols in run_params.items(): 146 | # binary classifiaction tasks 147 | if len(fake_cols) == 1: 148 | # get args tuple and append to task_args list 149 | args = ( 150 | fake_cols, 151 | metadata_path_laundered, 152 | open_smile_feature_count, 153 | run_name_prefix, 154 | run_tags, 155 | models, 156 | create_df_artifact, 157 | "label", 158 | ) 159 | task_args.append(args) 160 | 161 | # multiclass classification tasks 162 | else: 163 | # get args tuple and append to task_args list 164 | args = ( 165 | fake_cols, 166 | metadata_path_laundered, 167 | open_smile_feature_count, 168 | run_name_prefix, 169 | run_tags, 170 | models, 171 | create_df_artifact, 172 | "multiclass_label", 173 | ) 174 | task_args.append(args) 175 | 176 | #################################### 177 | ##### run multiprocessing ########## 178 | #################################### 179 | 180 | # run the pipeline in parallel 181 | pool.starmap_async(run_pipeline, task_args) 182 | 183 | # close the pool and wait for the work to finish 184 | pool.close() 185 | pool.join() 186 | 187 | # end timing 188 | end_time = time.time() 189 | execution_time_seconds = end_time - start_time 190 | 191 | # convert to minutes 192 | execution_time_minutes = execution_time_seconds / 60 193 | 194 | print("\nAll async pipeline runs complete \n") 195 | print(f"Execution time: {execution_time_minutes} minutes") 196 | 197 | 198 | # main function 199 | if __name__ == "__main__": 200 | # Create an argument parser 201 | parser = argparse.ArgumentParser(description="Run pipeline") 202 | 203 | # Add the command-line arguments 204 | parser.add_argument("experiment_name", type=str, help="Name of the experiment") 205 | parser.add_argument( 206 | "--create_df_artifact", 207 | action="store_true", 208 | help="Flag to enable creating df artifact", 209 | ) 210 | parser.add_argument( 211 | "--open_smile_feature_count", 212 | type=int, 213 | default=20, 214 | help="Value for open smile feature count", 215 | ) 216 | parser.add_argument( 217 | "--num_processes", 218 | type=int, 219 | default=15, 220 | help="Number of parallel processes to run", 221 | ) 222 | 223 | # Parse the command-line arguments 224 | args = parser.parse_args() 225 | 226 | # Check if the experiment name is provided 227 | if not args.experiment_name: 228 | parser.error("Experiment name is required.") 229 | 230 | # Extract the arguments 231 | experiment_name = args.experiment_name 232 | create_df_artifact = args.create_df_artifact 233 | open_smile_feature_count = args.open_smile_feature_count 234 | num_processes = args.num_processes 235 | 236 | # Call the main function with the arguments 237 | main(experiment_name, open_smile_feature_count, create_df_artifact, num_processes) 238 | -------------------------------------------------------------------------------- /src/packages/TIMITDataLoader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pathlib 4 | from random import sample, seed, shuffle 5 | import pandas as pd 6 | import numpy as np 7 | 8 | # class for loading TIMIT data for multivoice experiments 9 | class TIMITDataLoader: 10 | def __init__(self, data_path: str, id_col: str = "id") -> None: 11 | self.file_path = data_path 12 | # set seed 13 | seed(12) 14 | 15 | def flatten(self, l): 16 | return [item for sublist in l for item in sublist] 17 | 18 | def get_all_files(self): 19 | files = [] 20 | for r, d, f in os.walk(self.file_path): 21 | for file in f: 22 | if ".wav" in file.lower(): 23 | files.append(os.path.join(r, file)) 24 | 25 | cleaned_files = [item for item in files if not "_processed" in item] 26 | 27 | return cleaned_files 28 | 29 | # generate train-dev-test split 30 | def generate_split(self, folder=False, balanced=True): 31 | if folder: 32 | data_df = self.generateFinalDataFrame_folder() 33 | else: 34 | data_df = self.generateFinalDataFrame(balanced=balanced) 35 | 36 | indices = list(data_df.index) 37 | 38 | shuffle(indices) 39 | total_samples = len(indices) 40 | train_size = int(0.6 * total_samples) 41 | dev_size = int(0.2 * total_samples) 42 | 43 | train_indices = indices[:train_size] 44 | dev_indices = indices[train_size : train_size + dev_size] 45 | test_indices = indices[train_size + dev_size :] 46 | 47 | data_df.loc[train_indices, "type"] = "train" 48 | data_df.loc[dev_indices, "type"] = "dev" 49 | data_df.loc[test_indices, "type"] = "test" 50 | 51 | train_count = data_df[data_df["type"] == "train"].shape[0] 52 | dev_count = data_df[data_df["type"] == "dev"].shape[0] 53 | test_count = data_df[data_df["type"] == "test"].shape[0] 54 | 55 | print(f"# of Train instances: {train_count}") 56 | print(f"# of Dev instances: {dev_count}") 57 | print(f"# of Test instances: {test_count}") 58 | 59 | return data_df 60 | 61 | def generate_split_speaker( 62 | self, speakers_to_remove, folder=False 63 | ): 64 | if folder: 65 | data_df = self.generateFinalDataFrame_folder() 66 | else: 67 | data_df = self.generateFinalDataFrame() 68 | 69 | data_df["speaker"] = [ 70 | item.split("/")[-1].split("_")[0] for item in data_df["path"] 71 | ] 72 | data_df["remove"] = [ 73 | 1 if item in speakers_to_remove else 0 for item in data_df["speaker"] 74 | ] 75 | 76 | data_df_without_test_speakers = data_df[data_df["remove"] == 0] 77 | data_df_with_test_speakers = data_df[data_df["remove"] == 1] 78 | 79 | cleaned_indices = list(data_df_without_test_speakers.index) 80 | removed_indices = list(data_df_with_test_speakers.index) 81 | 82 | shuffle(cleaned_indices) 83 | total_samples = len(cleaned_indices) 84 | train_size = int(0.6 * len(cleaned_indices)) 85 | dev_size = int(0.2 * len(cleaned_indices)) 86 | 87 | train_indices = cleaned_indices[:train_size] 88 | dev_indices = cleaned_indices[train_size : train_size + dev_size] 89 | test_indices = cleaned_indices[train_size + dev_size :] 90 | 91 | data_df.loc[train_indices, "type"] = "train" 92 | data_df.loc[dev_indices, "type"] = "dev" 93 | data_df.loc[test_indices, "type"] = "test" 94 | 95 | # Drop the original 'test' indices 96 | data_df = data_df[data_df.type != "test"] 97 | 98 | # Set the left out speakers to be the only 'test' indices 99 | data_df.loc[removed_indices, "type"] = "test" 100 | 101 | # Clean up dataframe 102 | data_df.drop(["remove"], axis=1, inplace=True) 103 | 104 | train_count = data_df[data_df["type"] == "train"].shape[0] 105 | dev_count = data_df[data_df["type"] == "dev"].shape[0] 106 | test_count = data_df[data_df["type"] == "test"].shape[0] 107 | 108 | print(f"# of Train instances: {train_count}") 109 | print(f"# of Dev instances: {dev_count}") 110 | print(f"# of Test instances: {test_count}") 111 | 112 | return data_df.reset_index(drop=True) 113 | 114 | def generateFinalDataFrame(self, balanced: bool = True): 115 | 116 | # Get resampled real and fake files 117 | all_wav_files = pathlib.Path(self.file_path) 118 | all_wav_files = list(all_wav_files.rglob("*.wav")) + list( 119 | all_wav_files.rglob("*.WAV") 120 | ) 121 | 122 | real_resampled_wav_files = [ 123 | str(file) for file in all_wav_files if "real" in str(file) 124 | ] 125 | fake_resampled_wav_files = [ 126 | str(file) for file in all_wav_files if "fake/" in str(file) 127 | ] 128 | 129 | # Extract phrases and file names 130 | final_folders = [] 131 | 132 | for folder in os.listdir(self.file_path): 133 | phrase_files = [ 134 | phrase for phrase in real_resampled_wav_files if folder in phrase 135 | ] 136 | 137 | file_names = set( 138 | [name.split("_")[-1].split(".")[0] for name in phrase_files] 139 | ) 140 | 141 | if len(file_names) > 1: 142 | continue 143 | 144 | # Ensure each file has at least 2 real samples 145 | elif len(phrase_files) > 1: 146 | final_folders.append(folder) 147 | 148 | print(len(final_folders)) 149 | 150 | real_files = [] 151 | fake_files = [] 152 | 153 | print(f"Params: {len(final_folders)} different phrases") 154 | 155 | # Remove any potential duplicates 156 | file_dict = {} 157 | for i in range(len(real_resampled_wav_files)): 158 | file_name = real_resampled_wav_files[i].split("/")[-1] 159 | file_dict[file_name] = real_resampled_wav_files[i] 160 | 161 | real_resampled_wav_files = [file_dict[item] for item in file_dict.keys()] 162 | 163 | for n in range(len(final_folders)): 164 | phrase = final_folders[n] 165 | 166 | real_examples = [ 167 | file for file in real_resampled_wav_files if f"_{phrase}." in file 168 | ] 169 | real_examples = [ 170 | file for file in real_resampled_wav_files if f"/{phrase}/" in file 171 | ] 172 | 173 | fake_examples = [ 174 | file for file in fake_resampled_wav_files if f"_{phrase}." in file 175 | ] 176 | fake_examples = [ 177 | file for file in fake_resampled_wav_files if f"/{phrase}/" in file 178 | ] 179 | 180 | # Ensure we take the same number of each phrase for real and fake, downsample the real/fake files accordingly 181 | if len(real_examples) > len(fake_examples): 182 | real_examples = sample(real_examples, len(fake_examples)) 183 | else: 184 | fake_examples = sample(fake_examples, len(real_examples)) 185 | 186 | [real_files.append(file) for file in real_examples] 187 | [fake_files.append(file) for file in fake_examples] 188 | 189 | balanced_real_paths = real_files 190 | balanced_fake_paths = fake_files 191 | 192 | df = pd.DataFrame( 193 | { 194 | "type": [ 195 | "tbc" 196 | for i in range(len(balanced_real_paths) + len(balanced_fake_paths)) 197 | ], 198 | "id": [ 199 | i 200 | for i in range(len(balanced_real_paths) + len(balanced_fake_paths)) 201 | ], 202 | "architecture": [0 for item in balanced_real_paths] 203 | + [1 for item in balanced_fake_paths], 204 | "orig_path": balanced_real_paths + balanced_fake_paths, 205 | "label": [0 for item in balanced_real_paths] 206 | + [1 for item in balanced_fake_paths], 207 | "multiclass_label": [0 for item in balanced_real_paths] 208 | + [1 for item in balanced_fake_paths], 209 | } 210 | ) 211 | 212 | downsampled_src = "/home/ubuntu/data/TIMIT_and_ElevenLabs/16KHz" 213 | orig_paths = df["orig_path"].tolist() 214 | downsampled_paths = [ 215 | os.path.join(downsampled_src, os.path.basename(path)) for path in orig_paths 216 | ] 217 | 218 | df["path"] = downsampled_paths 219 | 220 | return df 221 | -------------------------------------------------------------------------------- /pip_requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | aiohttp==3.8.4 3 | aiosignal==1.3.1 4 | alabaster==0.7.13 5 | alembic==1.11.1 6 | aniso8601==9.0.1 7 | antlr4-python3-runtime==4.9.3 8 | anyio==3.6.2 9 | appdirs==1.4.4 10 | argon2-cffi==21.3.0 11 | argon2-cffi-bindings==21.2.0 12 | astor==0.8.1 13 | asttokens==2.2.1 14 | astunparse==1.6.3 15 | async-timeout==4.0.2 16 | attrdict==2.0.1 17 | attrs==22.2.0 18 | audeer==1.19.0 19 | audformat==0.16.1 20 | audinterface==1.0.1 21 | audiofile==1.2.1 22 | audioread==3.0.0 23 | audmath==1.2.1 24 | audobject==0.7.9 25 | audresample==1.2.1 26 | av==10.0.0 27 | Babel==2.10.3 28 | backcall==0.2.0 29 | bcrypt==4.0.1 30 | beautifulsoup4==4.11.2 31 | black==19.10b0 32 | boto3==1.26.81 33 | botocore==1.29.81 34 | braceexpand==0.1.7 35 | cachetools==5.3.0 36 | certifi @ file:///croot/certifi_1671487769961/work/certifi 37 | cffi==1.15.1 38 | chardet==5.1.0 39 | charset-normalizer==2.1.1 40 | click==8.1.3 41 | cloudpickle==2.2.1 42 | colorama==0.4.6 43 | comm==0.1.3 44 | commonmark==0.9.1 45 | cryptography==41.0.1 46 | cycler==0.11.0 47 | Cython==0.29.33 48 | databricks-cli==0.17.7 49 | datasets==2.10.1 50 | debugpy==1.6.6 51 | decorator==5.1.1 52 | deeplake==3.2.11 53 | dill==0.3.6 54 | Distance==0.1.3 55 | disvoice==0.1.8 56 | disvoice-prosody==0.0.5 57 | dm-tree==0.1.8 58 | docker==6.1.3 59 | docker-pycreds==0.4.0 60 | docopt==0.6.2 61 | docutils==0.19 62 | editdistance==0.6.2 63 | einops==0.6.0 64 | entrypoints @ file:///tmp/build/80754af9/entrypoints_1649926445639/work 65 | etils==1.0.0 66 | evdev==1.6.1 67 | exceptiongroup==1.1.0 68 | executing==1.2.0 69 | faiss-cpu==1.7.3 70 | fastapi==0.92.0 71 | fastjsonschema==2.16.2 72 | fasttext==0.9.2 73 | ffmpy==0.3.0 74 | filelock==3.9.0 75 | Flask==2.2.3 76 | Flask-RESTful==0.3.9 77 | flatbuffers==23.1.21 78 | flit_core @ file:///opt/conda/conda-bld/flit-core_1644941570762/work/source/flit_core 79 | fonttools==4.38.0 80 | frozenlist==1.3.3 81 | fsspec==2023.1.0 82 | ftfy==6.1.1 83 | future==0.18.3 84 | g2p-en==2.1.0 85 | gast==0.5.3 86 | gdown==4.6.4 87 | gitdb==4.0.10 88 | GitPython==3.1.31 89 | google-api-core==2.11.0 90 | google-api-python-client==2.83.0 91 | google-auth==1.4.2 92 | google-auth-httplib2==0.1.0 93 | google-auth-oauthlib==0.4.6 94 | google-cloud-texttospeech==2.14.1 95 | googleapis-common-protos==1.58.0 96 | gradio==3.4.0 97 | greenlet==2.0.2 98 | grpcio==1.53.0 99 | grpcio-status==1.53.0 100 | gunicorn==20.1.0 101 | h11==0.12.0 102 | h5py==3.8.0 103 | htmlmin==0.1.12 104 | httpcore==0.15.0 105 | httplib2==0.22.0 106 | httpx==0.23.3 107 | hub==3.0.1 108 | huggingface-hub==0.12.1 109 | humbug==0.2.8 110 | hydra-core==1.2.0 111 | idna==3.4 112 | ijson==3.2.0.post0 113 | ImageHash==4.3.1 114 | imageio==2.4.1 115 | imageio-ffmpeg==0.4.8 116 | imagesize==1.4.1 117 | importlib-metadata==6.1.0 118 | importlib-resources==5.12.0 119 | inflect==6.0.2 120 | iniconfig==2.0.0 121 | ipadic==1.0.0 122 | ipykernel==6.22.0 123 | ipython==8.12.0 124 | ipywidgets==8.0.2 125 | iso-639==0.4.5 126 | iso3166==2.1.1 127 | isort==4.3.21 128 | itsdangerous==2.1.2 129 | jedi==0.18.2 130 | jieba==0.42.1 131 | Jinja2==3.1.2 132 | jiwer==2.5.1 133 | jmespath==1.0.1 134 | joblib==1.2.0 135 | json5==0.9.10 136 | jupyter-server==1.21.0 137 | jupyter_client==8.1.0 138 | jupyter_core==5.3.0 139 | jupyterlab==3.5.0 140 | jupyterlab-pygments==0.2.2 141 | jupyterlab-widgets==3.0.3 142 | jupyterlab_server==2.16.1 143 | kaldi-io==0.9.5 144 | kaldi-python-io==1.2.2 145 | kaldiio==2.17.2 146 | kiwisolver==1.4.4 147 | latexcodec==2.0.1 148 | lazy_loader==0.1 149 | Levenshtein==0.20.2 150 | libclang==15.0.6.1 151 | librosa==0.10.0 152 | lightning-utilities==0.7.1 153 | linkify-it-py==2.0.0 154 | llvmlite==0.39.1 155 | loguru==0.6.0 156 | lxml==4.9.2 157 | Mako==1.2.4 158 | Markdown==3.4.1 159 | markdown-it-py==2.2.0 160 | MarkupSafe==2.1.1 161 | marshmallow==3.19.0 162 | matplotlib==3.5.3 163 | matplotlib-inline==0.1.6 164 | mdit-py-plugins==0.3.4 165 | mdurl==0.1.2 166 | mecab-python3==1.0.5 167 | missingno==0.5.1 168 | mistune==2.0.4 169 | mkl-fft==1.3.1 170 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work 171 | mkl-service==2.4.0 172 | mlflow==2.4.1 173 | moviepy==1.0.3 174 | mpmath==1.2.1 175 | msgpack==1.0.4 176 | multidict==6.0.4 177 | multimethod==1.9 178 | multiprocess==0.70.14 179 | nbclassic==0.4.7 180 | nbclient==0.7.0 181 | nbconvert==7.2.2 182 | nbformat==5.7.0 183 | nemo-toolkit==1.15.0 184 | nest-asyncio==1.5.6 185 | nltk==3.8.1 186 | notebook_shim==0.2.0 187 | numba==0.56.4 188 | numcodecs==0.11.0 189 | numexpr==2.8.4 190 | numpy==1.23.4 191 | oauth2client==4.1.3 192 | oauthlib==3.2.2 193 | omegaconf==2.2.3 194 | onnx==1.13.1 195 | OpenCC==1.1.6 196 | opensmile==2.4.2 197 | opt-einsum==3.3.0 198 | orjson==3.8.6 199 | oyaml==1.0 200 | packaging==23.0 201 | pandas==1.5.1 202 | pandas-profiling==3.4.0 203 | pandasgui==0.2.14 204 | pandastable==0.13.1 205 | pangu==4.0.6.1 206 | parameterized==0.8.1 207 | paramiko==3.2.0 208 | parso==0.8.3 209 | pathos==0.3.0 210 | pathspec==0.11.0 211 | pathtools==0.1.2 212 | patsy==0.5.3 213 | pexpect==4.8.0 214 | phik==0.12.2 215 | phonet==0.3.7 216 | pickleshare==0.7.5 217 | Pillow==9.4.0 218 | pip-api==0.0.30 219 | pipreqs==0.4.11 220 | plac==1.3.5 221 | platformdirs==3.2.0 222 | plotly==5.15.0 223 | pluggy==1.0.0 224 | pooch==1.7.0 225 | portalocker==2.7.0 226 | portpicker==1.2.0 227 | pox==0.3.2 228 | ppft==1.7.6.6 229 | proglog==0.1.10 230 | progress==1.6 231 | promise==2.3 232 | prompt-toolkit==3.0.38 233 | proto-plus==1.22.2 234 | protobuf==3.20.3 235 | psutil==5.9.4 236 | ptyprocess==0.7.0 237 | pure-eval==0.2.2 238 | pyannote.core==5.0.0 239 | pyannote.database==4.1.3 240 | pyannote.metrics==3.2.1 241 | pyarrow==11.0.0 242 | pyasn1==0.4.8 243 | pyasn1-modules==0.2.8 244 | pybind11==2.10.3 245 | pybtex==0.24.0 246 | pybtex-docutils==1.0.2 247 | pycparser==2.21 248 | pycryptodome==3.17 249 | pydantic==1.10.2 250 | PyDrive==1.3.1 251 | pydub==0.25.1 252 | Pygments==2.14.0 253 | PyJWT==2.7.0 254 | PyNaCl==1.5.0 255 | pynput==1.7.6 256 | pyparsing==3.0.9 257 | pypinyin==0.48.0 258 | pypinyin-dict==0.5.0 259 | PyQt5==5.15.9 260 | PyQt5-Qt5==5.15.2 261 | PyQt5-sip==12.12.1 262 | PyQtWebEngine==5.15.6 263 | PyQtWebEngine-Qt5==5.15.2 264 | PySocks==1.7.1 265 | pysptk==0.2.0 266 | pytest==7.2.1 267 | pytest-runner==6.0.0 268 | python-dateutil==2.8.2 269 | python-multipart==0.0.5 270 | python-speech-features==0.6 271 | python-xlib==0.33 272 | pytorch-lightning==1.8.6 273 | pytz==2022.5 274 | PyYAML==6.0 275 | pyzmq==25.0.2 276 | qtstylish==0.1.5 277 | querystring-parser==1.2.4 278 | rapidfuzz==2.13.7 279 | regex==2022.10.31 280 | requests==2.31.0 281 | requests-oauthlib==1.3.1 282 | responses==0.18.0 283 | rfc3986==1.5.0 284 | rich==12.6.0 285 | rsa==4.9 286 | ruamel.yaml==0.17.21 287 | ruamel.yaml.clib==0.2.7 288 | s3transfer==0.6.0 289 | sacrebleu==2.3.1 290 | sacremoses==0.0.53 291 | scikit-learn==1.2.2 292 | scipy==1.9.3 293 | seaborn==0.12.1 294 | Send2Trash==1.8.0 295 | sentence-transformers==2.2.2 296 | sentencepiece==0.1.97 297 | sentry-sdk==1.15.0 298 | setproctitle==1.3.2 299 | shellingham==1.5.0.post1 300 | simplegeneric==0.8.1 301 | six==1.16.0 302 | smmap==5.0.0 303 | sniffio==1.3.0 304 | snowballstemmer==2.2.0 305 | sortedcontainers==2.4.0 306 | soundfile==0.12.1 307 | soupsieve==2.4 308 | sox==1.4.1 309 | soxr==0.3.3 310 | Sphinx==6.1.3 311 | sphinxcontrib-applehelp==1.0.4 312 | sphinxcontrib-bibtex==2.5.0 313 | sphinxcontrib-devhelp==1.0.2 314 | sphinxcontrib-htmlhelp==2.0.1 315 | sphinxcontrib-jsmath==1.0.1 316 | sphinxcontrib-qthelp==1.0.3 317 | sphinxcontrib-serializinghtml==1.1.5 318 | SQLAlchemy==2.0.17 319 | sqlparse==0.4.4 320 | stack-data==0.6.2 321 | starlette==0.25.0 322 | statsmodels==0.13.2 323 | sympy==1.11.1 324 | tabulate==0.9.0 325 | tangled-up-in-unicode==0.2.0 326 | tenacity==8.2.2 327 | tensorboard==2.12.0 328 | tensorboard-data-server==0.7.0 329 | tensorboard-plugin-wit==1.8.1 330 | tensorboardX==2.6 331 | tensorflow==2.11.0 332 | tensorflow-datasets==4.8.3 333 | tensorflow-io==0.31.0 334 | tensorflow-io-gcs-filesystem==0.31.0 335 | tensorflow-metadata==1.12.0 336 | termcolor==2.2.0 337 | terminado==0.13.3 338 | text-unidecode==1.3 339 | textdistance==4.5.0 340 | texterrors==0.4.4 341 | threadpoolctl==3.1.0 342 | tinycss2==1.2.1 343 | tokenizers==0.13.2 344 | toml==0.10.2 345 | tomli==2.0.1 346 | torch==1.13.1 347 | torch-summary==1.4.5 348 | torchaudio==0.13.1 349 | torchmetrics==0.11.1 350 | torchvision==0.2.2 351 | tornado==6.2 352 | tqdm==4.64.1 353 | traitlets==5.9.0 354 | transformers==4.26.1 355 | typed-ast==1.5.4 356 | typer==0.7.0 357 | typing_extensions==4.5.0 358 | uc-micro-py==1.0.1 359 | uritemplate==4.1.1 360 | urllib3==1.26.14 361 | uvicorn==0.20.0 362 | visions==0.7.5 363 | wandb==0.13.10 364 | wcwidth==0.2.6 365 | webdataset==0.1.62 366 | websocket-client==1.6.1 367 | websockets==10.4 368 | Werkzeug==2.2.3 369 | wget==3.2 370 | widgetsnbextension==4.0.3 371 | wordcloud==1.9.2 372 | wrapt==1.15.0 373 | xlrd==2.0.1 374 | xxhash==3.2.0 375 | yarg==0.1.9 376 | yarl==1.8.2 377 | yellowbrick==1.5 378 | youtokentome==1.0.6 379 | zipp==3.15.0 380 | -------------------------------------------------------------------------------- /src/run_pipeline_multivoice.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | import sys 3 | import os 4 | import multiprocessing 5 | import pandas as pd 6 | import mlflow 7 | import time 8 | import argparse 9 | 10 | # local packages 11 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src") 12 | import packages.ExperimentPipeline as ep 13 | from packages.TIMITDataLoader import TIMITDataLoader 14 | from packages.LJDataLoader import LJDataLoader 15 | from packages.AudioEmbeddingsManager import AudioEmbeddingsManager 16 | from packages.ModelManager import ModelManager 17 | from packages.CadenceModelManager import CadenceModelManager 18 | from packages.SmileFeatureManager import SmileFeatureManager 19 | 20 | # fixed values 21 | timit_data_path = "/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs" 22 | fake_voices = [ 23 | "Adam", 24 | "Antoni", 25 | "Arnold", 26 | "Bella", 27 | "Biden", 28 | "Domi", 29 | "Elli", 30 | "Josh", 31 | "Obama", 32 | "Rachel", 33 | "Sam", 34 | ] 35 | # set the models to run 36 | models = ["logreg", "random_forest"] 37 | 38 | 39 | # helper functions 40 | def chunks(lst, n): 41 | # sort the list 42 | lst.sort() 43 | for i in range(0, len(lst), n): 44 | yield lst[i : i + n] 45 | 46 | 47 | # function that runs the pipeline asynchonously 48 | def run_pipeline( 49 | data_df, open_smile_feature_count, run_name_prefix, run_tags, create_df_artifact 50 | ) -> None: 51 | # create and run pipeline object 52 | exp = ep.ExperimentPipeline( 53 | fake_cols=["ElevenLabs"], metadata_path=None, data_df=data_df 54 | ) 55 | exp.generate_features( 56 | feature_method="all", open_smile_feature_count=open_smile_feature_count 57 | ) 58 | exp.train_predict_using_models( 59 | run_name_prefix=run_name_prefix, 60 | run_tags=run_tags, 61 | models=models, 62 | create_df_artifact=create_df_artifact, 63 | label_type="label", 64 | ) 65 | 66 | 67 | # main function 68 | def main( 69 | experiment_name, 70 | open_smile_feature_count, 71 | create_df_artifact, 72 | num_processes, 73 | save_path, 74 | ): 75 | # start timing 76 | start_time = time.time() 77 | 78 | print("\nRunning pipeline for experiment: \n", experiment_name) 79 | mlflow.set_experiment(experiment_name) 80 | 81 | print("\nopen_smile_feature_count: \n", open_smile_feature_count) 82 | print("\ncreate_df_artifact: \n", create_df_artifact) 83 | print("\nnum_processes: \n", num_processes) 84 | print( 85 | "\nusing {} processes out of {} available processes: \n".format( 86 | num_processes, multiprocessing.cpu_count() 87 | ) 88 | ) 89 | 90 | # load the timit data 91 | timit_data_loader = TIMITDataLoader(timit_data_path) 92 | # generate the split 93 | df = timit_data_loader.generate_split() 94 | # get speakers 95 | df["speaker"] = [item.split("/")[-1].split("_")[0] for item in df["path"]] 96 | 97 | # create partitions 98 | real_speakers = list( 99 | set([item for item in df["speaker"] if not item.startswith(tuple(fake_voices))]) 100 | ) 101 | fake_speakers = list( 102 | set([item for item in df["speaker"] if item.startswith(tuple(fake_voices))]) 103 | ) 104 | 105 | real_speaker_partitions = list(chunks(real_speakers, 20)) 106 | fake_speaker_partitions = list(chunks(fake_speakers, 2)) 107 | 108 | #################################### 109 | ##### start mutliprocessing ######## 110 | #################################### 111 | 112 | # Create a pool of worker processes 113 | pool = multiprocessing.Pool(processes=num_processes) 114 | 115 | # list for holding task arguments 116 | task_args = [] 117 | 118 | ###################################### 119 | ############# create tasks ########### 120 | ###################################### 121 | 122 | # counter for labeling runs 123 | counter = 1 124 | 125 | # loop through the partitions to remove voices 126 | for fake_speaker_chunk in fake_speaker_partitions: 127 | for real_speaker_chunk in real_speaker_partitions: 128 | # voices to remove 129 | voices_to_remove = fake_speaker_chunk + real_speaker_chunk 130 | 131 | # re-instantiate the loader 132 | timit_data_loader = TIMITDataLoader(timit_data_path) 133 | 134 | # generating split speaker test from the 135 | data_df = timit_data_loader.generate_split_speaker( 136 | voices_to_remove, folder=False 137 | ) 138 | 139 | # other task arguments 140 | run_name_prefix = f"multivoice_run_{counter}" 141 | run_tags = {"voices_to_remove": voices_to_remove} 142 | 143 | # arguments for the task 144 | args = ( 145 | data_df, 146 | open_smile_feature_count, 147 | run_name_prefix, 148 | run_tags, 149 | create_df_artifact, 150 | ) 151 | 152 | task_args.append(args) 153 | 154 | counter += 1 155 | 156 | #################################### 157 | ##### run multiprocessing ########## 158 | #################################### 159 | 160 | # run the pipeline in parallel 161 | pool.starmap_async(run_pipeline, task_args) 162 | 163 | # close the pool and wait for the work to finish 164 | pool.close() 165 | pool.join() 166 | 167 | #################################### 168 | ####### aggregate results ########## 169 | #################################### 170 | 171 | # get all the runs for the experiment 172 | experiment = mlflow.get_experiment_by_name(experiment_name) 173 | experiment_id = experiment.experiment_id 174 | runs = mlflow.search_runs(experiment_ids=experiment_id) 175 | 176 | # aggregate results and save to csv 177 | agg_results = ( 178 | runs.groupby(["tags.feature_method", "tags.estimator_name", "tags.label_type"])[ 179 | "metrics.accuracy", 180 | "metrics.0_accuracy", 181 | "metrics.1_accuracy", 182 | "metrics.eer_score", 183 | ] 184 | .mean() 185 | .reset_index() 186 | ) 187 | new_column_names = { 188 | "tags.feature_method": "feature_method", 189 | "tags.estimator_name": "estimator_name", 190 | "tags.label_type": "label_type", 191 | "metrics.accuracy": "accuracy", 192 | "metrics.0_accuracy": "real_accuracy", 193 | "metrics.1_accuracy": "fake_accuracy", 194 | "metrics.eer_score": "eer_score", 195 | } 196 | if save_path.lower().endswith(".csv"): 197 | agg_results.to_csv(save_path) 198 | else: 199 | agg_results.to_csv(save_path + f"/results_{experiment_name}.csv", index=False) 200 | 201 | print("\nAggregated results saved to: \n", save_path) 202 | 203 | #################################### 204 | ######### end the script ########### 205 | #################################### 206 | 207 | # end timing 208 | end_time = time.time() 209 | execution_time_seconds = end_time - start_time 210 | 211 | # convert to minutes 212 | execution_time_minutes = execution_time_seconds / 60 213 | 214 | print("\nAll async pipeline runs complete \n") 215 | print(f"Execution time: {execution_time_minutes} minutes") 216 | 217 | 218 | # main function 219 | if __name__ == "__main__": 220 | # Create an argument parser 221 | parser = argparse.ArgumentParser(description="Run pipeline") 222 | 223 | # Add the command-line arguments 224 | parser.add_argument("experiment_name", type=str, help="Name of the experiment") 225 | parser.add_argument( 226 | "--create_df_artifact", 227 | action="store_true", 228 | help="Flag to enable creating df artifact", 229 | ) 230 | parser.add_argument( 231 | "--open_smile_feature_count", 232 | type=int, 233 | default=10, 234 | help="Value for open smile feature count", 235 | ) 236 | parser.add_argument( 237 | "--num_processes", 238 | type=int, 239 | default=15, 240 | help="Number of parallel processes to run", 241 | ) 242 | parser.add_argument( 243 | "--save_path", 244 | type=str, 245 | default="results_multivoice.csv", 246 | help="Path of the CSV file to save", 247 | ) 248 | 249 | # Parse the command-line arguments 250 | args = parser.parse_args() 251 | 252 | # Check if the experiment name is provided 253 | if not args.experiment_name: 254 | parser.error("Experiment name is required.") 255 | 256 | # Extract the arguments 257 | experiment_name = args.experiment_name 258 | create_df_artifact = args.create_df_artifact 259 | open_smile_feature_count = args.open_smile_feature_count 260 | num_processes = args.num_processes 261 | save_path = args.save_path 262 | 263 | # Call the main function with the arguments 264 | main( 265 | experiment_name, 266 | open_smile_feature_count, 267 | create_df_artifact, 268 | num_processes, 269 | save_path, 270 | ) 271 | -------------------------------------------------------------------------------- /conda_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | _libgcc_mutex=0.1=main 5 | _openmp_mutex=5.1=1_gnu 6 | absl-py=1.4.0=pypi_0 7 | aiohttp=3.8.4=pypi_0 8 | aiosignal=1.3.1=pypi_0 9 | alabaster=0.7.13=pypi_0 10 | alembic=1.11.1=pypi_0 11 | aniso8601=9.0.1=pypi_0 12 | antlr4-python3-runtime=4.9.3=pypi_0 13 | appdirs=1.4.4=pypi_0 14 | astor=0.8.1=pypi_0 15 | asttokens=2.2.1=pypi_0 16 | astunparse=1.6.3=pypi_0 17 | async-timeout=4.0.2=pypi_0 18 | attrdict=2.0.1=pypi_0 19 | attrs=22.2.0=pypi_0 20 | audioread=3.0.0=pypi_0 21 | backcall=0.2.0=pyhd3eb1b0_0 22 | bcrypt=4.0.1=pypi_0 23 | beautifulsoup4=4.11.2=pypi_0 24 | black=19.10b0=pypi_0 25 | blas=1.0=mkl 26 | boto3=1.26.79=pypi_0 27 | botocore=1.29.79=pypi_0 28 | braceexpand=0.1.7=pypi_0 29 | ca-certificates=2023.01.10=h06a4308_0 30 | cachetools=5.3.0=pypi_0 31 | certifi=2022.12.7=py38h06a4308_0 32 | cffi=1.15.1=pypi_0 33 | chardet=5.1.0=pypi_0 34 | click=8.0.2=pypi_0 35 | cloudpickle=2.2.1=pypi_0 36 | colorama=0.4.6=pypi_0 37 | comm=0.1.2=py38h06a4308_0 38 | commonmark=0.9.1=pypi_0 39 | cryptography=39.0.1=pypi_0 40 | cudatoolkit=11.3.1=h2bc3f7f_2 41 | cycler=0.11.0=pypi_0 42 | cython=0.29.33=pypi_0 43 | databricks-cli=0.17.7=pypi_0 44 | debugpy=1.6.6=pypi_0 45 | decorator=5.1.1=pyhd3eb1b0_0 46 | distance=0.1.3=pypi_0 47 | docker=6.1.3=pypi_0 48 | docker-pycreds=0.4.0=pypi_0 49 | docopt=0.6.2=pypi_0 50 | docutils=0.19=pypi_0 51 | editdistance=0.6.2=pypi_0 52 | einops=0.6.0=pypi_0 53 | entrypoints=0.4=py38h06a4308_0 54 | evdev=1.6.1=pypi_0 55 | exceptiongroup=1.1.0=pypi_0 56 | executing=1.2.0=pypi_0 57 | faiss-cpu=1.7.3=pypi_0 58 | fastapi=0.92.0=pypi_0 59 | fasttext=0.9.2=pypi_0 60 | ffmpy=0.3.0=pypi_0 61 | filelock=3.9.0=pypi_0 62 | flask=2.2.3=pypi_0 63 | flask-restful=0.3.9=pypi_0 64 | flit-core=3.6.0=pyhd3eb1b0_0 65 | freetype=2.12.1=h4a9f257_0 66 | frozenlist=1.3.3=pypi_0 67 | fsspec=2023.1.0=pypi_0 68 | ftfy=6.1.1=pypi_0 69 | future=0.18.3=pypi_0 70 | g2p-en=2.1.0=pypi_0 71 | gast=0.5.3=pypi_0 72 | gdown=4.6.4=pypi_0 73 | giflib=5.2.1=h5eee18b_3 74 | gitdb=4.0.10=pypi_0 75 | gitpython=3.1.31=pypi_0 76 | google-auth=2.16.1=pypi_0 77 | google-auth-oauthlib=0.4.6=pypi_0 78 | gradio=3.4.0=pypi_0 79 | greenlet=2.0.2=pypi_0 80 | grpcio=1.51.3=pypi_0 81 | gunicorn=20.1.0=pypi_0 82 | h11=0.12.0=pypi_0 83 | h5py=3.8.0=pypi_0 84 | httpcore=0.15.0=pypi_0 85 | httpx=0.23.3=pypi_0 86 | huggingface-hub=0.12.1=pypi_0 87 | hydra-core=1.2.0=pypi_0 88 | idna=3.4=pypi_0 89 | ijson=3.2.0.post0=pypi_0 90 | imagesize=1.4.1=pypi_0 91 | importlib-resources=5.12.0=pypi_0 92 | inflect=6.0.2=pypi_0 93 | iniconfig=2.0.0=pypi_0 94 | intel-openmp=2021.4.0=h06a4308_3561 95 | ipadic=1.0.0=pypi_0 96 | ipykernel=6.21.2=pypi_0 97 | ipython=8.10.0=pypi_0 98 | isort=4.3.21=pypi_0 99 | itsdangerous=2.1.2=pypi_0 100 | jedi=0.18.2=pypi_0 101 | jieba=0.42.1=pypi_0 102 | jiwer=2.5.1=pypi_0 103 | jmespath=1.0.1=pypi_0 104 | jpeg=9e=h7f8727e_0 105 | jupyter-core=5.2.0=pypi_0 106 | jupyter_client=7.4.8=py38h06a4308_0 107 | jupyter_core=5.1.1=py38h06a4308_0 108 | kaldi-python-io=1.2.2=pypi_0 109 | kaldiio=2.17.2=pypi_0 110 | kiwisolver=1.4.4=pypi_0 111 | latexcodec=2.0.1=pypi_0 112 | lazy-loader=0.1=pypi_0 113 | lcms2=2.12=h3be6417_0 114 | lerc=3.0=h295c915_0 115 | levenshtein=0.20.2=pypi_0 116 | libdeflate=1.8=h7f8727e_5 117 | libedit=3.1.20221030=h5eee18b_0 118 | libffi=3.2.1=hf484d3e_1007 119 | libgcc-ng=11.2.0=h1234567_1 120 | libgomp=11.2.0=h1234567_1 121 | libpng=1.6.37=hbc83047_0 122 | librosa=0.10.0=pypi_0 123 | libsodium=1.0.18=h7b6447c_0 124 | libstdcxx-ng=11.2.0=h1234567_1 125 | libtiff=4.5.0=h6a678d5_1 126 | libwebp=1.2.4=h11a3e52_1 127 | libwebp-base=1.2.4=h5eee18b_1 128 | lightning-utilities=0.7.1=pypi_0 129 | linkify-it-py=2.0.0=pypi_0 130 | llvmlite=0.39.1=pypi_0 131 | loguru=0.6.0=pypi_0 132 | lxml=4.9.2=pypi_0 133 | lz4-c=1.9.4=h6a678d5_0 134 | mako=1.2.4=pypi_0 135 | markdown=3.4.1=pypi_0 136 | markdown-it-py=2.2.0=pypi_0 137 | marshmallow=3.19.0=pypi_0 138 | matplotlib-inline=0.1.6=py38h06a4308_0 139 | mdit-py-plugins=0.3.4=pypi_0 140 | mdurl=0.1.2=pypi_0 141 | mecab-python3=1.0.5=pypi_0 142 | mkl=2021.4.0=h06a4308_640 143 | mkl-service=2.4.0=py38h7f8727e_0 144 | mkl_fft=1.3.1=py38hd3c417c_0 145 | mkl_random=1.2.2=py38h51133e4_0 146 | mlflow=2.4.1=pypi_0 147 | mpmath=1.2.1=pypi_0 148 | msgpack=1.0.4=pypi_0 149 | multidict=6.0.4=pypi_0 150 | ncurses=6.4=h6a678d5_0 151 | nemo-toolkit=1.15.0=pypi_0 152 | nest-asyncio=1.5.6=py38h06a4308_0 153 | nltk=3.8.1=pypi_0 154 | numba=0.56.4=pypi_0 155 | numexpr=2.8.4=pypi_0 156 | numpy=1.23.5=py38h14f4228_0 157 | numpy-base=1.23.5=py38h31eccc5_0 158 | oauthlib=3.2.2=pypi_0 159 | omegaconf=2.2.3=pypi_0 160 | onnx=1.13.1=pypi_0 161 | opencc=1.1.6=pypi_0 162 | openssl=1.1.1s=h7f8727e_0 163 | opt-einsum=3.3.0=pypi_0 164 | orjson=3.8.6=pypi_0 165 | packaging=22.0=py38h06a4308_0 166 | pandasgui=0.2.14=pypi_0 167 | pandastable=0.13.1=pypi_0 168 | pangu=4.0.6.1=pypi_0 169 | parameterized=0.8.1=pypi_0 170 | paramiko=3.0.0=pypi_0 171 | parso=0.8.3=pyhd3eb1b0_0 172 | pathspec=0.11.0=pypi_0 173 | pathtools=0.1.2=pypi_0 174 | pexpect=4.8.0=pyhd3eb1b0_3 175 | pickleshare=0.7.5=pyhd3eb1b0_1003 176 | pillow=9.3.0=py38h6a678d5_2 177 | pip=23.1.2=pypi_0 178 | pip-api=0.0.30=pypi_0 179 | pipreqs=0.4.11=pypi_0 180 | plac=1.3.5=pypi_0 181 | platformdirs=3.0.0=pypi_0 182 | plotly=5.15.0=pypi_0 183 | pluggy=1.0.0=pypi_0 184 | pooch=1.6.0=pypi_0 185 | portalocker=2.7.0=pypi_0 186 | progress=1.6=pypi_0 187 | prompt-toolkit=3.0.37=pypi_0 188 | protobuf=3.20.3=pypi_0 189 | psutil=5.9.4=pypi_0 190 | ptyprocess=0.7.0=pyhd3eb1b0_2 191 | pure_eval=0.2.2=pyhd3eb1b0_0 192 | pyannote-core=5.0.0=pypi_0 193 | pyannote-database=4.1.3=pypi_0 194 | pyannote-metrics=3.2.1=pypi_0 195 | pyasn1=0.4.8=pypi_0 196 | pyasn1-modules=0.2.8=pypi_0 197 | pybind11=2.10.3=pypi_0 198 | pybtex=0.24.0=pypi_0 199 | pybtex-docutils=1.0.2=pypi_0 200 | pycparser=2.21=pypi_0 201 | pycryptodome=3.17=pypi_0 202 | pydub=0.25.1=pypi_0 203 | pygments=2.11.2=pyhd3eb1b0_0 204 | pyjwt=2.7.0=pypi_0 205 | pynacl=1.5.0=pypi_0 206 | pynput=1.7.6=pypi_0 207 | pyparsing=3.0.9=pypi_0 208 | pypinyin=0.48.0=pypi_0 209 | pypinyin-dict=0.5.0=pypi_0 210 | pyqt5=5.15.9=pypi_0 211 | pyqt5-qt5=5.15.2=pypi_0 212 | pyqt5-sip=12.12.1=pypi_0 213 | pyqtwebengine=5.15.6=pypi_0 214 | pyqtwebengine-qt5=5.15.2=pypi_0 215 | pysocks=1.7.1=pypi_0 216 | pytest=7.2.1=pypi_0 217 | pytest-runner=6.0.0=pypi_0 218 | python=3.8.0=h0371630_2 219 | python-dateutil=2.8.2=pyhd3eb1b0_0 220 | python-multipart=0.0.5=pypi_0 221 | python-xlib=0.33=pypi_0 222 | pytorch=1.13.1=py3.8_cpu_0 223 | pytorch-lightning=1.8.6=pypi_0 224 | pytorch-mutex=1.0=cpu 225 | pyyaml=5.4.1=pypi_0 226 | pyzmq=23.2.0=py38h6a678d5_0 227 | qtstylish=0.1.5=pypi_0 228 | querystring-parser=1.2.4=pypi_0 229 | rapidfuzz=2.13.7=pypi_0 230 | readline=7.0=h7b6447c_5 231 | regex=2022.10.31=pypi_0 232 | requests=2.31.0=pypi_0 233 | requests-oauthlib=1.3.1=pypi_0 234 | rfc3986=1.5.0=pypi_0 235 | rich=12.6.0=pypi_0 236 | rsa=4.9=pypi_0 237 | ruamel-yaml=0.17.21=pypi_0 238 | ruamel-yaml-clib=0.2.7=pypi_0 239 | s3transfer=0.6.0=pypi_0 240 | sacrebleu=2.3.1=pypi_0 241 | sacremoses=0.0.53=pypi_0 242 | scikit-learn=1.2.1=pypi_0 243 | sentence-transformers=2.2.2=pypi_0 244 | sentencepiece=0.1.97=pypi_0 245 | sentry-sdk=1.15.0=pypi_0 246 | setproctitle=1.3.2=pypi_0 247 | setuptools=59.5.0=pypi_0 248 | shellingham=1.5.0.post1=pypi_0 249 | six=1.16.0=pyhd3eb1b0_1 250 | smmap=5.0.0=pypi_0 251 | snowballstemmer=2.2.0=pypi_0 252 | sortedcontainers=2.4.0=pypi_0 253 | soundfile=0.12.1=pypi_0 254 | soupsieve=2.4=pypi_0 255 | sox=1.4.1=pypi_0 256 | soxr=0.3.3=pypi_0 257 | sphinx=6.1.3=pypi_0 258 | sphinxcontrib-applehelp=1.0.4=pypi_0 259 | sphinxcontrib-bibtex=2.5.0=pypi_0 260 | sphinxcontrib-devhelp=1.0.2=pypi_0 261 | sphinxcontrib-htmlhelp=2.0.1=pypi_0 262 | sphinxcontrib-jsmath=1.0.1=pypi_0 263 | sphinxcontrib-qthelp=1.0.3=pypi_0 264 | sphinxcontrib-serializinghtml=1.1.5=pypi_0 265 | sqlalchemy=2.0.17=pypi_0 266 | sqlite=3.33.0=h62c20be_0 267 | sqlparse=0.4.4=pypi_0 268 | stack-data=0.6.2=pypi_0 269 | stack_data=0.2.0=pyhd3eb1b0_0 270 | starlette=0.25.0=pypi_0 271 | sympy=1.11.1=pypi_0 272 | tabulate=0.9.0=pypi_0 273 | tenacity=8.2.2=pypi_0 274 | tensorboard=2.12.0=pypi_0 275 | tensorboard-data-server=0.7.0=pypi_0 276 | tensorboard-plugin-wit=1.8.1=pypi_0 277 | tensorboardx=2.6=pypi_0 278 | termcolor=2.2.0=pypi_0 279 | text-unidecode=1.3=pypi_0 280 | textdistance=4.5.0=pypi_0 281 | texterrors=0.4.4=pypi_0 282 | threadpoolctl=3.1.0=pypi_0 283 | tk=8.6.12=h1ccaba5_0 284 | tokenizers=0.12.1=pypi_0 285 | toml=0.10.2=pypi_0 286 | torchaudio=0.13.1=py38_cpu 287 | torchmetrics=0.11.1=pypi_0 288 | torchvision=0.2.2=py_3 289 | tornado=6.2=py38h5eee18b_0 290 | traitlets=5.7.1=py38h06a4308_0 291 | transformers=4.21.2=pypi_0 292 | typed-ast=1.5.4=pypi_0 293 | typer=0.7.0=pypi_0 294 | typing_extensions=4.4.0=py38h06a4308_0 295 | uc-micro-py=1.0.1=pypi_0 296 | urllib3=1.26.16=pypi_0 297 | uvicorn=0.20.0=pypi_0 298 | wandb=0.13.10=pypi_0 299 | wcwidth=0.2.6=pypi_0 300 | webdataset=0.1.62=pypi_0 301 | websocket-client=1.6.1=pypi_0 302 | websockets=10.4=pypi_0 303 | werkzeug=2.2.3=pypi_0 304 | wget=3.2=pypi_0 305 | wheel=0.38.4=py38h06a4308_0 306 | wordcloud=1.9.2=pypi_0 307 | wrapt=1.15.0=pypi_0 308 | xlrd=2.0.1=pypi_0 309 | xz=5.2.10=h5eee18b_1 310 | yarg=0.1.9=pypi_0 311 | yarl=1.8.2=pypi_0 312 | youtokentome=1.0.6=pypi_0 313 | zeromq=4.3.4=h2531618_0 314 | zipp=3.15.0=pypi_0 315 | zlib=1.2.13=h5eee18b_0 316 | zstd=1.5.2=ha4553b6_0 317 | -------------------------------------------------------------------------------- /src/packages/CadenceModelManager.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | import sys 3 | import pandas as pd 4 | import os 5 | import librosa 6 | import numpy as np 7 | from sklearn.preprocessing import MinMaxScaler 8 | from sklearn.tree import DecisionTreeClassifier 9 | from sklearn.model_selection import cross_val_score 10 | import json 11 | 12 | # local packages 13 | from packages.SavedFeatureLoader import loadFeatures 14 | from packages.CadenceUtils import * 15 | from packages.BayesSearch import BayesSearch 16 | 17 | 18 | class CadenceModelManager: 19 | def __init__( 20 | self, data, low_pass_filter_cutoff: int = 10, trunc_window_size: int = 100 21 | ) -> None: 22 | self.data = data 23 | self.low_pass_filter_cutoff = low_pass_filter_cutoff 24 | 25 | # assume fixed sampling rate for all files 26 | self.sr = sr = librosa.load(self.data["path"][0])[1] 27 | 28 | # generate cadence features 29 | def generate_features(self, window_size, silence_threshold, paths): 30 | 31 | window_size = int(window_size) 32 | 33 | # Normalise amplitudes 34 | print("Normalizing amplitudes") 35 | norm_audio = normalize_audio_amplitudes(paths) 36 | 37 | # Truncate silences 38 | print("Truncating silences") 39 | _, _, trunc_audio = truncate_silences( 40 | norm_audio, window_size, silence_threshold 41 | ) 42 | # Extract pauses 43 | print("Extracting pauses") 44 | pauses = self.run_all_files( 45 | get_silence, window_size, silence_threshold, trunc_audio 46 | ) 47 | 48 | # Extract pause spreads 49 | print("Extracting pause spreads") 50 | silence_spreads = self.run_all_files( 51 | get_silence_spread, window_size, silence_threshold, trunc_audio 52 | ) 53 | 54 | # Extract amplitude and derivative 55 | print("Extracting amplitude features") 56 | amps = self.run_all_files( 57 | get_amplitude, window_size, silence_threshold, trunc_audio 58 | ) 59 | 60 | # Create dataframe 61 | print("Creating dataframe") 62 | features = pd.DataFrame( 63 | { 64 | "pause_ratio": [item["ratio_pause_voiced"] for item in pauses], 65 | "pause_mean": [item["mean_of_silences"] for item in silence_spreads], 66 | "pause_std": [item["spread_of_silences"] for item in silence_spreads], 67 | "n_pauses": [item["n_pauses"] for item in silence_spreads], 68 | "amp_deriv": [item["abs_deriv_amplitude"] for item in amps], 69 | "amp_mean": [item["mean_amplitude"] for item in amps], 70 | } 71 | ) 72 | 73 | print("Complete") 74 | 75 | return features 76 | 77 | # run the cadence feature extraction pipeline 78 | def run_cadence_feature_extraction_pipeline( 79 | self, 80 | window_size=None, 81 | silence_threshold=None, 82 | data=None, 83 | scaler=None, 84 | fill_na=None, 85 | regenerate_features: bool = False, 86 | ): 87 | # feature regeneration block 88 | if regenerate_features: 89 | if data is None: 90 | features = self.generate_features( 91 | window_size, silence_threshold, self.data["path"] 92 | ) 93 | full_df = pd.concat((self.data, features), axis=1) 94 | else: 95 | features = self.generate_features( 96 | window_size, silence_threshold, data["path"] 97 | ) 98 | full_df = pd.concat((data, features), axis=1) 99 | feature_cols = list(features.columns) 100 | 101 | # if features are not being regenerated, load them from the saved features 102 | # this block is tested and used in pipeline 103 | else: 104 | full_df = loadFeatures(self.data.copy(), "cadence") 105 | feature_cols = list(set(full_df.columns) ^ set(self.data.columns)) 106 | 107 | full_df, scaler = self.normalize_data(full_df, feature_cols, scaler=scaler) 108 | 109 | if fill_na is not None: 110 | full_df = full_df.fillna(fill_na) 111 | 112 | return full_df, feature_cols, scaler 113 | 114 | # data normalization using minmaxscaler 115 | def normalize_data(self, full_df, feature_cols, scaler=None): 116 | if scaler is None: 117 | scaler = MinMaxScaler() 118 | full_df.loc[ 119 | full_df["type"] == "train", feature_cols 120 | ] = scaler.fit_transform( 121 | full_df.loc[full_df["type"] == "train", feature_cols] 122 | ) 123 | full_df.loc[~(full_df["type"] == "train"), feature_cols] = scaler.transform( 124 | full_df.loc[~(full_df["type"] == "train"), feature_cols] 125 | ) 126 | else: 127 | full_df.loc[:, list(features.columns)] = scaler.transform( 128 | full_df.loc[:, list(features.columns)] 129 | ) 130 | 131 | return full_df, scaler 132 | 133 | # helper function to run a function on all files 134 | def run_all_files(self, function, window_size, silence_threshold, truncated_audio): 135 | results = [] 136 | for item in truncated_audio: 137 | results.append( 138 | function( 139 | item, 140 | window_size, 141 | silence_threshold, 142 | self.sr, 143 | self.low_pass_filter_cutoff, 144 | ) 145 | ) 146 | return results 147 | 148 | # target function for bayesian optimization 149 | def target_function( 150 | self, 151 | data, 152 | window_size, 153 | silence_threshold, 154 | label_col="label", 155 | model=DecisionTreeClassifier(random_state=12), 156 | ): 157 | features, feature_cols, _ = self.run_cadence_feature_extraction_pipeline( 158 | window_size, 159 | silence_threshold, 160 | data=data, 161 | fill_na=-1, 162 | regenerate_features=True, 163 | ) 164 | X = features[feature_cols] 165 | y = features[label_col] 166 | return cross_val_score(model, X, y, cv=10).mean() 167 | 168 | # run target function on a set of parameters 169 | def run_target_function(self, z, data): 170 | scores = [] 171 | for i in range(z.shape[0]): 172 | window_size, silence_threshold = int(z[i, 0]), z[i, 1] 173 | print(f"Running Params: {window_size}, {silence_threshold}") 174 | scores.append(self.target_function(data, window_size, silence_threshold)) 175 | return np.array(scores) 176 | 177 | # sample parameters for bayesian optimization 178 | def sample_params(self, count): 179 | window_size_mean = 300 180 | window_size_std = 100 181 | window_min = 25 182 | silence_threshold_mean = 0.05 183 | silence_threshold_std = 0.04 184 | silence_min = 0.005 185 | silence_max = 0.2 186 | 187 | window_size = np.random.normal(window_size_mean, window_size_std, count) 188 | window_size[window_size < window_min] = window_min 189 | window_size = window_size.astype(int) 190 | silence_threshold = np.random.normal( 191 | silence_threshold_mean, silence_threshold_std, count 192 | ) 193 | silence_threshold[silence_threshold < silence_min] = silence_min 194 | silence_threshold[silence_threshold > silence_max] = silence_max 195 | 196 | return np.concatenate( 197 | (window_size.reshape(-1, 1), silence_threshold.reshape(-1, 1)), axis=1 198 | ) 199 | 200 | # run bayesian optimization 201 | def hyperparam_search(self, n_iter, sample_size, init_ex_count, gp_ex_count): 202 | search_data = ( 203 | self.data[self.data["type"].isin(["train", "dev"])] 204 | .sample(sample_size) 205 | .copy() 206 | .reset_index() 207 | ) 208 | search_data.to_csv("/home/ubuntu/search_data.csv", index=False) 209 | 210 | bayes_search = BayesSearch( 211 | search_data, 212 | self.run_target_function, 213 | self.sample_params, 214 | n_iter=n_iter, 215 | init_ex_count=init_ex_count, 216 | gp_ex_count=gp_ex_count, 217 | ) 218 | params, acc = bayes_search.optimize() 219 | return params, acc 220 | 221 | # run bayesian optimization and save down the best params 222 | def hyperparam_search_and_features( 223 | self, 224 | output_dir, 225 | output_name, 226 | n_iter=25, 227 | sample_size=300, 228 | init_ex_count=20, 229 | gp_ex_count=1000, 230 | ): 231 | params, _ = self.hyperparam_search( 232 | n_iter=n_iter, 233 | sample_size=sample_size, 234 | init_ex_count=init_ex_count, 235 | gp_ex_count=gp_ex_count, 236 | ) 237 | window_size, silence_threshold = params[0], params[1] 238 | 239 | # save down the best params in a json file 240 | if os.path.exists(os.path.join(output_dir, "params.json")): 241 | with open(os.path.join(output_dir, "params.json")) as file: 242 | params = json.load(file) 243 | else: 244 | params = {} 245 | 246 | if "." in output_name: 247 | output_name = os.splitext(output_name)[0] 248 | 249 | params[output_name] = { 250 | "window_size": window_size, 251 | "silence_threshold": silence_threshold, 252 | } 253 | 254 | with open(os.path.join(output_dir, "params.json"), "w") as file: 255 | json.dump(params, file) 256 | 257 | fake_data = self.data[self.data["label"] == 1].copy() 258 | features = self.generate_features( 259 | window_size, silence_threshold, fake_data["path"] 260 | ) 261 | full_df = pd.concat((self.data, features), axis=1) 262 | full_df.to_csv(os.path.join(output_dir, f"{output_name}.csv"), index=False) 263 | 264 | 265 | def save_features(metadata_path, params_json_path): 266 | pass 267 | -------------------------------------------------------------------------------- /src/packages/ExperimentPipeline.py: -------------------------------------------------------------------------------- 1 | # global packages 2 | import sys 3 | import os 4 | import nemo.collections.asr as nemo_asr 5 | import pandas as pd 6 | import mlflow 7 | import copy 8 | 9 | # local packages 10 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src") 11 | from packages.LJDataLoader import LJDataLoader 12 | from packages.AudioEmbeddingsManager import AudioEmbeddingsManager 13 | from packages.ModelManager import ModelManager 14 | from packages.CadenceModelManager import CadenceModelManager 15 | from packages.SmileFeatureManager import SmileFeatureManager 16 | 17 | 18 | class ExperimentPipeline: 19 | ################################################################################# 20 | ################################# Initialization ################################ 21 | ################################################################################# 22 | 23 | def __init__(self, fake_cols, metadata_path, data_df=None) -> None: 24 | # intialize the class and generate the data for experiment pipeline if data is not provided 25 | self.fake_cols = fake_cols 26 | self.metadata_path = metadata_path 27 | if data_df is None: 28 | self.data_df = self._generate_split(self.fake_cols, self.metadata_path) 29 | else: 30 | # for multivoice experiments, data_df is generated separately and needs to be provided to the class 31 | self.data_df = data_df 32 | 33 | # initialize feature store 34 | self.feature_store = {} 35 | 36 | def _generate_split(self, fake_cols, metadata_path): 37 | # filter data used in training of elevenlabs and initialize the data loader 38 | loader = LJDataLoader( 39 | data_path=self.metadata_path, filter_cols=["ElevenLabsCloneClip"] 40 | ) 41 | 42 | # train-dev-test split 43 | loader.splitData() 44 | 45 | # aggregate wavefake architectures into one column by randomly selecting from architectures 46 | source_architectures = [ 47 | "Full_Band_MelGan", 48 | "HifiGan", 49 | "MelGan", 50 | "MelGanLarge", 51 | "Multi_Band_MelGan", 52 | "Parallel_WaveGan", 53 | "Waveglow", 54 | ] 55 | new_col_name = "RandWaveFake" 56 | loader.selectRandomArchitecture( 57 | target_col=new_col_name, source_cols=source_architectures 58 | ) 59 | 60 | # combine elevenlabs and uberduck into one column for binary classification 61 | source_architectures = ["ElevenLabs", "UberDuck"] 62 | new_col_name = "EL_UD_Fake" 63 | loader.selectRandomArchitecture( 64 | target_col=new_col_name, source_cols=source_architectures 65 | ) 66 | 67 | # combine randwavefake, elevenlabs, and uberduck into one column for binary classification 68 | source_architectures = ["RandWaveFake", "ElevenLabs", "UberDuck"] 69 | new_col_name = "Fake" 70 | loader.selectRandomArchitecture( 71 | target_col=new_col_name, source_cols=source_architectures 72 | ) 73 | 74 | # generate final dataframe 75 | data_df = loader.generateFinalDataFrame(real_col="Real", fake_cols=fake_cols) 76 | 77 | return data_df 78 | 79 | ################################################################################# 80 | ################################# Feature Generation ############################ 81 | ################################################################################# 82 | 83 | def generate_features(self, feature_method="all", open_smile_feature_count=10): 84 | #### titanet features #### 85 | if feature_method == "titanet": 86 | self.feature_store["titanet"] = self._generate_titanet_features() 87 | 88 | #### openSmile features #### 89 | if feature_method == "openSmile_binary": 90 | self.feature_store["openSmile_binary"] = self._generate_openSmile_features( 91 | feature_selector_type="random_forest", 92 | label_type="binary", 93 | feature_count=open_smile_feature_count, 94 | ) 95 | 96 | if feature_method == "openSmile_multiclass": 97 | self.feature_store[ 98 | "openSmile_multiclass" 99 | ] = self._generate_openSmile_features( 100 | feature_selector_type="random_forest", 101 | label_type="multiclass", 102 | feature_count=open_smile_feature_count, 103 | ) 104 | #### cadence features ####ß 105 | if feature_method == "cadence": 106 | self.feature_store["cadence"] = self._generate_cadence_features() 107 | 108 | #### all features ####ß 109 | if feature_method == "all": 110 | self.feature_store["titanet"] = self._generate_titanet_features() 111 | self.feature_store["openSmile_binary"] = self._generate_openSmile_features( 112 | feature_selector_type="random_forest", 113 | label_type="binary", 114 | feature_count=open_smile_feature_count, 115 | ) 116 | self.feature_store[ 117 | "openSmile_multiclass" 118 | ] = self._generate_openSmile_features( 119 | feature_selector_type="random_forest", 120 | label_type="multiclass", 121 | feature_count=open_smile_feature_count, 122 | ) 123 | self.feature_store["cadence"] = self._generate_cadence_features() 124 | 125 | #### private methods for feature generation #### 126 | 127 | def _generate_titanet_features(self): 128 | speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained( 129 | model_name="titanet_large" 130 | ) 131 | embedding_manager = AudioEmbeddingsManager( 132 | model=speaker_model, data=self.data_df 133 | ) 134 | 135 | return embedding_manager.generateFeatureDf() 136 | 137 | def _generate_openSmile_features( 138 | self, feature_selector_type, label_type, feature_count 139 | ): 140 | smile_manager = SmileFeatureManager(self.data_df) 141 | 142 | if label_type == "binary": 143 | return smile_manager.generateFeatureDf( 144 | feature_selector_type, label_type, feature_count 145 | ) 146 | 147 | if label_type == "multiclass": 148 | return smile_manager.generateFeatureDf( 149 | feature_selector_type, label_type, feature_count 150 | ) 151 | 152 | def _generate_cadence_features(self): 153 | cadence_manager = CadenceModelManager(self.data_df) 154 | ( 155 | cad_feature_df, 156 | cad_feature_cols, 157 | scalar, 158 | ) = cadence_manager.run_cadence_feature_extraction_pipeline(fill_na=-1) 159 | 160 | return cad_feature_df, cad_feature_cols 161 | 162 | ################################################################################# 163 | ################################# Train Predict ################################# 164 | ################################################################################# 165 | 166 | def train_predict_using_models( 167 | self, 168 | models=["logreg", "random_forest"], 169 | run_tags=None, 170 | run_name_prefix=None, 171 | create_df_artifact=False, 172 | label_type="label", 173 | ): 174 | # run train eval loop 175 | for model_type in models: 176 | for feature_method, feature_data in self.feature_store.items(): 177 | # condition to skip certain feature methods for certain label types 178 | if ( 179 | label_type == "multiclass_label" 180 | and feature_method == "openSmile_binary" 181 | ): 182 | continue 183 | if label_type == "label" and feature_method == "openSmile_multiclass": 184 | continue 185 | 186 | # generate mlflow run details 187 | run_tags, run_name = self._generate_mlflow_run_details( 188 | run_tags, run_name_prefix, model_type, label_type, feature_method 189 | ) 190 | 191 | # start mlflow run 192 | with mlflow.start_run(tags=run_tags, run_name=run_name) as run: 193 | # instantiate model and perform trainPredict 194 | model = ModelManager( 195 | model_name=model_type, 196 | data=feature_data[0], 197 | feature_cols=feature_data[1], 198 | merge_train_dev=True, 199 | ) 200 | 201 | model.trainPredict(label_col=label_type) 202 | 203 | # mlflow logging 204 | self._log_mlflow_run_details(run, model, create_df_artifact) 205 | 206 | # end mlflow run 207 | mlflow.end_run() 208 | 209 | print( 210 | "Finished run: " 211 | + run.info.run_name 212 | + "with feature method: " 213 | + feature_method 214 | ) 215 | 216 | #### private methods for train predict #### 217 | 218 | def _generate_mlflow_run_details( 219 | self, run_tags, run_name_prefix, model_type, label_type, feature_method 220 | ): 221 | # tag details 222 | _run_tags = copy.deepcopy(run_tags) 223 | _run_tags.update( 224 | { 225 | "feature_method": feature_method, 226 | "label_type": label_type, 227 | "selected_architectures": self.fake_cols, 228 | } 229 | ) 230 | 231 | # run name 232 | if ( 233 | (run_tags is not None) 234 | and ("laundered" in run_tags.keys()) 235 | and (run_tags["laundered"] == 1) 236 | ): 237 | _run_name = ( 238 | run_name_prefix 239 | + "_" 240 | + feature_method 241 | + "_" 242 | + label_type 243 | + "_" 244 | + model_type 245 | + "_laundered" 246 | ) 247 | else: 248 | _run_name = ( 249 | run_name_prefix 250 | + "_" 251 | + feature_method 252 | + "_" 253 | + label_type 254 | + "_" 255 | + model_type 256 | ) 257 | 258 | return _run_tags, _run_name 259 | 260 | def _log_mlflow_run_details(self, run, model, create_df_artifact) -> None: 261 | ##### update tags ##### 262 | mlflow.set_tag("estimator_name", type(model.model).__name__) 263 | 264 | ##### 1) mlflow log model with schema i.e. signature ##### 265 | signature = mlflow.models.signature.infer_signature( 266 | model.X_train, model.y_train 267 | ) 268 | mlflow.sklearn.log_model( 269 | model.model, "model_" + run.info.run_name, signature=signature 270 | ) 271 | 272 | ##### 2) mlflow log model params ##### 273 | mlflow.log_params(model.model.get_params()) 274 | 275 | ##### 3) mlflow log model artifacts ##### 276 | ## train_dev_test data 277 | if create_df_artifact: 278 | data_path = "/home/ubuntu/data/temp/data.csv" 279 | model.data.to_csv(data_path, index=False) 280 | mlflow.log_artifact(data_path) 281 | os.remove(data_path) 282 | 283 | ##### 4) mlflow log model metrics ##### 284 | # save class accuracies independently 285 | for key, value in model.class_accuracy.items(): 286 | mlflow.log_metric(str(key) + "_accuracy", value) 287 | 288 | # save aggregate accuracy 289 | if len(self.fake_cols) > 1: 290 | agg_accuracy = 0 291 | for key, value in model.class_accuracy.items(): 292 | if key in self.fake_cols: 293 | agg_accuracy += value 294 | # compute average accuracy for fake classes 295 | agg_accuracy = agg_accuracy / len(self.fake_cols) 296 | mlflow.log_metric("fake_accuracy", agg_accuracy) 297 | 298 | # save aggregate accuracy 299 | mlflow.log_metric("accuracy", model.accuracy) 300 | 301 | # save log loss 302 | mlflow.log_metric("log_loss", model.log_loss_value) 303 | 304 | # save eer score 305 | if model.eer_score is not None: 306 | mlflow.log_metric("eer_score", model.eer_score) 307 | 308 | # save eer threshold 309 | if model.eer_threshold is not None: 310 | mlflow.log_metric("eer_threshold", model.eer_threshold) 311 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single- and Multi-Speaker Cloned Voice Detection: From Perceptual to Learned Features 2 | 3 | 4 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/LICENSE) 5 | [![Python 3.8.0](https://img.shields.io/badge/python-3.8.0-blue.svg)](https://www.python.org/downloads/release/python-380/) 6 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 7 | 8 | This is the repository for the paper titled [Single and Multi Speaker Cloned Voice Detection: From Perceptual to Learned Features](https://arxiv.org/abs/2307.07683) submitted to the 2023 IEEE International Workshop on Information Forensics and Security (WIFS 2023). 9 | 10 | The provided source code includes implementations of both the single-speaker and multi-speaker pipelines. However, please note that the dataset used in the experiments is not included in this repository. To replicate the experiments, you would need to create an analogous experimental dataset with cloned voices using different voice cloning architectures or providers. 11 | 12 | The repository does provide code for data generation and adversarial laundering, specifically tailored for an example provider called ElevenLabs. You would need to generate features from the analogous dataset and save them to disk. Additionally, you will need to modify the relevant data handling code to ensure compatibility with your new dataset in order to run the pipeline successfully. 13 | 14 | Please refer to the repository and the paper for more detailed instructions on how to use the code and conduct the experiments. 15 | 16 | # Folder Structure 17 | 18 | The repository is structured as follows: 19 | 20 | | Folder | File | Description | 21 | |-----------|------------|---------------------------------------------------| 22 | |__Experiment Pipeline__| 23 | | `/src/` |[`run_pipeline_ljspeech.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/run_pipeline_ljspeech.py)| Runs the pipeline for single voice (LJSpeech) experiments| 24 | | `/src/` |[`run_pipeline_multivoice.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/run_pipeline_multivoice.py)| Runs the pipeline for multivoice experiments| 25 | | `/src/packages/` | [`ExperimentPipeline.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ExperimentPipeline.py) | Class for running the experiment_pipeline and logging results| 26 | | `/src/packages/` | [`ModelManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ModelManager.py) |Class for managing the final classification models | 27 | |__Feature Generation__| 28 | | `/src/packages/` | [`AudioEmbeddingsManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/AudioEmbeddingsManager.py) | Class for managing learned features generated using [NVIDIA TitaNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speaker_recognition/models.html)| 29 | | `/src/packages/` | [`SmileFeatureManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureManager.py) | Class for managing spectral features generated using [openSMILE](https://audeering.github.io/opensmile-python/usage.html)| 30 | | `/src/packages/` | [`SmileFeatureGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureGenerator.py) | Class for generating spectral features and saving to disk for collections of audio files| 31 | | `/src/packages/` | [`SmileFeatureSelector.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureSelector.py) | Class for selecting spectral features using `sklearn.feature_selection` | 32 | | `/src/packages/` | [`CadenceModelManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/CadenceModelManager.py) | Class for managing perceptual features generated using handcrafted technqiues| 33 | | `/src/packages/` | [`CadenceUtils.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/CadenceUtils.py) | Utility functions used by `CadenceModelManager` for generating features | 34 | | `/src/packages/` | [`BayesSearch.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/BayesSearch.py) | A class that implements Bayesian Hyperparameter Optimization for perceptual model | 35 | | `/src/packages/` | [`SavedFeatureLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SavedFeatureLoader.py) | Helper function for loading during experiments the generated features saved to disk| 36 | |__Data Loaders__| 37 | | `/src/packages/` | [`LJDataLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/LJDataLoader.py) | Class for loading and handling the LJSpeech data for experiments| 38 | | `/src/packages/` | [`TIMITDataLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/TIMITDataLoader.py) | Class for loading and handling the TIMIT data for multi-voice experiments| 39 | |__Data Generation__| 40 | | `/src/packages/` | [`BaseDeepFakeGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/BaseDeepFakeGenerator.py) | Base class used for processing data used for voice cloning | 41 | | `/src/packages/` | [`ElevenLabsDeepFakeGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ElevenLabsDeepFakeGenerator.py)| Class used to generate deepfakes using the ElevenLabs API | 42 | | `/src/packages/` | [`AudioManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/AudioManager.py) | Class for resampling audio files and performing adversarial laundering | 43 | |__Misc__| 44 | | `.` | [`README.md`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/README.md) | Provides an overview for the project| 45 | | `.` | [`conda_requirements.txt`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/conda_requirements.txt) | Dependencies for creating the `conda` environment| 46 | | `.` | [`pip_requirements.txt`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/pip_requirements.txt) | Dependencies installed with `pip`| 47 | 48 | # Data 49 | 50 | An overview of the real and synthetic datasets used in our single-speaker (top) and multi-speaker (bottom) evaluations. The 91,700 WaveFake samples correspond to 13,100 samples per each of seven different vocoder architectures, hence the larger number of clips and duration. 51 | 52 | ### Single-speaker 53 | 54 | | **Type** | **Name** | **Clips (#)** | **Duration (sec)** | 55 | |:--------:|:--------:|:-------------:|:------------------:| 56 | | Real | LJSpeech | 13,100 | 86,117 | 57 | | Synthetic | WaveFake | 91,700 | 603,081 | 58 | | Synthetic | ElevenLabs | 13,077 | 78,441 | 59 | | Synthetic | Uberduck | 13,094 | 83,322 | 60 | 61 | ### Multi-speaker 62 | 63 | | **Type** | **Name** | **Clips (#)** | **Duration (sec)** | 64 | |:--------:|:--------:|:-------------:|:------------------:| 65 | | Real | TIMIT | 4,620 | 14,192 | 66 | | Synthetic | ElevenLabs | 5,499 | 15,413 | 67 | 68 | ### Publicly Available Data 69 | 70 | 1. The LJ Speech 1.1 Dataset -- [Data](https://keithito.com/LJ-Speech-Dataset/) 71 | 2. WaveFake: A Data Set to Facilitate Audio Deepfake Detection -- [Paper](https://arxiv.org/abs/2111.02813), [Data](https://zenodo.org/record/5642694) 72 | 3. TIMIT Acoustic-Phonetic Continuous Speech Corpus -- [Data](https://catalog.ldc.upenn.edu/LDC93S1) 73 | 74 | ### Commercial Voice Cloning Tools 75 | 76 | 1. ElevenLabs (EL) -- https://beta.elevenlabs.io/ 77 | 2. UberDuck (UD) -- https://app.uberduck.ai/ 78 | 79 | # Results 80 | 81 | ### Single-speaker 82 | 83 | Accuracies for a personalized, single-speaker classification of unlaundered audio (top) and audio subject to adversarial laundering in the form of additive noise and transcoding (bottom). Dataset corresponds to ElevenLabs (EL), UberDuck (UD), and WaveFake (WF); Model corresponds to a linear (L) or non-linear (NL) classifier, and for a single-classifier (real v. synthetic) or multi-classifier (real vs. specific synthethis architecture); accuracy (%) is reported for synthetic audio, real audio, and (for the single-classifiers) equal error rate (EER) is also reported. 84 | 85 | 86 | | | | Synthetic Accuracy (%) | | | Real Accuracy (%) | | | EER (%) | | | 87 | |----------|--------|:----------------------:|-----|-----|:-----------------:|-----|-----|:-------:|-----|-----| 88 | | **Dataset** | **Model** | **Learned** | **Spectral** | **Perceptual** | **Learned** | **Spectral** | **Perceptual** | **Learned** | **Spectral** | **Perceptual** | 89 | |__Unlaundered__| 90 | |Binary| 91 | | EL | single (L) | 100.0 | 99.2 | 78.2 | 100.0 | 99.9 | 72.5 | 0.0 | 0.5 | 24.9 | 92 | | | single (NL) | 100.0 | 99.9 | 82.2 | 100.0 | 100.0 | 80.4 | 0.0 | 0.1 | 18.6 | 93 | | UD | single (L) | 99.8 | 98.9 | 51.9 | 99.9 | 98.9 | 54.0 | 0.1 | 1.1 | 47.2 | 94 | | | single (NL) | 99.7 | 99.2 | 54.4 | 99.9 | 99.0 | 56.5 | 0.2 | 0.9 | 44.5 | 95 | | WF | single (L) | 96.5 | 78.4 | 57.8 | 97.1 | 82.3 | 45.6 | 3.3 | 19.7 | 48.5 | 96 | | | single (NL) | 94.5 | 87.6 | 50.3 | 96.7 | 90.2 | 52.7 | 4.4 | 11.2 | 48.6 | 97 | | EL+UD | single (L) | 99.7 | 94.8 | 63.4 | 99.9 | 97.1 | 60.3 | 0.2 | 4.2 | 37.9 | 98 | | | single (NL) | 99.7 | 99.2 | 57.3 | 99.9 | 99.6 | 69.0 | 0.2 | 0.8 | 37.6 | 99 | | EL+UD+WF | single (L) | 93.2 | 79.7 | 58.4 | 98.7 | 93.0 | 57.6 | 3.6 | 15.9 | 42.1 | 100 | | | single (NL) | 91.2 | 90.6 | 53.1 | 99.0 | 94.1 | 64.7 | 4.1 | 7.9 | 41.6 | 101 | |Multiclass| 102 | | EL+UD | multi (L) | 99.9 | 96.6 | 61.0 | 100.0 | 94.6 | 35.7 | - | - | - | 103 | | | multi (NL) | 99.7 | 98.3 | 65.6 | 100.0 | 97.2 | 43.2 | - | - | - | 104 | | EL+UD+WF | multi (L) | 98.8 | 80.2 | 45.1 | 97.3 | 64.3 | 22.9 | - | - | - | 105 | | | multi (NL) | 98.1 | 94.2 | 48.6 | 96.3 | 84.4 | 27.6 | - | - | - | 106 | |__Laundered__| 107 | |Binary| 108 | | EL | single (L) | 95.5 | 94.3 | 61.1 | 94.5 | 92.6 | 65.2 | 4.9 | 6.7 | 36.6 | 109 | | | single (NL) | 96.0 | 96.2 | 70.4 | 95.4 | 95.6 | 69.6 | 4.1 | 4.1 | 30.1 | 110 | | UD | single (L) | 95.4 | 81.1 | 61.4 | 91.8 | 84.3 | 44.7 | 6.3 | 17.3 | 46.7 | 111 | | | single (NL) | 95.4 | 86.8 | 52.9 | 93.3 | 86.1 | 55.9 | 5.5 | 13.6 | 45.6 | 112 | | WF | single (L) | 87.6 | 60.7 | 59.6 | 85.0 | 70.4 | 42.5 | 13.9 | 34.4 | 49.4 | 113 | | | single (NL) | 83.6 | 77.1 | 51.4 | 85.6 | 76.7 | 53.9 | 15.3 | 23.1 | 47.3 | 114 | | EL+UD | single (L) | 95.2 | 79.1 | 54.0 | 91.7 | 78.4 | 59.8 | 6.2 | 21.3 | 43.1 | 115 | | | single (NL) | 94.8 | 86.1 | 55.2 | 93.3 | 90.0 | 62.4 | 6.0 | 12.0 | 41.4 | 116 | | EL+UD+WF | single (L) | 83.7 | 70.9 | 50.6 | 88.6 | 72.9 | 59.7 | 13.2 | 28.2 | 44.8 | 117 | | | single (NL) | 83.4 | 79.2 | 53.0 | 90.7 | 85.1 | 60.7 | 12.5 | 17.9 | 43.6 | 118 | |Multiclass| 119 | | EL+UD | multi (L) | 94.2 | 85.6 | 50.9 | 91.0 | 77.1 | 29.1 | - | - | - | 120 | | | multi (NL) | 94.5 | 91.7 | 53.2 | 90.3 | 82.9 | 41.3 | - | - | - | 121 | | EL+UD+WF | multi (L) | 89.8 | 65.4 | 35.3 | 83.1 | 44.3 | 26.2 | - | - | - | 122 | | | multi (NL) | 88.8 | 78.8 | 39.8 | 82.1 | 63.0 | 28.6 | - | - | - | 123 | 124 | ### Multi-speaker 125 | 126 | Accuracies for a non-personalized, multi-speaker classification of unlaundered audio. Dataset corresponds to ElevenLabs (EL); Model corresponds to a linear (L) or non-linear (NL) classifier, and for a single-classifier (real v. synthetic) or multi-classifier (real vs. specific synthethis architecture); accuracy (%) is reported for synthetic audio, real audio, and (for the single-classifiers) equal error rate (EER) is also reported. 127 | 128 | 129 | | | | Synthetic Accuracy (%) | | | Real Accuracy (%) | | | EER (%) | | | 130 | |----------|--------|:----------------------:|-----|-----|:-----------------:|-----|-----|:-------:|-----|-----| 131 | | **Dataset** | **Model** | **Learned** | **Spectral** | **Perceptual** | **Learned** | **Spectral** | **Perceptual** | **Learned** | **Spectral** | **Perceptual** | 132 | | EL | single (L) | 100.0 | 94.2 | 83.8 | 99.9 | 98.3 | 86.9 | 0.0 | 3.0 | 1.3 | 133 | | | single (NL) | 92.3 | 96.3 | 82.2 | 100.0 | 99.7 | 87.7 | 0.1 | 1.6 | 1.4 | 134 | 135 | 136 | # Research Group 137 | 138 | * Sarah Barrington1 -- 139 | * Romit Barua1 -- 140 | * Gautham Koorma1 -- 141 | * Hany Farid1,2 -- 142 | 143 | School of Information1 and Electrical Engineering and Computer Sciences1,2 at the University of California, Berkeley 144 | 145 | This work was partially funded by a [grant from the UC Berkeley Center For Long-Term Cybersecurity (CLTC)](https://cltc.berkeley.edu/publication/digital-fingerprinting-to-protect-against-deepfakes/), an [award for open-source innovation from the Digital Public Goods Alliance and United Nations Development Program](https://digitalpublicgoods.net/information-pollution/), and an unrestricted gift from Meta. 146 | 147 | # Citation 148 | 149 | Please cite the following paper if you use this code: 150 | 151 | ``` 152 | @misc{barrington2023single, 153 | title={Single and Multi-Speaker Cloned Voice Detection: From Perceptual to Learned Features}, 154 | author={Sarah Barrington and Romit Barua and Gautham Koorma and Hany Farid}, 155 | year={2023}, 156 | eprint={2307.07683}, 157 | archivePrefix={arXiv}, 158 | primaryClass={cs.SD} 159 | } 160 | ``` 161 | --------------------------------------------------------------------------------