├── LICENSE
├── src
    ├── packages
    │   ├── AudioEmbeddingsManager.py
    │   ├── BaseDeepFakeGenerator.py
    │   ├── SavedFeatureLoader.py
    │   ├── SmileFeatureGenerator.py
    │   ├── ElevenLabsDeepFakeGenerator.py
    │   ├── playhtDataGenerator.py
    │   ├── BayesSearch.py
    │   ├── SmileFeatureManager.py
    │   ├── CadenceUtils.py
    │   ├── LJDataLoader.py
    │   ├── SmileFeatureSelector.py
    │   ├── ModelManager.py
    │   ├── AudioManager.py
    │   ├── TIMITDataLoader.py
    │   ├── CadenceModelManager.py
    │   └── ExperimentPipeline.py
    ├── run_pipeline_ljspeech.py
    └── run_pipeline_multivoice.py
├── .gitignore
├── pip_requirements.txt
├── conda_requirements.txt
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, Sarah Barrington, Romit Barua, Gautham Koorma, Hany Farid
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/src/packages/AudioEmbeddingsManager.py:
--------------------------------------------------------------------------------
 1 | # global packages
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # local packages
 6 | from packages.SavedFeatureLoader import loadFeatures
 7 | 
 8 | # directory to save embeddings to
 9 | SAVED_EMBEDDINGS_DIR = "/home/ubuntu/data/wavefake_data/Embeddings/16000KHz"
10 | 
11 | # helper function to generate Titanet embeddings
12 | def generateTitaNetEmbeddings(model, paths, normalize):
13 |     embeddings = np.array(
14 |         [
15 |             model.get_embedding(file_path).cpu().detach().numpy()[0]
16 |             for file_path in paths
17 |         ]
18 |     )
19 | 
20 |     if normalize:
21 |         raise NotImplementedError("Normalizing embeddings is not implemented yet")
22 | 
23 |     return embeddings
24 | 
25 | 
26 | class AudioEmbeddingsManager:
27 |     # initialize with model and data
28 |     def __init__(self, model, data) -> None:
29 |         self.model = model
30 |         self.data = data
31 | 
32 |     # generate the dataframe of embeddings for experiments
33 |     def generateFeatureDf(
34 |         self, normalize: bool = False, regenerate_embeddings: bool = False
35 |     ):
36 |         # generate embeddings and save to disk
37 |         if regenerate_embeddings:
38 |             embeddings_df = pd.DataFrame(self.generateEmbeddings(normalize))
39 | 
40 |             feature_cols = list(embeddings_df.columns)
41 |             feature_df = pd.concat((self.data, embeddings_df), axis=1)
42 | 
43 |         # load embeddings from disk
44 |         else:
45 |             feature_df = loadFeatures(self.data.copy(), "titanet")
46 |             feature_cols = list(set(feature_df.columns) ^ set(self.data.columns))
47 | 
48 |         return feature_df, feature_cols
49 | 
50 |     # generate embeddings for each audio file
51 |     def generateEmbeddings(self, normalize):
52 |         return generateTitaNetEmbeddings(self.model, self.data["path"], normalize)
53 | 


--------------------------------------------------------------------------------
/src/packages/BaseDeepFakeGenerator.py:
--------------------------------------------------------------------------------
 1 | from lib2to3.pgen2.tokenize import tokenize
 2 | import os
 3 | from secrets import token_urlsafe
 4 | import pandas as pd
 5 | 
 6 | #base class used by other generators to load text from a dataframe or directory 
 7 | #and process transcripts
 8 | class BaseDeepFakeGenerator:
 9 |     def __init__(self, tokenize_type: str = None):
10 |         if not isinstance(tokenize_type, type(None)):
11 |             assert tokenize_type.lower() in [
12 |                 "word",
13 |                 "sentence",
14 |             ], "If you provide a tokenize type, it must be sentence or word"
15 |         self.tokenize_type = tokenize_type
16 | 
17 |     def loadTextFromDataFrame(
18 |         self,
19 |         dataframe_path: str,
20 |         source_col: str,
21 |         transcript_col: str,
22 |         punc_to_remove: list = None,
23 |     ):
24 |         metadata = pd.read_csv(dataframe_path)
25 |         source_paths = list(metadata[source_col])
26 |         file_names = [os.path.basename(source_path) for source_path in source_paths]
27 |         transcripts = list(metadata[transcript_col])
28 | 
29 |         if punc_to_remove:
30 |             transcripts = self.process_transcripts(transcripts, punc_to_remove)
31 | 
32 |         return file_names, transcripts
33 | 
34 |     def loadTextFromDirectory(self, dir_name: str):
35 |         for file_name in os.listdir(dir_name):
36 |             if ".txt" in file_name:
37 |                 pass
38 | 
39 |     def _readTextFile(self, text_path: str):
40 |         with open(text_path) as f:
41 |             lines = f.readlines()
42 | 
43 |         f.close()
44 | 
45 |     def process_transcripts(self, transcripts: list, punc_to_remove: list):
46 |         processed_transcripts = []
47 |         for idx, transcript in enumerate(transcripts):
48 |             for punc in punc_to_remove:
49 |                 print(idx)
50 |                 transcript = transcript.replace(punc, "")
51 |             processed_transcripts.append(transcript)
52 |         return processed_transcripts
53 | 


--------------------------------------------------------------------------------
/src/packages/SavedFeatureLoader.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | from tqdm import tqdm
 4 | import json
 5 | 
 6 | #path to the feature map json file that maps 
 7 | #the metadata path to the feature path
 8 | FEATURE_MAP_PATH = (
 9 |     "/home/ubuntu/data/FeatureMap.json" 
10 | )
11 | 
12 | #list of valid feature types
13 | VALID_FEATURE_TYPES = [
14 |     "titanet",
15 |     "openSmile",
16 |     "cadence",
17 | ] 
18 | #helper function
19 | def loadFeatures(
20 |     metadata,
21 |     feature_type,
22 |     metadata_filepath_col="path",
23 |     feature_filepath_col="path",
24 |     feature_map_path=FEATURE_MAP_PATH,
25 | ):
26 |     assert (
27 |         feature_type in VALID_FEATURE_TYPES
28 |     ), f"Please ensure that {feature_type} is a valid feature type"
29 | 
30 |    #add a column to the metadata dataframe that contains the path to the feature
31 |     metadata["path_keys"] = metadata["path"].apply(os.path.dirname)
32 |     present_paths = metadata["path_keys"].unique().tolist()
33 |     with open(feature_map_path) as f:
34 |         feature_map = json.load(f)
35 | 
36 |     #load the feature dataframe and merge it with the metadata dataframe
37 |     merged_df = pd.DataFrame()
38 |     for path in tqdm(present_paths):
39 |         feature_df = pd.read_csv(feature_map[path][feature_type])
40 |         filter_metadata = metadata[metadata["path_keys"] == path]
41 |         merged_df = pd.concat(
42 |             [
43 |                 merged_df,
44 |                 pd.merge(
45 |                     filter_metadata,
46 |                     feature_df,
47 |                     how="left",
48 |                     left_on=metadata_filepath_col,
49 |                     right_on=feature_filepath_col,
50 |                 ),
51 |             ],
52 |             axis=0,
53 |         ).reset_index(drop=True)
54 | 
55 |     #drop the feature path column if it is not the same as the metadata path column
56 |     if feature_filepath_col != metadata_filepath_col:
57 |         merged_df = merged_df.drop(columns=[feature_filepath_col])
58 | 
59 |     #drop the path_keys columnß
60 |     merged_df = merged_df.drop(columns=["path_keys"])
61 |     return merged_df
62 | 


--------------------------------------------------------------------------------
/src/packages/SmileFeatureGenerator.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import random
 5 | from tqdm import tqdm
 6 | from pathlib import Path
 7 | import opensmile
 8 | 
 9 | # base_path
10 | base_path = "/home/ubuntu"
11 | 
12 | 
13 | class smileFeatureGenerator:
14 |     # initiate the class to generate openSMILE ComParE_2016 functionals
15 |     def __init__(
16 |         self,
17 |         data_path: str,
18 |         feature_set=opensmile.FeatureSet.ComParE_2016,
19 |         feature_level=opensmile.FeatureLevel.Functionals,
20 |     ) -> None:
21 |         self.data_path = data_path
22 |         self.feature_extractor = opensmile.Smile(
23 |             feature_set=feature_set, feature_level=feature_level
24 |         )
25 |         # store the wav files in a list
26 |         self.wav_list = self._getWavList()
27 |         assert len(self.wav_list) > 0, "No wav files found in data path"
28 | 
29 |     # private method to iterate through the data path and store the wav files in a list
30 |     def _getWavList(self):
31 |         wav_list = []
32 |         for file_name in os.listdir(self.data_path):
33 |             if ".wav" in file_name.lower():
34 |                 wav_list.append(file_name)
35 |         return wav_list
36 | 
37 |     # generate openSMILE features
38 |     def generateFeatures(self):
39 |         print("Generating openSMILE features...\n")
40 | 
41 |         self.smile_df = pd.DataFrame()
42 | 
43 |         for i in tqdm(range(len(self.wav_list))):
44 |             file_path = os.path.join(self.data_path, self.wav_list[i])
45 |             try:
46 |                 features = self.feature_extractor.process_file(file_path).reset_index()
47 |             except:
48 |                 print("Error processing file: {}".format(file_path))
49 |                 continue
50 | 
51 |             # compute file duration
52 |             duration = features["end"] - features["start"]
53 |             duration = duration.astype("timedelta64[ms]") / 1000
54 |             features.insert(1, "duration(seconds)", duration)
55 | 
56 |             features.drop(columns=["start", "end"], inplace=True)
57 | 
58 |             self.smile_df = pd.concat([self.smile_df, features]).reset_index(drop=True)
59 | 
60 |         print("\nopenSMILE features generated... call saveFeatures(filename)\n")
61 | 
62 |     # save the feature to disk for loading during experiments
63 |     def saveFeatures(self, filename: str):
64 |         self.smile_df.to_csv(filename, index=False)
65 |         print("Features saved to {}\n".format(filename))
66 | 


--------------------------------------------------------------------------------
/src/packages/ElevenLabsDeepFakeGenerator.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import requests
 3 | import os
 4 | 
 5 | from packages.BaseDeepFakeGenerator import BaseDeepFakeGenerator
 6 | from packages.AudioManager import AudioManager
 7 | 
 8 | #class used to generate deepfakes using the ElevenLabs API
 9 | class ElevenLabsDeepFakeGenerator(BaseDeepFakeGenerator):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.api_key = self._load_API_key()
13 | 
14 |     def _load_API_key(
15 |         self, config_path="/home/ubuntu/MultiModalDeepFake/Configs/secret/config.yaml"
16 |     ):
17 |         with open(config_path, "r") as file:
18 |             inputs = yaml.safe_load(file)
19 |         xi_api_key = inputs["eleven_labs_api_key"]
20 |         return xi_api_key
21 | 
22 |     def generateDeepFakeFromDataFrame(
23 |         self,
24 |         dataframe_path: str,
25 |         output_dir: str,
26 |         source_col: str,
27 |         transcript_col: str,
28 |         voice_id: str,
29 |         voice_name: str = None,
30 |         convert_audio_to_format: str = None,
31 |         punc_to_remove: list = None,
32 |     ):
33 |         file_names, transcripts = self.loadTextFromDataFrame(
34 |             dataframe_path=dataframe_path,
35 |             source_col=source_col,
36 |             transcript_col=transcript_col,
37 |             punc_to_remove=punc_to_remove,
38 |         )
39 |         print(file_names)
40 |         if convert_audio_to_format:
41 |             audio_manager = AudioManager()
42 | 
43 |         for idx, transcript in enumerate(transcripts):
44 |             try:
45 |                 audio_clip = self.generateDeepfake(voice_id=voice_id, text=transcript)
46 | 
47 |                 file_name = file_names[idx].replace(
48 |                     os.path.splitext(file_names[idx])[1], ".mpeg"
49 |                 )
50 |                 with open(os.path.join(output_dir, file_name), "wb") as f:
51 |                     f.write(audio_clip.content)
52 |                     f.close()
53 |             except Exception as e:
54 |                 print(f"Failed to Generate DeepFake for {file_names[idx]}")
55 |                 print(f"Error: {str(e)}")
56 |                 print()
57 | 
58 |             if convert_audio_to_format:
59 |                 audio_manager.convertAudioFileTypes(
60 |                     os.path.join(output_dir, file_name),
61 |                     output_format=convert_audio_to_format,
62 |                     delete_original=True,
63 |                 )
64 | 
65 |     def generateDeepfake(self, voice_id, text):
66 |         headers = {
67 |             "accept": "audio/mpeg",
68 |             "xi-api-key": self.api_key,
69 |             "Content-Type": "application/json",
70 |         }
71 | 
72 |         data = f'{{"text": "{text}"}}'
73 | 
74 |         return requests.post(
75 |             f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
76 |             headers=headers,
77 |             data=data,
78 |         )
79 | 


--------------------------------------------------------------------------------
/src/packages/playhtDataGenerator.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | from IPython.display import Audio
 5 | import os
 6 | import time
 7 | import json
 8 | import requests
 9 | 
10 | base_path = "/home/ubuntu/"
11 | 
12 | 
13 | class PlayHTVoiceClone:
14 |     # refresh token -- this is on GitHub
15 |     def __init__(
16 |         self, credentials_json="/home/ubuntu/configs/playht_api_credentials.json"
17 |     ) -> None:
18 |         with open(filepath, "r") as f:
19 |             api_credentials = json.load(f)
20 | 
21 |         self.authorization = api_credentials["Authorization"]
22 |         self.user_id = api_credentials["X-User-ID"]
23 |         self.base_url = "https://play.ht/api/v1/"
24 | 
25 |         # hold urls here for now
26 |         self.convert_url = self.base_url + "convert"
27 |         self.download_url = self.base_url + "articleStatus"
28 | 
29 |         self.headers = api_credentials
30 |         # temporarily set content type to json
31 |         self.headers["Content-Type"] = "application/json"
32 | 
33 |     def select_cloned_voice(self):
34 |         self.cloned_voices_url = self.base_url + "getClonedVoices"
35 | 
36 |         self.cloned_voice_resp = requests.get(
37 |             self.cloned_voices_url, headers=self.headers
38 |         )
39 | 
40 |         self.cloned_voice_id = self.cloned_voice_resp.json()["clonedVoices"][0]["id"]
41 |         self.cloned_voice_name = self.cloned_voice_resp.json()["clonedVoices"][0][
42 |             "name"
43 |         ]
44 |         print("Cloned voice name: {}".format(self.cloned_voice_name))
45 | 
46 |     def run_tts(self, text):
47 |         tid = self._start_conversion(text)
48 |         print("_start_conversion completed!! tid: {}".format(tid))
49 | 
50 |         audio_url = self._poll_status(tid)
51 | 
52 |         print(audio_url)
53 | 
54 |         # self._download_audio(audio_url)
55 | 
56 |     def _start_conversion(self, text):
57 |         payload = {"voice": self.cloned_voice_id}
58 |         payload["content"] = [text]
59 | 
60 |         convert_payload = json.dumps(payload)
61 | 
62 |         converted_voice_resp = requests.post(
63 |             self.convert_url, headers=self.headers, data=convert_payload
64 |         )
65 | 
66 |         return converted_voice_resp.json()["transcriptionId"]
67 | 
68 |     def _poll_status(self, tid):
69 |         url = self.download_url + f"?transcriptionId={tid}"
70 | 
71 |         delay = 5
72 | 
73 |         print("Polling status loop started")
74 | 
75 |         while True:
76 |             # get response
77 |             download_resp = requests.get(url, headers=self.headers)
78 |             # check if transcription is complete
79 |             msg = download_resp.json().get("message")
80 |             print(f"Messsage: {msg}")
81 | 
82 |             if msg == "Transcription completed":
83 |                 audio_url = download_resp.json().get("audioUrl")
84 |                 return audio_url
85 |                 break
86 | 
87 |             # if not, wait and try again
88 |             print("wait and try again")
89 |             time.sleep(delay)
90 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/src/packages/BayesSearch.py:
--------------------------------------------------------------------------------
  1 | from sklearn.gaussian_process import GaussianProcessRegressor
  2 | import pandas as pd
  3 | import numpy as np
  4 | from scipy.optimize import minimize
  5 | from scipy.stats import norm
  6 | 
  7 | 
  8 | # implemented using 
  9 | # https://towardsdatascience.com/bayesian-optimization-a-step-by-step-approach-a1cb678dd2ec
 10 | class BayesSearch:
 11 |     def __init__(
 12 |         self,
 13 |         data,
 14 |         target_function,
 15 |         sampling_function,
 16 |         n_iter,
 17 |         init_ex_count=20,
 18 |         gp_ex_count=1000,
 19 |     ):
 20 |         self.target_function = target_function
 21 |         self.gp_reg = GaussianProcessRegressor()
 22 |         self.output = pd.DataFrame(
 23 |             columns=["WindowSize", "SilenceThreshold", "Acc", "EI"]
 24 |         )  ## SB_COmment - any reason for EI over UCB or PI (Upper Confidence Bound/Probability of Improvement)
 25 |         self.sampling_function = sampling_function
 26 |         self.n_iter = n_iter
 27 |         self.data = data
 28 |         self.init_ex_count = init_ex_count
 29 |         self.gp_ex_count = gp_ex_count
 30 |         self.distances_ = []
 31 | 
 32 |         self.h = None
 33 |         self.y = None
 34 |         self._generate_initial()
 35 | 
 36 |     def _generate_initial(self):
 37 |         print(f"Initializing the {self.init_ex_count} hyper-parameters")
 38 | 
 39 |         self.h = self.sampling_function(self.init_ex_count)
 40 |         self.y = self.target_function(self.h, self.data)
 41 | 
 42 |     def expected_improvement(self, h_new):
 43 |         mean_y_new, sigma_y_new = self.gp_reg.predict(
 44 |             np.array([h_new]), return_std=True
 45 |         )
 46 |         sigma_y_new = sigma_y_new.reshape(-1, 1)
 47 |         if sigma_y_new == 0.0:
 48 |             return 0.0
 49 | 
 50 |         mean_y = self.gp_reg.predict(self.h)
 51 |         max_mean_y = np.max(mean_y)
 52 |         z = (mean_y_new - max_mean_y) / sigma_y_new
 53 |         exp_imp = (mean_y_new - max_mean_y) * norm.cdf(z) + sigma_y_new * norm.pdf(z)
 54 | 
 55 |         return exp_imp
 56 | 
 57 |     def next_params(self, explore_exploit_ratio=0.2):
 58 |         min_ei = np.inf
 59 |         max_ei = 0
 60 |         h_optimal = None
 61 |         h_new_sample = self.sampling_function(self.gp_ex_count)
 62 | 
 63 |         for x_new in h_new_sample:
 64 |             # response = minimize(fun=self.expected_improvement, x0=x_new, method='L-BFGS-B')
 65 |             # if response.fun < min_ei:
 66 |             #    min_ei = response.fun
 67 |             #    h_optimal = response.x
 68 |             exp_imp = self.expected_improvement(x_new)
 69 |             if exp_imp < min_ei:
 70 |                 min_ei = exp_imp
 71 |                 h_optimal = x_new
 72 |             if exp_imp > max_ei:
 73 |                 max_ei = exp_imp
 74 |                 h_optimal = x_new
 75 | 
 76 |         print("Optimal H: ", h_optimal)
 77 | 
 78 |         if np.random.rand() < explore_exploit_ratio:
 79 |             return h_optimal, max_ei
 80 |         else:
 81 |             return h_optimal, min_ei
 82 | 
 83 |     def optimize(self):
 84 |         y_max_ind = np.argmax(self.y)
 85 |         y_max = self.y[y_max_ind]
 86 |         optimal_h = self.h[y_max_ind]
 87 |         optimal_ei = None
 88 | 
 89 |         for i in range(self.n_iter):
 90 |             self.gp_reg.fit(self.h, self.y)
 91 |             h_next, ei = self.next_params()
 92 |             y_next = self.target_function(np.array([h_next]), self.data)
 93 |             print("acc: ", y_next)
 94 | 
 95 |             self.h = np.concatenate((self.h, np.array([h_next])))
 96 |             self.y = np.concatenate((self.y, np.array(y_next)))
 97 | 
 98 |             if y_next[0] > y_max:
 99 |                 y_max = y_next[0]
100 |                 optimal_h = h_next
101 |                 optimal_ei = ei
102 | 
103 |             if i == 0:
104 |                 prev_h = h_next
105 |             else:
106 |                 self.distances_.append(np.linalg.norm(prev_h - h_next))
107 |                 prev_h = h_next
108 | 
109 |             # self.best_samples_ = self.best_samples_.append({"y": y_max, "ei": optimal_ei},ignore_index=True)
110 | 
111 |         return optimal_h, y_max
112 | 


--------------------------------------------------------------------------------
/src/packages/SmileFeatureManager.py:
--------------------------------------------------------------------------------
  1 | # global packages
  2 | from sklearn.ensemble import RandomForestClassifier
  3 | 
  4 | # local packages
  5 | from packages.SavedFeatureLoader import loadFeatures
  6 | from packages.SmileFeatureSelector import *
  7 | 
  8 | # list of valid feature selectors
  9 | VALID_FEATURE_SELECTORS = ["random_forest"]
 10 | 
 11 | 
 12 | class SmileFeatureManager:
 13 |     # initialize the class
 14 |     def __init__(self, data) -> None:
 15 |         self.data = data
 16 |         self.metadata_cols = data.columns
 17 |         self.loadSavedFeatures()
 18 | 
 19 |     # load saved features from disk into a dataframe
 20 |     # feature generation is done for the data separately and features are saved to disk
 21 |     def loadSavedFeatures(self):
 22 |         self.feature_df = loadFeatures(
 23 |             self.data.copy(), "openSmile", feature_filepath_col="file"
 24 |         )
 25 |         # drop duration column since it is not used as a feature
 26 |         self.feature_df = self.feature_df.drop(columns=["duration(seconds)"])
 27 | 
 28 |     # generate the final dataframe with selected features
 29 |     def generateFeatureDf(self, feature_selector_type, label_type, feature_count=10):
 30 |         assert (
 31 |             feature_selector_type in VALID_FEATURE_SELECTORS
 32 |         ), f"{feature_selector_type} not valid. Valid types include {VALID_FEATURE_SELECTORS}"
 33 |         assert label_type in [
 34 |             "binary",
 35 |             "multiclass",
 36 |         ], "Label type must be either binary or multiclass"
 37 | 
 38 |         if feature_selector_type == "random_forest":
 39 |             # set random state for reproducibility while selecting features
 40 |             selector = smileFeatureSelectFrmModel(
 41 |                 self.feature_df,
 42 |                 metadata=list(self.metadata_cols),
 43 |                 model=RandomForestClassifier(random_state=12),
 44 |             )
 45 | 
 46 |             # features for the binary classification task
 47 |             if label_type == "binary":
 48 |                 df = selector.select_features_binary(
 49 |                     max_features=feature_count, return_df=True
 50 |                 )
 51 |             # features for the multiclass classification task
 52 |             else:
 53 |                 df = selector.select_features_multiclass(
 54 |                     max_features=feature_count, return_df=True
 55 |                 )
 56 | 
 57 |             return df
 58 | 
 59 | 
 60 | from packages.SavedFeatureLoader import loadFeatures
 61 | from packages.SmileFeatureSelector import *
 62 | from sklearn.ensemble import RandomForestClassifier
 63 | 
 64 | VALID_FEATURE_SELECTORS = ["random_forest"]
 65 | 
 66 | 
 67 | class SmileFeatureManager:
 68 |     def __init__(self, data) -> None:
 69 |         self.data = data
 70 |         self.metadata_cols = data.columns
 71 |         self.loadSavedFeatures()
 72 | 
 73 |     def loadSavedFeatures(self):
 74 |         self.feature_df = loadFeatures(
 75 |             self.data.copy(), "openSmile", feature_filepath_col="file"
 76 |         )
 77 |         # GK edit 06/02/2023 to drop duration column
 78 |         self.feature_df = self.feature_df.drop(columns=["duration(seconds)"])
 79 | 
 80 |     def generateFeatureDf(self, feature_selector_type, label_type, feature_count=10):
 81 |         assert (
 82 |             feature_selector_type in VALID_FEATURE_SELECTORS
 83 |         ), f"{feature_selector_type} not valid. Valid types include {VALID_FEATURE_SELECTORS}"
 84 |         assert label_type in [
 85 |             "binary",
 86 |             "multiclass",
 87 |         ], "Label type must be either binary or multiclass"
 88 | 
 89 |         if feature_selector_type == "random_forest":
 90 |             # GK edit 06/29/23 to set random state
 91 |             selector = smileFeatureSelectFromModel(
 92 |                 self.feature_df,
 93 |                 metadata=list(self.metadata_cols),
 94 |                 model=RandomForestClassifier(random_state=12),
 95 |             )
 96 | 
 97 |             if label_type == "binary":
 98 |                 df = selector.select_features_binary(
 99 |                     max_features=feature_count, return_df=True
100 |                 )
101 |             else:
102 |                 df = selector.select_features_multiclass(
103 |                     max_features=feature_count, return_df=True
104 |                 )
105 | 
106 |             return df
107 | 


--------------------------------------------------------------------------------
/src/packages/CadenceUtils.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | from math import trunc
  4 | from scipy import signal
  5 | from numpy import diff
  6 | 
  7 | def filter_signal(audio, sr, low_pass_filter_cutoff):
  8 |     
  9 |     # Smooth signal with low pass filter, the parameters for which were tuned locally
 10 |     t = np.arange(len(audio)) / sr
 11 |     w = low_pass_filter_cutoff / (sr / 2)
 12 |     b, a = signal.butter(5, w, "low")
 13 |     smoothed_signal = signal.filtfilt(b, a, audio)
 14 | 
 15 |     return smoothed_signal
 16 | 
 17 | def get_amplitude(audio, window_size, silence_threshold, sr, low_pass_filter_cutoff):
 18 |     
 19 |     # Generate amplitude features
 20 |     abs_audio = abs(audio)
 21 |     smoothed_signal = filter_signal(abs_audio, sr, low_pass_filter_cutoff)
 22 | 
 23 |     deriv_amplitude = np.mean(diff(smoothed_signal))
 24 |     mean_amplitude = np.mean(smoothed_signal)
 25 | 
 26 |     return {
 27 |         "abs_deriv_amplitude": abs(deriv_amplitude),
 28 |         "mean_amplitude": mean_amplitude,
 29 |     }
 30 | 
 31 | 
 32 | def normalize_audio_amplitudes(paths):
 33 |     
 34 |     # Normalize amplitudes to be within [-1, 1] according to max absolute value
 35 |     normalized_audio = []
 36 |     for file in paths:
 37 |         sample = librosa.load(file)[0]
 38 |         max_abs = np.max(np.abs(sample))
 39 |         normalized_sample = sample / max_abs
 40 |         normalized_audio.append(normalized_sample)
 41 | 
 42 |     return normalized_audio
 43 | 
 44 | 
 45 | def truncate_silences(
 46 |     normalized_audio,
 47 |     window_size,
 48 |     silence_threshold,
 49 |     sr=None,
 50 |     low_pass_filter_cutoff=None,
 51 |     counter=0,
 52 | ):
 53 |     # Remove start and end silences from clips 
 54 |     start_ids = []
 55 |     end_ids = []
 56 |     truncated_audio = []
 57 | 
 58 |     for audio in normalized_audio:
 59 |         truncation_id_start = None
 60 |         truncation_id_end = None
 61 | 
 62 |         counter += 1
 63 |         if counter % 100 == 0:
 64 |             print(
 65 |                 f"Truncating audio {counter}/{len(normalized_audio)} ({round(counter*100/len(normalized_audio))}%)"
 66 |             )
 67 | 
 68 |         for j in range(len(audio)):
 69 |             roll_average = np.mean(np.abs(audio[j : j + window_size]))
 70 |             if roll_average > silence_threshold:
 71 |                 truncation_id_start = j
 72 |                 break
 73 | 
 74 |         for j in reversed(range(len(audio))):
 75 |             roll_average = np.mean(np.abs(audio[j - window_size : j]))
 76 |             if roll_average > silence_threshold:
 77 |                 truncation_id_end = j - window_size
 78 |                 break
 79 | 
 80 |         if truncation_id_start is not None and truncation_id_end is not None:
 81 |             truncated_audio.append(audio[truncation_id_start:truncation_id_end])
 82 |         start_ids.append(truncation_id_start)
 83 |         end_ids.append(truncation_id_end)
 84 | 
 85 |     return start_ids, end_ids, truncated_audio
 86 | 
 87 | 
 88 | def moving_average(x, w):
 89 |     #compute moving average
 90 |     return np.convolve(x, np.ones(w), "valid") / w
 91 | 
 92 | 
 93 | def get_silence(
 94 |     audio, window_size, silence_threshold, sr=None, low_pass_filter_cutoff=None
 95 | ):
 96 |     #computes silent and voiced portions of audio
 97 |     thresh = max(abs(audio)) * silence_threshold
 98 |     moving_avg = moving_average(abs(audio), window_size)
 99 |     silent = np.where(abs(moving_avg) < thresh)
100 |     voiced = np.where(abs(moving_avg) >= thresh)
101 | 
102 |     # Get percentage of silence and voiced
103 |     pct_pause = len(silent[0]) * 100 / (len(silent[0]) + len(voiced[0]))
104 |     pct_voiced = len(voiced[0]) * 100 / (len(silent[0]) + len(voiced[0]))
105 | 
106 |     if len(voiced[0]) == 0:
107 |         ratio_pause_voiced = None
108 |     else:
109 |         ratio_pause_voiced = len(silent[0]) / len(voiced[0])
110 | 
111 |     return {
112 |         "pct_pause": pct_pause,
113 |         "pct_voiced": pct_voiced,
114 |         "ratio_pause_voiced": ratio_pause_voiced,
115 |     }
116 | 
117 | 
118 | def get_silence_spread(
119 |     audio, window_size, silence_threshold, sr=None, low_pass_filter_cutoff=None
120 | ):
121 |     
122 |     thresh = max(abs(audio)) * silence_threshold
123 |     moving_avg = moving_average(abs(audio), window_size) 
124 | 
125 |     silent_windows = np.where(moving_avg < thresh)
126 |     moving_avg[silent_windows] = 0
127 |     silence_count = 0
128 |     silence_counts = []
129 | 
130 |     for i in range(len(moving_avg) - 1):
131 |         item = moving_avg[i]
132 |         next_item = moving_avg[i + 1]
133 | 
134 |         if item != 0 and next_item == 0:
135 |             silence_count = 0
136 | 
137 |         elif item == 0 and next_item == 0:
138 |             silence_count += 1
139 | 
140 |         elif item == 0 and next_item != 0:
141 |             silence_counts.append(silence_count)
142 | 
143 |         else:
144 |             continue
145 | 
146 |     # Get spreads/means and normalise
147 |     spread_of_silences = np.std(silence_counts) / len(moving_avg)
148 |     mean_of_silences = np.mean(silence_counts) / len(moving_avg)
149 |     n_pauses = len(silence_counts)
150 | 
151 |     return {
152 |         "spread_of_silences": spread_of_silences,
153 |         "mean_of_silences": mean_of_silences,
154 |         "silence_counts": silence_counts,
155 |         "n_pauses": n_pauses,
156 |     }
157 | 


--------------------------------------------------------------------------------
/src/packages/LJDataLoader.py:
--------------------------------------------------------------------------------
  1 | from random import random, sample, seed
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | # helper function
  6 | def loadExistingFile(file_path):
  7 |     return pd.read_csv(file_path)
  8 | 
  9 | class LJDataLoader:
 10 |     # initialization
 11 |     def __init__(
 12 |         self, data_path: str, id_col: str = "id", filter_cols: list = []
 13 |     ) -> None:
 14 |         assert ".csv" in data_path, "Data Path should be a csv file."
 15 |         self.metadata = pd.read_csv(data_path)
 16 |         self._validateData()
 17 |         # self._filterCols(filter_cols)
 18 |         self.id_col = id_col
 19 | 
 20 |     # data validation
 21 |     def _validateData(self):
 22 |         self.metadata = self.metadata.dropna().reset_index()
 23 | 
 24 |     # filtering columns
 25 |     def _filterCols(self, filter_cols):
 26 |         for col in filter_cols:
 27 |             self.metadata = self.metadata[self.metadata[col] == 0]
 28 | 
 29 |     # data sampling
 30 |     def sample(self, perc: float = 0.1):
 31 |         self.metadata = self.metadata.sample(frac=perc, ignore_index=True)
 32 | 
 33 |     # splitting data into train, dev, and test sets
 34 |     def splitData(
 35 |         self, train_perc=0.6, dev_perc=0.2, test_perc=0.2, shuffle: bool = True
 36 |     ):
 37 |         assert train_perc + dev_perc + test_perc == 1, ""
 38 | 
 39 |         if shuffle:
 40 |             self.metadata = self.metadata.sample(
 41 |                 frac=1, ignore_index=True, random_state=12
 42 |             )
 43 | 
 44 |         self.metadata["type"] = None
 45 | 
 46 |         train_idx, dev_idx = int(self.metadata.shape[0] * train_perc), int(
 47 |             self.metadata.shape[0] * (train_perc + dev_perc)
 48 |         )
 49 | 
 50 |         self.metadata.loc[:train_idx, "type"] = "train"
 51 |         self.metadata.loc[train_idx:dev_idx, "type"] = "dev"
 52 |         self.metadata.loc[dev_idx:, "type"] = "test"
 53 | 
 54 |     # selecting random architecture from a list of columns containing architecture names for mixing data
 55 |     def selectRandomArchitecture(self, target_col: str, source_cols: list):
 56 |         def randomlySelectCols(rw):
 57 |             # setting random seed for reproducibility
 58 |             # np.random.seed(12)
 59 |             rand_idx = np.random.randint(0, len(source_cols))
 60 |             return rw[source_cols[rand_idx]]
 61 | 
 62 |         self.metadata[target_col] = self.metadata.apply(
 63 |             lambda row: randomlySelectCols(row), axis=1
 64 |         )
 65 | 
 66 |     # generating final dataframe for experiments
 67 |     def generateFinalDataFrame(
 68 |         self,
 69 |         real_col: str,
 70 |         fake_cols: list,
 71 |         single_id_entry: bool = False,
 72 |         balanced: bool = False,
 73 |     ):
 74 |         agg_cols = [real_col] + fake_cols
 75 | 
 76 |         if single_id_entry:
 77 |             filter_df = self.metadata[agg_cols].copy()
 78 |             multiclass_labels = np.random.randint(
 79 |                 0, len(agg_cols), filter_df.shape[0]
 80 |             ).reshape(filter_df.shape[0], -1)
 81 |             chosen_data = np.take_along_axis(
 82 |                 filter_df.to_numpy(), multiclass_labels, axis=1
 83 |             ).squeeze()
 84 |             multiclass_labels = multiclass_labels.squeeze()
 85 |             labels = np.where(
 86 |                 multiclass_labels == 0, 0, 1
 87 |             )  # in the future, may need to double check that this works for varying column orders
 88 |             architectures = [agg_cols[id] for i in multiclass_labels]
 89 |             return pd.DataFrame(
 90 |                 {
 91 |                     "path": chosen_data,
 92 |                     "label": labels,
 93 |                     "multiclass_label": multiclass_labels,
 94 |                     "type": self.metadata["type"],
 95 |                     "id": self.metadata["id"],
 96 |                     "architecture": architectures,
 97 |                 }
 98 |             )
 99 | 
100 |         filter_df = self.metadata[agg_cols + ["type", "id"]].copy()
101 |         output = pd.melt(
102 |             filter_df,
103 |             id_vars=["type", "id"],
104 |             value_vars=agg_cols,
105 |             value_name="path",
106 |             var_name="architecture",
107 |         )
108 |         output["label"] = np.where(output["architecture"] == real_col, 0, 1)
109 |         multiclass_map = {k: v for v, k in enumerate(agg_cols)}
110 |         output["multiclass_label"] = output["architecture"].map(multiclass_map)
111 |         # output = output.drop(columns=['architecture'])
112 |         
113 |         ### balancing code ##
114 |         if balanced:
115 |             seed(4)
116 | 
117 |             binary_class_labels = output["label"]
118 |             real_indices = list(np.where(binary_class_labels == 0)[0])
119 |             fake_indices = list(np.where(binary_class_labels == 1)[0])
120 | 
121 |             # Apply random sampling to rebalance data
122 |             # NOTE: currently using equal p(sample) from each all fake samples.
123 |             # E.g. we just random sample from all with a 1 class.
124 |             if len(real_indices) < len(fake_indices):
125 |                 fake_indices = sample(fake_indices, len(real_indices))
126 |             elif len(real_indices) > len(fake_indices):
127 |                 real_indices = sample(real_indices, len(fake_indices))
128 | 
129 |             output = output.iloc[real_indices + fake_indices, :].sort_index()
130 | 
131 |         ### END ###
132 |         return output
133 | 


--------------------------------------------------------------------------------
/src/packages/SmileFeatureSelector.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import opensmile
  3 | from tqdm import tqdm
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from sklearn.feature_selection import SelectFromModel
  6 | from sklearn.preprocessing import StandardScaler
  7 | 
  8 | # base_path
  9 | base_path = "/home/ubuntu/"
 10 | 
 11 | ############################################################################################
 12 | # Base Class ###############################################################################
 13 | ############################################################################################
 14 | 
 15 | class smileFeatureSelectorBase:
 16 |     # initialize the class to select features
 17 |     def __init__(
 18 |         self, df, metadata, standardize: bool = True, scaler=StandardScaler()
 19 |     ) -> None:
 20 |         print("Initializing data...")
 21 | 
 22 |         self.data = df
 23 |         self.metadata = metadata
 24 |         self.all_features = self.data.drop(columns=self.metadata).columns
 25 | 
 26 |         self.train_df = self.data[self.data["type"] == "train"].copy()
 27 |         self.dev_df = self.data[self.data["type"] == "dev"].copy()
 28 |         self.test_df = self.data[self.data["type"] == "test"].copy()
 29 | 
 30 |         ## standardize the features inside the train, dev, and test sets for the selected features
 31 |         if standardize:
 32 |             print("Standardizing features...")
 33 |             cols_to_scale = list(self.all_features)
 34 |             scaler.fit(self.train_df[cols_to_scale])
 35 |             self.train_df.loc[:, cols_to_scale] = scaler.transform(
 36 |                 self.train_df.loc[:, cols_to_scale]
 37 |             )
 38 |             self.dev_df.loc[:, cols_to_scale] = scaler.transform(
 39 |                 self.dev_df.loc[:, cols_to_scale]
 40 |             )
 41 |             self.test_df.loc[:, cols_to_scale] = scaler.transform(
 42 |                 self.test_df.loc[:, cols_to_scale]
 43 |             )
 44 |             self.scaler = scaler
 45 |         else:
 46 |             self.scaler = None
 47 | 
 48 |         # print('smileFeatureSelector object initialized.\n')
 49 | 
 50 | ############################################################################################
 51 | # Feature Selection From Model #############################################################
 52 | ############################################################################################
 53 | class smileFeatureSelectFromModel(smileFeatureSelectorBase):
 54 |     def __init__(
 55 |         self, df, metadata, standardize: bool = True, model=RandomForestClassifier()
 56 |     ):
 57 |         """
 58 |         Initialize the smileFeatureSelectorBruteForce class.
 59 |         """
 60 |         # initialize the base class
 61 |         super().__init__(df, metadata, standardize)
 62 | 
 63 |         # load the model to use for brute force feature selection
 64 |         self.model = model
 65 |         print("smileFeatureSelectFromModel object initialized.\n")
 66 | 
 67 |     # ... (rest of the methods in smileFeatureSelectFromModel)
 68 |     def select_features_binary(
 69 |         self,
 70 |         max_features=10,
 71 |         return_df=False,
 72 |         print_features=True,
 73 |         return_features=False,
 74 |     ):
 75 |         """
 76 |         Selects the top num_features features based on the model specified
 77 |         """
 78 | 
 79 |         # for binary classification
 80 |         sfm_features = self._run_sfm(
 81 |             self.train_df, self.dev_df, max_features, multiclass=False
 82 |         )
 83 |         self.binary_feature_set = set(sfm_features)
 84 | 
 85 |         if print_features:
 86 |             print("\nSelected features:.\n")
 87 |             for count, item in enumerate(self.binary_feature_set):
 88 |                 print("{}. {}".format(count + 1, item))
 89 | 
 90 |         if return_features:
 91 |             return list(self.binary_feature_set)
 92 | 
 93 |         if return_df:
 94 |             return self.data[
 95 |                 self.data.columns.intersection(
 96 |                     self.metadata + list(self.binary_feature_set)
 97 |                 )
 98 |             ], list(self.binary_feature_set)
 99 | 
100 |     def select_features_multiclass(
101 |         self,
102 |         max_features=10,
103 |         archs="all_archs",
104 |         return_df=False,
105 |         print_features=True,
106 |         return_features=False,
107 |     ):
108 |         # for multiclass classification
109 |         sfm_features = self._run_sfm(
110 |             self.train_df, self.dev_df, max_features, multiclass=True
111 |         )
112 |         self.multiclass_feature_set = set(sfm_features)
113 | 
114 |         if print_features:
115 |             print("\nSelected features:.\n")
116 |             for count, item in enumerate(self.multiclass_feature_set):
117 |                 print("{}. {}".format(count + 1, item))
118 | 
119 |         if return_features:
120 |             return list(self.multiclass_feature_set)
121 | 
122 |         if return_df:
123 |             return self.data[
124 |                 self.data.columns.intersection(
125 |                     self.metadata + list(self.multiclass_feature_set)
126 |                 )
127 |             ], list(self.multiclass_feature_set)
128 | 
129 |     def _run_sfm(self, trdf, dvdf, max_features, multiclass=False):
130 |         # split train data into X and y
131 |         X_train = trdf.drop(columns=self.metadata).copy()
132 |         if multiclass:
133 |             y_train = trdf["multiclass_label"].copy()
134 |         else:
135 |             y_train = trdf["label"].copy()
136 | 
137 |         # instantiating the model and fitting it
138 |         sfm_model = SelectFromModel(self.model, max_features=max_features)
139 |         sfm_model.fit(X_train, y_train)
140 | 
141 |         # getting the selected features
142 |         sfm_features = list(X_train.columns[sfm_model.get_support()])
143 |         return sfm_features
144 |         import pandas as pd
145 | 


--------------------------------------------------------------------------------
/src/packages/ModelManager.py:
--------------------------------------------------------------------------------
  1 | import sklearn
  2 | from sklearn.metrics import accuracy_score, log_loss, roc_curve
  3 | import pandas as pd
  4 | from sklearn.svm import SVC
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.neighbors import KNeighborsClassifier
  7 | from sklearn.tree import DecisionTreeClassifier
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | import numpy as np
 10 | from scipy.optimize import brentq
 11 | from scipy.interpolate import interp1d
 12 | 
 13 | VALID_MODELS = ["svm", "logreg", "knn", "decision_tree", "random_forest"]
 14 | 
 15 | 
 16 | class ModelManager:
 17 |     def __init__(self, model_name, data, feature_cols, merge_train_dev: bool = False):
 18 |         self.model_name = model_name
 19 |         self.data = data
 20 |         self._splitDataframe(merge_train_dev=merge_train_dev)
 21 |         self.init_model()
 22 |         self.feature_cols = feature_cols
 23 | 
 24 |     #model initialization
 25 |     def init_model(self, params=None):
 26 |         assert (
 27 |             self.model_name.lower() in VALID_MODELS
 28 |         ), f"{self.model_name} is not valid. Valid models include {VALID_MODELS}"
 29 | 
 30 |         if self.model_name == "svm":
 31 |             if params is None:
 32 |                 self.model = SVC()
 33 |             else:
 34 |                 self.model = SVC(**params)
 35 |         elif self.model_name == "logreg":
 36 |             if params is None:
 37 |                 self.model = LogisticRegression()
 38 |             else:
 39 |                 self.model = LogisticRegression(**params)
 40 |         elif self.model_name == "knn":
 41 |             if params is None:
 42 |                 self.model = KNeighborsClassifier()
 43 |             else:
 44 |                 self.model = KNeighborsClassifier(**params)
 45 |         elif self.model_name == "decision_tree":
 46 |             if params is None:
 47 |                 self.model = DecisionTreeClassifier()
 48 |             else:
 49 |                 self.model = DecisionTreeClassifier(**params)
 50 |         elif self.model_name == "random_forest":
 51 |             if params is None:
 52 |                 self.model = RandomForestClassifier(random_state=12)
 53 |             else:
 54 |                 self.model = RandomForestClassifier(**params)
 55 | 
 56 |     def _splitDataframe(self, merge_train_dev: bool):
 57 | 
 58 |         if merge_train_dev:
 59 |             self.train = self.data[
 60 |                 (self.data.type == "train") | (self.data.type == "dev")
 61 |             ]
 62 |             self.dev = None
 63 |         else:
 64 |             self.train = self.data[(self.data.type == "train")]
 65 |             self.dev = self.data[(self.data.type == "dev")]
 66 | 
 67 |         self.test = self.data[(self.data.type == "test")]
 68 | 
 69 |     def trainModel(self, label_col: str):
 70 |         # Train the model using the training data
 71 |         self.y_train = self.train[label_col]
 72 |         self.X_train = self.train[self.feature_cols].copy()
 73 | 
 74 |         self.X_train.to_csv("/home/ubuntu/features.csv", index=False)
 75 | 
 76 |         self.model.fit(self.X_train, self.y_train)
 77 | 
 78 |     def predict(self, label_col: str):
 79 |         # Make predictions on the test data
 80 |         self.y_test = self.test[label_col]
 81 |         self.X_test = self.test[self.feature_cols].copy()
 82 | 
 83 |         self.y_pred = self.model.predict(self.X_test)
 84 | 
 85 |         # Calculate accuracy and log loss
 86 |         self.accuracy = accuracy_score(self.y_test, self.y_pred)
 87 | 
 88 |         self.class_accuracy = {}
 89 |         cls_y_test = self.y_test.copy()
 90 |         cls_y_test = cls_y_test.reset_index(drop=True)
 91 |         for cls in range(len(set(self.y_test))):
 92 |             cls_name = self.data.loc[
 93 |                 self.data[label_col] == cls, "architecture"
 94 |             ].unique()[0]
 95 | 
 96 |             cls_idx = np.where(self.y_test == cls)[0]
 97 |             cls_test = cls_y_test[cls_idx]
 98 |             cls_pred = self.y_pred[cls_idx]
 99 |             self.class_accuracy[cls_name] = accuracy_score(cls_test, cls_pred)
100 | 
101 |         self.eer_score, self.eer_threshold = None, None
102 | 
103 |         if self.model_name not in ["svm"]:
104 |             self.y_prob = self.model.predict_proba(self.X_test)
105 |             self.log_loss_value = log_loss(self.y_test, self.y_prob)
106 | 
107 |             #calculate eer score
108 |             if "multi" not in label_col:
109 |                 self.eer_score, self.eer_threshold = self.calculate_eer()
110 | 
111 |             return (
112 |                 self.accuracy,
113 |                 self.log_loss_value,
114 |                 self.eer_score,
115 |                 self.eer_threshold,
116 |             )
117 | 
118 |         self.log_loss_value = None
119 |         return self.accuracy, self.log_loss_value, self.eer_score, self.eer_threshold
120 | 
121 |     #train and predict using model
122 |     def trainPredict(self, label_col: str):
123 |         self.trainModel(label_col=label_col)
124 |         acc, log_loss, eer_score, eer_threshold = self.predict(label_col=label_col)
125 |         return acc, log_loss, eer_score, eer_threshold
126 | 
127 |     def plotRocCurve(self):
128 |         # Create a ROC curve plot
129 |         fpr, tpr, _ = roc_curve(self.y_test, self.y_prob[:, 1])
130 |         plt.plot(fpr, tpr)
131 |         plt.xlabel("False Positive Rate")
132 |         plt.ylabel("True Positive Rate")
133 |         plt.title("ROC Curve")
134 |         plt.show()
135 | 
136 |     def plotProbaDistribution(self):
137 |         # Create a histogram of test set probability scores
138 |         plt.hist(self.y_prob)
139 |         plt.xlabel("Probability Score")
140 |         plt.ylabel("Frequency")
141 |         plt.title("Test Set Probability Score Distribution")
142 |         plt.show()
143 | 
144 |     def calculate_eer(self):
145 |         # Calculate the False Positive Rate (FPR) and True Positive Rate (TPR)
146 |         fpr, tpr, thresholds = roc_curve(self.y_test, self.y_prob[:, 1], pos_label=1)
147 | 
148 |         # Interpolate the FPR and TPR values
149 |         interpolated = interp1d(fpr, tpr)
150 | 
151 |         # Find the point where FAR and FRR are equal (EER)
152 |         eer = brentq(lambda x: 1.0 - x - interpolated(x), 0.0, 1.0)
153 | 
154 |         optimal_threshold = thresholds[np.nanargmin(np.abs((1.0 - tpr) - fpr))]
155 | 
156 |         return eer, optimal_threshold
157 | 


--------------------------------------------------------------------------------
/src/packages/AudioManager.py:
--------------------------------------------------------------------------------
  1 | from pydub import AudioSegment
  2 | import os
  3 | from packages.LibrosaManager import LibrosaManager
  4 | import soundfile as sf
  5 | import librosa
  6 | import numpy as np
  7 | import random
  8 | import shutil
  9 | 
 10 | #utilities for converting audio files to appropriate sample rates
 11 | #and for performing adversarial laundering
 12 | class AudioManager:
 13 |     def __init__(self) -> None:
 14 |         pass
 15 | 
 16 |     def convertAudioDirectory(
 17 |         self,
 18 |         audio_dir: str,
 19 |         input_format: str,
 20 |         output_format: str = ".wav",
 21 |         output_dir: str = None,
 22 |         delete_original: bool = False,
 23 |         bitrate: str = None,
 24 |         codec: str = None,
 25 |     ):
 26 |         for file in os.listdir(audio_dir):
 27 |             if input_format in file:
 28 |                 self.convertAudioFileTypes(
 29 |                     os.path.join(audio_dir, file),
 30 |                     output_format=output_format,
 31 |                     delete_original=delete_original,
 32 |                     output_dir=output_dir,
 33 |                     bitrate=bitrate,
 34 |                     codec=codec,
 35 |                 )
 36 | 
 37 |     def convertAudioFileTypes(
 38 |         self,
 39 |         audio_path: str,
 40 |         output_format: str = ".wav",
 41 |         delete_original: bool = False,
 42 |         output_dir: str = None,
 43 |         output_file_name: str = None,
 44 |         bitrate: str = None,
 45 |         codec: str = None,
 46 |     ):
 47 |         assert output_format in [
 48 |             ".wav",
 49 |             ".mp4",
 50 |         ], f"{output_format} is an invalid output format. Please enter types: (.wav, .mp4)." 
 51 |         try:
 52 |             import_audio = AudioSegment.from_file(audio_path)
 53 | 
 54 |             if isinstance(output_file_name, type(None)):
 55 |                 output_file_name = os.path.basename(audio_path)
 56 |             output_file_name = output_file_name.replace(
 57 |                 os.path.splitext(output_file_name)[1], output_format
 58 |             )
 59 | 
 60 |             if not output_dir:
 61 |                 output_dir = os.path.dirname(audio_path)
 62 | 
 63 |             import_audio.export(
 64 |                 os.path.join(output_dir, output_file_name),
 65 |                 format=output_format.replace(".", ""),
 66 |                 codec=codec,
 67 |                 bitrate=bitrate,
 68 |             )
 69 | 
 70 |             if delete_original:
 71 |                 os.remove(audio_path)
 72 | 
 73 |         except Exception as e:
 74 |             print(f"Failed to Convert Audio File: {audio_path}")
 75 |             print("Error: ", e)
 76 | 
 77 |     #resampling
 78 |     def resampleAudioDirectory(
 79 |         self,
 80 |         input_directory: str,
 81 |         output_directory: str,
 82 |         target_sample_rate: int,
 83 |         replace_existing: bool = False,
 84 |     ):
 85 |         for file in os.listdir(input_directory):
 86 |             if os.path.splitext(file)[1] not in [".wav", ".mp4", ".WAV"]:
 87 |                 continue
 88 | 
 89 |             if not replace_existing:
 90 |                 if os.path.isfile(os.path.join(output_directory, file)):
 91 |                     continue
 92 | 
 93 |             try:
 94 |                 librosa_manager = LibrosaManager(os.path.join(input_directory, file))
 95 |                 resampled_audio = librosa_manager.resample(
 96 |                     target_sample_rate
 97 |                 )  ## SB_Comment - see librosa manager re: resampling
 98 |                 sf.write(
 99 |                     os.path.join(output_directory, file),
100 |                     resampled_audio,
101 |                     target_sample_rate,
102 |                     subtype="PCM_24",
103 |                 )
104 |             except Exception as e:
105 |                 print(f"Failed to Resample: {file}")
106 |                 print(f"Error Msg: {e}")
107 |                 print()
108 |      
109 |     #function for adding noise to audio
110 |     def addNoiseWithSnr(self, audio_path: str, snr_range: list = [10, 80]):
111 |         audio, sr = librosa.load(
112 |             audio_path
113 |         )
114 | 
115 |         audio_power = np.mean(audio**2)
116 | 
117 |         noise_snr = random.randint(snr_range[0], snr_range[1])
118 |         noise_power = audio_power / (10 ** (noise_snr / 10))
119 |         noise = np.random.normal(scale=np.sqrt(noise_power) * 100, size=len(audio))
120 | 
121 |         noisy_audio = audio + noise
122 | 
123 |         return noisy_audio, noise_snr, sr
124 | 
125 |     #adversarial laundering
126 |     def launderAudioDirectory(
127 |         self,
128 |         input_dir: str,
129 |         output_dir: str,
130 |         noise_type: str = "random_gaussian",
131 |         replace_existing: bool = False,
132 |         transcode_prob=0.5,
133 |         noise_prob=0.5,
134 |     ):
135 |         full_launder_details = []
136 | 
137 |         # Loop through files for laundering them
138 |         for file in os.listdir(input_dir):
139 | 
140 |             file_launder_details = [os.path.join(input_dir, file), 0, None, 0, None]
141 | 
142 |             try:
143 |                 #get flags for laundering
144 |                 is_transcode = np.random.rand() <= transcode_prob
145 |                 is_noise = np.random.rand() <= noise_prob
146 | 
147 |                 bitrate_options = ["64k", "127k", "196k"]
148 | 
149 |                 #transcoding
150 |                 if is_transcode:
151 |                     bitrate = random.choice(bitrate_options)
152 | 
153 |                     file_launder_details[1] = 1
154 |                     file_launder_details[2] = bitrate
155 | 
156 |                     self.convertAudioFileTypes(
157 |                         os.path.join(input_dir, file),
158 |                         output_dir=output_dir,
159 |                         output_format=".mp4",
160 |                         delete_original=False,
161 |                         bitrate=bitrate,
162 |                         codec="aac",
163 |                     )
164 | 
165 |                     self.convertAudioFileTypes(
166 |                         os.path.join(output_dir, file.replace("wav", "mp4")),
167 |                         output_format=".wav",
168 |                         delete_original=True,
169 |                     )
170 | 
171 |                 else:
172 |                     # if no transcode is necessary, just move the file to the new directory
173 |                     shutil.copy(
174 |                         os.path.join(input_dir, file), os.path.join(output_dir, file)
175 |                     )          
176 | 
177 |                 #adding noise
178 |                 if is_noise:
179 |                     noisy_audio, noise_snr, sr = self.addNoiseWithSnr(
180 |                         os.path.join(output_dir, file)
181 |                     )
182 | 
183 |                     file_launder_details[3] = 1
184 |                     file_launder_details[4] = noise_snr
185 | 
186 |                     sf.write(
187 |                         os.path.join(output_dir, file), noisy_audio, sr
188 |                     ) 
189 | 
190 |                 full_launder_details.append(file_launder_details)
191 | 
192 |             except Exception as e:
193 |                 print(f"Failed to add noise: {file}")
194 |                 print(f"Error Msg: {e}")
195 |                 print()
196 | 
197 |         return full_launder_details
198 | 


--------------------------------------------------------------------------------
/src/run_pipeline_ljspeech.py:
--------------------------------------------------------------------------------
  1 | # global packages
  2 | import os
  3 | import sys
  4 | import multiprocessing
  5 | import pandas as pd
  6 | import mlflow
  7 | import time
  8 | import argparse
  9 | 
 10 | # local packages
 11 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src")
 12 | import packages.ExperimentPipeline as ep
 13 | 
 14 | 
 15 | # function that runs the pipeline asynchonously
 16 | def run_pipeline(
 17 |     fake_cols,
 18 |     metadata_path,
 19 |     open_smile_feature_count,
 20 |     run_name_prefix,
 21 |     run_tags,
 22 |     models,
 23 |     create_df_artifact,
 24 |     label_type="label",
 25 | ) -> None:
 26 |     # create and run pipeline object
 27 |     exp = ep.ExperimentPipeline(fake_cols, metadata_path)
 28 |     exp.generate_features(
 29 |         feature_method="all", open_smile_feature_count=open_smile_feature_count
 30 |     )
 31 |     exp.train_predict_using_models(
 32 |         run_name_prefix=run_name_prefix,
 33 |         run_tags=run_tags,
 34 |         models=models,
 35 |         create_df_artifact=create_df_artifact,
 36 |         label_type=label_type,
 37 |     )
 38 | 
 39 | 
 40 | # main function
 41 | def main(experiment_name, open_smile_feature_count, create_df_artifact, num_processes):
 42 |     # start timing
 43 |     start_time = time.time()
 44 | 
 45 |     print("\nRunning pipeline for experiment: \n", experiment_name)
 46 |     mlflow.set_experiment(experiment_name)
 47 | 
 48 |     print("\nopen_smile_feature_count: \n", open_smile_feature_count)
 49 |     print("\ncreate_df_artifact: \n", create_df_artifact)
 50 |     print("\nnum_processes: \n", num_processes)
 51 |     print(
 52 |         "\nusing {} processes out of {} available processes: \n".format(
 53 |             num_processes, multiprocessing.cpu_count()
 54 |         )
 55 |     )
 56 | 
 57 |     # set the models to run
 58 |     models = ["logreg", "random_forest"]
 59 | 
 60 |     ####################################
 61 |     ##### start mutliprocessing ########
 62 |     ####################################
 63 | 
 64 |     # Create a pool of worker processes
 65 |     pool = multiprocessing.Pool(processes=num_processes)
 66 | 
 67 |     # list for holding task arguments
 68 |     task_args = []
 69 | 
 70 |     ######################################
 71 |     ##### tasks for unlaundered data #####
 72 |     ######################################
 73 | 
 74 |     # mlflow tag setting
 75 |     run_tags = {"laundered": 0}
 76 |     # metadata path
 77 |     metadata_path_unlaundered = (
 78 |         "/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv"
 79 |     )
 80 | 
 81 |     # pipeline params
 82 |     run_params = {}
 83 |     run_params["EL"] = ["ElevenLabs"]
 84 |     run_params["UD"] = ["UberDuck"]
 85 |     run_params["WF"] = ["RandWaveFake"]
 86 |     run_params["EL_UD"] = ["ElevenLabs", "UberDuck"]
 87 |     run_params["EL_UD_WF"] = ["ElevenLabs", "UberDuck", "RandWaveFake"]
 88 |     run_params["EL_UD_Fake"] = ["EL_UD_Fake"]
 89 |     run_params["Fake"] = ["Fake"]
 90 | 
 91 |     # get the task params for unlaundered data
 92 |     for run_name_prefix, fake_cols in run_params.items():
 93 |         # binary classifiaction  tasks
 94 |         if len(fake_cols) == 1:
 95 |             # get args tuple and append to task_args list
 96 |             args = (
 97 |                 fake_cols,
 98 |                 metadata_path_unlaundered,
 99 |                 open_smile_feature_count,
100 |                 run_name_prefix,
101 |                 run_tags,
102 |                 models,
103 |                 create_df_artifact,
104 |                 "label",
105 |             )
106 |             task_args.append(args)
107 | 
108 |         # multiclass classification tasks
109 |         else:
110 |             # get args tuple and append to task_args list
111 |             args = (
112 |                 fake_cols,
113 |                 metadata_path_unlaundered,
114 |                 open_smile_feature_count,
115 |                 run_name_prefix,
116 |                 run_tags,
117 |                 models,
118 |                 create_df_artifact,
119 |                 "multiclass_label",
120 |             )
121 |             task_args.append(args)
122 | 
123 |     ####################################
124 |     ##### tasks for laundered data #####
125 |     ####################################
126 | 
127 |     # mlflow tag setting
128 |     run_tags = {"laundered": 1}
129 |     # metadata path
130 |     metadata_path_laundered = (
131 |         "/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv"
132 |     )
133 | 
134 |     # pipeline params
135 |     run_params = {}
136 |     run_params["EL"] = ["ElevenLabs"]
137 |     run_params["UD"] = ["UberDuck"]
138 |     run_params["WF"] = ["RandWaveFake"]
139 |     run_params["EL_UD"] = ["ElevenLabs", "UberDuck"]
140 |     run_params["EL_UD_WF"] = ["ElevenLabs", "UberDuck", "RandWaveFake"]
141 |     run_params["EL_UD_Fake"] = ["EL_UD_Fake"]
142 |     run_params["Fake"] = ["Fake"]
143 | 
144 |     # get the task params for laundered data
145 |     for run_name_prefix, fake_cols in run_params.items():
146 |         # binary classifiaction  tasks
147 |         if len(fake_cols) == 1:
148 |             # get args tuple and append to task_args list
149 |             args = (
150 |                 fake_cols,
151 |                 metadata_path_laundered,
152 |                 open_smile_feature_count,
153 |                 run_name_prefix,
154 |                 run_tags,
155 |                 models,
156 |                 create_df_artifact,
157 |                 "label",
158 |             )
159 |             task_args.append(args)
160 | 
161 |         # multiclass classification tasks
162 |         else:
163 |             # get args tuple and append to task_args list
164 |             args = (
165 |                 fake_cols,
166 |                 metadata_path_laundered,
167 |                 open_smile_feature_count,
168 |                 run_name_prefix,
169 |                 run_tags,
170 |                 models,
171 |                 create_df_artifact,
172 |                 "multiclass_label",
173 |             )
174 |             task_args.append(args)
175 | 
176 |     ####################################
177 |     ##### run multiprocessing ##########
178 |     ####################################
179 | 
180 |     # run the pipeline in parallel
181 |     pool.starmap_async(run_pipeline, task_args)
182 | 
183 |     # close the pool and wait for the work to finish
184 |     pool.close()
185 |     pool.join()
186 | 
187 |     # end timing
188 |     end_time = time.time()
189 |     execution_time_seconds = end_time - start_time
190 | 
191 |     # convert to minutes
192 |     execution_time_minutes = execution_time_seconds / 60
193 | 
194 |     print("\nAll async pipeline runs complete \n")
195 |     print(f"Execution time: {execution_time_minutes} minutes")
196 | 
197 | 
198 | # main function
199 | if __name__ == "__main__":
200 |     # Create an argument parser
201 |     parser = argparse.ArgumentParser(description="Run pipeline")
202 | 
203 |     # Add the command-line arguments
204 |     parser.add_argument("experiment_name", type=str, help="Name of the experiment")
205 |     parser.add_argument(
206 |         "--create_df_artifact",
207 |         action="store_true",
208 |         help="Flag to enable creating df artifact",
209 |     )
210 |     parser.add_argument(
211 |         "--open_smile_feature_count",
212 |         type=int,
213 |         default=20,
214 |         help="Value for open smile feature count",
215 |     )
216 |     parser.add_argument(
217 |         "--num_processes",
218 |         type=int,
219 |         default=15,
220 |         help="Number of parallel processes to run",
221 |     )
222 | 
223 |     # Parse the command-line arguments
224 |     args = parser.parse_args()
225 | 
226 |     # Check if the experiment name is provided
227 |     if not args.experiment_name:
228 |         parser.error("Experiment name is required.")
229 | 
230 |     # Extract the arguments
231 |     experiment_name = args.experiment_name
232 |     create_df_artifact = args.create_df_artifact
233 |     open_smile_feature_count = args.open_smile_feature_count
234 |     num_processes = args.num_processes
235 | 
236 |     # Call the main function with the arguments
237 |     main(experiment_name, open_smile_feature_count, create_df_artifact, num_processes)
238 | 


--------------------------------------------------------------------------------
/src/packages/TIMITDataLoader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import pathlib
  4 | from random import sample, seed, shuffle
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | # class for loading TIMIT data for multivoice experiments
  9 | class TIMITDataLoader:
 10 |     def __init__(self, data_path: str, id_col: str = "id") -> None:
 11 |         self.file_path = data_path
 12 |         # set seed
 13 |         seed(12)
 14 | 
 15 |     def flatten(self, l):
 16 |         return [item for sublist in l for item in sublist]
 17 | 
 18 |     def get_all_files(self):
 19 |         files = []
 20 |         for r, d, f in os.walk(self.file_path):
 21 |             for file in f:
 22 |                 if ".wav" in file.lower():
 23 |                     files.append(os.path.join(r, file))
 24 | 
 25 |         cleaned_files = [item for item in files if not "_processed" in item]
 26 | 
 27 |         return cleaned_files
 28 | 
 29 |     # generate train-dev-test split
 30 |     def generate_split(self, folder=False, balanced=True):
 31 |         if folder:
 32 |             data_df = self.generateFinalDataFrame_folder()
 33 |         else:
 34 |             data_df = self.generateFinalDataFrame(balanced=balanced)
 35 | 
 36 |         indices = list(data_df.index)
 37 | 
 38 |         shuffle(indices)
 39 |         total_samples = len(indices)
 40 |         train_size = int(0.6 * total_samples)
 41 |         dev_size = int(0.2 * total_samples)
 42 | 
 43 |         train_indices = indices[:train_size]
 44 |         dev_indices = indices[train_size : train_size + dev_size]
 45 |         test_indices = indices[train_size + dev_size :]
 46 | 
 47 |         data_df.loc[train_indices, "type"] = "train"
 48 |         data_df.loc[dev_indices, "type"] = "dev"
 49 |         data_df.loc[test_indices, "type"] = "test"
 50 | 
 51 |         train_count = data_df[data_df["type"] == "train"].shape[0]
 52 |         dev_count = data_df[data_df["type"] == "dev"].shape[0]
 53 |         test_count = data_df[data_df["type"] == "test"].shape[0]
 54 | 
 55 |         print(f"# of Train instances: {train_count}")
 56 |         print(f"# of Dev instances: {dev_count}")
 57 |         print(f"# of Test instances: {test_count}")
 58 | 
 59 |         return data_df
 60 | 
 61 |     def generate_split_speaker(
 62 |         self, speakers_to_remove, folder=False
 63 |     ):
 64 |         if folder:
 65 |             data_df = self.generateFinalDataFrame_folder()
 66 |         else:
 67 |             data_df = self.generateFinalDataFrame()
 68 | 
 69 |         data_df["speaker"] = [
 70 |             item.split("/")[-1].split("_")[0] for item in data_df["path"]
 71 |         ]
 72 |         data_df["remove"] = [
 73 |             1 if item in speakers_to_remove else 0 for item in data_df["speaker"]
 74 |         ]
 75 | 
 76 |         data_df_without_test_speakers = data_df[data_df["remove"] == 0]
 77 |         data_df_with_test_speakers = data_df[data_df["remove"] == 1]
 78 | 
 79 |         cleaned_indices = list(data_df_without_test_speakers.index)
 80 |         removed_indices = list(data_df_with_test_speakers.index)
 81 | 
 82 |         shuffle(cleaned_indices)
 83 |         total_samples = len(cleaned_indices)
 84 |         train_size = int(0.6 * len(cleaned_indices))
 85 |         dev_size = int(0.2 * len(cleaned_indices))
 86 | 
 87 |         train_indices = cleaned_indices[:train_size]
 88 |         dev_indices = cleaned_indices[train_size : train_size + dev_size]
 89 |         test_indices = cleaned_indices[train_size + dev_size :]
 90 | 
 91 |         data_df.loc[train_indices, "type"] = "train"
 92 |         data_df.loc[dev_indices, "type"] = "dev"
 93 |         data_df.loc[test_indices, "type"] = "test"
 94 | 
 95 |         # Drop the original 'test' indices
 96 |         data_df = data_df[data_df.type != "test"]
 97 | 
 98 |         # Set the left out speakers to be the only 'test' indices
 99 |         data_df.loc[removed_indices, "type"] = "test"
100 | 
101 |         # Clean up dataframe
102 |         data_df.drop(["remove"], axis=1, inplace=True)
103 | 
104 |         train_count = data_df[data_df["type"] == "train"].shape[0]
105 |         dev_count = data_df[data_df["type"] == "dev"].shape[0]
106 |         test_count = data_df[data_df["type"] == "test"].shape[0]
107 | 
108 |         print(f"# of Train instances: {train_count}")
109 |         print(f"# of Dev instances: {dev_count}")
110 |         print(f"# of Test instances: {test_count}")
111 | 
112 |         return data_df.reset_index(drop=True)
113 | 
114 |     def generateFinalDataFrame(self, balanced: bool = True):
115 |         
116 |         # Get resampled real and fake files
117 |         all_wav_files = pathlib.Path(self.file_path)
118 |         all_wav_files = list(all_wav_files.rglob("*.wav")) + list(
119 |             all_wav_files.rglob("*.WAV")
120 |         )
121 | 
122 |         real_resampled_wav_files = [
123 |             str(file) for file in all_wav_files if "real" in str(file)
124 |         ]
125 |         fake_resampled_wav_files = [
126 |             str(file) for file in all_wav_files if "fake/" in str(file)
127 |         ]
128 | 
129 |         # Extract phrases and file names
130 |         final_folders = []
131 | 
132 |         for folder in os.listdir(self.file_path):
133 |             phrase_files = [
134 |                 phrase for phrase in real_resampled_wav_files if folder in phrase
135 |             ]
136 | 
137 |             file_names = set(
138 |                 [name.split("_")[-1].split(".")[0] for name in phrase_files]
139 |             )
140 | 
141 |             if len(file_names) > 1:
142 |                 continue
143 | 
144 |             # Ensure each file has at least 2 real samples
145 |             elif len(phrase_files) > 1:
146 |                 final_folders.append(folder)
147 | 
148 |         print(len(final_folders))
149 | 
150 |         real_files = []
151 |         fake_files = []
152 | 
153 |         print(f"Params: {len(final_folders)} different phrases")
154 | 
155 |         # Remove any potential duplicates
156 |         file_dict = {}
157 |         for i in range(len(real_resampled_wav_files)):
158 |             file_name = real_resampled_wav_files[i].split("/")[-1]
159 |             file_dict[file_name] = real_resampled_wav_files[i]
160 | 
161 |         real_resampled_wav_files = [file_dict[item] for item in file_dict.keys()]
162 | 
163 |         for n in range(len(final_folders)):
164 |             phrase = final_folders[n]
165 | 
166 |             real_examples = [
167 |                 file for file in real_resampled_wav_files if f"_{phrase}." in file
168 |             ]
169 |             real_examples = [
170 |                 file for file in real_resampled_wav_files if f"/{phrase}/" in file
171 |             ]
172 | 
173 |             fake_examples = [
174 |                 file for file in fake_resampled_wav_files if f"_{phrase}." in file
175 |             ]
176 |             fake_examples = [
177 |                 file for file in fake_resampled_wav_files if f"/{phrase}/" in file
178 |             ]
179 | 
180 |             # Ensure we take the same number of each phrase for real and fake, downsample the real/fake files accordingly
181 |             if len(real_examples) > len(fake_examples):
182 |                 real_examples = sample(real_examples, len(fake_examples))
183 |             else:
184 |                 fake_examples = sample(fake_examples, len(real_examples))
185 | 
186 |             [real_files.append(file) for file in real_examples]
187 |             [fake_files.append(file) for file in fake_examples]
188 | 
189 |         balanced_real_paths = real_files
190 |         balanced_fake_paths = fake_files
191 | 
192 |         df = pd.DataFrame(
193 |             {
194 |                 "type": [
195 |                     "tbc"
196 |                     for i in range(len(balanced_real_paths) + len(balanced_fake_paths))
197 |                 ],
198 |                 "id": [
199 |                     i
200 |                     for i in range(len(balanced_real_paths) + len(balanced_fake_paths))
201 |                 ],
202 |                 "architecture": [0 for item in balanced_real_paths]
203 |                 + [1 for item in balanced_fake_paths],
204 |                 "orig_path": balanced_real_paths + balanced_fake_paths,
205 |                 "label": [0 for item in balanced_real_paths]
206 |                 + [1 for item in balanced_fake_paths],
207 |                 "multiclass_label": [0 for item in balanced_real_paths]
208 |                 + [1 for item in balanced_fake_paths],
209 |             }
210 |         )
211 | 
212 |         downsampled_src = "/home/ubuntu/data/TIMIT_and_ElevenLabs/16KHz"
213 |         orig_paths = df["orig_path"].tolist()
214 |         downsampled_paths = [
215 |             os.path.join(downsampled_src, os.path.basename(path)) for path in orig_paths
216 |         ]
217 | 
218 |         df["path"] = downsampled_paths
219 | 
220 |         return df
221 | 


--------------------------------------------------------------------------------
/pip_requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.4.0
  2 | aiohttp==3.8.4
  3 | aiosignal==1.3.1
  4 | alabaster==0.7.13
  5 | alembic==1.11.1
  6 | aniso8601==9.0.1
  7 | antlr4-python3-runtime==4.9.3
  8 | anyio==3.6.2
  9 | appdirs==1.4.4
 10 | argon2-cffi==21.3.0
 11 | argon2-cffi-bindings==21.2.0
 12 | astor==0.8.1
 13 | asttokens==2.2.1
 14 | astunparse==1.6.3
 15 | async-timeout==4.0.2
 16 | attrdict==2.0.1
 17 | attrs==22.2.0
 18 | audeer==1.19.0
 19 | audformat==0.16.1
 20 | audinterface==1.0.1
 21 | audiofile==1.2.1
 22 | audioread==3.0.0
 23 | audmath==1.2.1
 24 | audobject==0.7.9
 25 | audresample==1.2.1
 26 | av==10.0.0
 27 | Babel==2.10.3
 28 | backcall==0.2.0
 29 | bcrypt==4.0.1
 30 | beautifulsoup4==4.11.2
 31 | black==19.10b0
 32 | boto3==1.26.81
 33 | botocore==1.29.81
 34 | braceexpand==0.1.7
 35 | cachetools==5.3.0
 36 | certifi @ file:///croot/certifi_1671487769961/work/certifi
 37 | cffi==1.15.1
 38 | chardet==5.1.0
 39 | charset-normalizer==2.1.1
 40 | click==8.1.3
 41 | cloudpickle==2.2.1
 42 | colorama==0.4.6
 43 | comm==0.1.3
 44 | commonmark==0.9.1
 45 | cryptography==41.0.1
 46 | cycler==0.11.0
 47 | Cython==0.29.33
 48 | databricks-cli==0.17.7
 49 | datasets==2.10.1
 50 | debugpy==1.6.6
 51 | decorator==5.1.1
 52 | deeplake==3.2.11
 53 | dill==0.3.6
 54 | Distance==0.1.3
 55 | disvoice==0.1.8
 56 | disvoice-prosody==0.0.5
 57 | dm-tree==0.1.8
 58 | docker==6.1.3
 59 | docker-pycreds==0.4.0
 60 | docopt==0.6.2
 61 | docutils==0.19
 62 | editdistance==0.6.2
 63 | einops==0.6.0
 64 | entrypoints @ file:///tmp/build/80754af9/entrypoints_1649926445639/work
 65 | etils==1.0.0
 66 | evdev==1.6.1
 67 | exceptiongroup==1.1.0
 68 | executing==1.2.0
 69 | faiss-cpu==1.7.3
 70 | fastapi==0.92.0
 71 | fastjsonschema==2.16.2
 72 | fasttext==0.9.2
 73 | ffmpy==0.3.0
 74 | filelock==3.9.0
 75 | Flask==2.2.3
 76 | Flask-RESTful==0.3.9
 77 | flatbuffers==23.1.21
 78 | flit_core @ file:///opt/conda/conda-bld/flit-core_1644941570762/work/source/flit_core
 79 | fonttools==4.38.0
 80 | frozenlist==1.3.3
 81 | fsspec==2023.1.0
 82 | ftfy==6.1.1
 83 | future==0.18.3
 84 | g2p-en==2.1.0
 85 | gast==0.5.3
 86 | gdown==4.6.4
 87 | gitdb==4.0.10
 88 | GitPython==3.1.31
 89 | google-api-core==2.11.0
 90 | google-api-python-client==2.83.0
 91 | google-auth==1.4.2
 92 | google-auth-httplib2==0.1.0
 93 | google-auth-oauthlib==0.4.6
 94 | google-cloud-texttospeech==2.14.1
 95 | googleapis-common-protos==1.58.0
 96 | gradio==3.4.0
 97 | greenlet==2.0.2
 98 | grpcio==1.53.0
 99 | grpcio-status==1.53.0
100 | gunicorn==20.1.0
101 | h11==0.12.0
102 | h5py==3.8.0
103 | htmlmin==0.1.12
104 | httpcore==0.15.0
105 | httplib2==0.22.0
106 | httpx==0.23.3
107 | hub==3.0.1
108 | huggingface-hub==0.12.1
109 | humbug==0.2.8
110 | hydra-core==1.2.0
111 | idna==3.4
112 | ijson==3.2.0.post0
113 | ImageHash==4.3.1
114 | imageio==2.4.1
115 | imageio-ffmpeg==0.4.8
116 | imagesize==1.4.1
117 | importlib-metadata==6.1.0
118 | importlib-resources==5.12.0
119 | inflect==6.0.2
120 | iniconfig==2.0.0
121 | ipadic==1.0.0
122 | ipykernel==6.22.0
123 | ipython==8.12.0
124 | ipywidgets==8.0.2
125 | iso-639==0.4.5
126 | iso3166==2.1.1
127 | isort==4.3.21
128 | itsdangerous==2.1.2
129 | jedi==0.18.2
130 | jieba==0.42.1
131 | Jinja2==3.1.2
132 | jiwer==2.5.1
133 | jmespath==1.0.1
134 | joblib==1.2.0
135 | json5==0.9.10
136 | jupyter-server==1.21.0
137 | jupyter_client==8.1.0
138 | jupyter_core==5.3.0
139 | jupyterlab==3.5.0
140 | jupyterlab-pygments==0.2.2
141 | jupyterlab-widgets==3.0.3
142 | jupyterlab_server==2.16.1
143 | kaldi-io==0.9.5
144 | kaldi-python-io==1.2.2
145 | kaldiio==2.17.2
146 | kiwisolver==1.4.4
147 | latexcodec==2.0.1
148 | lazy_loader==0.1
149 | Levenshtein==0.20.2
150 | libclang==15.0.6.1
151 | librosa==0.10.0
152 | lightning-utilities==0.7.1
153 | linkify-it-py==2.0.0
154 | llvmlite==0.39.1
155 | loguru==0.6.0
156 | lxml==4.9.2
157 | Mako==1.2.4
158 | Markdown==3.4.1
159 | markdown-it-py==2.2.0
160 | MarkupSafe==2.1.1
161 | marshmallow==3.19.0
162 | matplotlib==3.5.3
163 | matplotlib-inline==0.1.6
164 | mdit-py-plugins==0.3.4
165 | mdurl==0.1.2
166 | mecab-python3==1.0.5
167 | missingno==0.5.1
168 | mistune==2.0.4
169 | mkl-fft==1.3.1
170 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work
171 | mkl-service==2.4.0
172 | mlflow==2.4.1
173 | moviepy==1.0.3
174 | mpmath==1.2.1
175 | msgpack==1.0.4
176 | multidict==6.0.4
177 | multimethod==1.9
178 | multiprocess==0.70.14
179 | nbclassic==0.4.7
180 | nbclient==0.7.0
181 | nbconvert==7.2.2
182 | nbformat==5.7.0
183 | nemo-toolkit==1.15.0
184 | nest-asyncio==1.5.6
185 | nltk==3.8.1
186 | notebook_shim==0.2.0
187 | numba==0.56.4
188 | numcodecs==0.11.0
189 | numexpr==2.8.4
190 | numpy==1.23.4
191 | oauth2client==4.1.3
192 | oauthlib==3.2.2
193 | omegaconf==2.2.3
194 | onnx==1.13.1
195 | OpenCC==1.1.6
196 | opensmile==2.4.2
197 | opt-einsum==3.3.0
198 | orjson==3.8.6
199 | oyaml==1.0
200 | packaging==23.0
201 | pandas==1.5.1
202 | pandas-profiling==3.4.0
203 | pandasgui==0.2.14
204 | pandastable==0.13.1
205 | pangu==4.0.6.1
206 | parameterized==0.8.1
207 | paramiko==3.2.0
208 | parso==0.8.3
209 | pathos==0.3.0
210 | pathspec==0.11.0
211 | pathtools==0.1.2
212 | patsy==0.5.3
213 | pexpect==4.8.0
214 | phik==0.12.2
215 | phonet==0.3.7
216 | pickleshare==0.7.5
217 | Pillow==9.4.0
218 | pip-api==0.0.30
219 | pipreqs==0.4.11
220 | plac==1.3.5
221 | platformdirs==3.2.0
222 | plotly==5.15.0
223 | pluggy==1.0.0
224 | pooch==1.7.0
225 | portalocker==2.7.0
226 | portpicker==1.2.0
227 | pox==0.3.2
228 | ppft==1.7.6.6
229 | proglog==0.1.10
230 | progress==1.6
231 | promise==2.3
232 | prompt-toolkit==3.0.38
233 | proto-plus==1.22.2
234 | protobuf==3.20.3
235 | psutil==5.9.4
236 | ptyprocess==0.7.0
237 | pure-eval==0.2.2
238 | pyannote.core==5.0.0
239 | pyannote.database==4.1.3
240 | pyannote.metrics==3.2.1
241 | pyarrow==11.0.0
242 | pyasn1==0.4.8
243 | pyasn1-modules==0.2.8
244 | pybind11==2.10.3
245 | pybtex==0.24.0
246 | pybtex-docutils==1.0.2
247 | pycparser==2.21
248 | pycryptodome==3.17
249 | pydantic==1.10.2
250 | PyDrive==1.3.1
251 | pydub==0.25.1
252 | Pygments==2.14.0
253 | PyJWT==2.7.0
254 | PyNaCl==1.5.0
255 | pynput==1.7.6
256 | pyparsing==3.0.9
257 | pypinyin==0.48.0
258 | pypinyin-dict==0.5.0
259 | PyQt5==5.15.9
260 | PyQt5-Qt5==5.15.2
261 | PyQt5-sip==12.12.1
262 | PyQtWebEngine==5.15.6
263 | PyQtWebEngine-Qt5==5.15.2
264 | PySocks==1.7.1
265 | pysptk==0.2.0
266 | pytest==7.2.1
267 | pytest-runner==6.0.0
268 | python-dateutil==2.8.2
269 | python-multipart==0.0.5
270 | python-speech-features==0.6
271 | python-xlib==0.33
272 | pytorch-lightning==1.8.6
273 | pytz==2022.5
274 | PyYAML==6.0
275 | pyzmq==25.0.2
276 | qtstylish==0.1.5
277 | querystring-parser==1.2.4
278 | rapidfuzz==2.13.7
279 | regex==2022.10.31
280 | requests==2.31.0
281 | requests-oauthlib==1.3.1
282 | responses==0.18.0
283 | rfc3986==1.5.0
284 | rich==12.6.0
285 | rsa==4.9
286 | ruamel.yaml==0.17.21
287 | ruamel.yaml.clib==0.2.7
288 | s3transfer==0.6.0
289 | sacrebleu==2.3.1
290 | sacremoses==0.0.53
291 | scikit-learn==1.2.2
292 | scipy==1.9.3
293 | seaborn==0.12.1
294 | Send2Trash==1.8.0
295 | sentence-transformers==2.2.2
296 | sentencepiece==0.1.97
297 | sentry-sdk==1.15.0
298 | setproctitle==1.3.2
299 | shellingham==1.5.0.post1
300 | simplegeneric==0.8.1
301 | six==1.16.0
302 | smmap==5.0.0
303 | sniffio==1.3.0
304 | snowballstemmer==2.2.0
305 | sortedcontainers==2.4.0
306 | soundfile==0.12.1
307 | soupsieve==2.4
308 | sox==1.4.1
309 | soxr==0.3.3
310 | Sphinx==6.1.3
311 | sphinxcontrib-applehelp==1.0.4
312 | sphinxcontrib-bibtex==2.5.0
313 | sphinxcontrib-devhelp==1.0.2
314 | sphinxcontrib-htmlhelp==2.0.1
315 | sphinxcontrib-jsmath==1.0.1
316 | sphinxcontrib-qthelp==1.0.3
317 | sphinxcontrib-serializinghtml==1.1.5
318 | SQLAlchemy==2.0.17
319 | sqlparse==0.4.4
320 | stack-data==0.6.2
321 | starlette==0.25.0
322 | statsmodels==0.13.2
323 | sympy==1.11.1
324 | tabulate==0.9.0
325 | tangled-up-in-unicode==0.2.0
326 | tenacity==8.2.2
327 | tensorboard==2.12.0
328 | tensorboard-data-server==0.7.0
329 | tensorboard-plugin-wit==1.8.1
330 | tensorboardX==2.6
331 | tensorflow==2.11.0
332 | tensorflow-datasets==4.8.3
333 | tensorflow-io==0.31.0
334 | tensorflow-io-gcs-filesystem==0.31.0
335 | tensorflow-metadata==1.12.0
336 | termcolor==2.2.0
337 | terminado==0.13.3
338 | text-unidecode==1.3
339 | textdistance==4.5.0
340 | texterrors==0.4.4
341 | threadpoolctl==3.1.0
342 | tinycss2==1.2.1
343 | tokenizers==0.13.2
344 | toml==0.10.2
345 | tomli==2.0.1
346 | torch==1.13.1
347 | torch-summary==1.4.5
348 | torchaudio==0.13.1
349 | torchmetrics==0.11.1
350 | torchvision==0.2.2
351 | tornado==6.2
352 | tqdm==4.64.1
353 | traitlets==5.9.0
354 | transformers==4.26.1
355 | typed-ast==1.5.4
356 | typer==0.7.0
357 | typing_extensions==4.5.0
358 | uc-micro-py==1.0.1
359 | uritemplate==4.1.1
360 | urllib3==1.26.14
361 | uvicorn==0.20.0
362 | visions==0.7.5
363 | wandb==0.13.10
364 | wcwidth==0.2.6
365 | webdataset==0.1.62
366 | websocket-client==1.6.1
367 | websockets==10.4
368 | Werkzeug==2.2.3
369 | wget==3.2
370 | widgetsnbextension==4.0.3
371 | wordcloud==1.9.2
372 | wrapt==1.15.0
373 | xlrd==2.0.1
374 | xxhash==3.2.0
375 | yarg==0.1.9
376 | yarl==1.8.2
377 | yellowbrick==1.5
378 | youtokentome==1.0.6
379 | zipp==3.15.0
380 | 


--------------------------------------------------------------------------------
/src/run_pipeline_multivoice.py:
--------------------------------------------------------------------------------
  1 | # global packages
  2 | import sys
  3 | import os
  4 | import multiprocessing
  5 | import pandas as pd
  6 | import mlflow
  7 | import time
  8 | import argparse
  9 | 
 10 | # local packages
 11 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src")
 12 | import packages.ExperimentPipeline as ep
 13 | from packages.TIMITDataLoader import TIMITDataLoader
 14 | from packages.LJDataLoader import LJDataLoader
 15 | from packages.AudioEmbeddingsManager import AudioEmbeddingsManager
 16 | from packages.ModelManager import ModelManager
 17 | from packages.CadenceModelManager import CadenceModelManager
 18 | from packages.SmileFeatureManager import SmileFeatureManager
 19 | 
 20 | # fixed values
 21 | timit_data_path = "/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs"
 22 | fake_voices = [
 23 |     "Adam",
 24 |     "Antoni",
 25 |     "Arnold",
 26 |     "Bella",
 27 |     "Biden",
 28 |     "Domi",
 29 |     "Elli",
 30 |     "Josh",
 31 |     "Obama",
 32 |     "Rachel",
 33 |     "Sam",
 34 | ]
 35 | # set the models to run
 36 | models = ["logreg", "random_forest"]
 37 | 
 38 | 
 39 | # helper functions
 40 | def chunks(lst, n):
 41 |     # sort the list
 42 |     lst.sort()
 43 |     for i in range(0, len(lst), n):
 44 |         yield lst[i : i + n]
 45 | 
 46 | 
 47 | # function that runs the pipeline asynchonously
 48 | def run_pipeline(
 49 |     data_df, open_smile_feature_count, run_name_prefix, run_tags, create_df_artifact
 50 | ) -> None:
 51 |     # create and run pipeline object
 52 |     exp = ep.ExperimentPipeline(
 53 |         fake_cols=["ElevenLabs"], metadata_path=None, data_df=data_df
 54 |     )
 55 |     exp.generate_features(
 56 |         feature_method="all", open_smile_feature_count=open_smile_feature_count
 57 |     )
 58 |     exp.train_predict_using_models(
 59 |         run_name_prefix=run_name_prefix,
 60 |         run_tags=run_tags,
 61 |         models=models,
 62 |         create_df_artifact=create_df_artifact,
 63 |         label_type="label",
 64 |     )
 65 | 
 66 | 
 67 | # main function
 68 | def main(
 69 |     experiment_name,
 70 |     open_smile_feature_count,
 71 |     create_df_artifact,
 72 |     num_processes,
 73 |     save_path,
 74 | ):
 75 |     # start timing
 76 |     start_time = time.time()
 77 | 
 78 |     print("\nRunning pipeline for experiment: \n", experiment_name)
 79 |     mlflow.set_experiment(experiment_name)
 80 | 
 81 |     print("\nopen_smile_feature_count: \n", open_smile_feature_count)
 82 |     print("\ncreate_df_artifact: \n", create_df_artifact)
 83 |     print("\nnum_processes: \n", num_processes)
 84 |     print(
 85 |         "\nusing {} processes out of {} available processes: \n".format(
 86 |             num_processes, multiprocessing.cpu_count()
 87 |         )
 88 |     )
 89 | 
 90 |     # load the timit data
 91 |     timit_data_loader = TIMITDataLoader(timit_data_path)
 92 |     # generate the split
 93 |     df = timit_data_loader.generate_split()
 94 |     # get speakers
 95 |     df["speaker"] = [item.split("/")[-1].split("_")[0] for item in df["path"]]
 96 | 
 97 |     # create partitions
 98 |     real_speakers = list(
 99 |         set([item for item in df["speaker"] if not item.startswith(tuple(fake_voices))])
100 |     )
101 |     fake_speakers = list(
102 |         set([item for item in df["speaker"] if item.startswith(tuple(fake_voices))])
103 |     )
104 | 
105 |     real_speaker_partitions = list(chunks(real_speakers, 20))
106 |     fake_speaker_partitions = list(chunks(fake_speakers, 2))
107 | 
108 |     ####################################
109 |     ##### start mutliprocessing ########
110 |     ####################################
111 | 
112 |     # Create a pool of worker processes
113 |     pool = multiprocessing.Pool(processes=num_processes)
114 | 
115 |     # list for holding task arguments
116 |     task_args = []
117 | 
118 |     ######################################
119 |     ############# create tasks ###########
120 |     ######################################
121 | 
122 |     # counter for labeling runs
123 |     counter = 1
124 | 
125 |     # loop through the partitions to remove voices
126 |     for fake_speaker_chunk in fake_speaker_partitions:
127 |         for real_speaker_chunk in real_speaker_partitions:
128 |             # voices to remove
129 |             voices_to_remove = fake_speaker_chunk + real_speaker_chunk
130 | 
131 |             # re-instantiate the loader
132 |             timit_data_loader = TIMITDataLoader(timit_data_path)
133 | 
134 |             # generating split speaker test from the
135 |             data_df = timit_data_loader.generate_split_speaker(
136 |                 voices_to_remove, folder=False
137 |             )
138 | 
139 |             # other task arguments
140 |             run_name_prefix = f"multivoice_run_{counter}"
141 |             run_tags = {"voices_to_remove": voices_to_remove}
142 | 
143 |             # arguments for the task
144 |             args = (
145 |                 data_df,
146 |                 open_smile_feature_count,
147 |                 run_name_prefix,
148 |                 run_tags,
149 |                 create_df_artifact,
150 |             )
151 | 
152 |             task_args.append(args)
153 | 
154 |             counter += 1
155 | 
156 |     ####################################
157 |     ##### run multiprocessing ##########
158 |     ####################################
159 | 
160 |     # run the pipeline in parallel
161 |     pool.starmap_async(run_pipeline, task_args)
162 | 
163 |     # close the pool and wait for the work to finish
164 |     pool.close()
165 |     pool.join()
166 | 
167 |     ####################################
168 |     ####### aggregate results ##########
169 |     ####################################
170 | 
171 |     # get all the runs for the experiment
172 |     experiment = mlflow.get_experiment_by_name(experiment_name)
173 |     experiment_id = experiment.experiment_id
174 |     runs = mlflow.search_runs(experiment_ids=experiment_id)
175 | 
176 |     # aggregate results and save to csv
177 |     agg_results = (
178 |         runs.groupby(["tags.feature_method", "tags.estimator_name", "tags.label_type"])[
179 |             "metrics.accuracy",
180 |             "metrics.0_accuracy",
181 |             "metrics.1_accuracy",
182 |             "metrics.eer_score",
183 |         ]
184 |         .mean()
185 |         .reset_index()
186 |     )
187 |     new_column_names = {
188 |         "tags.feature_method": "feature_method",
189 |         "tags.estimator_name": "estimator_name",
190 |         "tags.label_type": "label_type",
191 |         "metrics.accuracy": "accuracy",
192 |         "metrics.0_accuracy": "real_accuracy",
193 |         "metrics.1_accuracy": "fake_accuracy",
194 |         "metrics.eer_score": "eer_score",
195 |     }
196 |     if save_path.lower().endswith(".csv"):
197 |         agg_results.to_csv(save_path)
198 |     else:
199 |         agg_results.to_csv(save_path + f"/results_{experiment_name}.csv", index=False)
200 | 
201 |     print("\nAggregated results saved to: \n", save_path)
202 | 
203 |     ####################################
204 |     ######### end the script ###########
205 |     ####################################
206 | 
207 |     # end timing
208 |     end_time = time.time()
209 |     execution_time_seconds = end_time - start_time
210 | 
211 |     # convert to minutes
212 |     execution_time_minutes = execution_time_seconds / 60
213 | 
214 |     print("\nAll async pipeline runs complete \n")
215 |     print(f"Execution time: {execution_time_minutes} minutes")
216 | 
217 | 
218 | # main function
219 | if __name__ == "__main__":
220 |     # Create an argument parser
221 |     parser = argparse.ArgumentParser(description="Run pipeline")
222 | 
223 |     # Add the command-line arguments
224 |     parser.add_argument("experiment_name", type=str, help="Name of the experiment")
225 |     parser.add_argument(
226 |         "--create_df_artifact",
227 |         action="store_true",
228 |         help="Flag to enable creating df artifact",
229 |     )
230 |     parser.add_argument(
231 |         "--open_smile_feature_count",
232 |         type=int,
233 |         default=10,
234 |         help="Value for open smile feature count",
235 |     )
236 |     parser.add_argument(
237 |         "--num_processes",
238 |         type=int,
239 |         default=15,
240 |         help="Number of parallel processes to run",
241 |     )
242 |     parser.add_argument(
243 |         "--save_path",
244 |         type=str,
245 |         default="results_multivoice.csv",
246 |         help="Path of the CSV file to save",
247 |     )
248 | 
249 |     # Parse the command-line arguments
250 |     args = parser.parse_args()
251 | 
252 |     # Check if the experiment name is provided
253 |     if not args.experiment_name:
254 |         parser.error("Experiment name is required.")
255 | 
256 |     # Extract the arguments
257 |     experiment_name = args.experiment_name
258 |     create_df_artifact = args.create_df_artifact
259 |     open_smile_feature_count = args.open_smile_feature_count
260 |     num_processes = args.num_processes
261 |     save_path = args.save_path
262 | 
263 |     # Call the main function with the arguments
264 |     main(
265 |         experiment_name,
266 |         open_smile_feature_count,
267 |         create_df_artifact,
268 |         num_processes,
269 |         save_path,
270 |     )
271 | 


--------------------------------------------------------------------------------
/conda_requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: linux-64
  4 | _libgcc_mutex=0.1=main
  5 | _openmp_mutex=5.1=1_gnu
  6 | absl-py=1.4.0=pypi_0
  7 | aiohttp=3.8.4=pypi_0
  8 | aiosignal=1.3.1=pypi_0
  9 | alabaster=0.7.13=pypi_0
 10 | alembic=1.11.1=pypi_0
 11 | aniso8601=9.0.1=pypi_0
 12 | antlr4-python3-runtime=4.9.3=pypi_0
 13 | appdirs=1.4.4=pypi_0
 14 | astor=0.8.1=pypi_0
 15 | asttokens=2.2.1=pypi_0
 16 | astunparse=1.6.3=pypi_0
 17 | async-timeout=4.0.2=pypi_0
 18 | attrdict=2.0.1=pypi_0
 19 | attrs=22.2.0=pypi_0
 20 | audioread=3.0.0=pypi_0
 21 | backcall=0.2.0=pyhd3eb1b0_0
 22 | bcrypt=4.0.1=pypi_0
 23 | beautifulsoup4=4.11.2=pypi_0
 24 | black=19.10b0=pypi_0
 25 | blas=1.0=mkl
 26 | boto3=1.26.79=pypi_0
 27 | botocore=1.29.79=pypi_0
 28 | braceexpand=0.1.7=pypi_0
 29 | ca-certificates=2023.01.10=h06a4308_0
 30 | cachetools=5.3.0=pypi_0
 31 | certifi=2022.12.7=py38h06a4308_0
 32 | cffi=1.15.1=pypi_0
 33 | chardet=5.1.0=pypi_0
 34 | click=8.0.2=pypi_0
 35 | cloudpickle=2.2.1=pypi_0
 36 | colorama=0.4.6=pypi_0
 37 | comm=0.1.2=py38h06a4308_0
 38 | commonmark=0.9.1=pypi_0
 39 | cryptography=39.0.1=pypi_0
 40 | cudatoolkit=11.3.1=h2bc3f7f_2
 41 | cycler=0.11.0=pypi_0
 42 | cython=0.29.33=pypi_0
 43 | databricks-cli=0.17.7=pypi_0
 44 | debugpy=1.6.6=pypi_0
 45 | decorator=5.1.1=pyhd3eb1b0_0
 46 | distance=0.1.3=pypi_0
 47 | docker=6.1.3=pypi_0
 48 | docker-pycreds=0.4.0=pypi_0
 49 | docopt=0.6.2=pypi_0
 50 | docutils=0.19=pypi_0
 51 | editdistance=0.6.2=pypi_0
 52 | einops=0.6.0=pypi_0
 53 | entrypoints=0.4=py38h06a4308_0
 54 | evdev=1.6.1=pypi_0
 55 | exceptiongroup=1.1.0=pypi_0
 56 | executing=1.2.0=pypi_0
 57 | faiss-cpu=1.7.3=pypi_0
 58 | fastapi=0.92.0=pypi_0
 59 | fasttext=0.9.2=pypi_0
 60 | ffmpy=0.3.0=pypi_0
 61 | filelock=3.9.0=pypi_0
 62 | flask=2.2.3=pypi_0
 63 | flask-restful=0.3.9=pypi_0
 64 | flit-core=3.6.0=pyhd3eb1b0_0
 65 | freetype=2.12.1=h4a9f257_0
 66 | frozenlist=1.3.3=pypi_0
 67 | fsspec=2023.1.0=pypi_0
 68 | ftfy=6.1.1=pypi_0
 69 | future=0.18.3=pypi_0
 70 | g2p-en=2.1.0=pypi_0
 71 | gast=0.5.3=pypi_0
 72 | gdown=4.6.4=pypi_0
 73 | giflib=5.2.1=h5eee18b_3
 74 | gitdb=4.0.10=pypi_0
 75 | gitpython=3.1.31=pypi_0
 76 | google-auth=2.16.1=pypi_0
 77 | google-auth-oauthlib=0.4.6=pypi_0
 78 | gradio=3.4.0=pypi_0
 79 | greenlet=2.0.2=pypi_0
 80 | grpcio=1.51.3=pypi_0
 81 | gunicorn=20.1.0=pypi_0
 82 | h11=0.12.0=pypi_0
 83 | h5py=3.8.0=pypi_0
 84 | httpcore=0.15.0=pypi_0
 85 | httpx=0.23.3=pypi_0
 86 | huggingface-hub=0.12.1=pypi_0
 87 | hydra-core=1.2.0=pypi_0
 88 | idna=3.4=pypi_0
 89 | ijson=3.2.0.post0=pypi_0
 90 | imagesize=1.4.1=pypi_0
 91 | importlib-resources=5.12.0=pypi_0
 92 | inflect=6.0.2=pypi_0
 93 | iniconfig=2.0.0=pypi_0
 94 | intel-openmp=2021.4.0=h06a4308_3561
 95 | ipadic=1.0.0=pypi_0
 96 | ipykernel=6.21.2=pypi_0
 97 | ipython=8.10.0=pypi_0
 98 | isort=4.3.21=pypi_0
 99 | itsdangerous=2.1.2=pypi_0
100 | jedi=0.18.2=pypi_0
101 | jieba=0.42.1=pypi_0
102 | jiwer=2.5.1=pypi_0
103 | jmespath=1.0.1=pypi_0
104 | jpeg=9e=h7f8727e_0
105 | jupyter-core=5.2.0=pypi_0
106 | jupyter_client=7.4.8=py38h06a4308_0
107 | jupyter_core=5.1.1=py38h06a4308_0
108 | kaldi-python-io=1.2.2=pypi_0
109 | kaldiio=2.17.2=pypi_0
110 | kiwisolver=1.4.4=pypi_0
111 | latexcodec=2.0.1=pypi_0
112 | lazy-loader=0.1=pypi_0
113 | lcms2=2.12=h3be6417_0
114 | lerc=3.0=h295c915_0
115 | levenshtein=0.20.2=pypi_0
116 | libdeflate=1.8=h7f8727e_5
117 | libedit=3.1.20221030=h5eee18b_0
118 | libffi=3.2.1=hf484d3e_1007
119 | libgcc-ng=11.2.0=h1234567_1
120 | libgomp=11.2.0=h1234567_1
121 | libpng=1.6.37=hbc83047_0
122 | librosa=0.10.0=pypi_0
123 | libsodium=1.0.18=h7b6447c_0
124 | libstdcxx-ng=11.2.0=h1234567_1
125 | libtiff=4.5.0=h6a678d5_1
126 | libwebp=1.2.4=h11a3e52_1
127 | libwebp-base=1.2.4=h5eee18b_1
128 | lightning-utilities=0.7.1=pypi_0
129 | linkify-it-py=2.0.0=pypi_0
130 | llvmlite=0.39.1=pypi_0
131 | loguru=0.6.0=pypi_0
132 | lxml=4.9.2=pypi_0
133 | lz4-c=1.9.4=h6a678d5_0
134 | mako=1.2.4=pypi_0
135 | markdown=3.4.1=pypi_0
136 | markdown-it-py=2.2.0=pypi_0
137 | marshmallow=3.19.0=pypi_0
138 | matplotlib-inline=0.1.6=py38h06a4308_0
139 | mdit-py-plugins=0.3.4=pypi_0
140 | mdurl=0.1.2=pypi_0
141 | mecab-python3=1.0.5=pypi_0
142 | mkl=2021.4.0=h06a4308_640
143 | mkl-service=2.4.0=py38h7f8727e_0
144 | mkl_fft=1.3.1=py38hd3c417c_0
145 | mkl_random=1.2.2=py38h51133e4_0
146 | mlflow=2.4.1=pypi_0
147 | mpmath=1.2.1=pypi_0
148 | msgpack=1.0.4=pypi_0
149 | multidict=6.0.4=pypi_0
150 | ncurses=6.4=h6a678d5_0
151 | nemo-toolkit=1.15.0=pypi_0
152 | nest-asyncio=1.5.6=py38h06a4308_0
153 | nltk=3.8.1=pypi_0
154 | numba=0.56.4=pypi_0
155 | numexpr=2.8.4=pypi_0
156 | numpy=1.23.5=py38h14f4228_0
157 | numpy-base=1.23.5=py38h31eccc5_0
158 | oauthlib=3.2.2=pypi_0
159 | omegaconf=2.2.3=pypi_0
160 | onnx=1.13.1=pypi_0
161 | opencc=1.1.6=pypi_0
162 | openssl=1.1.1s=h7f8727e_0
163 | opt-einsum=3.3.0=pypi_0
164 | orjson=3.8.6=pypi_0
165 | packaging=22.0=py38h06a4308_0
166 | pandasgui=0.2.14=pypi_0
167 | pandastable=0.13.1=pypi_0
168 | pangu=4.0.6.1=pypi_0
169 | parameterized=0.8.1=pypi_0
170 | paramiko=3.0.0=pypi_0
171 | parso=0.8.3=pyhd3eb1b0_0
172 | pathspec=0.11.0=pypi_0
173 | pathtools=0.1.2=pypi_0
174 | pexpect=4.8.0=pyhd3eb1b0_3
175 | pickleshare=0.7.5=pyhd3eb1b0_1003
176 | pillow=9.3.0=py38h6a678d5_2
177 | pip=23.1.2=pypi_0
178 | pip-api=0.0.30=pypi_0
179 | pipreqs=0.4.11=pypi_0
180 | plac=1.3.5=pypi_0
181 | platformdirs=3.0.0=pypi_0
182 | plotly=5.15.0=pypi_0
183 | pluggy=1.0.0=pypi_0
184 | pooch=1.6.0=pypi_0
185 | portalocker=2.7.0=pypi_0
186 | progress=1.6=pypi_0
187 | prompt-toolkit=3.0.37=pypi_0
188 | protobuf=3.20.3=pypi_0
189 | psutil=5.9.4=pypi_0
190 | ptyprocess=0.7.0=pyhd3eb1b0_2
191 | pure_eval=0.2.2=pyhd3eb1b0_0
192 | pyannote-core=5.0.0=pypi_0
193 | pyannote-database=4.1.3=pypi_0
194 | pyannote-metrics=3.2.1=pypi_0
195 | pyasn1=0.4.8=pypi_0
196 | pyasn1-modules=0.2.8=pypi_0
197 | pybind11=2.10.3=pypi_0
198 | pybtex=0.24.0=pypi_0
199 | pybtex-docutils=1.0.2=pypi_0
200 | pycparser=2.21=pypi_0
201 | pycryptodome=3.17=pypi_0
202 | pydub=0.25.1=pypi_0
203 | pygments=2.11.2=pyhd3eb1b0_0
204 | pyjwt=2.7.0=pypi_0
205 | pynacl=1.5.0=pypi_0
206 | pynput=1.7.6=pypi_0
207 | pyparsing=3.0.9=pypi_0
208 | pypinyin=0.48.0=pypi_0
209 | pypinyin-dict=0.5.0=pypi_0
210 | pyqt5=5.15.9=pypi_0
211 | pyqt5-qt5=5.15.2=pypi_0
212 | pyqt5-sip=12.12.1=pypi_0
213 | pyqtwebengine=5.15.6=pypi_0
214 | pyqtwebengine-qt5=5.15.2=pypi_0
215 | pysocks=1.7.1=pypi_0
216 | pytest=7.2.1=pypi_0
217 | pytest-runner=6.0.0=pypi_0
218 | python=3.8.0=h0371630_2
219 | python-dateutil=2.8.2=pyhd3eb1b0_0
220 | python-multipart=0.0.5=pypi_0
221 | python-xlib=0.33=pypi_0
222 | pytorch=1.13.1=py3.8_cpu_0
223 | pytorch-lightning=1.8.6=pypi_0
224 | pytorch-mutex=1.0=cpu
225 | pyyaml=5.4.1=pypi_0
226 | pyzmq=23.2.0=py38h6a678d5_0
227 | qtstylish=0.1.5=pypi_0
228 | querystring-parser=1.2.4=pypi_0
229 | rapidfuzz=2.13.7=pypi_0
230 | readline=7.0=h7b6447c_5
231 | regex=2022.10.31=pypi_0
232 | requests=2.31.0=pypi_0
233 | requests-oauthlib=1.3.1=pypi_0
234 | rfc3986=1.5.0=pypi_0
235 | rich=12.6.0=pypi_0
236 | rsa=4.9=pypi_0
237 | ruamel-yaml=0.17.21=pypi_0
238 | ruamel-yaml-clib=0.2.7=pypi_0
239 | s3transfer=0.6.0=pypi_0
240 | sacrebleu=2.3.1=pypi_0
241 | sacremoses=0.0.53=pypi_0
242 | scikit-learn=1.2.1=pypi_0
243 | sentence-transformers=2.2.2=pypi_0
244 | sentencepiece=0.1.97=pypi_0
245 | sentry-sdk=1.15.0=pypi_0
246 | setproctitle=1.3.2=pypi_0
247 | setuptools=59.5.0=pypi_0
248 | shellingham=1.5.0.post1=pypi_0
249 | six=1.16.0=pyhd3eb1b0_1
250 | smmap=5.0.0=pypi_0
251 | snowballstemmer=2.2.0=pypi_0
252 | sortedcontainers=2.4.0=pypi_0
253 | soundfile=0.12.1=pypi_0
254 | soupsieve=2.4=pypi_0
255 | sox=1.4.1=pypi_0
256 | soxr=0.3.3=pypi_0
257 | sphinx=6.1.3=pypi_0
258 | sphinxcontrib-applehelp=1.0.4=pypi_0
259 | sphinxcontrib-bibtex=2.5.0=pypi_0
260 | sphinxcontrib-devhelp=1.0.2=pypi_0
261 | sphinxcontrib-htmlhelp=2.0.1=pypi_0
262 | sphinxcontrib-jsmath=1.0.1=pypi_0
263 | sphinxcontrib-qthelp=1.0.3=pypi_0
264 | sphinxcontrib-serializinghtml=1.1.5=pypi_0
265 | sqlalchemy=2.0.17=pypi_0
266 | sqlite=3.33.0=h62c20be_0
267 | sqlparse=0.4.4=pypi_0
268 | stack-data=0.6.2=pypi_0
269 | stack_data=0.2.0=pyhd3eb1b0_0
270 | starlette=0.25.0=pypi_0
271 | sympy=1.11.1=pypi_0
272 | tabulate=0.9.0=pypi_0
273 | tenacity=8.2.2=pypi_0
274 | tensorboard=2.12.0=pypi_0
275 | tensorboard-data-server=0.7.0=pypi_0
276 | tensorboard-plugin-wit=1.8.1=pypi_0
277 | tensorboardx=2.6=pypi_0
278 | termcolor=2.2.0=pypi_0
279 | text-unidecode=1.3=pypi_0
280 | textdistance=4.5.0=pypi_0
281 | texterrors=0.4.4=pypi_0
282 | threadpoolctl=3.1.0=pypi_0
283 | tk=8.6.12=h1ccaba5_0
284 | tokenizers=0.12.1=pypi_0
285 | toml=0.10.2=pypi_0
286 | torchaudio=0.13.1=py38_cpu
287 | torchmetrics=0.11.1=pypi_0
288 | torchvision=0.2.2=py_3
289 | tornado=6.2=py38h5eee18b_0
290 | traitlets=5.7.1=py38h06a4308_0
291 | transformers=4.21.2=pypi_0
292 | typed-ast=1.5.4=pypi_0
293 | typer=0.7.0=pypi_0
294 | typing_extensions=4.4.0=py38h06a4308_0
295 | uc-micro-py=1.0.1=pypi_0
296 | urllib3=1.26.16=pypi_0
297 | uvicorn=0.20.0=pypi_0
298 | wandb=0.13.10=pypi_0
299 | wcwidth=0.2.6=pypi_0
300 | webdataset=0.1.62=pypi_0
301 | websocket-client=1.6.1=pypi_0
302 | websockets=10.4=pypi_0
303 | werkzeug=2.2.3=pypi_0
304 | wget=3.2=pypi_0
305 | wheel=0.38.4=py38h06a4308_0
306 | wordcloud=1.9.2=pypi_0
307 | wrapt=1.15.0=pypi_0
308 | xlrd=2.0.1=pypi_0
309 | xz=5.2.10=h5eee18b_1
310 | yarg=0.1.9=pypi_0
311 | yarl=1.8.2=pypi_0
312 | youtokentome=1.0.6=pypi_0
313 | zeromq=4.3.4=h2531618_0
314 | zipp=3.15.0=pypi_0
315 | zlib=1.2.13=h5eee18b_0
316 | zstd=1.5.2=ha4553b6_0
317 | 


--------------------------------------------------------------------------------
/src/packages/CadenceModelManager.py:
--------------------------------------------------------------------------------
  1 | # global packages
  2 | import sys
  3 | import pandas as pd
  4 | import os
  5 | import librosa
  6 | import numpy as np
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | from sklearn.tree import DecisionTreeClassifier
  9 | from sklearn.model_selection import cross_val_score
 10 | import json
 11 | 
 12 | # local packages
 13 | from packages.SavedFeatureLoader import loadFeatures
 14 | from packages.CadenceUtils import *
 15 | from packages.BayesSearch import BayesSearch
 16 | 
 17 | 
 18 | class CadenceModelManager:
 19 |     def __init__(
 20 |         self, data, low_pass_filter_cutoff: int = 10, trunc_window_size: int = 100
 21 |     ) -> None:  
 22 |         self.data = data
 23 |         self.low_pass_filter_cutoff = low_pass_filter_cutoff
 24 |         
 25 |         # assume fixed sampling rate for all files
 26 |         self.sr = sr = librosa.load(self.data["path"][0])[1]
 27 | 
 28 |     # generate cadence features
 29 |     def generate_features(self, window_size, silence_threshold, paths):
 30 | 
 31 |         window_size = int(window_size)
 32 | 
 33 |         # Normalise amplitudes
 34 |         print("Normalizing amplitudes")
 35 |         norm_audio = normalize_audio_amplitudes(paths)
 36 | 
 37 |         # Truncate silences
 38 |         print("Truncating silences")
 39 |         _, _, trunc_audio = truncate_silences(
 40 |             norm_audio, window_size, silence_threshold
 41 |         )
 42 |         # Extract pauses
 43 |         print("Extracting pauses")
 44 |         pauses = self.run_all_files(
 45 |             get_silence, window_size, silence_threshold, trunc_audio
 46 |         )
 47 | 
 48 |         # Extract pause spreads
 49 |         print("Extracting pause spreads")
 50 |         silence_spreads = self.run_all_files(
 51 |             get_silence_spread, window_size, silence_threshold, trunc_audio
 52 |         )
 53 | 
 54 |         # Extract amplitude and derivative
 55 |         print("Extracting amplitude features")
 56 |         amps = self.run_all_files(
 57 |             get_amplitude, window_size, silence_threshold, trunc_audio
 58 |         )
 59 | 
 60 |         # Create dataframe
 61 |         print("Creating dataframe")
 62 |         features = pd.DataFrame(
 63 |             {
 64 |                 "pause_ratio": [item["ratio_pause_voiced"] for item in pauses],
 65 |                 "pause_mean": [item["mean_of_silences"] for item in silence_spreads],
 66 |                 "pause_std": [item["spread_of_silences"] for item in silence_spreads],
 67 |                 "n_pauses": [item["n_pauses"] for item in silence_spreads],
 68 |                 "amp_deriv": [item["abs_deriv_amplitude"] for item in amps],
 69 |                 "amp_mean": [item["mean_amplitude"] for item in amps],
 70 |             }
 71 |         )
 72 | 
 73 |         print("Complete")
 74 | 
 75 |         return features
 76 | 
 77 |     # run the cadence feature extraction pipeline
 78 |     def run_cadence_feature_extraction_pipeline(
 79 |         self,
 80 |         window_size=None,
 81 |         silence_threshold=None,
 82 |         data=None,
 83 |         scaler=None,
 84 |         fill_na=None,
 85 |         regenerate_features: bool = False,
 86 |     ):
 87 |         # feature regeneration block
 88 |         if regenerate_features:
 89 |             if data is None:
 90 |                 features = self.generate_features(
 91 |                     window_size, silence_threshold, self.data["path"]
 92 |                 )
 93 |                 full_df = pd.concat((self.data, features), axis=1)
 94 |             else:
 95 |                 features = self.generate_features(
 96 |                     window_size, silence_threshold, data["path"]
 97 |                 )
 98 |                 full_df = pd.concat((data, features), axis=1)
 99 |             feature_cols = list(features.columns)
100 | 
101 |         # if features are not being regenerated, load them from the saved features
102 |         # this block is tested and used in pipeline
103 |         else:
104 |             full_df = loadFeatures(self.data.copy(), "cadence")
105 |             feature_cols = list(set(full_df.columns) ^ set(self.data.columns))
106 | 
107 |         full_df, scaler = self.normalize_data(full_df, feature_cols, scaler=scaler)
108 | 
109 |         if fill_na is not None:
110 |             full_df = full_df.fillna(fill_na)
111 | 
112 |         return full_df, feature_cols, scaler
113 | 
114 |     # data normalization using minmaxscaler
115 |     def normalize_data(self, full_df, feature_cols, scaler=None):
116 |         if scaler is None:
117 |             scaler = MinMaxScaler()
118 |             full_df.loc[
119 |                 full_df["type"] == "train", feature_cols
120 |             ] = scaler.fit_transform(
121 |                 full_df.loc[full_df["type"] == "train", feature_cols]
122 |             )
123 |             full_df.loc[~(full_df["type"] == "train"), feature_cols] = scaler.transform(
124 |                 full_df.loc[~(full_df["type"] == "train"), feature_cols]
125 |             )
126 |         else:
127 |             full_df.loc[:, list(features.columns)] = scaler.transform(
128 |                 full_df.loc[:, list(features.columns)]
129 |             )
130 | 
131 |         return full_df, scaler
132 | 
133 |     # helper function to run a function on all files
134 |     def run_all_files(self, function, window_size, silence_threshold, truncated_audio):
135 |         results = []
136 |         for item in truncated_audio:
137 |             results.append(
138 |                 function(
139 |                     item,
140 |                     window_size,
141 |                     silence_threshold,
142 |                     self.sr,
143 |                     self.low_pass_filter_cutoff,
144 |                 )
145 |             )
146 |         return results
147 | 
148 |     # target function for bayesian optimization
149 |     def target_function(
150 |         self,
151 |         data,
152 |         window_size,
153 |         silence_threshold,
154 |         label_col="label",
155 |         model=DecisionTreeClassifier(random_state=12),
156 |     ):
157 |         features, feature_cols, _ = self.run_cadence_feature_extraction_pipeline(
158 |             window_size,
159 |             silence_threshold,
160 |             data=data,
161 |             fill_na=-1,
162 |             regenerate_features=True,
163 |         )
164 |         X = features[feature_cols]
165 |         y = features[label_col]
166 |         return cross_val_score(model, X, y, cv=10).mean()
167 | 
168 |     # run target function on a set of parameters
169 |     def run_target_function(self, z, data):
170 |         scores = []
171 |         for i in range(z.shape[0]):
172 |             window_size, silence_threshold = int(z[i, 0]), z[i, 1]
173 |             print(f"Running Params: {window_size}, {silence_threshold}")
174 |             scores.append(self.target_function(data, window_size, silence_threshold))
175 |         return np.array(scores)
176 | 
177 |     # sample parameters for bayesian optimization
178 |     def sample_params(self, count):
179 |         window_size_mean = 300
180 |         window_size_std = 100
181 |         window_min = 25
182 |         silence_threshold_mean = 0.05
183 |         silence_threshold_std = 0.04
184 |         silence_min = 0.005
185 |         silence_max = 0.2
186 | 
187 |         window_size = np.random.normal(window_size_mean, window_size_std, count)
188 |         window_size[window_size < window_min] = window_min
189 |         window_size = window_size.astype(int)
190 |         silence_threshold = np.random.normal(
191 |             silence_threshold_mean, silence_threshold_std, count
192 |         )
193 |         silence_threshold[silence_threshold < silence_min] = silence_min
194 |         silence_threshold[silence_threshold > silence_max] = silence_max
195 | 
196 |         return np.concatenate(
197 |             (window_size.reshape(-1, 1), silence_threshold.reshape(-1, 1)), axis=1
198 |         )
199 | 
200 |     # run bayesian optimization
201 |     def hyperparam_search(self, n_iter, sample_size, init_ex_count, gp_ex_count):
202 |         search_data = (
203 |             self.data[self.data["type"].isin(["train", "dev"])]
204 |             .sample(sample_size)
205 |             .copy()
206 |             .reset_index()
207 |         )
208 |         search_data.to_csv("/home/ubuntu/search_data.csv", index=False)
209 | 
210 |         bayes_search = BayesSearch(
211 |             search_data,
212 |             self.run_target_function,
213 |             self.sample_params,
214 |             n_iter=n_iter,
215 |             init_ex_count=init_ex_count,
216 |             gp_ex_count=gp_ex_count,
217 |         )
218 |         params, acc = bayes_search.optimize()
219 |         return params, acc
220 | 
221 |     # run bayesian optimization and save down the best params
222 |     def hyperparam_search_and_features(
223 |         self,
224 |         output_dir,
225 |         output_name,
226 |         n_iter=25,
227 |         sample_size=300,
228 |         init_ex_count=20,
229 |         gp_ex_count=1000,
230 |     ):
231 |         params, _ = self.hyperparam_search(
232 |             n_iter=n_iter,
233 |             sample_size=sample_size,
234 |             init_ex_count=init_ex_count,
235 |             gp_ex_count=gp_ex_count,
236 |         )
237 |         window_size, silence_threshold = params[0], params[1]
238 | 
239 |         # save down the best params in a json file
240 |         if os.path.exists(os.path.join(output_dir, "params.json")):
241 |             with open(os.path.join(output_dir, "params.json")) as file:
242 |                 params = json.load(file)
243 |         else:
244 |             params = {}
245 | 
246 |         if "." in output_name:
247 |             output_name = os.splitext(output_name)[0]
248 | 
249 |         params[output_name] = {
250 |             "window_size": window_size,
251 |             "silence_threshold": silence_threshold,
252 |         }
253 | 
254 |         with open(os.path.join(output_dir, "params.json"), "w") as file:
255 |             json.dump(params, file)
256 | 
257 |         fake_data = self.data[self.data["label"] == 1].copy()
258 |         features = self.generate_features(
259 |             window_size, silence_threshold, fake_data["path"]
260 |         )
261 |         full_df = pd.concat((self.data, features), axis=1)
262 |         full_df.to_csv(os.path.join(output_dir, f"{output_name}.csv"), index=False)
263 | 
264 | 
265 | def save_features(metadata_path, params_json_path):
266 |     pass
267 | 


--------------------------------------------------------------------------------
/src/packages/ExperimentPipeline.py:
--------------------------------------------------------------------------------
  1 | # global packages
  2 | import sys
  3 | import os
  4 | import nemo.collections.asr as nemo_asr
  5 | import pandas as pd
  6 | import mlflow
  7 | import copy
  8 | 
  9 | # local packages
 10 | sys.path.append("/home/ubuntu/ClonedVoiceDetection/src")
 11 | from packages.LJDataLoader import LJDataLoader
 12 | from packages.AudioEmbeddingsManager import AudioEmbeddingsManager
 13 | from packages.ModelManager import ModelManager
 14 | from packages.CadenceModelManager import CadenceModelManager
 15 | from packages.SmileFeatureManager import SmileFeatureManager
 16 | 
 17 | 
 18 | class ExperimentPipeline:
 19 |     #################################################################################
 20 |     ################################# Initialization ################################
 21 |     #################################################################################
 22 | 
 23 |     def __init__(self, fake_cols, metadata_path, data_df=None) -> None:
 24 |         # intialize the class and generate the data for experiment pipeline if data is not provided
 25 |         self.fake_cols = fake_cols
 26 |         self.metadata_path = metadata_path
 27 |         if data_df is None:
 28 |             self.data_df = self._generate_split(self.fake_cols, self.metadata_path)
 29 |         else:
 30 |             # for multivoice experiments, data_df is generated separately and needs to be provided to the class
 31 |             self.data_df = data_df
 32 | 
 33 |         # initialize feature store
 34 |         self.feature_store = {}
 35 | 
 36 |     def _generate_split(self, fake_cols, metadata_path):
 37 |         # filter data used in training of elevenlabs and initialize the data loader
 38 |         loader = LJDataLoader(
 39 |             data_path=self.metadata_path, filter_cols=["ElevenLabsCloneClip"]
 40 |         )
 41 | 
 42 |         # train-dev-test split
 43 |         loader.splitData()
 44 | 
 45 |         # aggregate wavefake architectures into one column by randomly selecting from architectures
 46 |         source_architectures = [
 47 |             "Full_Band_MelGan",
 48 |             "HifiGan",
 49 |             "MelGan",
 50 |             "MelGanLarge",
 51 |             "Multi_Band_MelGan",
 52 |             "Parallel_WaveGan",
 53 |             "Waveglow",
 54 |         ]
 55 |         new_col_name = "RandWaveFake"
 56 |         loader.selectRandomArchitecture(
 57 |             target_col=new_col_name, source_cols=source_architectures
 58 |         )
 59 | 
 60 |         # combine elevenlabs and uberduck into one column for binary classification
 61 |         source_architectures = ["ElevenLabs", "UberDuck"]
 62 |         new_col_name = "EL_UD_Fake"
 63 |         loader.selectRandomArchitecture(
 64 |             target_col=new_col_name, source_cols=source_architectures
 65 |         )
 66 | 
 67 |         # combine randwavefake, elevenlabs, and uberduck into one column for binary classification
 68 |         source_architectures = ["RandWaveFake", "ElevenLabs", "UberDuck"]
 69 |         new_col_name = "Fake"
 70 |         loader.selectRandomArchitecture(
 71 |             target_col=new_col_name, source_cols=source_architectures
 72 |         )
 73 | 
 74 |         # generate final dataframe
 75 |         data_df = loader.generateFinalDataFrame(real_col="Real", fake_cols=fake_cols)
 76 | 
 77 |         return data_df
 78 | 
 79 |     #################################################################################
 80 |     ################################# Feature Generation ############################
 81 |     #################################################################################
 82 | 
 83 |     def generate_features(self, feature_method="all", open_smile_feature_count=10):
 84 |         #### titanet features ####
 85 |         if feature_method == "titanet":
 86 |             self.feature_store["titanet"] = self._generate_titanet_features()
 87 | 
 88 |         #### openSmile features ####
 89 |         if feature_method == "openSmile_binary":
 90 |             self.feature_store["openSmile_binary"] = self._generate_openSmile_features(
 91 |                 feature_selector_type="random_forest",
 92 |                 label_type="binary",
 93 |                 feature_count=open_smile_feature_count,
 94 |             )
 95 | 
 96 |         if feature_method == "openSmile_multiclass":
 97 |             self.feature_store[
 98 |                 "openSmile_multiclass"
 99 |             ] = self._generate_openSmile_features(
100 |                 feature_selector_type="random_forest",
101 |                 label_type="multiclass",
102 |                 feature_count=open_smile_feature_count,
103 |             )
104 |         #### cadence features ####ß
105 |         if feature_method == "cadence":
106 |             self.feature_store["cadence"] = self._generate_cadence_features()
107 | 
108 |         #### all features ####ß
109 |         if feature_method == "all":
110 |             self.feature_store["titanet"] = self._generate_titanet_features()
111 |             self.feature_store["openSmile_binary"] = self._generate_openSmile_features(
112 |                 feature_selector_type="random_forest",
113 |                 label_type="binary",
114 |                 feature_count=open_smile_feature_count,
115 |             )
116 |             self.feature_store[
117 |                 "openSmile_multiclass"
118 |             ] = self._generate_openSmile_features(
119 |                 feature_selector_type="random_forest",
120 |                 label_type="multiclass",
121 |                 feature_count=open_smile_feature_count,
122 |             )
123 |             self.feature_store["cadence"] = self._generate_cadence_features()
124 | 
125 |     #### private methods for feature generation ####
126 | 
127 |     def _generate_titanet_features(self):
128 |         speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(
129 |             model_name="titanet_large"
130 |         )
131 |         embedding_manager = AudioEmbeddingsManager(
132 |             model=speaker_model, data=self.data_df
133 |         )
134 | 
135 |         return embedding_manager.generateFeatureDf()
136 | 
137 |     def _generate_openSmile_features(
138 |         self, feature_selector_type, label_type, feature_count
139 |     ):
140 |         smile_manager = SmileFeatureManager(self.data_df)
141 | 
142 |         if label_type == "binary":
143 |             return smile_manager.generateFeatureDf(
144 |                 feature_selector_type, label_type, feature_count
145 |             )
146 | 
147 |         if label_type == "multiclass":
148 |             return smile_manager.generateFeatureDf(
149 |                 feature_selector_type, label_type, feature_count
150 |             )
151 | 
152 |     def _generate_cadence_features(self):
153 |         cadence_manager = CadenceModelManager(self.data_df)
154 |         (
155 |             cad_feature_df,
156 |             cad_feature_cols,
157 |             scalar,
158 |         ) = cadence_manager.run_cadence_feature_extraction_pipeline(fill_na=-1)
159 | 
160 |         return cad_feature_df, cad_feature_cols
161 | 
162 |     #################################################################################
163 |     ################################# Train Predict #################################
164 |     #################################################################################
165 | 
166 |     def train_predict_using_models(
167 |         self,
168 |         models=["logreg", "random_forest"],
169 |         run_tags=None,
170 |         run_name_prefix=None,
171 |         create_df_artifact=False,
172 |         label_type="label",
173 |     ):
174 |         # run train eval loop
175 |         for model_type in models:
176 |             for feature_method, feature_data in self.feature_store.items():
177 |                 # condition to skip certain feature methods for certain label types
178 |                 if (
179 |                     label_type == "multiclass_label"
180 |                     and feature_method == "openSmile_binary"
181 |                 ):
182 |                     continue
183 |                 if label_type == "label" and feature_method == "openSmile_multiclass":
184 |                     continue
185 | 
186 |                 # generate mlflow run details
187 |                 run_tags, run_name = self._generate_mlflow_run_details(
188 |                     run_tags, run_name_prefix, model_type, label_type, feature_method
189 |                 )
190 | 
191 |                 # start mlflow run
192 |                 with mlflow.start_run(tags=run_tags, run_name=run_name) as run:
193 |                     # instantiate model and perform trainPredict
194 |                     model = ModelManager(
195 |                         model_name=model_type,
196 |                         data=feature_data[0],
197 |                         feature_cols=feature_data[1],
198 |                         merge_train_dev=True,
199 |                     )
200 | 
201 |                     model.trainPredict(label_col=label_type)
202 | 
203 |                     # mlflow logging
204 |                     self._log_mlflow_run_details(run, model, create_df_artifact)
205 | 
206 |                     # end mlflow run
207 |                     mlflow.end_run()
208 | 
209 |                     print(
210 |                         "Finished run: "
211 |                         + run.info.run_name
212 |                         + "with feature method: "
213 |                         + feature_method
214 |                     )
215 | 
216 |     #### private methods for train predict ####
217 | 
218 |     def _generate_mlflow_run_details(
219 |         self, run_tags, run_name_prefix, model_type, label_type, feature_method
220 |     ):
221 |         # tag details
222 |         _run_tags = copy.deepcopy(run_tags)
223 |         _run_tags.update(
224 |             {
225 |                 "feature_method": feature_method,
226 |                 "label_type": label_type,
227 |                 "selected_architectures": self.fake_cols,
228 |             }
229 |         )
230 | 
231 |         # run name
232 |         if (
233 |             (run_tags is not None)
234 |             and ("laundered" in run_tags.keys())
235 |             and (run_tags["laundered"] == 1)
236 |         ):
237 |             _run_name = (
238 |                 run_name_prefix
239 |                 + "_"
240 |                 + feature_method
241 |                 + "_"
242 |                 + label_type
243 |                 + "_"
244 |                 + model_type
245 |                 + "_laundered"
246 |             )
247 |         else:
248 |             _run_name = (
249 |                 run_name_prefix
250 |                 + "_"
251 |                 + feature_method
252 |                 + "_"
253 |                 + label_type
254 |                 + "_"
255 |                 + model_type
256 |             )
257 | 
258 |         return _run_tags, _run_name
259 | 
260 |     def _log_mlflow_run_details(self, run, model, create_df_artifact) -> None:
261 |         ##### update tags #####
262 |         mlflow.set_tag("estimator_name", type(model.model).__name__)
263 | 
264 |         ##### 1) mlflow log model with schema i.e. signature #####
265 |         signature = mlflow.models.signature.infer_signature(
266 |             model.X_train, model.y_train
267 |         )
268 |         mlflow.sklearn.log_model(
269 |             model.model, "model_" + run.info.run_name, signature=signature
270 |         )
271 | 
272 |         ##### 2) mlflow log model params #####
273 |         mlflow.log_params(model.model.get_params())
274 | 
275 |         ##### 3) mlflow log model artifacts #####
276 |         ## train_dev_test data
277 |         if create_df_artifact:
278 |             data_path = "/home/ubuntu/data/temp/data.csv"
279 |             model.data.to_csv(data_path, index=False)
280 |             mlflow.log_artifact(data_path)
281 |             os.remove(data_path)
282 | 
283 |         ##### 4) mlflow log model metrics #####
284 |         # save class accuracies independently
285 |         for key, value in model.class_accuracy.items():
286 |             mlflow.log_metric(str(key) + "_accuracy", value)
287 | 
288 |         # save aggregate accuracy
289 |         if len(self.fake_cols) > 1:
290 |             agg_accuracy = 0
291 |             for key, value in model.class_accuracy.items():
292 |                 if key in self.fake_cols:
293 |                     agg_accuracy += value
294 |             # compute average accuracy for fake classes
295 |             agg_accuracy = agg_accuracy / len(self.fake_cols)
296 |             mlflow.log_metric("fake_accuracy", agg_accuracy)
297 | 
298 |         # save aggregate accuracy
299 |         mlflow.log_metric("accuracy", model.accuracy)
300 | 
301 |         # save log loss
302 |         mlflow.log_metric("log_loss", model.log_loss_value)
303 | 
304 |         # save eer score
305 |         if model.eer_score is not None:
306 |             mlflow.log_metric("eer_score", model.eer_score)
307 | 
308 |         # save eer threshold
309 |         if model.eer_threshold is not None:
310 |             mlflow.log_metric("eer_threshold", model.eer_threshold)
311 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Single- and Multi-Speaker Cloned Voice Detection: From Perceptual to Learned Features
  2 | 
  3 | <!-- Add link to license on github and decide which license, check python version for accuracy -->
  4 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/LICENSE)
  5 | [![Python 3.8.0](https://img.shields.io/badge/python-3.8.0-blue.svg)](https://www.python.org/downloads/release/python-380/)
  6 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)  
  7 | 
  8 | This is the repository for the paper titled [Single and Multi Speaker Cloned Voice Detection: From Perceptual to Learned Features](https://arxiv.org/abs/2307.07683) submitted to the 2023 IEEE International Workshop on Information Forensics and Security (WIFS 2023).
  9 | 
 10 | The provided source code includes implementations of both the single-speaker and multi-speaker pipelines. However, please note that the dataset used in the experiments is not included in this repository. To replicate the experiments, you would need to create an analogous experimental dataset with cloned voices using different voice cloning architectures or providers.
 11 | 
 12 | The repository does provide code for data generation and adversarial laundering, specifically tailored for an example provider called ElevenLabs. You would need to generate features from the analogous dataset and save them to disk. Additionally, you will need to modify the relevant data handling code to ensure compatibility with your new dataset in order to run the pipeline successfully.
 13 | 
 14 | Please refer to the repository and the paper for more detailed instructions on how to use the code and conduct the experiments.
 15 | 
 16 | # Folder Structure
 17 | 
 18 | The repository is structured as follows:
 19 | 
 20 | | Folder    | File       | Description                                       |
 21 | |-----------|------------|---------------------------------------------------|
 22 | |__Experiment Pipeline__|
 23 | | `/src/`   |[`run_pipeline_ljspeech.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/run_pipeline_ljspeech.py)| Runs the pipeline for single voice (LJSpeech) experiments|
 24 | | `/src/`   |[`run_pipeline_multivoice.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/run_pipeline_multivoice.py)| Runs the pipeline for multivoice experiments|
 25 | | `/src/packages/`  | [`ExperimentPipeline.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ExperimentPipeline.py)     | Class for running the experiment_pipeline and logging results|
 26 | | `/src/packages/`  | [`ModelManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ModelManager.py)            |Class for managing the final classification models |
 27 | |__Feature Generation__|
 28 | | `/src/packages/`  | [`AudioEmbeddingsManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/AudioEmbeddingsManager.py)  | Class for managing learned features generated using [NVIDIA TitaNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speaker_recognition/models.html)|
 29 | | `/src/packages/`  | [`SmileFeatureManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureManager.py)     | Class for managing spectral features generated using [openSMILE](https://audeering.github.io/opensmile-python/usage.html)|
 30 | | `/src/packages/`  | [`SmileFeatureGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureGenerator.py)   | Class for generating spectral features and saving to disk for collections of audio files|
 31 | | `/src/packages/`  | [`SmileFeatureSelector.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SmileFeatureSelector.py)    | Class for selecting spectral features using `sklearn.feature_selection` |
 32 | | `/src/packages/`  | [`CadenceModelManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/CadenceModelManager.py)     | Class for managing perceptual features generated using handcrafted technqiues|
 33 | | `/src/packages/`  | [`CadenceUtils.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/CadenceUtils.py)            | Utility functions used by `CadenceModelManager` for generating features |
 34 | | `/src/packages/`  | [`BayesSearch.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/BayesSearch.py)             | A class that implements Bayesian Hyperparameter Optimization for perceptual model |
 35 | | `/src/packages/`  | [`SavedFeatureLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/SavedFeatureLoader.py)      | Helper function for loading during experiments the generated features saved to disk|
 36 | |__Data Loaders__|
 37 | | `/src/packages/`  | [`LJDataLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/LJDataLoader.py)            | Class for loading and handling the LJSpeech data for experiments|
 38 | | `/src/packages/`  | [`TIMITDataLoader.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/TIMITDataLoader.py)         | Class for loading and handling the TIMIT data for multi-voice experiments|
 39 | |__Data Generation__|
 40 | | `/src/packages/`  | [`BaseDeepFakeGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/BaseDeepFakeGenerator.py)   | Base class used for processing data used for voice cloning |
 41 | | `/src/packages/`  | [`ElevenLabsDeepFakeGenerator.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/ElevenLabsDeepFakeGenerator.py)| Class used to generate deepfakes using the ElevenLabs API |
 42 | | `/src/packages/`  | [`AudioManager.py`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/src/packages/AudioManager.py)            | Class for resampling audio files and performing adversarial laundering |
 43 | |__Misc__|
 44 | | `.`       | [`README.md`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/README.md) | Provides an overview for the project|
 45 | | `.`       | [`conda_requirements.txt`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/conda_requirements.txt) | Dependencies for creating the `conda` environment|
 46 | | `.`       | [`pip_requirements.txt`](https://github.com/audio-df-ucb/ClonedVoiceDetection/blob/main/pip_requirements.txt) | Dependencies installed with `pip`|
 47 | 
 48 | # Data
 49 | 
 50 | An overview of the real and synthetic datasets used in our single-speaker (top) and multi-speaker (bottom) evaluations. The 91,700 WaveFake samples correspond to 13,100 samples per each of seven different vocoder architectures, hence the larger number of clips and duration.
 51 | 
 52 | ### Single-speaker
 53 | 
 54 | | **Type** | **Name** | **Clips (#)** | **Duration (sec)** |
 55 | |:--------:|:--------:|:-------------:|:------------------:|
 56 | | Real | LJSpeech | 13,100 | 86,117 |
 57 | | Synthetic | WaveFake | 91,700 | 603,081 |
 58 | | Synthetic | ElevenLabs | 13,077 | 78,441 |
 59 | | Synthetic | Uberduck | 13,094 | 83,322 |
 60 | 
 61 | ### Multi-speaker
 62 | 
 63 | | **Type** | **Name** | **Clips (#)** | **Duration (sec)** |
 64 | |:--------:|:--------:|:-------------:|:------------------:|
 65 | | Real | TIMIT | 4,620 | 14,192 |
 66 | | Synthetic | ElevenLabs | 5,499 | 15,413 |
 67 | 
 68 | ### Publicly Available Data
 69 | 
 70 | 1. The LJ Speech 1.1 Dataset -- [Data](https://keithito.com/LJ-Speech-Dataset/)
 71 | 2.  WaveFake: A Data Set to Facilitate Audio Deepfake Detection -- [Paper](https://arxiv.org/abs/2111.02813), [Data](https://zenodo.org/record/5642694)
 72 | 3. TIMIT Acoustic-Phonetic Continuous Speech Corpus -- [Data](https://catalog.ldc.upenn.edu/LDC93S1)
 73 | 
 74 | ### Commercial Voice Cloning Tools
 75 | 
 76 | 1. ElevenLabs (EL) -- https://beta.elevenlabs.io/
 77 | 2. UberDuck (UD) -- https://app.uberduck.ai/
 78 | 
 79 | # Results
 80 | 
 81 | ### Single-speaker
 82 | 
 83 | Accuracies for a personalized, single-speaker classification of unlaundered audio (top) and audio subject to adversarial laundering in the form of additive noise and transcoding (bottom). Dataset corresponds to ElevenLabs (EL), UberDuck (UD), and WaveFake (WF); Model corresponds to a linear (L) or non-linear (NL) classifier, and for a single-classifier (real v. synthetic) or multi-classifier (real vs. specific synthethis architecture); accuracy (%) is reported for synthetic audio, real audio, and (for the single-classifiers) equal error rate (EER) is also reported.
 84 | 
 85 | 
 86 | |          |        | Synthetic Accuracy (%) |     |     | Real Accuracy (%) |     |     | EER (%) |     |     |
 87 | |----------|--------|:----------------------:|-----|-----|:-----------------:|-----|-----|:-------:|-----|-----|
 88 | | **Dataset**  | **Model**  | **Learned**  | **Spectral** | **Perceptual** | **Learned**  | **Spectral** | **Perceptual** | **Learned**  | **Spectral** | **Perceptual** |
 89 | |__Unlaundered__|
 90 | |Binary|
 91 | | EL       | single (L)  | 100.0 | 99.2 | 78.2 | 100.0 | 99.9 | 72.5 | 0.0 | 0.5 | 24.9 |
 92 | |          | single (NL) | 100.0 | 99.9 | 82.2 | 100.0 | 100.0 | 80.4 | 0.0 | 0.1 | 18.6 |
 93 | | UD       | single (L) | 99.8 | 98.9 | 51.9 | 99.9 | 98.9 | 54.0 | 0.1 | 1.1 | 47.2 |
 94 | |          | single (NL) | 99.7 | 99.2 | 54.4 | 99.9 | 99.0 | 56.5 | 0.2 | 0.9 | 44.5 |
 95 | | WF       | single (L) | 96.5 | 78.4 | 57.8 | 97.1 | 82.3 | 45.6 | 3.3 | 19.7 | 48.5 |
 96 | |          | single (NL) | 94.5 | 87.6 | 50.3 | 96.7 | 90.2 | 52.7 | 4.4 | 11.2 | 48.6 |
 97 | | EL+UD    | single (L) | 99.7 | 94.8 | 63.4 | 99.9 | 97.1 | 60.3 | 0.2 | 4.2 | 37.9 |
 98 | |          | single (NL) | 99.7 | 99.2 | 57.3 | 99.9 | 99.6 | 69.0 | 0.2 | 0.8 | 37.6 |
 99 | | EL+UD+WF | single (L) | 93.2 | 79.7 | 58.4 | 98.7 | 93.0 | 57.6 | 3.6 | 15.9 | 42.1 |
100 | |          | single (NL) | 91.2 | 90.6 | 53.1 | 99.0 | 94.1 | 64.7 | 4.1 | 7.9 | 41.6 |
101 | |Multiclass|
102 | | EL+UD    | multi (L) | 99.9 | 96.6 | 61.0 | 100.0 | 94.6 | 35.7 | - | - | - |
103 | |          | multi (NL) | 99.7 | 98.3 | 65.6 | 100.0 | 97.2 | 43.2 | - | - | - |
104 | | EL+UD+WF | multi (L) | 98.8 | 80.2 | 45.1 | 97.3 | 64.3 | 22.9 | - | - | - |
105 | |          | multi (NL) | 98.1 | 94.2 | 48.6 | 96.3 | 84.4 | 27.6 | - | - | - |
106 | |__Laundered__|
107 | |Binary|
108 | | EL       | single (L)  | 95.5 | 94.3 | 61.1 | 94.5 | 92.6 | 65.2 | 4.9 | 6.7 | 36.6 |
109 | |          | single (NL)  | 96.0 | 96.2 | 70.4 | 95.4 | 95.6 | 69.6 | 4.1 | 4.1 | 30.1 |
110 | | UD       | single (L) | 95.4 | 81.1 | 61.4 | 91.8 | 84.3 | 44.7 | 6.3 | 17.3 | 46.7 |
111 | |          | single (NL) | 95.4 | 86.8 | 52.9 | 93.3 | 86.1 | 55.9 | 5.5 | 13.6 | 45.6 |
112 | | WF       | single (L) | 87.6 | 60.7 | 59.6 | 85.0 | 70.4 | 42.5 | 13.9 | 34.4 | 49.4 |
113 | |          | single (NL) | 83.6 | 77.1 | 51.4 | 85.6 | 76.7 | 53.9 | 15.3 | 23.1 | 47.3 |
114 | | EL+UD    | single (L) | 95.2 | 79.1 | 54.0 | 91.7 | 78.4 | 59.8 | 6.2 | 21.3 | 43.1 |
115 | |          | single (NL) | 94.8 | 86.1 | 55.2 | 93.3 | 90.0 | 62.4 | 6.0 | 12.0 | 41.4 |
116 | | EL+UD+WF | single (L) | 83.7 | 70.9 | 50.6 | 88.6 | 72.9 | 59.7 | 13.2 | 28.2 | 44.8 |
117 | |          | single (NL) | 83.4 | 79.2 | 53.0 | 90.7 | 85.1 | 60.7 | 12.5 | 17.9 | 43.6 |
118 | |Multiclass|
119 | | EL+UD    | multi (L)  | 94.2 | 85.6 | 50.9 | 91.0 | 77.1 | 29.1 | -   | -   | -   |
120 | |          | multi (NL) | 94.5 | 91.7 | 53.2 | 90.3 | 82.9 | 41.3 | -   | -   | -   |
121 | | EL+UD+WF | multi (L)  | 89.8 | 65.4 | 35.3 | 83.1 | 44.3 | 26.2 | -   | -   | -   |
122 | |          | multi (NL) | 88.8 | 78.8 | 39.8 | 82.1 | 63.0 | 28.6 | -   | -   | -   |
123 | 
124 | ### Multi-speaker
125 | 
126 | Accuracies for a non-personalized, multi-speaker classification of unlaundered audio. Dataset corresponds to ElevenLabs (EL); Model corresponds to a linear (L) or non-linear (NL) classifier, and for a single-classifier (real v. synthetic) or multi-classifier (real vs. specific synthethis architecture); accuracy (%) is reported for synthetic audio, real audio, and (for the single-classifiers) equal error rate (EER) is also reported.
127 | 
128 | 
129 | |          |        | Synthetic Accuracy (%) |     |     | Real Accuracy (%) |     |     | EER (%) |     |     |
130 | |----------|--------|:----------------------:|-----|-----|:-----------------:|-----|-----|:-------:|-----|-----|
131 | | **Dataset**  | **Model**  | **Learned**  | **Spectral** | **Perceptual** | **Learned**  | **Spectral** | **Perceptual** | **Learned**  | **Spectral** | **Perceptual** |
132 | | EL       | single (L)  | 100.0 | 94.2 | 83.8 | 99.9 | 98.3 | 86.9 | 0.0 | 3.0 | 1.3 |
133 | |          | single (NL) | 92.3 | 96.3 | 82.2 | 100.0 | 99.7 | 87.7 | 0.1 | 1.6 | 1.4 |
134 | 
135 | 
136 | # Research Group
137 | 
138 | * Sarah Barrington<sup>1</sup> -- <sbarrington@berkeley.edu>
139 | * Romit Barua<sup>1</sup> -- <romit_barua@berkeley.edu>
140 | * Gautham Koorma<sup>1</sup> -- <gautham.koorma@berkeley.edu>
141 | * Hany Farid<sup>1,2</sup> -- <hfarid@berkeley.edu> 
142 | 
143 | School of Information<sup>1</sup> and Electrical Engineering and Computer Sciences<sup>1,2</sup> at the University of California, Berkeley
144 | 
145 | This work was partially funded by a [grant from the UC Berkeley Center For Long-Term Cybersecurity (CLTC)](https://cltc.berkeley.edu/publication/digital-fingerprinting-to-protect-against-deepfakes/), an [award for open-source innovation from the Digital Public Goods Alliance and United Nations Development Program](https://digitalpublicgoods.net/information-pollution/), and an unrestricted gift from Meta. 
146 | 
147 | # Citation
148 | 
149 | Please cite the following paper if you use this code:
150 | 
151 | ```
152 | @misc{barrington2023single,
153 |       title={Single and Multi-Speaker Cloned Voice Detection: From Perceptual to Learned Features}, 
154 |       author={Sarah Barrington and Romit Barua and Gautham Koorma and Hany Farid},
155 |       year={2023},
156 |       eprint={2307.07683},
157 |       archivePrefix={arXiv},
158 |       primaryClass={cs.SD}
159 | }
160 | ```
161 | 


--------------------------------------------------------------------------------