├── .gitattributes
├── .gitignore
├── .idea
    ├── .gitignore
    ├── BenchmarkDataset.iml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── BenchmarkDatasetCreator
    ├── __init__.py
    ├── dataset.py
    ├── folders.py
    └── metadata.py
├── BenchmarkDatasetCreator_app
    ├── Home.py
    ├── help_dictionary.py
    └── pages
    │   ├── 1_Project_creator.py
    │   ├── 2_Metadata_creator.py
    │   └── 3_Dataset_creator.py
├── LICENSE.txt
├── README.md
├── SelectionTable
    ├── MD02_truth_selections.txt
    └── MD03_truth_selections.txt
├── docs
    ├── DependenciesMapping.txt
    ├── HowToInstall
    │   └── HowToInstall_Mac.txt
    ├── bioacoustics_species_list.txt
    └── illustrations
    │   ├── method_schematic.png
    │   ├── method_schematicV3.jpeg
    │   ├── ‎method_schematicV2.png
    │   └── ‎method_schematicV2_zoom.png
├── examples
    ├── CreateBenchmarkDataset.ipynb
    └── CreateBenchmarkDataset.py
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | 
155 | # macOS system file
156 | *.DS_Store
157 | .idea
158 | 
159 | # Keynote docs
160 | *.key
161 | *.mp4
162 | 
163 | 
164 | # Python requirements file
165 | 
166 | # Data folder 
167 | benchmark_data/
168 | lea_help/
169 | 
170 | # Temporary files
171 | *.tmp
172 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/BenchmarkDataset.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.8 (BenchmarkDataset)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="PyDocumentationSettings">
11 |     <option name="format" value="PLAIN" />
12 |     <option name="myDocStringFormat" value="Plain" />
13 |   </component>
14 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="4">
 8 |             <item index="0" class="java.lang.String" itemvalue="das4whales" />
 9 |             <item index="1" class="java.lang.String" itemvalue="random" />
10 |             <item index="2" class="java.lang.String" itemvalue="os" />
11 |             <item index="3" class="java.lang.String" itemvalue="csv" />
12 |           </list>
13 |         </value>
14 |       </option>
15 |     </inspection_tool>
16 |   </profile>
17 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (BenchmarkDataset)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/BenchmarkDataset.iml" filepath="$PROJECT_DIR$/.idea/BenchmarkDataset.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leabouffaut/BenchmarkDatasetCreator/bdb3f1f46056d9f8fe21e948d330a22506638d03/BenchmarkDatasetCreator/__init__.py


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator/dataset.py:
--------------------------------------------------------------------------------
  1 | # Benchmark Dataset Creator functions
  2 | #
  3 | # Léa Bouffaut, Ph.D. -- K. Lisa Yang Center for Conservation Bioacoustics, Cornell University
  4 | # lea.bouffaut@cornell.edu
  5 | 
  6 | import os
  7 | import sys
  8 | import shutil
  9 | 
 10 | import numpy as np
 11 | # from scipy import signal
 12 | import pandas as pd
 13 | import librosa
 14 | import soundfile as sf
 15 | from tqdm import tqdm
 16 | 
 17 | 
 18 | # ---------------------------
 19 | #  User interaction functions
 20 | def query_yes_no(question, default="yes"): # TODO moved to create_folders_functions -- delete
 21 |     """
 22 |     Ask a yes/no question via raw_input() and return their answer.
 23 | 
 24 |     Inputs:
 25 |         - question: A string that is presented to the user.
 26 |         - default: The presumed answer if the user just hits <Enter>. It must be "yes" (the default), 
 27 |         "no", or None (meaning an answer is required from the user).
 28 | 
 29 |     Return value:
 30 |         - True for "yes" or False for "no".
 31 |     """
 32 |     # Dictionary mapping valid yes/no responses to boolean values
 33 |     valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
 34 | 
 35 |     # Set the prompt based on the default answer
 36 |     if default is None:
 37 |         prompt = " [y/n] "
 38 |     elif default == "yes":
 39 |         prompt = " [Y/n] "
 40 |     elif default == "no":
 41 |         prompt = " [y/N] "
 42 |     else:
 43 |         # Raise a ValueError for an invalid default answer
 44 |         raise ValueError("invalid default answer: '%s'" % default)
 45 | 
 46 |     # Loop until a valid response is provided
 47 |     while True:
 48 |         # Display the question and prompt the user for a response
 49 |         sys.stdout.write(question + prompt)
 50 |         choice = input().lower()
 51 |         # If the default answer is provided and not empty, return the corresponding boolean value
 52 |         if default is not None and choice == "":
 53 |             return valid[default]
 54 |         # If the user's choice is in the valid responses, return the corresponding boolean value
 55 |         elif choice in valid:
 56 |             return valid[choice]
 57 |         else:
 58 |             # If the response is invalid, prompt the user to respond with 'yes' or 'no'
 59 |             sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
 60 | 
 61 | 
 62 | def path_print(start_path): # TODO moved to create_folders_functions -- delete
 63 |     """
 64 |     Prints the content of the folder designated by startpath
 65 |     """
 66 |     # Iterate through the directory tree starting from 'startpath'
 67 |     for root, dirs, files in os.walk(start_path):
 68 | 
 69 |         # Determine the depth of the current directory relative to 'startpath'
 70 |         level = root.replace(start_path, '').count(os.sep)
 71 | 
 72 |         # Calculate the indentation for displaying directory structure
 73 |         indent = ' ' * 4 * (level)
 74 | 
 75 |         # Print the name of the current directory
 76 |         print('{}{}/'.format(indent, os.path.basename(root)))
 77 | 
 78 |         # Calculate the indentation for displaying files within the directory
 79 |         sub_indent = ' ' * 4 * (level + 1)
 80 | 
 81 |         # Iterate through the files in the current directory
 82 |         for f in files:
 83 |             # Print the name of each file within the directory
 84 |             print('{}{}'.format(sub_indent, f))
 85 | 
 86 | 
 87 | def check_export_settings(export_settings):
 88 |     """
 89 |     Checks the completeness of export settings provided by the user.
 90 | 
 91 |     Inputs:
 92 |         - export_settings: A dictionary that should contain the audio export settings:
 93 |         'Original project name', 'Audio duration (s)', 'fs (Hz)', 'Bit depth', 'Export label', 
 94 |         'Split export selections', and 'Export folder'.
 95 | 
 96 |     Raises:
 97 |         - ValueError: If any required field in the wanted_fields_list is missing in the export_settings 
 98 |         dictionary.
 99 |     """
100 |     wanted_fields_dict = {
101 |         'Project ID': None,
102 |         'Deployment ID': None,
103 |         'Digital sampling': {
104 |             'Audio duration (s)': None,
105 |             'fs (Hz)': None,
106 |             'Bit depth': None
107 |         },
108 |         'Selections': {
109 |             'Export label': None,
110 |             'Split export selections': None
111 |         },
112 |         'Export folders': {
113 |             'Export folder': None
114 |         }
115 |     }
116 | 
117 |     missing = []
118 | 
119 |     # Go through the wanted fields
120 |     for field, value in wanted_fields_dict.items():
121 |         if field not in export_settings:
122 |             missing.append(field)
123 |         elif isinstance(value, dict):  # If the value is a dictionary, check its subfields
124 |             for subfield in value:
125 |                 if subfield not in export_settings[field]:
126 |                     missing.append(f"{field} -> {subfield}")
127 | 
128 |     if missing:
129 |         raise ValueError(f"Error: Missing field(s) in export_settings: {missing}")
130 |     else:
131 |         print(f"All required fields are filled")
132 | 
133 | 
134 | def check_selection_tab(selection_table_path):
135 |     """
136 |     Checks the validity of a selection table path.
137 | 
138 |     Inputs:
139 |         - selection_table_path: A string representing the path to a selection table file or folder.
140 | 
141 |     Raises:
142 |         - ValueError: If the selection_table_path is not a valid path to an existing folder or file.
143 |     """
144 |     # Test if selection_table_path is a file
145 |     if os.path.isfile(selection_table_path):
146 |         print(f"selection_table_path is a File")
147 | 
148 |     # Test if selection_table_path is a Folder and count number of txt files in that folder
149 |     elif os.path.isdir(selection_table_path):
150 |         filelist = [file for file in os.listdir(selection_table_path) if file.endswith(".txt")]
151 | 
152 |         print(f"selection_table_path is a Folder with {len(filelist)} .txt Files")
153 | 
154 |     # Otherwise, raise an error for invalid selection_table_path
155 |     else:
156 |         raise ValueError("Please provide a valid path to an existing folder or file.")
157 | 
158 | 
159 | def get_bitdepth(bit_depth):
160 |     """
161 |     Get the bit depth based on user-input export settings. Only FLAC files are supported.
162 | 
163 |     Inputs:
164 |         - bit_depth: 'Bit depth' integer.
165 | 
166 |     Outputs:
167 |         - bit_depth: The corresponding bit depth for the export settings.
168 |     """
169 |     authorized_user_bit_depth = [8, 16, 24]
170 |     sf_flac_bit_depth = ['PCM_S8', 'PCM_16', 'PCM_24']  # This is only valid for flac files. 
171 |     # write sf.available_subtypes('WAV') to get the bit depth 
172 |     # format supported for wav files
173 | 
174 |     bit_depth = sf_flac_bit_depth[authorized_user_bit_depth.index(bit_depth)]
175 |     return bit_depth
176 | 
177 | 
178 | def get_print_fs(fs_original):
179 |     """
180 |     Take note of the sampling frequency for the file naming system.
181 |     Input:
182 |         - fs_original: Original sampling frequency.
183 |         
184 |     Output:
185 |         - fs_original_print: fs to print
186 |     """
187 |     if fs_original >= 1000:
188 |         fs_original_print = str(int(np.floor(fs_original / 1000))) + 'kHz'
189 |     else:
190 |         fs_original_print = str(int(fs_original)) + 'Hz'
191 | 
192 |     return fs_original_print
193 | 
194 | 
195 | # ------------------------
196 | #  Data checking functions
197 | def check_bitdepth(bit_depth):
198 |     """
199 |     Checks if the user-input bit depth is a possible value, based on formats supported by soundfile.write for FLAC files.
200 | 
201 |     Displays an error message if the value is not supported.
202 |     
203 |     Inputs:
204 |         - bit_depth: 'Bit depth' integer.
205 |     Output:
206 |         - Printed text indicating if the bit depth is not supported
207 |     """
208 | 
209 |     # List of authorized bit depths user inputs and corresponding soundfile FLAC bit depths
210 |     authorized_user_bit_depth = [8, 16, 24]
211 |     sf_flac_bit_depth = ['PCM_S8', 'PCM_16', 'PCM_24']
212 | 
213 |     # Test if the specified bit depth is supported
214 |     if bit_depth not in authorized_user_bit_depth:
215 |         # Raise error message if the specified bit depth is not supported
216 |         raise ValueError(
217 |             f"Error: Non-supported Bit depth, please select on of the following values:\n ...{authorized_user_bit_depth}")
218 | 
219 | 
220 | def check_selection_table(df):
221 |     """
222 |     This function checks whether all of the required fields are present in the selection table.
223 | 
224 |     Inputs:
225 |         - df: The dataframe of the selection table.
226 |         - label_key: The name of the field for the label column.
227 | 
228 |     Output:
229 |         - Printed text indicating whether the required fields are present.
230 |     """
231 | 
232 |     # List of all desired fields in the selection table
233 |     wanted_fields = ['Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)', 'File Offset (s)',
234 |                      'Begin Path']
235 | 
236 |     # Check if each desired field is present in the selection table, and if not, add it to the 'missing' list
237 |     missing = []
238 |     for item in wanted_fields:
239 |         if item not in df.columns:
240 |             missing.append(item)
241 | 
242 |     # Raise an error if any required fields are missing
243 |     if missing:
244 |         raise ValueError(f'Error: The following field(s) is missing from the selection table: {", ".join(missing)}')
245 |     else:
246 |         print('All required fields are in the selection table')
247 | 
248 | 
249 | def check_selection_table_folder(df):
250 |     """
251 |     This function checks whether all of the required fields are present in a selection table without specifying the annotation column.
252 | 
253 |     Inputs:
254 |         - df: The dataframe of the selection table.
255 |         - label_key: The name of the field for the label column.
256 | 
257 |     Output:
258 |         - Printed text indicating whether the required fields are present.
259 |     """
260 | 
261 |     # List of all desired fields in the selection table
262 |     wanted_fields = ['Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)', 'File Offset (s)',
263 |                      'Begin Path']
264 | 
265 |     # Check if each desired field is present in the selection table, and if not, add it to the 'missing' list
266 |     missing = []
267 |     for item in wanted_fields:
268 |         if item not in df.columns:
269 |             missing.append(item)
270 | 
271 |     return missing
272 | 
273 | 
274 | # ----------------------------------------------
275 | # Manipulate existing selection tables functions
276 | 
277 | 
278 | def load_selection_table(selection_table_path):
279 |     """
280 |     Load one or multiple selection table(s) from a file or folder. It takes tab-separated Raven Pro 1.6 
281 |     selection tables (.txt).
282 | 
283 |     Inputs:
284 |         - selection_table_path: A string representing the path to a selection table file or folder.
285 | 
286 |     Returns:
287 |         - selection_table_df: A Panda DataFrame containing the loaded selection table.
288 | 
289 |     This function loads the selection table from the provided selection_table_path, which can be either a 
290 |     file or a folder containing multiple selection table files. If selection_table_path points to a file, 
291 |     the function reads the file using pandas.read_csv(). If selection_table_path points to a folder, the 
292 |     function iterates through all '.csv' files in the folder, reads each file, and concatenates the data 
293 |     into a single DataFrame.
294 | 
295 |     The function also checks if all necessary fields are present in the selection table(s) and raises a 
296 |     ValueError if any field is missing. If all required fields are present, it prints a message confirming 
297 |     their presence.
298 | 
299 |     """
300 | 
301 |     # If selection_table_path is a file
302 |     if os.path.isfile(selection_table_path):
303 |         selection_table_df = pd.read_csv(selection_table_path, sep='\t')
304 | 
305 |         # Check if all necessary fields are present
306 |         check_selection_table(selection_table_df)
307 | 
308 |     # If selection_table_path is a folder
309 |     elif os.path.isdir(selection_table_path):
310 |         # Get the list of files
311 |         seltab_list = os.listdir(selection_table_path)
312 | 
313 |         # Create empty list for missing fields
314 |         missing = {}
315 |         for ff in seltab_list:
316 |             # Open selection table
317 |             selection_table_df_temp = pd.read_csv(os.path.join(selection_table_path, ff), sep='\t')
318 | 
319 |             # Check that all the files have the same fields
320 |             missing_file = check_selection_table_folder(selection_table_df_temp)
321 | 
322 |             # Add the file and missing field to the dictionary if missing_file not empty
323 |             if missing_file:
324 |                 missing[ff] = missing_file
325 | 
326 |             # If no entries are missing and this is the first selection table, create the output big selection table
327 |             elif (not missing_file) & ('selection_table_df' not in locals()):
328 |                 selection_table_df = selection_table_df_temp
329 | 
330 |             # If no entries are missing and selection_table_df exists   
331 |             elif (not missing_file) & ('selection_table_df' in locals()):
332 |                 #selection_table_df = selection_table_df.append(selection_table_df_temp)
333 |                 selection_table_df = pd.concat([selection_table_df, selection_table_df_temp], ignore_index=True)
334 | 
335 |         # If all required fields are in 
336 |         if not missing:
337 |             print('All required fields are in the selection tables')
338 | 
339 |         else:
340 | 
341 |             # Raise an error indicating missing fields in the selection tables
342 |             error_msg = 'Error: The following field(s) is missing from the selection table:\n'
343 |             for keys, value in missing.items():
344 |                 error_msg += f'--> in {keys}, the field(s) {value} are missing\n'
345 |             raise ValueError(error_msg)
346 | 
347 |             # Empty the dataframe
348 |             selection_table_df = pd.DataFrame({'A': []})
349 | 
350 |     else:
351 |         # Raise an error for invalid selection_table_path
352 |         raise ValueError("Please provide a valid path to an existing folder or file.")
353 | 
354 |     return selection_table_df
355 | 
356 | 
357 | def get_number_clips(list_audio_files, clip_duration):
358 |     """
359 |     This function reads the durations of all audio files in the given list and compares them to the desired clip duration.
360 | 
361 |     Inputs:
362 |         - list_audio_files: A list of audio files with their full paths.
363 |         - clip_duration: The chosen export clip duration.
364 | 
365 |     Outputs:
366 |         - number_clip: The number of export clips per audio file.
367 |     """
368 | 
369 |     # Get the duration of each file and calculate the associated number of non-overlapping clips
370 |     file_duration = []
371 |     number_clip = []
372 |     for file in list_audio_files:
373 |         fdur = librosa.get_duration(path=file)
374 |         number_clip.append(int(np.floor(fdur / clip_duration)))
375 |         file_duration.append(fdur)
376 | 
377 |     # Check if all files have the same number of clips
378 |     unique_number_clip = list(set(number_clip))
379 | 
380 |     if len(unique_number_clip) == 1:  # If all files have the same number of clips
381 |         # Print the information about the number of non-overlapping clips
382 |         print(f'All files can be divided into {unique_number_clip[0]} x {clip_duration}-s clips')
383 |     else:  # If there are different numbers of clips for different files
384 |         # Print the mismatched number of clips
385 |         print(f'Mismatched number of clips: {unique_number_clip} s')
386 | 
387 |     # Return the list containing the number of clips for each file
388 |     return number_clip
389 | 
390 | 
391 | def update_labels(selection_table_df, labels_dict, label_key):
392 |     """
393 |     Updates labels in the selection table based on the provided labels dictionary.
394 | 
395 |     Inputs:
396 |         - selection_table_df: DataFrame containing the selection table.
397 |         - labels_dict: Dictionary containing label updates.
398 |         - label_key: Name of the field for the label column.
399 | 
400 |     Outputs:
401 |         - Updated selection table with labels.
402 |     """
403 | 
404 |     # Swap the labels in the selection table
405 |     for old_label in labels_dict.keys():
406 |         # Test if the original label is present in the selection table
407 |         if old_label in selection_table_df[label_key].unique():
408 |             # Replace the original label with the new label
409 |             selection_table_df[label_key].replace(old_label, labels_dict[old_label], inplace=True)
410 |         else:
411 |             # Print a message if the original label is not found in the selection table
412 |             print(f'Skipping: Original label {old_label} not found in the selection table')
413 | 
414 |     # Check the uniqueness of labels after swapping
415 |     unique_labels = selection_table_df[label_key].unique()
416 |     print('New unique label list:')
417 |     for lab in unique_labels:
418 |         print(lab)
419 | 
420 |     # Return the modified selection table
421 |     return selection_table_df
422 | 
423 | 
424 | # -----------------------
425 | # Write outputs functions
426 | 
427 | def save_audioclip(audiofile, export_settings, export_filename, start_clip, bit_depth, channel):
428 |     # Test if the export audio file already exists otherwise, create it
429 |     if not os.path.exists(os.path.join(export_settings['Export folders']['Audio export folder'], export_filename + '.flac')):
430 | 
431 |         # Load and resample the the audio
432 |         x_clip, fs = librosa.load(audiofile, offset=start_clip,
433 |                                   duration=export_settings['Digital sampling']['Audio duration (s)'],
434 |                                   sr=export_settings['Digital sampling']['fs (Hz)'], mono=False, res_type='soxr_vhq')
435 |         # Test if x is multi-channel
436 |         nb_ch = x_clip.ndim
437 |         # Keep the wanted channel
438 |         if nb_ch > 1:
439 |             x_clip = x_clip[channel, :]
440 | 
441 |         # Save clip
442 |         sf.write(os.path.join(export_settings['Export folders']['Audio export folder'], export_filename + '.flac'),
443 |                  x_clip, fs, bit_depth)
444 | 
445 | 
446 | def write_selection_table(filename, entry, export_label='Tag'):
447 |     """
448 |     This function creates a selection table, appends entries, and saves it.
449 | 
450 |     Inputs:
451 |         - filename: Selected file name with full path and an extension.
452 |         - entry: Line to write in the selection table.
453 |         - export_label: Name of the label column in the selection table (str). Default is 'Tag'.
454 | 
455 |     Outputs:
456 |         - Saved selection table.
457 |     """
458 | 
459 |     header = ['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)',
460 |               'Begin File', 'Original Begin Time (s)', export_label]
461 | 
462 |     # If the filename doesn't exist yet, add the Header
463 |     if not os.path.exists(filename):
464 |         with open(filename, 'w') as f:
465 |             f.write('\t'.join(header) + '\n')
466 |             f.close()
467 | 
468 |     # Get the number of entries in the selection table
469 |     # If no entries yet, count = 0
470 |     with open(filename, 'r') as f:
471 |         for count, line in enumerate(f):
472 |             pass
473 |     entry[0] = count + 1
474 | 
475 |     # If some of the entries are not strings
476 |     for ind in range(len(entry)):
477 |         if not isinstance(entry[ind], str):
478 |             entry[ind] = str(entry[ind])
479 | 
480 |     # Append the variables to the table
481 |     with open(filename, 'a') as f:
482 |         f.write('\t'.join(entry) + '\n')
483 |         f.close()
484 | 
485 | 
486 | def write_annotation_csv(filename, entry, export_label='Tag'):
487 |     """
488 |     This function creates a recap annotation CSV, appends entries, and saves it in the format of https://doi.org/10.5281/zenodo.7079380.
489 | 
490 |     Inputs:
491 |         - filename: Selected file name with full path and an extension.
492 |         - entry: Line to write in the selection table.
493 |         - export_label: Name of the label column in the selection table (str). Default is 'Tag'.
494 | 
495 |     Outputs:
496 |         - One annotation table for the entire project.
497 |     """
498 | 
499 |     header = ['Filename', 'Start Time (s)', 'End Time (s)', 'Low Freq (Hz)', 'High Freq (Hz)', export_label]
500 | 
501 |     # If the filename doesn't exist yet, add the Header
502 |     if not os.path.exists(filename):
503 |         with open(filename, 'w') as f:
504 |             f.write('\t'.join(header) + '\n')
505 |             f.close()
506 | 
507 |     # Entry need to remove some of the entries to fit our header
508 |     # [0 = 'Selection', 1= 'View', 2= 'Channel', 3= 'Begin Time (s)', 4= 'End Time (s)', 
509 |     # 5= 'Low Freq (Hz)', 6= 'High Freq (Hz)', 7= 'Begin File', 8= 'Original Begin Time (s)', 9= 'Tag']
510 |     entry = [entry[7], "{:.2f}".format(float(entry[3])), "{:.2f}".format(float(entry[4])),
511 |              entry[5], entry[6], entry[9]]
512 | 
513 |     # Append the variables to the table
514 |     with open(filename, 'a') as f:
515 |         f.write('\t'.join(entry) + '\n')
516 |         f.close()
517 | 
518 | 
519 | def map_audio_selection(filename, audio_filename, selection_filename):
520 |     """
521 |     This function creates a recap CSV matching audio file names and selection table names, appends entries, and saves it.
522 | 
523 |     Inputs:
524 |         - filename: Selected file name with full path and an extension.
525 |         - audio_filename: Selected audio file name with full path and an extension.
526 |         - selection_filename: Corresponding annotation file name with full path and an extension.
527 | 
528 |     Outputs:
529 |         - One mapping CSV table for the entire project.
530 |     """
531 | 
532 |     # If the filename doesn't exist yet, add the Header
533 |     if not os.path.exists(filename):
534 |         with open(filename, 'w') as f:
535 |             f.close()
536 | 
537 |     # Append the association to the table
538 |     with open(filename, 'a') as f:
539 |         f.write('\t'.join([audio_filename, selection_filename]) + '\n')
540 |         f.close()
541 | 
542 | 
543 | def exports(export_settings, selection_table_af_df, save_sel_dict):
544 |     """
545 |     Create all exports based on provided export settings, selection table DataFrame, and save selection dictionary.
546 | 
547 |     Inputs:
548 |         - export_settings: Dictionary containing export settings.
549 |         - selection_table_af_df: Selection table imported as a Panda DataFrame.
550 |         - save_sel_dict: Dictionary containing information about the clip to be saved with the following keys:
551 |          'Selection #', 'fs_original_print', 'Channel', 'Start export clip', 'Bit depth', 'Label key', 'Begin Time (s)', 
552 |          'End Time (s)'
553 |           This variable is created in benchmark_creator
554 | 
555 |     This function creates all exports based on the provided export settings, selection table DataFrame, and save selection 
556 |     dictionary. It generates filenames for exported audio files, exports audio clips, writes entries in the selection table 
557 |     file, writes annotations in a global CSV file, and creates a file association CSV.
558 | 
559 |     Note: This function assumes the presence of several helper functions such as 'save_audioclip', 'write_selection_table',
560 |     'write_annotation_csv', and 'map_audio_selection'.
561 |     """
562 |     # Get the export audio file name in the format
563 |     # <Project>_<OriginalFileName>_<OriginalSamplingFrequency>_<OriginalChannel>.flac
564 |     export_filename = (export_settings['Project ID'] + '_' +
565 |                        export_settings['Deployment ID'] + '_' +
566 |                        os.path.splitext(
567 |                            os.path.basename(selection_table_af_df['Begin Path'].iloc[save_sel_dict['Selection #']]))[
568 |                            0] + '_' +
569 |                        str(save_sel_dict['fs_original_print']) + '_' + 'ch' + "{:02d}".format(
570 |                 save_sel_dict['Channel'] + 1) + '_' +
571 |                        "{:04d}".format(int(np.floor(save_sel_dict['Start export clip']))) + 's')
572 | 
573 |     # Export audio
574 |     audiofile = selection_table_af_df['Begin Path'].iloc[save_sel_dict['Selection #']]
575 |     save_audioclip(audiofile, export_settings, export_filename, save_sel_dict['Start export clip'],
576 |                    save_sel_dict['Bit depth'], save_sel_dict['Channel'])
577 | 
578 |     # Create/fill the selection table for this clip with the format 
579 |     # ['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)', 'Low Freq (Hz)', 
580 |     # 'High Freq (Hz)', 'Begin File', 'Original Begin Time (s)', 'Tag']
581 |     selection = [0,  # Placeholder, changes when adding the entry to the file writing the file
582 |                  'Spectrogram',  # All selections are on the Spectrogram
583 |                  1,  # We create monochannel audio so all is on channel 1
584 |                  save_sel_dict['Begin Time (s)'] - save_sel_dict['Start export clip'],
585 |                  save_sel_dict['End Time (s)'] - save_sel_dict['Start export clip'],
586 |                  selection_table_af_df['Low Freq (Hz)'].iloc[save_sel_dict['Selection #']],
587 |                  selection_table_af_df['High Freq (Hz)'].iloc[save_sel_dict['Selection #']],
588 |                  export_filename + '.flac',
589 |                  selection_table_af_df['File Offset (s)'].iloc[save_sel_dict['Selection #']],
590 |                  selection_table_af_df[save_sel_dict['Label key']].iloc[save_sel_dict['Selection #']]]
591 | 
592 |     # Write in the selection table (.txt)
593 |     write_selection_table(os.path.join(export_settings['Export folders']['Annotation export folder'], export_filename + '.txt'),
594 |                           selection, export_label=export_settings['Selections']['Export label'])
595 | 
596 |     # Write in the golbal csv file (.csv)
597 |     write_annotation_csv(export_settings['Export folders']['Annotation CSV file'],
598 |                          selection, export_label=export_settings['Selections']['Export label'])
599 | 
600 |     # Write in the file association (.csv)
601 |     map_audio_selection(export_settings['Export folders']['Audio-Seltab Map CSV file'],
602 |                         os.path.join(export_settings['Export folders']['Audio export folder'], export_filename + '.flac'),
603 |                         os.path.join(export_settings['Export folders']['Annotation export folder'], export_filename + '.txt'))
604 | 
605 | 
606 | # -------------------
607 | # Benchmark functions
608 | 
609 | def benchmark_size_estimator(selection_table_df, export_settings, label_key):
610 |     """
611 |     Estimates the benchmark size based on the provided selection table and export settings.
612 | 
613 |     Inputs:
614 |         - selection_table_df: DataFrame containing the selection table.
615 |         - export_settings: Dictionary containing export settings.
616 | 
617 |     Returns:
618 |         - Estimated benchmark size.
619 | 
620 |     This function estimates the benchmark size based on the provided selection table and export settings. It performs the following steps:
621 | 
622 |     Note: This function relies on helper functions such as 'get_number_clips' and 'check_bitdepth' for certain calculations.
623 |     """
624 | 
625 |     # 1) Run tests on the selection table
626 | 
627 |     # List unique audio files in the selection table
628 |     unique_audiofiles = selection_table_df['Begin Path'].unique()
629 | 
630 |     # Test if the selected export_settings['Audio duration (s)'] can fit in individual audio files
631 |     clip_number = get_number_clips(unique_audiofiles, export_settings['Digital sampling']['Audio duration (s)'])
632 | 
633 |     # Test if the bit depth is ok
634 |     check_bitdepth(export_settings['Digital sampling']['Bit depth'])
635 | 
636 |     # 2) Get the number of audio files that will be created
637 |     export_filename_list = []
638 |     count_benchmark_clips = 0
639 | 
640 |     # Go through each audio file
641 |     for ind_af in range(len(unique_audiofiles)):
642 |         # Load a second of the file to get the metadata
643 |         x, fs_original = librosa.load(unique_audiofiles[ind_af], offset=0.0, duration=1, sr=None, mono=False)
644 | 
645 |         # Test if x is multi-channel
646 |         nb_ch = x.ndim
647 | 
648 |         # Go through each channel 
649 |         for ch in range(nb_ch):
650 |             # From the selection table, get the subset of selections that correspond to this specific audio file and channel
651 |             selection_table_af_df = selection_table_df[(selection_table_df['Begin Path'] == unique_audiofiles[ind_af])
652 |                                                        & (selection_table_df['Channel'] == ch + 1)]
653 | 
654 |             # If the selection table dataframe is not empty
655 |             if not selection_table_af_df.empty:
656 |                 # For each selection
657 |                 for sel in range(len(selection_table_af_df)):
658 |                     # Get begin and end time of the selection
659 |                     begin_time = selection_table_af_df['File Offset (s)'].iloc[sel]
660 |                     end_time = (begin_time + selection_table_af_df['End Time (s)'].iloc[sel]
661 |                                 - selection_table_af_df['Begin Time (s)'].iloc[sel])
662 | 
663 |                     # Check which clip chuncks this selection is associated with
664 |                     sel_in_clip_begintime = \
665 |                         np.floor(begin_time / export_settings['Digital sampling']['Audio duration (s)'])
666 |                     sel_in_clip_endtime = \
667 |                         np.floor(end_time / export_settings['Digital sampling']['Audio duration (s)'])
668 | 
669 |                     # If both begin and end time are in a single clip chunck
670 |                     if sel_in_clip_begintime == sel_in_clip_endtime:
671 | 
672 |                         # Get the timing of the export clip (s)
673 |                         start_clip = sel_in_clip_begintime * export_settings['Digital sampling']['Audio duration (s)']
674 |                         end_clip = start_clip + export_settings['Digital sampling']['Audio duration (s)']
675 | 
676 |                         # Get the export audio file name in the format
677 |                         # <Project>_<OriginalFileName>_<OriginalSamplingFrequency>_<OriginalChannel>.flac
678 |                         export_filename = (export_settings['Project ID'] + '_' +
679 |                                            export_settings['Deployment ID'] + '_' +
680 |                                            os.path.splitext(
681 |                                                os.path.basename(selection_table_af_df['Begin Path'].iloc[sel]))[0]
682 |                                            + '_' + 'ch' + "{:02d}".format(ch + 1) + '_' +
683 |                                            "{:04d}".format(int(np.floor(start_clip))) + 's')
684 | 
685 |                         # Test if the export audio file already exists otherwise, create it
686 |                         if export_filename not in export_filename_list:
687 |                             export_filename_list.append(export_filename)
688 |                             count_benchmark_clips += 1
689 | 
690 |     # 3) Calculate the size
691 |     bd = int(export_settings['Digital sampling']['Bit depth'])
692 |     flac_compression = 0.5
693 |     bit_rate = bd * export_settings['Digital sampling']['fs (Hz)']
694 |     audio_file_size_byte = bit_rate * export_settings['Digital sampling'][
695 |         'Audio duration (s)'] * 1 / 8  # nb channels/ nb bits per bytes (8)
696 |     dataset_size_byte = audio_file_size_byte * count_benchmark_clips
697 | 
698 |     # 4) Display 
699 |     print(
700 |         f"File size are estimated with a flac compression factor of {int(flac_compression * 100)}% which may vary "
701 |         f"depending on the file.")
702 |     print(f"Estimated file size ... {int(np.round(audio_file_size_byte * 10 ** (-6) * flac_compression))} MB")
703 | 
704 |     if np.round(dataset_size_byte * 10 ** (-6) * flac_compression) > 999:
705 |         print(
706 |             f" > Estimated Benchmark dataset size ... {int(np.round(dataset_size_byte * 10 ** (-9) * flac_compression))} GB")
707 |     else:
708 |         print(
709 |             f" > Estimated Benchmark dataset size ... {int(np.round(dataset_size_byte * 10 ** (-6) * flac_compression))} MB")
710 | 
711 | 
712 | def benchmark_creator(selection_table_df, export_settings, label_key):
713 |     """
714 |     Creates a benchmark based on the provided selection table and export settings.
715 | 
716 |     Inputs:
717 |         - selection_table_df: DataFrame containing the selection table.
718 |         - export_settings: Dictionary containing export settings.
719 | 
720 |     Outputs:
721 |         - Created benchmark.
722 | 
723 |     This function creates a benchmark based on the provided selection table and export settings. It performs the following steps:
724 | 
725 |     1) Lists unique audio files in the selection table.
726 |     2) Retrieves the bit depth from the export settings.
727 |     3) Iterates through each audio file and channel:
728 |         a) Loads a second of the audio file to retrieve metadata.
729 |         b) Determines the original sampling frequency for file naming.
730 |         c) Checks if the audio data is multi-channel.
731 |         d) Filters selections corresponding to the current audio file and channel.
732 |         e) For each selection:
733 |             i) Identifies the clip chunk associated with the selection.
734 |             ii) Creates a dictionary with variables for the export.
735 |             iii) Calls the 'exports' function to export audio and annotation files.
736 |             iv) Handles split annotations if required by export settings.
737 | 
738 |     Note: This function relies on helper functions such as 'get_bitdepth', 'get_print_fs', and 'exports' for certain calculations and export operations.
739 |     """
740 | 
741 |     # List unique audio files in the selection table
742 |     unique_audiofiles = selection_table_df['Begin Path'].unique()
743 | 
744 |     # Get the bit depth
745 |     bit_depth = get_bitdepth(export_settings['Digital sampling']['Bit depth'])
746 | 
747 |     # Get total number of clips
748 |     tot_clips = 0
749 | 
750 |     # Go through each audio file
751 |     for ind_af in tqdm(range(len(unique_audiofiles))):
752 | 
753 |         # Load a second of the file to get the metadata
754 |         x, fs_original = librosa.load(unique_audiofiles[ind_af], offset=0.0, duration=1, sr=None, mono=False)
755 | 
756 |         # Take note of the sampling frequency for the file naming system
757 |         fs_original_print = get_print_fs(fs_original)
758 | 
759 |         # Test if x is multi-channel
760 |         nb_ch = x.ndim
761 | 
762 |         # Go through each channel 
763 |         for ch in range(nb_ch):
764 |             # From the selection table, get the subset of selections that correspond to this specific audio file and channel
765 |             selection_table_af_df = selection_table_df[(selection_table_df['Begin Path'] == unique_audiofiles[ind_af])
766 |                                                        & (selection_table_df['Channel'] == ch + 1)]
767 | 
768 |             # If the selection table dataframe is not empty
769 |             if not selection_table_af_df.empty:
770 |                 # For each selection
771 |                 for sel in range(len(selection_table_af_df)):
772 |                     # Get begin and end time of the selection
773 |                     begin_time = selection_table_af_df['File Offset (s)'].iloc[sel]
774 |                     end_time = (begin_time + selection_table_af_df['End Time (s)'].iloc[sel]
775 |                                 - selection_table_af_df['Begin Time (s)'].iloc[sel])
776 | 
777 |                     # Check which clip chuncks this selection is associated with
778 |                     sel_in_clip_begintime = \
779 |                         np.floor(begin_time / export_settings['Digital sampling']['Audio duration (s)'])
780 |                     sel_in_clip_endtime = \
781 |                         np.floor(end_time / export_settings['Digital sampling']['Audio duration (s)'])
782 | 
783 |                     # If both begin and end time are in a single clip chunck, that is default and will always be done
784 |                     if sel_in_clip_begintime == sel_in_clip_endtime:
785 | 
786 |                         # Get the timing of the export clip (s)
787 |                         start_clip = sel_in_clip_begintime * export_settings['Digital sampling']['Audio duration (s)']
788 |                         end_clip = start_clip + export_settings['Digital sampling']['Audio duration (s)']
789 | 
790 |                         # Create the dictionnary that will have all of the variables for the exports
791 |                         save_sel_dict = {
792 |                             'Selection #': sel,  # Selection number in the table
793 |                             'fs_original_print': fs_original_print,  # Original sampling frequency
794 |                             'Channel': ch,  # Channel
795 |                             'Start export clip': start_clip,  # Timing of thebeginint of the export clip (s)
796 |                             'Bit depth': bit_depth,  # Bit depth, in correcto format
797 |                             'Label key': label_key,  # Key to the label column in the selection table
798 |                             'Begin Time (s)': begin_time,  # Time to start the annotation
799 |                             'End Time (s)': end_time  # Time to end the annotation
800 |                         }
801 | 
802 |                         # Export everything
803 |                         exports(export_settings, selection_table_af_df, save_sel_dict)
804 |                         tot_clips += 1
805 | 
806 |                     # When an annotation is at the limit between two export audio files, 
807 |                     # If there is sufficient amount on either/both sides, keep it if (export_settings['Split export selections'][0] is True)  
808 |                     elif export_settings['Selections']['Split export selections'][0] is True:
809 |                         # Test if the duration before the split is sufficient
810 |                         if abs(sel_in_clip_endtime * export_settings['Digital sampling']['Audio duration (s)'] - begin_time) >= \
811 |                                 export_settings['Selections']['Split export selections'][1]:
812 |                             # Get the timing of the export clip (s)
813 |                             start_clip = sel_in_clip_begintime * export_settings['Digital sampling']['Audio duration (s)']
814 |                             end_clip = start_clip + export_settings['Digital sampling']['Audio duration (s)']
815 | 
816 |                             # Update the begin and end time of the split annotation
817 |                             begin_time = selection_table_af_df['File Offset (s)'].iloc[sel]
818 |                             end_time = end_clip
819 | 
820 |                             # Create the dictionnary that will have all of the variables for the exports
821 |                             save_sel_dict = {
822 |                                 'Selection #': sel,  # Selection number in the table
823 |                                 'fs_original_print': fs_original_print,  # Original sampling frequency
824 |                                 'Channel': ch,  # Channel
825 |                                 'Start export clip': start_clip,  # Timing of thebeginint of the export clip (s)
826 |                                 'Bit depth': bit_depth,  # Bit depth, in correcto format
827 |                                 'Label key': label_key,  # Key to the label column in the selection table
828 |                                 'Begin Time (s)': begin_time,  # Time to start the annotation
829 |                                 'End Time (s)': end_time  # Time to end the annotation
830 |                             }
831 | 
832 |                             # Export everything
833 |                             exports(export_settings, selection_table_af_df, save_sel_dict)
834 |                             tot_clips += 1
835 |                         # Test if the duration after the split is sufficient
836 |                         elif abs(end_time - sel_in_clip_endtime * export_settings['Digital sampling']['Audio duration (s)']) >= \
837 |                                 export_settings['Selections']['Split export selections'][1]:
838 |                             # Get the timing of the export clip (s)
839 |                             start_clip = sel_in_clip_endtime * export_settings['Digital sampling']['Audio duration (s)']
840 |                             end_clip = start_clip + export_settings['Digital sampling']['Audio duration (s)']
841 | 
842 |                             # Update the begin and end time of the split annotation
843 |                             begin_time = start_clip
844 |                             end_time = (selection_table_af_df['File Offset (s)'].iloc[sel] +
845 |                                         selection_table_af_df['End Time (s)'].iloc[sel]
846 |                                         - selection_table_af_df['Begin Time (s)'].iloc[sel])
847 | 
848 |                             # Create the dictionnary that will have all of the variables for the exports
849 |                             save_sel_dict = {
850 |                                 'Selection #': sel,  # Selection number in the table
851 |                                 'fs_original_print': fs_original_print,  # Original sampling frequency
852 |                                 'Channel': ch,  # Channel
853 |                                 'Start export clip': start_clip,  # Timing of thebeginint of the export clip (s)
854 |                                 'Bit depth': bit_depth,  # Bit depth, in correcto format
855 |                                 'Label key': label_key,  # Key to the label column in the selection table
856 |                                 'Begin Time (s)': begin_time,  # Time to start the annotation
857 |                                 'End Time (s)': end_time  # Time to end the annotation
858 |                             }
859 | 
860 |                             # Export everything
861 |                             exports(export_settings, selection_table_af_df, save_sel_dict)
862 |                             tot_clips += 1
863 |                     else:
864 |                         # If the selection is not comparised in the export clip, then do not save it, and print
865 |                         printselnb = selection_table_af_df['Selection'].iloc[sel]
866 |                         head, tail = os.path.split(selection_table_af_df['Begin Path'].iloc[sel])
867 |                         print(f'Ignored annotation...  Selection # {printselnb}, File {tail}, Channel {ch + 1}, {begin_time}-{end_time} s')
868 | 
869 | 
870 |     print(f'Total number of clips: {tot_clips}')


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator/folders.py:
--------------------------------------------------------------------------------
  1 | # Group of functions supporting the creation of the filing system in 1_Project_creator
  2 | 
  3 | # Imports
  4 | from contextlib import contextmanager, redirect_stdout
  5 | from io import StringIO
  6 | import os
  7 | import sys
  8 | import shutil
  9 | 
 10 | 
 11 | # The function below is to help write the output
 12 | @contextmanager
 13 | def st_capture(output_func):
 14 |     with StringIO() as stdout, redirect_stdout(stdout):
 15 |         old_write = stdout.write
 16 | 
 17 |         def new_write(string):
 18 |             ret = old_write(string)
 19 |             output_func(stdout.getvalue())
 20 |             return ret
 21 | 
 22 |         stdout.write = new_write
 23 |         yield
 24 | 
 25 | 
 26 | def query_yes_no(question, default="yes"):
 27 |     """
 28 |     Ask a yes/no question via raw_input() and return their answer.
 29 | 
 30 |     Inputs:
 31 |         - question: A string that is presented to the user.
 32 |         - default: The presumed answer if the user just hits <Enter>. It must be "yes" (the default),
 33 |         "no", or None (meaning an answer is required from the user).
 34 | 
 35 |     Return value:
 36 |         - True for "yes" or False for "no".
 37 |     """
 38 |     # Dictionary mapping valid yes/no responses to boolean values
 39 |     valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
 40 | 
 41 |     # Set the prompt based on the default answer
 42 |     if default is None:
 43 |         prompt = " [y/n] "
 44 |     elif default == "yes":
 45 |         prompt = " [Y/n] "
 46 |     elif default == "no":
 47 |         prompt = " [y/N] "
 48 |     else:
 49 |         # Raise a ValueError for an invalid default answer
 50 |         raise ValueError("invalid default answer: '%s'" % default)
 51 | 
 52 |     # Loop until a valid response is provided
 53 |     while True:
 54 |         # Display the question and prompt the user for a response
 55 |         sys.stdout.write(question + prompt)
 56 |         choice = input().lower()
 57 |         # If the default answer is provided and not empty, return the corresponding boolean value
 58 |         if default is not None and choice == "":
 59 |             return valid[default]
 60 |         # If the user's choice is in the valid responses, return the corresponding boolean value
 61 |         elif choice in valid:
 62 |             return valid[choice]
 63 |         else:
 64 |             # If the response is invalid, prompt the user to respond with 'yes' or 'no'
 65 |             sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
 66 | 
 67 | 
 68 | def path_print(start_path):
 69 |     """
 70 |     Prints the content of the folder designated by startpath
 71 |     """
 72 |     # Iterate through the directory tree starting from 'startpath'
 73 |     for root, dirs, files in os.walk(start_path):
 74 | 
 75 |         # Determine the depth of the current directory relative to 'startpath'
 76 |         level = root.replace(start_path, '').count(os.sep)
 77 | 
 78 |         # Calculate the indentation for displaying directory structure
 79 |         indent = ' ' * 4 * (level)
 80 | 
 81 |         # Print the name of the current directory
 82 |         print('{}{}/'.format(indent, os.path.basename(root)))
 83 | 
 84 |         # Calculate the indentation for displaying files within the directory
 85 |         sub_indent = ' ' * 4 * (level + 1)
 86 | 
 87 |         # Iterate through the files in the current directory
 88 |         for f in files:
 89 |             # Print the name of each file within the directory
 90 |             print('{}{}'.format(sub_indent, f))
 91 | 
 92 | 
 93 | def create_path(export_dict):
 94 |     """
 95 |     Function to create export folders following this architecture:
 96 |     Export folder/
 97 |     |... Original project name/
 98 |     |... |... audio/
 99 |     |... |... annotations/
100 | 
101 |     # Only used in Jupyter notebooks & .py files, streamlit has a different display system
102 |     Displays a warning if the folders already exist, which can be overwritten based on user input.
103 |     """
104 |     # Construct paths for audio and annotations folders based on export settings
105 |     audio_path = os.path.join(export_dict['Export folder'],
106 |                               export_dict['Project ID'] + '_' +
107 |                               export_dict['Deployment ID'],
108 |                               'audio')
109 |     annot_path = os.path.join(export_dict['Export folder'],
110 |                               export_dict['Project ID'] + '_' +
111 |                               export_dict['Deployment ID'],
112 |                               'annotations')
113 |     export_dict['Audio export folder'] = audio_path
114 |     export_dict['Annotation export folder'] = annot_path
115 | 
116 |     # If the audio folder does not exist in path
117 |     if not os.path.exists(audio_path):
118 |         # Create the audio and annotations folders
119 |         os.makedirs(audio_path)
120 |         os.makedirs(annot_path)
121 |         # Update export settings with the paths
122 |         export_dict['Audio export folder'] = audio_path
123 |         export_dict['Annotation export folder'] = annot_path
124 | 
125 |     # If the audio folder already exists
126 |     else:
127 |         # Display a warning message
128 |         print(f'Warning: This folder already exists, data may be deleted: \n')
129 |         print(path_print(os.path.join(export_dict['Export folder'],
130 |                                       export_dict['Project ID'] + '_' +
131 |                                       export_dict['Deployment ID'])))
132 | 
133 |         # Ask the user whether to delete existing data
134 |         if query_yes_no(f'Delete data?', default="yes"):
135 | 
136 |             # Delete existing audio and annotations folders
137 |             shutil.rmtree(audio_path)
138 |             shutil.rmtree(annot_path)
139 | 
140 |             # Recreate audio and annotations folders
141 |             os.makedirs(audio_path)
142 |             os.makedirs(annot_path)
143 | 
144 |             # Update export settings with the new paths
145 |             export_dict['Audio export folder'] = audio_path
146 |             export_dict['Annotation export folder'] = annot_path
147 |         else:
148 |             # Prompt the user to change the export folder path
149 |             print(f"Please change the export folder path")
150 | 


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator/metadata.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import datetime as dt
  3 | import timezonefinder, pytz
  4 | import json
  5 | 
  6 | 
  7 | def set_state(i):
  8 |     st.session_state.stage = i
  9 | 
 10 | 
 11 | # Function definitions
 12 | def get_date_time(label, data_dictionary):
 13 |     """
 14 |     Function to obtain date and time information with correct formats.
 15 | 
 16 |     Inputs:
 17 |         - label (str): Label for the date and time entry.
 18 |         - original_data_dictionary (dict): Original data dictionary containing deployment
 19 |         information.
 20 |         - st_session_state_stage:
 21 | 
 22 |     Returns:
 23 |         - tuple: A tuple containing the date and time information in UTC and local time formats
 24 |         following the ISO-8601 format e.g.,
 25 |             * UTC format: 2010-08-27T23:58:03.3975Z
 26 |             * Local time format (UTC-7h): 2023-03-15T10:54:00-07:00
 27 |     """
 28 | 
 29 |     # Create UI layout using Streamlit's columns
 30 |     col_label, col_unused, col_utc = st.columns([0.2, 0.2, 0.6])
 31 | 
 32 |     # Display label for the date and time entry
 33 |     col_label.write(label)
 34 | 
 35 |     # Toggle for UTC time
 36 |     UTC_bool = col_utc.toggle(
 37 |         'UTC',
 38 |         value=True,
 39 |         help=None,
 40 |         key='utc' + label
 41 |     )
 42 | 
 43 |     # Create UI layout for date and timezone selection
 44 |     col_date, col_tz = st.columns([0.7, 0.74])
 45 | 
 46 |     # Date input field
 47 |     date = col_date.date_input(
 48 |         "Date",
 49 |         value=None,
 50 |         min_value=dt.datetime(1970, 1, 1),
 51 |         max_value=dt.datetime.now(),
 52 |         format="YYYY-MM-DD",
 53 |         key='date' + label,
 54 |     )
 55 | 
 56 |     # Get timezone from latitude and longitude
 57 |     tf = timezonefinder.TimezoneFinder()
 58 |     default_tz = tf.certain_timezone_at(
 59 |         lat=data_dictionary['Deployment']['Position']['Lat.'],
 60 |         lng=data_dictionary['Deployment']['Position']['Lon.']
 61 |     )
 62 | 
 63 |     # Select local timezone
 64 |     if default_tz in pytz.common_timezones:
 65 |         local_timezone = col_tz.selectbox(
 66 |             'Select local time zone',
 67 |             pytz.common_timezones,
 68 |             index=pytz.common_timezones.index(default_tz),
 69 |             key='tz' + label
 70 |         )
 71 |     else:
 72 |         local_timezone = col_tz.selectbox(
 73 |             'Select local time zone',
 74 |             pytz.common_timezones,
 75 |             key='tz' + label
 76 |         )
 77 | 
 78 |     # UI layout for time selection
 79 |     col_hh, col_mm, col_ss = st.columns(3)
 80 | 
 81 |     # Hour input
 82 |     time_hh = int(col_hh.number_input(
 83 |         'Hour',
 84 |         min_value=0,
 85 |         max_value=23,
 86 |         format='%i',
 87 |         step=1,
 88 |         key='time_hh' + label
 89 |     ))
 90 | 
 91 |     # Minute input
 92 |     time_mm = int(col_mm.number_input(
 93 |         'Minutes',
 94 |         min_value=0,
 95 |         max_value=59,
 96 |         format='%i',
 97 |         step=1,
 98 |         key='time_mm' + label
 99 |     ))
100 | 
101 |     # Second input
102 |     time_ss = int(col_ss.number_input(
103 |         'Seconds',
104 |         value=float(0),
105 |         min_value=float(0),
106 |         max_value=float(59.9999),
107 |         format='%.4f',
108 |         step=0.0001,
109 |         key='time_ss' + label
110 |     ))
111 | 
112 |     if date is not None:
113 |         # Assemble datetime information
114 |         date_time_entry = dt.datetime.combine(
115 |             date, dt.time(hour=time_hh, minute=time_mm, second=time_ss)
116 |         )
117 | 
118 |         # Convert to UTC or local time based on toggle
119 |         if UTC_bool:
120 |             tz_entry = pytz.timezone('UTC')
121 |             date_time_utc = tz_entry.localize(date_time_entry)
122 |             date_time_local = date_time_utc.astimezone(pytz.timezone(local_timezone))
123 |         else:
124 |             tz_entry = pytz.timezone(local_timezone)
125 |             date_time_local = tz_entry.localize(date_time_entry)
126 |             date_time_utc = date_time_local.astimezone(pytz.timezone('UTC'))
127 | 
128 |         # Format date and time in ISO format
129 |         date_time_local = str(date_time_local.replace(microsecond=0).isoformat())
130 |         date_time_utc = str(date_time_utc.replace(microsecond=0).isoformat()).replace('+00:00', 'Z')
131 |     else:
132 |         st.warning('Please enter a valid date')
133 |         date_time_utc = ''
134 |         date_time_local = ''
135 |     return date_time_utc, date_time_local
136 | 
137 | 
138 | def check_dates(date_start_local, date_end_local):
139 |     """
140 |     Function to check if the end date occurs before the start date.
141 | 
142 |     Inputs:
143 |         - date_start_local (str): Start date and time in local time format.
144 |         - date_end_local (str): End date and time in local time format.
145 | 
146 |     Returns:
147 |         - None: This function doesn't return anything, but it raises an error if the end date occurs before the start date.
148 | 
149 |     Raises:
150 |         - ValueError: If the end date occurs before the start date, an error is raised indicating the issue.
151 |     """
152 | 
153 |     # Convert date strings to datetime objects
154 |     date_start = dt.datetime.fromisoformat(date_start_local)
155 |     date_end = dt.datetime.fromisoformat(date_end_local)
156 | 
157 |     # Check
158 |     if date_end < date_start:
159 |         st.error('The entered recording end occurs before recording start')
160 | 
161 | 
162 | # Adding people and contact, by the press of a button
163 | def display_input_row(index, authorized_roles):
164 |     """
165 |     Function to display an input row with fields for Name, Affiliation, Email Address, and Role.
166 | 
167 |     Inputs:
168 |         - index (int): Index of the input row.
169 |         - authorized_roles (list): List of authorized roles for the Role field.
170 | 
171 |     Returns:
172 |         - None: This function doesn't return anything, it's responsible for displaying the input row UI.
173 | 
174 |     Side Effects:
175 |         - Displays input fields for Name, Affiliation, Email Address, and Role on the Streamlit app.
176 | 
177 |     Note:
178 |         - Each input field has a unique key generated based on the index to ensure proper functioning and reactivity.
179 |     """
180 | 
181 |     # Create four columns for each input field: Name, Affiliation, Email Address, Role
182 |     role_col, name_col, affiliation_col, email_col = st.columns(4)
183 | 
184 |     # Add text input fields for Name, Affiliation, Email Address, and Role
185 |     role_col.selectbox('Role', authorized_roles, key=f'role_{index}')
186 |     name_col.text_input('Name', key=f'name_{index}')
187 |     affiliation_col.text_input('Affiliation', key=f'affiliation_{index}')
188 |     email_col.text_input('Email Address', key=f'email_{index}')
189 | 
190 | 
191 | def test_json_fields(json_data):
192 |     """
193 |     Tests whether a JSON file has the specified fields and validates their formats.
194 | 
195 |     Inputs:
196 |         - json_data: A dictionary representing the JSON data.
197 | 
198 |     Raises:
199 |         - ValueError: If any required field is missing in the JSON data or if any field has an invalid format.
200 |     """
201 |     missing_data = False
202 | 
203 |     with open(json_data, "r") as file:
204 |         json_data = json.load(file)
205 | 
206 |     print(json_data)
207 | 
208 |     required_fields = [
209 |         "Original data",
210 |         "Benchmarked data"
211 |     ]
212 | 
213 |     for field in required_fields:
214 |         if field not in json_data:
215 |             missing_data = True
216 |             raise ValueError(f"Missing field: {field}")
217 | 
218 |     if "Original data" in json_data:
219 |         original_data = json_data["Original data"]
220 |         original_data_fields = [
221 |             "Project ID",
222 |             "Deployment ID",
223 |             "Data stewardship",
224 |             "Instrument",
225 |             "Deployment",
226 |             "Sampling details",
227 |             "Annotations",
228 |             "Target signals",
229 |             "Annotation protocol"
230 |         ]
231 | 
232 |         for field in original_data_fields:
233 |             if field not in original_data:
234 |                 missing_data = True
235 |                 raise ValueError(f"Missing field in 'Original data': {field}")
236 | 
237 |         if "Sampling details" in original_data:
238 |             sampling_details = original_data["Sampling details"]
239 |             sampling_details_fields = [
240 |                 "Time",
241 |                 "Digital sampling"
242 |             ]
243 | 
244 |             for field in sampling_details_fields:
245 |                 if field not in sampling_details:
246 |                     missing_data = True
247 |                     raise ValueError(f"Missing field in 'Sampling details': {field}")
248 | 
249 |             if "Time" in sampling_details:
250 |                 time = sampling_details["Time"]
251 |                 time_fields = [
252 |                     "UTC Start",
253 |                     "UTC End",
254 |                     "Local Start",
255 |                     "Local End"
256 |                 ]
257 | 
258 |                 for field in time_fields:
259 |                     if field not in time:
260 |                         missing_data = True
261 |                         raise ValueError(f"Missing field in 'Time': {field}")
262 | 
263 |                     # Validate time format for UTC and Local timestamps
264 |                     if field.startswith("UTC") or field.startswith("Local"):
265 |                         try:
266 |                             dt.datetime.strptime(time[field], "%Y-%m-%dT%H:%M:%SZ")
267 |                         except ValueError:
268 |                             missing_data = True
269 |                             raise ValueError(f"Invalid format for '{field}': {time[field]}")
270 | 
271 |             if "Digital sampling" in sampling_details:
272 |                 digital_sampling = sampling_details["Digital sampling"]
273 |                 digital_sampling_fields = [
274 |                     "Sample rate (kHz)",
275 |                     "Sample Bits",
276 |                     "Clipping",
277 |                     "Data Modifications"
278 |                 ]
279 | 
280 |                 for field in digital_sampling_fields:
281 |                     if field not in digital_sampling:
282 |                         missing_data = True
283 |                         raise ValueError(f"Missing field in 'Digital sampling': {field}")
284 | 
285 |                     # Validate sample rate
286 |                     if field == "Sample rate (kHz)" and not isinstance(digital_sampling[field], (int, float)):
287 |                         missing_data = True
288 |                         raise ValueError(f"Invalid format for 'Sample rate (kHz)': {digital_sampling[field]}")
289 | 
290 |                     # Validate sample bits
291 |                     if field == "Sample Bits" and digital_sampling[field] not in [8, 16, 24]:
292 |                         missing_data = True
293 |                         raise ValueError(f"Invalid value for 'Sample Bits': {digital_sampling[field]}")
294 | 
295 |                     # Validate clipping
296 |                     if field == "Clipping" and digital_sampling[field] not in ["Yes", "No"]:
297 |                         missing_data = True
298 |                         raise ValueError(f"Invalid value for 'Clipping': {digital_sampling[field]}")
299 | 
300 |         if "Data stewardship" in original_data:
301 |             data_stewardship = original_data["Data stewardship"]
302 |             for person in data_stewardship:
303 |                 required_person_fields = ["Role", "Name", "Affiliation", "Email Address"]
304 |                 for person_field in required_person_fields:
305 |                     if person_field not in person:
306 |                         missing_data = True
307 |                         raise ValueError(f"Missing field in 'Data stewardship' for person: {person_field}")
308 | 
309 |         if "Instrument" in original_data:
310 |             instrument = original_data["Instrument"]
311 |             required_instrument_fields = ["Type", "Settings"]
312 |             for field in required_instrument_fields:
313 |                 if field not in instrument:
314 |                     missing_data = True
315 |                     raise ValueError(f"Missing field in 'Instrument': {field}")
316 | 
317 |         if "Deployment" in original_data:
318 |             deployment = original_data["Deployment"]
319 |             required_deployment_fields = ["Position", "Height/depth (m)", "Terrain elevation/water depth (m)"]
320 |             for field in required_deployment_fields:
321 |                 if field not in deployment:
322 |                     missing_data = True
323 |                     raise ValueError(f"Missing field in 'Deployment': {field}")
324 | 
325 |             if "Position" in deployment:
326 |                 position = deployment["Position"]
327 |                 if "Lat." not in position or "Lon." not in position:
328 |                     missing_data = True
329 |                     raise ValueError("Missing 'Lat.' or 'Lon.' in 'Position'")
330 |                 if not isinstance(position["Lat."], float) or not isinstance(position["Lon."], float):
331 |                     missing_data = True
332 |                     raise ValueError("'Lat.' and 'Lon.' in 'Position' must be floats")
333 |                 if not (-90.0 <= position["Lat."] <= 90.0):
334 |                     missing_data = True
335 |                     raise ValueError("'Lat.' in 'Position' must be between -90.0 and 90.0")
336 |                 if not (-180.0 <= position["Lon."] <= 180.0):
337 |                     missing_data = True
338 |                     raise ValueError("'Lon.' in 'Position' must be between -180.0 and 180.0")
339 |                 if not round(position["Lat."], 6) == position["Lat."]:
340 |                     missing_data = True
341 |                     raise ValueError("'Lat.' in 'Position' must have precision of up to 6 decimal places")
342 |                 if not round(position["Lon."], 6) == position["Lon."]:
343 |                     missing_data = True
344 |                     raise ValueError("'Lon.' in 'Position' must have precision of up to 6 decimal places")
345 | 
346 |     return missing_data
347 | 
348 | 
349 | def transform_original_metadata_to_ASA_standard(dict):
350 |     """
351 |         Transforms original metadata dictionary to the ASA (Acoustical Society of America) standard format.
352 | 
353 |         Inputs:
354 |             - dict: Original metadata dictionary to be transformed.
355 | 
356 |         Returns:
357 |             - Transformed metadata dictionary in the ASA standard.
358 |     """
359 | 
360 |     # Global
361 |     dict["ProjectId"] = dict.pop("Project ID")
362 |     dict["DeploymentId"] = dict.pop("Deployment ID")
363 | 
364 |     # Data Stewardship
365 |     dict["DataStewardship"] = dict.pop("Data stewardship")
366 |     # Each person is entered as an element of the dict["DataStewardship"]  list,
367 |     # so we need to deal with this slightly differently.
368 |     for entry in range(len(dict["DataStewardship"])):
369 |         dict["DataStewardship"][entry]["EmailAddress"] = \
370 |             dict["DataStewardship"][entry].pop("Email Address")
371 | 
372 |     # Deployment
373 |     dict['Deployment']["ElevationInstrument_m"] = dict['Deployment'].pop(
374 |         "Height/depth (m)")
375 |     dict['Deployment']["Elevation_m"] = dict[
376 |         'Deployment'].pop("Terrain elevation/water depth (m)")
377 | 
378 |     # Sampling details
379 |     dict["SamplingDetails"] = dict.pop("Sampling details")
380 | 
381 |     # Sampling details - Time
382 |     dict["SamplingDetails"]["Timestamp"] = dict["SamplingDetails"].pop("Time")
383 |     dict["SamplingDetails"]["Timestamp"]["StartUTC"] = \
384 |         dict["SamplingDetails"]["Timestamp"].pop("UTC Start")
385 |     dict["SamplingDetails"]["Timestamp"]["EndUTC"] = \
386 |         dict["SamplingDetails"]["Timestamp"].pop("UTC End")
387 |     dict["SamplingDetails"]["Timestamp"]["StartLocal"] = \
388 |         dict["SamplingDetails"]["Timestamp"].pop("Local Start")
389 |     dict["SamplingDetails"]["Timestamp"]["EndLocal"] = \
390 |         dict["SamplingDetails"]["Timestamp"].pop("Local End")
391 | 
392 |     # Sampling details - Digital sampling
393 |     dict["SamplingDetails"]["DigitalSampling"] = dict["SamplingDetails"].pop(
394 |         "Digital sampling")
395 |     dict["SamplingDetails"]["DigitalSampling"]["SampleRate_kHz"] = \
396 |         dict["SamplingDetails"]["DigitalSampling"].pop("Sample rate (kHz)")
397 |     dict["SamplingDetails"]["DigitalSampling"]["SampleBits"] = \
398 |         dict["SamplingDetails"]["DigitalSampling"].pop("Sample Bits")
399 |     dict["SamplingDetails"]["DigitalSampling"]["DataModifications"] = \
400 |         dict["SamplingDetails"]["DigitalSampling"].pop("Data Modifications")
401 | 
402 |     # Annotations
403 |     dict["Annotations"]["TargetSignals"] = dict["Annotations"].pop("Target signals")
404 |     dict["Annotations"]["NonTargetSignals"] = dict["Annotations"].pop(
405 |         "Non-target signals")
406 |     dict["Annotations"]["AnnotationProtocol"] = dict["Annotations"].pop(
407 |         "Annotation protocol")
408 |     return dict
409 | 
410 | 
411 | def transform_export_metadata_to_ASA_standard(export_metadata_dict):
412 |     """
413 |         Transforms original metadata dictionary to the ASA (Acoustical Society of America) standard format.
414 | 
415 |         Inputs:
416 |             - export_metadata_dict: Original metadata dictionary to be transformed.
417 | 
418 |         Returns:
419 |             - Transformed metadata dictionary in the ASA standard.
420 |     """
421 | 
422 |     export_metadata_dict["ProjectId"] = export_metadata_dict.pop("Project ID")
423 |     export_metadata_dict["DeploymentId"] = export_metadata_dict.pop("Deployment ID")
424 | 
425 |     export_metadata_dict['SignalProcessing']=export_metadata_dict.pop('Signal Processing')
426 | 
427 |     export_metadata_dict["DigitalSampling"] = export_metadata_dict.pop("Digital sampling")
428 |     export_metadata_dict["DigitalSampling"]["NewAudioDuration_s"] = export_metadata_dict["DigitalSampling"].pop(
429 |         "Audio duration (s)")
430 |     export_metadata_dict["DigitalSampling"]["NewSampleRate_kHz"] = export_metadata_dict["DigitalSampling"].pop(
431 |         "fs (Hz)")
432 |     export_metadata_dict["DigitalSampling"]["NewSampleBits"] = export_metadata_dict["DigitalSampling"].pop("Bit depth")
433 | 
434 |     export_metadata_dict["Selections"] = export_metadata_dict.pop("Selections")
435 |     export_metadata_dict["Selections"]["ExportLabel"] = export_metadata_dict["Selections"].pop("Export label")
436 |     export_metadata_dict["Selections"]["SplitExportSelections_bool_s"] = export_metadata_dict["Selections"].pop(
437 |         "Split export selections")
438 |     export_metadata_dict["Annotations"]["UsedLabelList"] = export_metadata_dict["Annotations"].pop(
439 |         "Used Label List")
440 | 
441 |     export_metadata_dict["ExportFolders"] = export_metadata_dict.pop("Export folders")
442 |     export_metadata_dict["ExportFolders"]["ExportFolder"] = export_metadata_dict["ExportFolders"].pop("Export folder")
443 |     export_metadata_dict["ExportFolders"]["AudioExportFolder"] = export_metadata_dict["ExportFolders"].pop(
444 |         "Audio export folder")
445 |     export_metadata_dict["ExportFolders"]["AnnotationExportFolder"] = export_metadata_dict["ExportFolders"].pop(
446 |         "Annotation export folder")
447 |     export_metadata_dict["ExportFolders"]["MetadataFolder"] = export_metadata_dict["ExportFolders"].pop(
448 |         "Metadata folder")
449 |     export_metadata_dict["ExportFolders"]["MetadataFileJSON"] = export_metadata_dict["ExportFolders"].pop(
450 |         "Metadata file")
451 |     export_metadata_dict["ExportFolders"]["AnnotationFileCSV"] = export_metadata_dict["ExportFolders"].pop(
452 |         "Annotation CSV file")
453 |     export_metadata_dict["ExportFolders"]["Audio-SeltabMapFileCSV"] = export_metadata_dict["ExportFolders"].pop(
454 |         "Audio-Seltab Map CSV file")
455 | 
456 |     return export_metadata_dict
457 | 


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator_app/Home.py:
--------------------------------------------------------------------------------
 1 | # Import convention
 2 | import streamlit as st
 3 | import sys
 4 | import os
 5 | sys.path.insert(1, '.'+ os.sep)
 6 | 
 7 | st.set_page_config(
 8 |     page_title='Benchmark Dataset Creator',
 9 | )
10 | st.title('Benchmark Dataset Creator')
11 | 
12 | # Add image
13 | #print(os.path.exists('.'+os.sep+ os.path.join('docs', 'illustrations', '‎method_schematicV2.png')))
14 | #st.image('.' + os.sep + os.path.join('docs', 'illustrations', '‎method_schematicV2.png'),
15 | #         channels="RGB", output_format="auto")
16 | 
17 | st.write('Welcome to the Benchmark Dataset Creator')
18 | 
19 | link_to_metadata = "pages" + os.sep + "1_Project_creator.py"
20 | st.page_link(link_to_metadata, label=":green[Start with Project Creator]", icon="➡️")


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator_app/help_dictionary.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import soundfile as sf
  3 | 
  4 | metadata = {
  5 |     'Project ID': # Becomes ProjectId in the standard
  6 |         "The name of your project. This entry will be used to  keep track of the origin of the data, "
  7 |         "as a part of the folder architecture and file naming. Please do not end this entry by / or \ and "
  8 |         "avoid spaces",
  9 |     'Deployment ID': # Becomes DeploymentId in the standard
 10 |         "A number used to help distinguish groups of deployments",
 11 |     'Data stewardship': { # Becomes DataStewardship in the standard
 12 |         'General': "Information and contact of the people/institutions/groups that contributed to this dataset. "
 13 |                    "Show and fill the fields of entry by pushing the 'Add co-creator' button",
 14 |         'Role': '',
 15 |         'Name': '',
 16 |         'Affiliation': '',
 17 |         'Email Address': '', # Becomes EmailAddress
 18 |         'Permits': "(Optional) Use this field to report Permits and Permission allowed by state and non-state actors "
 19 |                    "for the data collection, e.g., Permit n°XXXXX, obtained from XXXXX.",
 20 |         'DOI': "(Optional) DOI of an associated publication.",
 21 |     },
 22 |     'Instrument': {
 23 |         'General': "Information on the recording equipment",
 24 |         'Type': 'Select the recording equipment used to collect the original data.',
 25 |         'Settings': 'If Other is selected in recording equipment, please add details '
 26 |                     'about the recording set up in this field otherwise (Optional) Details '
 27 |                     'on instrument settings, e.g., gain, recording schedule, etc.',
 28 |     },
 29 |     'Deployment': {
 30 |         'General': "Information on the deployment location and conditions. "
 31 |                    "The Latitude and Longitude (°) entries take 6 decimals to enable sub-meter "
 32 |                    "precision at all latitudes. The map is used as a visual tool to verify "
 33 |                    "user entry.",
 34 |         'Position': {
 35 |             'Lat.': '',
 36 |             'Lon.': '',
 37 |         },
 38 |         'Height/depth (m)': # becomes ElevationInstrument_m to follow the standard
 39 |             '<b>Terrestrial</b>: recorder height is reported relative to ground level.'
 40 |             '<b>Aquatic/Marine</b>: recorder depths are entered relative to the surface',
 41 |         'Terrain elevation/water depth (m)': # becomes Elevation_m to follow the standard
 42 |             'Terrain elevation and water depth relative to sea level reported at the '
 43 |             'position of the recorder',
 44 |         'Env. context': '(Optional) Description of the environmental context , e.g., vegetation, weather, ocean-bottom '
 45 |                         'type, sea state, etc.',
 46 |     },
 47 |     'Sampling details': {
 48 |         'General': 'Information on the recording sampling. Times are entered either in local time or UTC.',
 49 |         'Time': { # Becomes Timestamp in the standard
 50 |             'UTC Start': '',
 51 |             'UTC End': '',
 52 |             'Local Start': '',
 53 |             'Local End': '',
 54 |         },
 55 |         'Digital sampling': {
 56 |             'Sample rate (kHz)': # Becomes SampleRate_kHz in the standard
 57 |                 'Recordings sampling frequency',
 58 |             'Sample Bits': # Becomes SampleBits in the standard
 59 |                 'Select the bit depth of the original data. The bit depth determines the number of possible '
 60 |                 'amplitude values we can record for each audio sample; for SWIFT units, it is set to 16 bits and '
 61 |                 'for Rockhopper to 24 bits.',
 62 |             'Clipping': 'Where there any clipping in the data? Clipping clipping is a form of waveform distortion that '
 63 |                         'occurs when an amplifier is pushed beyond its maximum limit (e.g., the source signal is too '
 64 |                         'loud), pushing it to overdrive. In that case, the output voltage is pushed to its maximum value.',
 65 |             'Data Modifications': '(Optional) Was the data modified e.g., resampled, normalized, filtered etc.?',
 66 |         },
 67 |     },
 68 |     'Annotations': {
 69 |         'General': "Fill up information about the annotation protocol",
 70 |         'Target signals': {
 71 |             'Kind': 'SpeciesID: the label are produced at the species level \n, '
 72 |                     'CallID: the labels are produced at the call level',
 73 |         },
 74 |         'Non-target signals': {
 75 |             'Noise': '',
 76 |             'Bio': '',
 77 |             'Anthro': '',
 78 |             'Geo': '',
 79 |         },
 80 |         'Annotation protocol': "(Optional) Details on the annotation protocol e.g., number of analysts, rules, "
 81 |                                "verification protocol etc.",
 82 |     }
 83 | }
 84 | 
 85 | folder = {
 86 |     'Project ID': "The name of your project. This entry will be used to  keep track of the origin of the data, "
 87 |                   "as a part of the folder architecture and file naming. Please do not end this entry by / or \ and "
 88 |                   "avoid spaces",
 89 |     'Deployment ID': "A number used to help distinguish groups of deployments",
 90 |     'Export folder': "Export folder is where the data and metadata will be saved.",
 91 |     'Audio export folder': '',  # Created without user input - Path to export audio
 92 |     'Annotation export folder': '',  # Created without user input - Path to export annotations
 93 |     'Metadata folder': '',  # Created without user input - Path to export Metadata
 94 |     'Metadata file': '',  # Created without user input - Full path + name of the Metadata file
 95 |     'Annotation CSV file': '',  # Created without user input - Full path + name of the recap CSV annotation file
 96 |     'Audio-Seltab Map CSV file': '',  # Created without user input - Full path + name of the CSV audio-annotation
 97 |     # association file
 98 | 
 99 | }
100 | 
101 | url = "https://docs.google.com/spreadsheets/d/1ScxYST26QIGE2d_ovEI1NtyPDmpWeMHJJ2LEu4nFwOw/edit?usp=sharing"
102 | 
103 | export = {
104 |     'Digital sampling': {
105 |         'Audio duration (s)': "Set  the chosen export audio file duration for the Benchmark dataset in minutes. Our "
106 |                               "recommendation is to set it to encompass the vocalization(s) of interest but also some "
107 |                               "context. What is the minimum duration that would represent the signal's repetition or "
108 |                               "call/cue rate (with several annotations)?",
109 |         'fs (Hz)': 'The sampling frequency is to be set at minima at double the maximum frequency of the signals of '
110 |                    'interest. If relevant, BirdNET uses fs = 48 kHz.',
111 |         'Bit depth': 'The bit depth determines the number of possible amplitude values we can record for each audio '
112 |                      'sample; for SWIFT units, it is set to 16 bits and for Rockhopper to 24 bits.',
113 |     },
114 |     'Selections': {
115 |         'Export label': "Defines the name of the label column for the created export Raven selection tables",
116 |         'Split export selections': {
117 |             'General':
118 |                 "Split export selection specifies the method when a selection is at the junction "
119 |                 "between two export audio files. [Recommended] If you have hundreds or even tens of "
120 |                 "selections of your target signals, we would recommend to keep this parameter set to "
121 |                 "false. [Other] This parameter can be handy if, for example, you selected long periods "
122 |                 "of background noise (long compared to the annotations of signals of interest) that "
123 |                 "could be split across two audio export files. In that case, you can set the minimum "
124 |                 "duration to something longer than your signals of interest or to 3 s if you plan to "
125 |                 "work with BirdNET. Another use case is if you have a very tight selection around your "
126 |                 "signal of interest (in time) and want even a very small portion of that signal to be "
127 |                 "labeled.",
128 |             'Minimum duration (s)':
129 |                 "Specify the minimum duration to report an annotation in the selection table in seconds",
130 |         },
131 |         'Path':
132 |             "(1) a complete path to a <b>selection table</b> if dealing with a single "
133 |             "audio file in total or a project with multiple audio files, e.g. "
134 |             "`'SelectionTable/MD02_truth_selections.txt'`"
135 |             "(2) a path to a <b>folder</b> if dealing with one selection table associated"
136 |             " with a single audio file, e.g., `'SelectionTables/'`",
137 |         'Label': "User-defined label key, should be a column title in the displayed Selection table",
138 |         'Label editor': {
139 |             'Help': '💡 To update existing labels, edit the `New labels` column.',
140 |             'Label list': "Look up the [Yang Center species list](%s) for existing standardized labels and add yours "
141 |                           "to the list!" % url
142 |         },
143 |         'Annotations': {
144 |             'Label Key': '', #Becomes LabelKey
145 |             'Used Label List': "",  # Becomes UsedLabelList
146 |         }
147 | 
148 |     },
149 |     'Export folder': "Export folder is where the data will be saved.",
150 | 
151 | }
152 | 
153 | benchmark_creator_info = {
154 |     'Method': {
155 |             'Software': 'Dataset and metadata created using the Benchmark Dataset Creator',
156 |             'url': 'https://github.com/leabouffaut/BenchmarkDatasetCreator',
157 |             'Author': 'Léa Bouffaut, Ph.D.',
158 |             'Institution': 'K. Lisa Yang Center for Conservation Bioacoustics, Cornell University',
159 |             'Release': 'dev',
160 |             'Date': 'April 2024'
161 |     },
162 |     'Signal Processing': {
163 |         'Resampling': f'The data is loaded and resampled as requested using Librosa {librosa.__version__}, using the '
164 |                       f'`soxr_vhq` protocol. See https://librosa.org/doc/main/generated/librosa.resample.html for the '
165 |                       f'documentation.',
166 |         'AudioWrite': f'The data is saved with the wanted Bit Depth using Soundfile {sf.__version__}. '
167 |                       f'See https://python-soundfile.readthedocs.io/en/0.11.0/index.html?highlight=write#soundfile.write'
168 |                       f' for the documentation.'
169 |     },
170 |     'Annotations': {
171 |         'Standard': url,
172 |     },
173 | }


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator_app/pages/1_Project_creator.py:
--------------------------------------------------------------------------------
  1 | # Streamlit app page 2, Metadata input
  2 | #
  3 | # This page is associated with a series of functions, in folders.py
  4 | # The text help for streamlit user inputs is integrated in help_dictionary.py in
  5 | # the folder dictionary
  6 | #
  7 | # >> This page creates the export_folder_dictionary
  8 | # saved in st.session_state.export_folder_dictionary
  9 | import sys
 10 | import os
 11 | 
 12 | import streamlit as st
 13 | import shutil
 14 | 
 15 | sys.path.insert(1, '.' + os.sep)
 16 | from BenchmarkDatasetCreator_app import help_dictionary as hd
 17 | from BenchmarkDatasetCreator import folders, metadata
 18 | 
 19 | # Page title (tab and page), Header
 20 | st.set_page_config(
 21 |     page_title='Benchmark Dataset Creator: Project',
 22 | )
 23 | st.title('Benchmark Dataset Creator')
 24 | st.header('Create project')
 25 | 
 26 | # Create a "staged" version of the program so not all shows up at once
 27 | if 'stage' not in st.session_state:
 28 |     st.session_state.stage = 0
 29 | 
 30 | 
 31 | def set_state(i):
 32 |     st.session_state.stage = i
 33 | 
 34 | 
 35 | # 1) Collect info on project & project ID
 36 | if st.session_state.stage >= 0:
 37 |     st.subheader('Project information')
 38 | 
 39 |     # Create the dictionary to store the information on the export folders
 40 |     export_folder_dictionary = {
 41 |         'Export folder': st.text_input(
 42 |             'Export folder',
 43 |             value="e.g., benchmark_data",
 44 |             type="default",
 45 |             help=hd.folder['Export folder'],
 46 |             label_visibility="visible"),
 47 | 
 48 |         'Project ID': st.text_input(
 49 |             'Project ID',
 50 |             value="e.g., 2013_UnivMD_Maryland_71485",
 51 |             type="default",
 52 |             help=hd.folder['Project ID'],
 53 |             label_visibility="visible"),
 54 | 
 55 |         'Deployment ID': "{:02d}".format(st.number_input(
 56 |             'Deployment ID',
 57 |             value=int(1.0),
 58 |             min_value=int(1),
 59 |             max_value=None,
 60 |             format='%02d',
 61 |             step=1,
 62 |             help=hd.folder['Deployment ID'],
 63 |             label_visibility="visible"))
 64 |     }
 65 | 
 66 |     st.button('Create Export folders', on_click=metadata.set_state, args=[1])
 67 | 
 68 | # 2) Construct paths for audio and annotations folders based on export settings
 69 | if st.session_state.stage >= 1:
 70 |     # <Export Folder>/<'Deployment ID'>_<'Project ID'>/audio/
 71 |     # <Export Folder>/<'Deployment ID'>_<'Project ID'>/annotations/
 72 | 
 73 |     # path names
 74 |     audio_path = os.path.join(export_folder_dictionary['Export folder'],
 75 |                               export_folder_dictionary['Project ID'] + '_' +
 76 |                               export_folder_dictionary['Deployment ID'],
 77 |                               'audio')
 78 |     annot_path = os.path.join(export_folder_dictionary['Export folder'],
 79 |                               export_folder_dictionary['Project ID'] + '_' +
 80 |                               export_folder_dictionary['Deployment ID'],
 81 |                               'annotations')
 82 |     metadata_path = os.path.join(export_folder_dictionary['Export folder'],
 83 |                                  export_folder_dictionary['Project ID'] + '_' +
 84 |                                  export_folder_dictionary['Deployment ID'])
 85 | 
 86 |     export_folder_dictionary['Audio export folder'] = audio_path
 87 |     export_folder_dictionary['Annotation export folder'] = annot_path
 88 |     export_folder_dictionary['Metadata folder'] = metadata_path
 89 | 
 90 |     # Metadata, annotation csv and audio-selection table map names
 91 |     metadata_filename = \
 92 |         os.path.join(export_folder_dictionary['Metadata folder'],
 93 |                      export_folder_dictionary['Project ID'] + '_' + \
 94 |                      export_folder_dictionary['Deployment ID'] + \
 95 |                      '_metadata.json')
 96 | 
 97 |     annotation_csv_filename = \
 98 |         os.path.join(export_folder_dictionary['Metadata folder'],
 99 |                      export_folder_dictionary['Project ID'] + '_' + \
100 |                      export_folder_dictionary['Deployment ID']
101 |                      + '_annotations.csv')
102 |     audio_sel_map_csv_filename = \
103 |         os.path.join(export_folder_dictionary['Metadata folder'],
104 |                      export_folder_dictionary['Project ID'] + '_' + \
105 |                      export_folder_dictionary['Deployment ID']
106 |                      + '_audio_seltab_map.csv')
107 | 
108 |     export_folder_dictionary['Metadata file'] = metadata_filename
109 |     export_folder_dictionary['Annotation CSV file'] = annotation_csv_filename
110 |     export_folder_dictionary['Audio-Seltab Map CSV file'] = audio_sel_map_csv_filename
111 | 
112 | 
113 |     # Create directories
114 |     # Option 1 -- The audio folder does not exist in path
115 |     if not os.path.exists(audio_path):
116 |         # Create the audio and annotations folders
117 |         os.makedirs(audio_path)
118 |         os.makedirs(annot_path)
119 | 
120 |         st.success(':white_check_mark: New folders created!')
121 |         st.session_state.stage = 2
122 | 
123 |     # Option 2 -- the audio folder already exists
124 |     else:
125 |         # Display a warning message
126 |         st.write(f'Warning: This folder already exists, data may be deleted: \n')
127 | 
128 |         output = st.empty()
129 |         with folders.st_capture(output.code):
130 |             folders.path_print(os.path.join(export_folder_dictionary['Export folder'],
131 |                                        export_folder_dictionary['Project ID'] + '_' +
132 |                                        export_folder_dictionary['Deployment ID']))
133 | 
134 |         col1, col2, col3, col4 = st.columns([0.2, 0.2, 0.4, 0.3])
135 |         # Ask the user whether to delete existing data
136 |         if col1.button('Delete data', help=None, on_click=set_state, args=[2]):
137 |             # Delete existing audio and annotations folders
138 |             #shutil.rmtree(audio_path)
139 |             #shutil.rmtree(annot_path)
140 |             shutil.rmtree(metadata_path)
141 | 
142 |             # Recreate audio and annotations folders
143 |             os.makedirs(audio_path)
144 |             os.makedirs(annot_path)
145 | 
146 |             st.success(':white_check_mark: Data successfully deleted & new folders created!')
147 | 
148 |         if col2.button('Abort', help=None, on_click=set_state, args=[1]):
149 |             # Prompt the user to change the export folder path
150 |             output = st.empty()
151 |             with folders.st_capture(output.code):
152 |                 raise ValueError("Please change the export folder path")
153 | 
154 | if st.session_state.stage >= 2:
155 |     # Show the info on the sidebar
156 |     st.sidebar.subheader('Project settings')
157 |     st.sidebar.write('Export folder')
158 |     st.sidebar.success(export_folder_dictionary['Export folder'])
159 |     st.sidebar.write('Project ID')
160 |     st.sidebar.success(export_folder_dictionary['Project ID'])
161 |     st.sidebar.write('Deployment ID')
162 |     st.sidebar.success(export_folder_dictionary['Deployment ID'])
163 | 
164 |     # Save
165 |     st.session_state.export_folder_dictionary = export_folder_dictionary
166 | 
167 |     # Activate next session state
168 |     st.session_state.stage = 3
169 |     link_to_metadata = "pages" + os.sep + "2_Metadata_creator.py"
170 |     st.page_link(link_to_metadata, label=":green[Continue to Metadata Creator]", icon="➡️")
171 | 
172 |     # Option for people to
173 |     json_data = st.checkbox('I already have a metadata file in the correct format')
174 |     if json_data:
175 |         st.write('In construction, please use the Metadata Creator')
176 |         #st.text_input('Path to metadata JSON file')
177 |         #
178 |         #if st.button('Verify metadata', help=None):
179 |         #
180 |         #    missing_data = metadata.test_json_fields(json_data)
181 |         #    # TODO implement a test to check if all metadata fields are present
182 |         #    if missing_data:  # Call this function with your JSON data
183 |         #        st.write('JSON file does not contain all the necessary fields, please use the Metadata Creator')
184 |         #    else:
185 |         #        # Activate next session state and get link to data creator
186 |         #        st.session_state.stage = 9
187 |         #        link_to_dataset = "pages" + os.sep + "3_Dataset_creator.py"
188 |         #        st.page_link(link_to_dataset, label=":green[Continue to Dataset Creator]", icon="➡️")
189 | 


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator_app/pages/2_Metadata_creator.py:
--------------------------------------------------------------------------------
  1 | # Streamlit app page 2, Metadata input
  2 | # This page is associated with a series of functions, in metadata.py
  3 | # The text help for streamlit user inputs is integrated in help_dictionary.py in the metadata dict
  4 | 
  5 | # Creates original_data_dictionary
  6 | 
  7 | # Imports
  8 | import json
  9 | import os
 10 | import sys
 11 | import copy
 12 | 
 13 | 
 14 | import pandas as pd
 15 | import streamlit as st
 16 | 
 17 | sys.path.insert(1, '.' + os.sep)
 18 | from BenchmarkDatasetCreator_app import help_dictionary as hd
 19 | from BenchmarkDatasetCreator import metadata
 20 | 
 21 | # Page title (tab and page), Header
 22 | st.set_page_config(
 23 |     page_title='Benchmark Dataset Creator: Metadata',
 24 | )
 25 | st.title('Benchmark Dataset Creator')
 26 | st.header('Collect metadata')
 27 | 
 28 | # Retrieve data from previous page
 29 | if not hasattr(st.session_state, 'export_folder_dictionary'):
 30 |     st.error('Project information missing')
 31 |     link_to_project = "pages" + os.sep + "1_Project_creator.py"
 32 |     st.page_link(link_to_project, label=":white[Go to Project creator]", icon="➡️")
 33 | else:
 34 |     export_folder_dictionary = st.session_state.export_folder_dictionary
 35 | 
 36 |     # Show the info on the sidebar
 37 |     st.sidebar.subheader('Project settings')
 38 |     st.sidebar.write('Project ID')
 39 |     st.sidebar.success(export_folder_dictionary['Project ID'])
 40 |     st.sidebar.write('Deployment ID')
 41 |     st.sidebar.success(export_folder_dictionary['Deployment ID'])
 42 |     st.sidebar.write('Export folder')
 43 |     st.sidebar.success(export_folder_dictionary['Export folder'])
 44 | 
 45 |     # Initialize the data saving dict
 46 |     original_data_dictionary = {}
 47 | 
 48 |     # 1) Collect info on project & project ID
 49 | 
 50 |     # Create the dictionary to store the information
 51 |     original_data_dictionary = {
 52 |         'Project ID': export_folder_dictionary['Project ID'],
 53 |         'Deployment ID': export_folder_dictionary['Deployment ID'],
 54 |     }
 55 | 
 56 |     if st.session_state.stage >= 3:
 57 |         # 2) Collect info on Data owners/curators
 58 |         # TODO: Add terms for local/indigenous partners
 59 |         # TODO: Land acknowledgment
 60 |         st.subheader('Data stewardship',
 61 |                      help=hd.metadata['Data stewardship']['General'])
 62 | 
 63 |         # Create list of authorized roles (based on Zenodo)
 64 |         authorized_roles = ['Contact person', 'Data collector', 'Data analyst', 'Dataset curator', 'Distributor',
 65 |                             'Hosting institution', 'Principal Investigator', 'Rights holder', 'Sponsor']
 66 | 
 67 |         # Check if 'rows' is not in the session state and initialize it to 0
 68 |         if 'rows' not in st.session_state:
 69 |             st.session_state['rows'] = 0
 70 | 
 71 | 
 72 |         # Add rows
 73 |         def increase_rows():
 74 |             """
 75 |             Function to increase the number of rows when the "Add person" button is clicked
 76 |             """
 77 |             st.session_state['rows'] += 1
 78 | 
 79 | 
 80 |         # Button to add a new person; calls the increase_rows function when clicked
 81 |         st.button('Add co-creator', on_click=increase_rows)
 82 | 
 83 |         # Loop through the number of rows and display input fields for each person
 84 |         for i in range(st.session_state['rows']):
 85 |             metadata.display_input_row(i, authorized_roles)
 86 | 
 87 |         # Display the entered information for each person as an interactive DataFrame
 88 |         # Create a list to store the entered data
 89 |         people_data = []
 90 | 
 91 |         # Loop through the rows and append the entered data to the list
 92 |         for i in range(st.session_state['rows']):
 93 |             person_data = {
 94 |                 'Role': st.session_state[f'role_{i}'],
 95 |                 'Name': st.session_state[f'name_{i}'],
 96 |                 'Affiliation': st.session_state[f'affiliation_{i}'],
 97 |                 'Email Address': st.session_state[f'email_{i}']
 98 |             }
 99 |             people_data.append(person_data)
100 | 
101 |         # Create a DataFrame from the collected data
102 |         people_df = pd.DataFrame(people_data)
103 | 
104 |         # Display the DataFrame
105 |         st.write('Entered dataset co-creators')
106 |         st.dataframe(people_df, use_container_width=True, hide_index=True)
107 | 
108 |         original_data_dictionary['Data stewardship'] = {
109 |             # Information about permits
110 |             'Permits': st.text_area(
111 |                 'Permit # or Permission and permitting authority',
112 |                 placeholder=hd.metadata['Data stewardship']['Permits'],
113 |                 height=None, max_chars=None, key=None,
114 |                 label_visibility="visible"),
115 | 
116 |             # Optional associated publication DOI
117 |             'DOI': st.text_input(
118 |                 'Associated publication (DOI) ',
119 |                 value="https://doi.org/XX.XXXXX",
120 |                 type="default",
121 |                 help=hd.metadata['Data stewardship']['DOI'],
122 |                 label_visibility="visible")}
123 | 
124 |         st.button('Next', key='Next2', help=None, on_click=metadata.set_state, args=[4])
125 | 
126 |     # 3) Add information on the instrumentation
127 |     if st.session_state.stage >= 4:
128 |         # Save the previous data
129 |         # Transform the dictionary in the wanted format
130 |         original_data_dictionary['Data stewardship'] = people_data
131 | 
132 |         st.subheader('Instrumentation',
133 |                      help=hd.metadata['Instrument']['General'])
134 | 
135 |         # Create two columns for app display
136 |         instrumentation_col, settings_col = st.columns(2)
137 | 
138 |         # List of authorized recording equipment + sort + add "Other" at the end
139 |         authorized_instruments = ['Cornell - SwiftOne', 'Cornell - Swift',
140 |                                   'Cornell - Rockhopper', 'Cornell - MARU',
141 |                                   'Open Acoustic Devices - AudioMoth',
142 |                                   'Open Acoustic Devices - HydroMoth',
143 |                                   "Ocean Instrunents - SoundTrap ST600 STD",
144 |                                   "Ocean Instrunents - SoundTrap ST600 HF",
145 |                                   "Scripps - HARP", "Wildlife Acoustics - Song Meter SM4",
146 |                                   "Wildlife Acoustics - Song Meter Mini 2",
147 |                                   "Wildlife Acoustics - Song Meter Micro",
148 |                                   "Wildlife Acoustics - Song Meter Micro 2",
149 |                                   "Wildlife Acoustics - Song Meter SM4BAT FS",
150 |                                   "Wildlife Acoustics - Song Meter Mini Bat2"]
151 |         authorized_instruments.sort()
152 |         authorized_instruments.append("Other")
153 | 
154 |         # Add the user inputs to the dictionary
155 |         original_data_dictionary['Instrument'] = {
156 |             'Type': instrumentation_col.selectbox(
157 |                 'Select recording equipment',
158 |                 authorized_instruments,
159 |                 help=hd.metadata['Instrument']['Type']),
160 |             'Settings': settings_col.text_area(
161 |                 'Details on instrument settings',
162 |                 placeholder=hd.metadata['Instrument']['Settings'],
163 |                 height=None, max_chars=None, key=None,
164 |                 label_visibility="visible")
165 |         }
166 |         st.button('Next', key='Next3', help=None, on_click=metadata.set_state, args=[5])
167 | 
168 |     # 4) Add information about the deployment
169 |     if st.session_state.stage >= 5:
170 |         st.subheader('Deployment',
171 |                      help=hd.metadata['Deployment']['General'])
172 | 
173 |         # Create two columns with different width for app display
174 |         deployment_input_col, map_col = st.columns([0.3, 0.7])
175 | 
176 |         # Get user inputs
177 |         original_data_dictionary['Deployment'] = {
178 |             'Position': {
179 |                 'Lat.': float(
180 |                     deployment_input_col.number_input(
181 |                         'Recorder latitude (°)',
182 |                         value=42.478327,
183 |                         min_value=-90.0,
184 |                         max_value=90.0,
185 |                         format='%.6f',
186 |                         step=0.000001,
187 |                         label_visibility="visible")),
188 |                 'Lon.': float(
189 |                     deployment_input_col.number_input(
190 |                         'Recorder longitude (°)',
191 |                         value=-76.450438,
192 |                         min_value=-180.0,
193 |                         max_value=180.0,
194 |                         format='%.6f',
195 |                         step=0.000001,
196 |                         # help="Enter Longitude",
197 |                         label_visibility="visible")),
198 |             },
199 |             'Height/depth (m)': int(
200 |                 deployment_input_col.number_input('Recorder height/depth (m)',
201 |                                                   value=10,
202 |                                                   min_value=0,
203 |                                                   max_value=None,
204 |                                                   format='%i',
205 |                                                   step=1,
206 |                                                   help=hd.metadata['Deployment']['Height/depth (m)'],
207 |                                                   label_visibility="visible")),
208 |             'Terrain elevation/water depth (m)': int(
209 |                 deployment_input_col.number_input(
210 |                     'Elevation/water depth (m)',
211 |                     value=10,
212 |                     min_value=0,
213 |                     max_value=None,
214 |                     format='%i',
215 |                     step=1,
216 |                     help=hd.metadata['Deployment']['Terrain elevation/water depth (m)'],
217 |                     label_visibility="visible")),
218 |             'Env. context': deployment_input_col.text_area(
219 |                 'Details on environmental context',
220 |                 placeholder=hd.metadata['Deployment']['Env. context'],
221 |                 label_visibility="visible",
222 |                 height=143)
223 |         }
224 | 
225 |         # Show map for the user to check their entry
226 |         df_map = pd.DataFrame({
227 |             'lat': [original_data_dictionary['Deployment']['Position']['Lat.']],
228 |             'lon': [original_data_dictionary['Deployment']['Position']['Lon.']]
229 |         })
230 |         map_col.map(df_map, size=5, zoom=15)
231 |         st.button('Next', key='Next4', help=None, on_click=metadata.set_state, args=[6])
232 | 
233 |     # 5) Enter sampling details
234 |     if st.session_state.stage >= 6:
235 |         st.subheader('Sampling details',
236 |                      help=hd.metadata['Sampling details']['General'])
237 | 
238 |         # Declare the dictionary structure for sampling details
239 |         original_data_dictionary['Sampling details'] = {
240 |             'Time': '',
241 |             'Digital sampling': '',
242 |         }
243 |         # Get the start and end time in both local time and UTC
244 |         start_date_time_utc, start_date_time_local = \
245 |             metadata.get_date_time('Recording start', original_data_dictionary)
246 | 
247 |         end_date_time_utc, end_date_time_local = \
248 |             metadata.get_date_time('Recording end', original_data_dictionary)
249 | 
250 |         # If the dates are filled
251 |         if (start_date_time_local is not None and end_date_time_local is not None) and \
252 |                 (start_date_time_local != '' and end_date_time_local != ''):
253 |             # Check the dates make sense:
254 |             metadata.check_dates(start_date_time_local, end_date_time_local)
255 | 
256 |             # Fill times in the dictionary
257 |             original_data_dictionary['Sampling details']['Time'] = {
258 |                 'UTC Start': start_date_time_utc,
259 |                 'UTC End': end_date_time_utc,
260 |                 'Local Start': start_date_time_local,
261 |                 'Local End': end_date_time_local}
262 | 
263 |             # Get the information on the digital sampling
264 |             st.write('Digital sampling')
265 | 
266 |             # Create two columns with different width for app display
267 |             digital_sampling_col, data_mod_col = st.columns([0.5, 0.5])
268 | 
269 |             # Values for bit depth
270 |             authorized_bit_depths = [8, 16, 24]
271 | 
272 |             # User inputs for all digital sampling
273 |             original_data_dictionary['Sampling details']['Digital sampling'] = {
274 |                 'Sample rate (kHz)': float(digital_sampling_col.number_input(
275 |                     'Sample rate (kHz)',
276 |                     value=1.000,
277 |                     min_value=0.100,
278 |                     max_value=None,
279 |                     format='%.3f',
280 |                     step=1.000,
281 |                     help=
282 |                     hd.metadata['Sampling details']['Digital sampling'][
283 |                         'Sample rate (kHz)'],
284 |                     label_visibility="visible")),
285 | 
286 |                 'Sample Bits': int(digital_sampling_col.selectbox(
287 |                     'Bit depth',
288 |                     authorized_bit_depths,
289 |                     index=1,
290 |                     help=hd.metadata['Sampling details']['Digital sampling'][
291 |                         'Sample Bits'])),
292 | 
293 |                 'Clipping': digital_sampling_col.radio(
294 |                     'Clipping',
295 |                     ['Yes', 'No', 'Don\'t know'],
296 |                     horizontal=True,
297 |                     index=None,
298 |                     help=hd.metadata['Sampling details']['Digital sampling'][
299 |                         'Clipping']),
300 |                 'Data Modifications': data_mod_col.text_area(
301 |                     'Data Modifications',
302 |                     placeholder=
303 |                     hd.metadata['Sampling details']['Digital sampling'][
304 |                         'Data Modifications'],
305 |                     label_visibility="visible",
306 |                     height=185)
307 |             }
308 | 
309 |             st.button('Next', key='Next5', help=None, on_click=metadata.set_state, args=[7])
310 | 
311 |     # 6) Get information on the annotation protocol
312 |     if st.session_state.stage >= 7:
313 |         st.subheader('Annotations',
314 |                      help=hd.metadata['Annotations']['General'])
315 |         # Add columns
316 |         annotation_questions_col, annotation_protocol_col = st.columns([0.5, 0.5])
317 | 
318 |         # About the target signals
319 |         annotation_questions_col.write('Target signals')
320 | 
321 |         # Authorized annotation types
322 |         authorized_annotations = ['SpeciesID', 'CallID']
323 | 
324 |         # Initialize annotations section of the dictionary
325 |         original_data_dictionary['Annotations'] = {
326 |             'Target signals': '',
327 |             'Non-target signals': '',
328 |             'Annotation protocol': ''
329 |         }
330 | 
331 |         original_data_dictionary['Annotations']['Target signals'] = {
332 |             'Kind': annotation_questions_col.radio(
333 |                 'Annotation type',
334 |                 authorized_annotations,
335 |                 horizontal=True,
336 |                 index=None,
337 |                 help=hd.metadata['Annotations']['Target signals']['Kind']
338 |             ),
339 |         }
340 | 
341 |         # About non-target signals
342 |         annotation_protocol_col.write('Non-target signals')
343 | 
344 |         # Authorized answers
345 |         yes_no = ['Yes', 'No']
346 | 
347 |         # noinspection PyTypedDict
348 |         original_data_dictionary['Annotations']['Non-target signals'] = {
349 |             'Noise': annotation_protocol_col.radio(
350 |                 'Does the dataset contain a background noise class?',
351 |                 yes_no,
352 |                 index=None,
353 |                 horizontal=True),
354 |             'Bio': '',
355 |             'Anthro': '',
356 |             'Geo': '',
357 |         }
358 | 
359 |         st.markdown("""
360 |         <style>
361 |         .small-font {
362 |             font-size:14px !important;
363 |         }
364 |         </style>
365 |         """, unsafe_allow_html=True)
366 |         annotation_protocol_col.markdown(
367 |             '<p class="small-font">Does the dataset contain selections with unique labels for:</p>',
368 |             unsafe_allow_html=True)
369 | 
370 |         original_data_dictionary['Annotations']['Non-target signals']['Bio'] = \
371 |             annotation_protocol_col.radio(
372 |                 ':heavy_minus_sign: Other biological sounds (e.g., insect chorus, un-IDed call types, etc)?',
373 |                 yes_no,
374 |                 index=None,
375 |                 horizontal=True,
376 |                 help='')
377 |         original_data_dictionary['Annotations']['Non-target signals']['Anthro'] = \
378 |             annotation_protocol_col.radio(
379 |                 ':heavy_minus_sign: Anthropogenic sounds (e.g., ship noise, piling, vehicles, chainsaw etc.)?',
380 |                 yes_no,
381 |                 index=None,
382 |                 horizontal=True,
383 |                 help='')
384 |         original_data_dictionary['Annotations']['Non-target signals']['Geo'] = \
385 |             annotation_protocol_col.radio(
386 |                 ':heavy_minus_sign: Geophysical sounds (e.g., thunder, heavy rain, earthquakes etc.)?',
387 |                 yes_no,
388 |                 index=None,
389 |                 horizontal=True,
390 |                 help='')
391 | 
392 |         # Optional field for annotation protocol
393 | 
394 |         # Free field for annotation protocol
395 |         original_data_dictionary['Annotations']['Annotation protocol'] = \
396 |             annotation_questions_col.text_area(
397 |                 'Annotation protocol',
398 |                 placeholder=hd.metadata['Annotations']['Annotation protocol'],
399 |                 label_visibility="visible",
400 |                 height=254)
401 | 
402 |         st.button('Submit', key='Submit', help=None, on_click=metadata.set_state, args=[8])
403 | 
404 |     # 7) Submit button to write JSON file
405 |     if st.session_state.stage >= 8:
406 |         dict_oj = copy.deepcopy(original_data_dictionary)
407 |         metadata_save = {
408 |             'Original data': metadata.transform_original_metadata_to_ASA_standard(dict_oj),
409 |             'Benchmarked data': ''
410 |         }
411 |         with open(export_folder_dictionary['Metadata file'], 'w') as fp:
412 |             json.dump(metadata_save, fp, indent=4)
413 | 
414 |         # Metadata announcement
415 |         meta_txt_col, meta_check_col = st.columns(2)
416 |         meta_txt_col.success('Metadata successfully created!')
417 |         #
418 |         if meta_txt_col.button('Show metadata'):
419 |             st.write('The metadata is saved at:', export_folder_dictionary['Metadata file'])
420 |             st.write('Here is a preview: ')
421 |             st.json(original_data_dictionary)
422 | 
423 |         # Save
424 |         st.session_state.export_folder_dictionary = export_folder_dictionary
425 |         st.session_state.original_data_dictionary = original_data_dictionary
426 | 
427 |         # Show on sidebar
428 |         st.sidebar.write('Metadata file')
429 |         st.sidebar.success(export_folder_dictionary['Metadata file'])
430 | 
431 | 
432 | 
433 |         # Activate next session state
434 |         st.session_state.stage = 9
435 |         link_to_dataset = "pages" + os.sep + "3_Dataset_creator.py"
436 |         st.page_link(link_to_dataset, label=":green[Continue to Dataset Creator]", icon="➡️")


--------------------------------------------------------------------------------
/BenchmarkDatasetCreator_app/pages/3_Dataset_creator.py:
--------------------------------------------------------------------------------
  1 | # Streamlit app page 3, Dataset creator
  2 | # This page is associated with a series of functions, in dataset.py
  3 | # The text help is in
  4 | # Imports
  5 | import sys
  6 | import os
  7 | import streamlit as st
  8 | import pandas as pd
  9 | import json
 10 | import copy
 11 | 
 12 | sys.path.insert(1, '.' + os.sep)
 13 | from BenchmarkDatasetCreator_app import help_dictionary as hd
 14 | from BenchmarkDatasetCreator import dataset, folders, metadata
 15 | 
 16 | 
 17 | # Titles
 18 | st.set_page_config(
 19 |     page_title='Benchmark Dataset Creator: Dataset',
 20 | )
 21 | st.title('Benchmark Dataset Creator')
 22 | st.header('Create benchmark dataset')
 23 | 
 24 | # Retrieve data from previous page
 25 | if not hasattr(st.session_state, 'export_folder_dictionary'):
 26 |     st.error('Project information missing')
 27 |     link_to_project = "pages" + os.sep + "1_Project_creator.py"
 28 |     st.page_link(link_to_project, label=":white[Go to Project creator]", icon="➡️")
 29 | 
 30 | elif not hasattr(st.session_state, 'export_folder_dictionary'):
 31 |     st.error('Metadata missing')
 32 |     link_to_project = "pages" + os.sep + "2_Metadata_creator.py"
 33 |     st.page_link(link_to_project, label=":white[Go to Metadata creator]", icon="➡️")
 34 | 
 35 | else:
 36 |     export_folder_dictionary = st.session_state.export_folder_dictionary
 37 |     original_data_dictionary = st.session_state.original_data_dictionary
 38 | 
 39 |     # Show the info on the sidebar
 40 |     st.sidebar.subheader('Project settings')
 41 |     st.sidebar.write('Export folder')
 42 |     st.sidebar.success(export_folder_dictionary['Export folder'])
 43 |     st.sidebar.write('Project ID')
 44 |     st.sidebar.success(export_folder_dictionary['Project ID'])
 45 |     st.sidebar.write('Deployment ID')
 46 |     st.sidebar.success(export_folder_dictionary['Deployment ID'])
 47 |     st.sidebar.write('Metadata file')
 48 |     st.sidebar.success(export_folder_dictionary['Metadata file'])
 49 | 
 50 | # Initialize the data saving variables
 51 | label_key = []
 52 | export_settings = {}
 53 | 
 54 | # TODO: Continue editing the species list csv
 55 | # TODO: have all of the text in a language-specific file -> https://phrase.com/blog/posts/translate-python-gnu-gettext/
 56 | # TODO: Finalize this piece of code with the new functions
 57 | # TODO: add BDC info to the metadata
 58 | # could be a solution
 59 | 
 60 | # User-defined export settings dictionary
 61 | if st.session_state.stage >= 9:
 62 |     st.subheader('Export settings selection')
 63 | 
 64 |     # Needed variables
 65 |     authorized_user_fs = ['1 kHz', '2 kHz', '8 kHz', '16 kHz', '32 kHz', '48 kHz',
 66 |                           '96 kHz', '192 kHz', '256 kHz', '384 kHz', '500 kHz']
 67 |     authorized_user_bit_depth = ['8 Bits', '16 Bits', '24 bits']
 68 | 
 69 |     export_settings_user_input = {
 70 |         #'Original project name':
 71 |         #    st.text_input(
 72 |         #        'Original project name',
 73 |         #        value="e.g., 2013_UnivMD_Maryland_71485_MD02",
 74 |         #        type="default",
 75 |         #        help="This entry will be used to  keep track of the origin of "
 76 |         #             "the data, as a part of the folder architecture and file naming."
 77 |         #             "please do not end this entry by / or \ and avoid spaces",
 78 |         #        label_visibility="visible"),
 79 | 
 80 |         'Audio duration (s)':
 81 |             st.slider(
 82 |                 'Audio duration (min)',
 83 |                 min_value=1, max_value=60, value=10, step=1, format='%i',
 84 |                 help=hd.export['Digital sampling']['Audio duration (s)'],
 85 |                 label_visibility="visible") * 60,
 86 | 
 87 |         'fs (Hz)':
 88 |             st.selectbox(
 89 |                 'Sampling Frequency', authorized_user_fs,
 90 |                 index=5,
 91 |                 help=hd.export['Digital sampling']['fs (Hz)'],
 92 |                 label_visibility="visible"),
 93 | 
 94 |         'Bit depth':
 95 |             st.selectbox(
 96 |                 'Bit depth', authorized_user_bit_depth,
 97 |                 index=2,
 98 |                 help=hd.export['Digital sampling']['Bit depth'],
 99 |                 label_visibility="visible"),
100 | 
101 |         'Export label':
102 |             st.text_input(
103 |                 'Export label',
104 |                 value="Tags",
105 |                 type="default",
106 |                 help=hd.export['Selections']['Export label'],
107 |                 label_visibility="visible"),
108 | 
109 |         'Split export selections':
110 |             st.toggle(
111 |                 'Split export selections',
112 |                 value=False,
113 |                 help=hd.export['Selections']['Split export selections']['General'],
114 |                 label_visibility="visible")}
115 | 
116 |     # User-chosen split output
117 |     if export_settings_user_input['Split export selections']:
118 |         export_settings_user_input['Split export selections'] = [
119 |             export_settings_user_input['Split export selections'],
120 |             st.number_input(
121 |                 'Minimum duration (s)',
122 |                 value=float(1.0),
123 |                 min_value=float(0),
124 |                 max_value=float(
125 |                     export_settings_user_input[
126 |                         'Audio duration (s)']),
127 |                 format='%.1f',
128 |                 step=0.1,
129 |                 help=hd.export['Split export selections']['Minimum duration (s)'],
130 |                 label_visibility="visible")
131 |         ]
132 |     else:
133 |         export_settings_user_input['Split export selections'] = [
134 |             export_settings_user_input['Split export selections'], 0]
135 | 
136 | 
137 |     st.button('Done', help=None, on_click=metadata.set_state, args=[10])
138 | 
139 | if st.session_state.stage >= 10:
140 |     # 1) continued, Entries in the correct format
141 |     # Create export_settings based on the user input:
142 |     export_settings = {
143 |         'Project ID': export_folder_dictionary['Project ID'],
144 |         'Deployment ID': export_folder_dictionary['Deployment ID'],
145 |         'Method': hd.benchmark_creator_info['Method'],
146 |         'Signal Processing': hd.benchmark_creator_info['Signal Processing'],
147 |         'Digital sampling': {
148 |             'Audio duration (s)': export_settings_user_input['Audio duration (s)'],
149 |         },
150 | 
151 |         'Selections': {
152 |             'Export label': export_settings_user_input['Export label'],
153 |             'Split export selections': export_settings_user_input['Split export selections'],
154 |         },
155 | 
156 |         'Export folders': {
157 |             'Export folder': export_folder_dictionary['Export folder'],
158 |             'Audio export folder': export_folder_dictionary['Audio export folder'],
159 |             'Annotation export folder': export_folder_dictionary['Annotation export folder'],
160 |             'Metadata folder': export_folder_dictionary['Metadata folder'],
161 |             'Metadata file': export_folder_dictionary['Metadata file'],
162 |             'Annotation CSV file': export_folder_dictionary['Annotation CSV file'],
163 |             'Audio-Seltab Map CSV file': export_folder_dictionary['Audio-Seltab Map CSV file']
164 |         },
165 |         }
166 | 
167 |     # Write fs in the correct format (str to num)
168 |     fs_wanted = [1, 2, 8, 16, 32, 48, 96, 192, 256, 384, 500]
169 |     export_settings['Digital sampling']['fs (Hz)'] = \
170 |         fs_wanted[authorized_user_fs.index(export_settings_user_input['fs (Hz)'])] * 1000
171 | 
172 |     # Write fs in the correct format (str to num)
173 |     bit_depth_wanted = [8, 16, 24]
174 |     export_settings['Digital sampling']['Bit depth'] = \
175 |         bit_depth_wanted[authorized_user_bit_depth.index(export_settings_user_input['Bit depth'])]
176 | 
177 |     # 3) Run check on the user-defined entries and show output
178 |     output = st.empty()
179 |     with folders.st_capture(output.code):
180 |         dataset.check_export_settings(export_settings)
181 | 
182 |     st.subheader('Load selections')
183 |     # # User-defined path to selection table(s)
184 |     selection_table_path = \
185 |         st.text_input(
186 |             'Path to a selection table or selection table folder',
187 |             value="e.g., SelectionTable/MD02_truth_selections.txt",
188 |             type="default",
189 |             help=hd.export['Selections']['Path'],
190 |             label_visibility="visible")
191 | 
192 |     # 4) Load selection table and show output
193 |     output = st.empty()
194 |     with folders.st_capture(output.code):
195 |         selection_table_df = dataset.load_selection_table(selection_table_path)
196 | 
197 |     # 5) Run dataset.check_selection_tab and show output of the function
198 |     output = st.empty()
199 |     with folders.st_capture(output.code):
200 |         dataset.check_selection_tab(selection_table_path)
201 | 
202 |     # 6) Show selection table
203 |     col3, col4 = st.columns([3, 1])
204 |     col3.subheader('Uploaded Selection table')
205 |     if not selection_table_df.empty:
206 |         col3.dataframe(selection_table_df)
207 | 
208 |     # 7) Ask for user-defined label key, should be in the Selection table keys displayed above
209 |     col4.subheader('Label')
210 |     label_key = \
211 |         col4.text_input(
212 |             'Selection table label',
213 |             value="e.g., Tags",
214 |             type="default",
215 |             help=hd.export['Selections']['Label'],
216 |             label_visibility="visible",
217 |             on_change=metadata.set_state, args=[11]),
218 | 
219 | if st.session_state.stage >= 11:
220 |     label_key = label_key[0]
221 | 
222 |     # 8) Remove duplicates (e.g., if we have both the spectrogram and waveform view)
223 |     selection_table_df.drop_duplicates(subset='Begin Time (s)', keep="last")
224 | 
225 |     # 9) Estimate the size of the dataset and show output
226 |     st.subheader('Estimate Benchmark Dataset size')
227 |     with st.spinner("Estimating the size of the Benchmark dataset..."):
228 |         output = st.empty()
229 |         with folders.st_capture(output.code):
230 |             dataset.benchmark_size_estimator(selection_table_df, export_settings, label_key)
231 | 
232 |     # 10) Check & update labels
233 |     st.subheader('Edit labels (Optional)')
234 |     # Get a list of unique labels from the selection table
235 |     unique_labels = selection_table_df[label_key].unique()
236 | 
237 |     # Create a dataframe
238 |     remap_label_df = pd.DataFrame({'Original labels': unique_labels,
239 |                                    'New labels': unique_labels})
240 |     # Show dataframe
241 |     col5, col6 = st.columns([1, 1.5])
242 |     new_labels_df = \
243 |         col5.data_editor(
244 |             remap_label_df,
245 |             num_rows="fixed",
246 |             disabled=["Original labels"],
247 |             hide_index=True)
248 |     col6.write(hd.export['Selections']['Label editor']['Help'])
249 |     col6.image(
250 |         'docs/illustrations/‎method_schematicV2_zoom.png',
251 |         caption=None, width=None, use_column_width=True,
252 |         clamp=False,
253 |         channels="RGB", output_format="auto")
254 | 
255 |     col6.write(hd.export['Selections']['Label editor']['Label list'])
256 | 
257 |     # Show button for creating Benchmark dataset
258 |     col6.button('Continue', help=None, on_click=metadata.set_state, args=[12])
259 | 
260 | if st.session_state.stage >= 12:
261 | 
262 |     # Show button for creating Benchmark dataset
263 |     st.button('Create Benchmark Dataset', help=None, on_click=metadata.set_state, args=[13])
264 | 
265 | if st.session_state.stage >= 13:
266 |     # 11) Swap the labels
267 |     # We want labels in a dictionary format with Key (old label): Value (new label)
268 |     new_labels_dict = new_labels_df.set_index('Original labels')['New labels'].to_dict()
269 | 
270 |     # Update the selection table
271 |     selection_table_df_updated = dataset.update_labels(selection_table_df, new_labels_dict, label_key)
272 | 
273 |     # Add the new labels to the Metadata dictionary
274 |     export_settings['Annotations'] = {
275 |         'LabelKey': label_key,
276 |         'Used Label List': list(new_labels_dict.values()),
277 |         'Standard': hd.benchmark_creator_info['Annotations']['Standard'],
278 |     }
279 | 
280 |     # 12) Write the metadata
281 |     dict_oj = copy.deepcopy(original_data_dictionary)
282 |     dict_export = copy.deepcopy(export_settings)
283 | 
284 |     metadata_save = {
285 |         'Original data': metadata.transform_original_metadata_to_ASA_standard(dict_oj),
286 |         'Benchmarked data': metadata.transform_export_metadata_to_ASA_standard(dict_export)
287 |     }
288 | 
289 |     with open(export_folder_dictionary['Metadata file'], 'w') as fp:
290 |         json.dump(metadata_save, fp, indent=4)
291 | 
292 |     # 13) Create the dataset
293 |     with st.spinner("Creating the Benchmark dataset..."):
294 |         dataset.benchmark_creator(selection_table_df_updated, export_settings, label_key)
295 | 
296 |     st.success('Benchmark dataset successfully created!')
297 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
  2 | Public License
  3 | 
  4 | By exercising the Licensed Rights (defined below), You accept and agree
  5 | to be bound by the terms and conditions of this Creative Commons
  6 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
  7 | ("Public License"). To the extent this Public License may be
  8 | interpreted as a contract, You are granted the Licensed Rights in
  9 | consideration of Your acceptance of these terms and conditions, and the
 10 | Licensor grants You such rights in consideration of benefits the
 11 | Licensor receives from making the Licensed Material available under
 12 | these terms and conditions.
 13 | 
 14 | 
 15 | Section 1 -- Definitions.
 16 | 
 17 |   a. Adapted Material means material subject to Copyright and Similar
 18 |      Rights that is derived from or based upon the Licensed Material
 19 |      and in which the Licensed Material is translated, altered,
 20 |      arranged, transformed, or otherwise modified in a manner requiring
 21 |      permission under the Copyright and Similar Rights held by the
 22 |      Licensor. For purposes of this Public License, where the Licensed
 23 |      Material is a musical work, performance, or sound recording,
 24 |      Adapted Material is always produced where the Licensed Material is
 25 |      synched in timed relation with a moving image.
 26 | 
 27 |   b. Adapter's License means the license You apply to Your Copyright
 28 |      and Similar Rights in Your contributions to Adapted Material in
 29 |      accordance with the terms and conditions of this Public License.
 30 | 
 31 |   c. BY-NC-SA Compatible License means a license listed at
 32 |      creativecommons.org/compatiblelicenses, approved by Creative
 33 |      Commons as essentially the equivalent of this Public License.
 34 | 
 35 |   d. Copyright and Similar Rights means copyright and/or similar rights
 36 |      closely related to copyright including, without limitation,
 37 |      performance, broadcast, sound recording, and Sui Generis Database
 38 |      Rights, without regard to how the rights are labeled or
 39 |      categorized. For purposes of this Public License, the rights
 40 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 41 |      Rights.
 42 | 
 43 |   e. Effective Technological Measures means those measures that, in the
 44 |      absence of proper authority, may not be circumvented under laws
 45 |      fulfilling obligations under Article 11 of the WIPO Copyright
 46 |      Treaty adopted on December 20, 1996, and/or similar international
 47 |      agreements.
 48 | 
 49 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
 50 |      any other exception or limitation to Copyright and Similar Rights
 51 |      that applies to Your use of the Licensed Material.
 52 | 
 53 |   g. License Elements means the license attributes listed in the name
 54 |      of a Creative Commons Public License. The License Elements of this
 55 |      Public License are Attribution, NonCommercial, and ShareAlike.
 56 | 
 57 |   h. Licensed Material means the artistic or literary work, database,
 58 |      or other material to which the Licensor applied this Public
 59 |      License.
 60 | 
 61 |   i. Licensed Rights means the rights granted to You subject to the
 62 |      terms and conditions of this Public License, which are limited to
 63 |      all Copyright and Similar Rights that apply to Your use of the
 64 |      Licensed Material and that the Licensor has authority to license.
 65 | 
 66 |   j. Licensor means the individual(s) or entity(ies) granting rights
 67 |      under this Public License.
 68 | 
 69 |   k. NonCommercial means not primarily intended for or directed towards
 70 |      commercial advantage or monetary compensation. For purposes of
 71 |      this Public License, the exchange of the Licensed Material for
 72 |      other material subject to Copyright and Similar Rights by digital
 73 |      file-sharing or similar means is NonCommercial provided there is
 74 |      no payment of monetary compensation in connection with the
 75 |      exchange.
 76 | 
 77 |   l. Share means to provide material to the public by any means or
 78 |      process that requires permission under the Licensed Rights, such
 79 |      as reproduction, public display, public performance, distribution,
 80 |      dissemination, communication, or importation, and to make material
 81 |      available to the public including in ways that members of the
 82 |      public may access the material from a place and at a time
 83 |      individually chosen by them.
 84 | 
 85 |   m. Sui Generis Database Rights means rights other than copyright
 86 |      resulting from Directive 96/9/EC of the European Parliament and of
 87 |      the Council of 11 March 1996 on the legal protection of databases,
 88 |      as amended and/or succeeded, as well as other essentially
 89 |      equivalent rights anywhere in the world.
 90 | 
 91 |   n. You means the individual or entity exercising the Licensed Rights
 92 |      under this Public License. Your has a corresponding meaning.
 93 | 
 94 | 
 95 | Section 2 -- Scope.
 96 | 
 97 |   a. License grant.
 98 | 
 99 |        1. Subject to the terms and conditions of this Public License,
100 |           the Licensor hereby grants You a worldwide, royalty-free,
101 |           non-sublicensable, non-exclusive, irrevocable license to
102 |           exercise the Licensed Rights in the Licensed Material to:
103 | 
104 |             a. reproduce and Share the Licensed Material, in whole or
105 |                in part, for NonCommercial purposes only; and
106 | 
107 |             b. produce, reproduce, and Share Adapted Material for
108 |                NonCommercial purposes only.
109 | 
110 |        2. Exceptions and Limitations. For the avoidance of doubt, where
111 |           Exceptions and Limitations apply to Your use, this Public
112 |           License does not apply, and You do not need to comply with
113 |           its terms and conditions.
114 | 
115 |        3. Term. The term of this Public License is specified in Section
116 |           6(a).
117 | 
118 |        4. Media and formats; technical modifications allowed. The
119 |           Licensor authorizes You to exercise the Licensed Rights in
120 |           all media and formats whether now known or hereafter created,
121 |           and to make technical modifications necessary to do so. The
122 |           Licensor waives and/or agrees not to assert any right or
123 |           authority to forbid You from making technical modifications
124 |           necessary to exercise the Licensed Rights, including
125 |           technical modifications necessary to circumvent Effective
126 |           Technological Measures. For purposes of this Public License,
127 |           simply making modifications authorized by this Section 2(a)
128 |           (4) never produces Adapted Material.
129 | 
130 |        5. Downstream recipients.
131 | 
132 |             a. Offer from the Licensor -- Licensed Material. Every
133 |                recipient of the Licensed Material automatically
134 |                receives an offer from the Licensor to exercise the
135 |                Licensed Rights under the terms and conditions of this
136 |                Public License.
137 | 
138 |             b. Additional offer from the Licensor -- Adapted Material.
139 |                Every recipient of Adapted Material from You
140 |                automatically receives an offer from the Licensor to
141 |                exercise the Licensed Rights in the Adapted Material
142 |                under the conditions of the Adapter's License You apply.
143 | 
144 |             c. No downstream restrictions. You may not offer or impose
145 |                any additional or different terms or conditions on, or
146 |                apply any Effective Technological Measures to, the
147 |                Licensed Material if doing so restricts exercise of the
148 |                Licensed Rights by any recipient of the Licensed
149 |                Material.
150 | 
151 |        6. No endorsement. Nothing in this Public License constitutes or
152 |           may be construed as permission to assert or imply that You
153 |           are, or that Your use of the Licensed Material is, connected
154 |           with, or sponsored, endorsed, or granted official status by,
155 |           the Licensor or others designated to receive attribution as
156 |           provided in Section 3(a)(1)(A)(i).
157 | 
158 |   b. Other rights.
159 | 
160 |        1. Moral rights, such as the right of integrity, are not
161 |           licensed under this Public License, nor are publicity,
162 |           privacy, and/or other similar personality rights; however, to
163 |           the extent possible, the Licensor waives and/or agrees not to
164 |           assert any such rights held by the Licensor to the limited
165 |           extent necessary to allow You to exercise the Licensed
166 |           Rights, but not otherwise.
167 | 
168 |        2. Patent and trademark rights are not licensed under this
169 |           Public License.
170 | 
171 |        3. To the extent possible, the Licensor waives any right to
172 |           collect royalties from You for the exercise of the Licensed
173 |           Rights, whether directly or through a collecting society
174 |           under any voluntary or waivable statutory or compulsory
175 |           licensing scheme. In all other cases the Licensor expressly
176 |           reserves any right to collect such royalties, including when
177 |           the Licensed Material is used other than for NonCommercial
178 |           purposes.
179 | 
180 | 
181 | Section 3 -- License Conditions.
182 | 
183 | Your exercise of the Licensed Rights is expressly made subject to the
184 | following conditions.
185 | 
186 |   a. Attribution.
187 | 
188 |        1. If You Share the Licensed Material (including in modified
189 |           form), You must:
190 | 
191 |             a. retain the following if it is supplied by the Licensor
192 |                with the Licensed Material:
193 | 
194 |                  i. identification of the creator(s) of the Licensed
195 |                     Material and any others designated to receive
196 |                     attribution, in any reasonable manner requested by
197 |                     the Licensor (including by pseudonym if
198 |                     designated);
199 | 
200 |                 ii. a copyright notice;
201 | 
202 |                iii. a notice that refers to this Public License;
203 | 
204 |                 iv. a notice that refers to the disclaimer of
205 |                     warranties;
206 | 
207 |                  v. a URI or hyperlink to the Licensed Material to the
208 |                     extent reasonably practicable;
209 | 
210 |             b. indicate if You modified the Licensed Material and
211 |                retain an indication of any previous modifications; and
212 | 
213 |             c. indicate the Licensed Material is licensed under this
214 |                Public License, and include the text of, or the URI or
215 |                hyperlink to, this Public License.
216 | 
217 |        2. You may satisfy the conditions in Section 3(a)(1) in any
218 |           reasonable manner based on the medium, means, and context in
219 |           which You Share the Licensed Material. For example, it may be
220 |           reasonable to satisfy the conditions by providing a URI or
221 |           hyperlink to a resource that includes the required
222 |           information.
223 |        3. If requested by the Licensor, You must remove any of the
224 |           information required by Section 3(a)(1)(A) to the extent
225 |           reasonably practicable.
226 | 
227 |   b. ShareAlike.
228 | 
229 |      In addition to the conditions in Section 3(a), if You Share
230 |      Adapted Material You produce, the following conditions also apply.
231 | 
232 |        1. The Adapter's License You apply must be a Creative Commons
233 |           license with the same License Elements, this version or
234 |           later, or a BY-NC-SA Compatible License.
235 | 
236 |        2. You must include the text of, or the URI or hyperlink to, the
237 |           Adapter's License You apply. You may satisfy this condition
238 |           in any reasonable manner based on the medium, means, and
239 |           context in which You Share Adapted Material.
240 | 
241 |        3. You may not offer or impose any additional or different terms
242 |           or conditions on, or apply any Effective Technological
243 |           Measures to, Adapted Material that restrict exercise of the
244 |           rights granted under the Adapter's License You apply.
245 | 
246 | 
247 | Section 4 -- Sui Generis Database Rights.
248 | 
249 | Where the Licensed Rights include Sui Generis Database Rights that
250 | apply to Your use of the Licensed Material:
251 | 
252 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
253 |      to extract, reuse, reproduce, and Share all or a substantial
254 |      portion of the contents of the database for NonCommercial purposes
255 |      only;
256 | 
257 |   b. if You include all or a substantial portion of the database
258 |      contents in a database in which You have Sui Generis Database
259 |      Rights, then the database in which You have Sui Generis Database
260 |      Rights (but not its individual contents) is Adapted Material,
261 |      including for purposes of Section 3(b); and
262 | 
263 |   c. You must comply with the conditions in Section 3(a) if You Share
264 |      all or a substantial portion of the contents of the database.
265 | 
266 | For the avoidance of doubt, this Section 4 supplements and does not
267 | replace Your obligations under this Public License where the Licensed
268 | Rights include other Copyright and Similar Rights.
269 | 
270 | 
271 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
272 | 
273 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
274 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
275 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
276 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
277 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
278 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
279 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
280 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
281 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
282 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
283 | 
284 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
285 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
286 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
287 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
288 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
289 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
290 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
291 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
292 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
293 | 
294 |   c. The disclaimer of warranties and limitation of liability provided
295 |      above shall be interpreted in a manner that, to the extent
296 |      possible, most closely approximates an absolute disclaimer and
297 |      waiver of all liability.
298 | 
299 | 
300 | Section 6 -- Term and Termination.
301 | 
302 |   a. This Public License applies for the term of the Copyright and
303 |      Similar Rights licensed here. However, if You fail to comply with
304 |      this Public License, then Your rights under this Public License
305 |      terminate automatically.
306 | 
307 |   b. Where Your right to use the Licensed Material has terminated under
308 |      Section 6(a), it reinstates:
309 | 
310 |        1. automatically as of the date the violation is cured, provided
311 |           it is cured within 30 days of Your discovery of the
312 |           violation; or
313 | 
314 |        2. upon express reinstatement by the Licensor.
315 | 
316 |      For the avoidance of doubt, this Section 6(b) does not affect any
317 |      right the Licensor may have to seek remedies for Your violations
318 |      of this Public License.
319 | 
320 |   c. For the avoidance of doubt, the Licensor may also offer the
321 |      Licensed Material under separate terms or conditions or stop
322 |      distributing the Licensed Material at any time; however, doing so
323 |      will not terminate this Public License.
324 | 
325 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
326 |      License.
327 | 
328 | 
329 | Section 7 -- Other Terms and Conditions.
330 | 
331 |   a. The Licensor shall not be bound by any additional or different
332 |      terms or conditions communicated by You unless expressly agreed.
333 | 
334 |   b. Any arrangements, understandings, or agreements regarding the
335 |      Licensed Material not stated herein are separate from and
336 |      independent of the terms and conditions of this Public License.
337 | 
338 | 
339 | Section 8 -- Interpretation.
340 | 
341 |   a. For the avoidance of doubt, this Public License does not, and
342 |      shall not be interpreted to, reduce, limit, restrict, or impose
343 |      conditions on any use of the Licensed Material that could lawfully
344 |      be made without permission under this Public License.
345 | 
346 |   b. To the extent possible, if any provision of this Public License is
347 |      deemed unenforceable, it shall be automatically reformed to the
348 |      minimum extent necessary to make it enforceable. If the provision
349 |      cannot be reformed, it shall be severed from this Public License
350 |      without affecting the enforceability of the remaining terms and
351 |      conditions.
352 | 
353 |   c. No term or condition of this Public License will be waived and no
354 |      failure to comply consented to unless expressly agreed to by the
355 |      Licensor.
356 | 
357 |   d. Nothing in this Public License constitutes or may be interpreted
358 |      as a limitation upon, or waiver of, any privileges and immunities
359 |      that apply to the Licensor or You, including from the legal
360 |      processes of any jurisdiction or authority.
361 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmark Dataset Creator
  2 | Léa Bouffaut, Ph.D. -- K. Lisa Yang Center for Conservation Bioacoustics, Cornell University
  3 | 
  4 | lea.bouffaut@cornell.edu
  5 | 
  6 | ### Motivations and objectives
  7 | 
  8 | Many bioacoustic projects are sitting on a goldmine of already annotated datasets. We want to create a standardized pipeline for creating, storing, sharing and
  9 | using data that is flexible and repeatable  to train and test AI models for different applications. More details in https://www.overleaf.com/read/yfcgvngmwfbs#e349e7
 10 | 
 11 | This notebook aims to create a benchmark dataset and standardize the following:
 12 | * Metadata following ASA standards
 13 | * File duration
 14 | * Sampling frequency
 15 | * Mono channel
 16 | * Bit depth
 17 | * File name format
 18 | * Selection table fields
 19 | 
 20 | It also gives the option to change labels, e.g., to match our standardized label format.
 21 | 
 22 | 
 23 | <b>It does NOT:</b>
 24 | * Filter the audio input beyond what is needed for resampling
 25 | * Normalize the audio file amplitude
 26 | 
 27 | 
 28 | For example, this schematic view presents (top) a Raven Pro project with a selection table associated with several audio files of different lengths, (bottom) the standardized benchmark clips, and associated annotations. Note that annotations at the junction between two export files and those in the remaining audio, which are too short in comparison with the selected export audio file duration, are ignored.
 29 | ![‎method_schematicV2](https://github.com/leabouffaut/BenchmarkDatasetCreator/blob/main/docs/illustrations/method_schematicV3.jpeg)
 30 | 
 31 | ### Necessary information in selection tables
 32 | This project uses Raven Pro 1.6 selection tables. Selection tables can either be associated with (1) a single audio file or (2) multiple audio files.
 33 | Selection tables, by default, contain the necessary information to draw a time-frequency box around a call, please make sure to have the required following fields, including an annotation column and variables that enable the code to retrieve the audio files:
 34 | * 'Begin Time (s)'
 35 | * 'End Time (s)'
 36 | * 'Low Frequency (Hz)'
 37 | * 'High Frequency (Hz)'
 38 | * 'Begin Path'
 39 | * 'File Offset (s)'
 40 | * 'Label'/'Tags'/Other
 41 | 
 42 | We will consider and test that all selection tables should contain all of the aforementioned fields, with a user-defined field for the label column. Note that 'Begin Path' should work from your current workstation (Unix and Windows mount servers and write paths differently)!
 43 | 
 44 | ### Labels 
 45 | The following format is our suggested label format: 
 46 | 
 47 | `<LatinNameAccronym>.<Location>.<CallName>`
 48 | 
 49 | Where 
 50 | * `<LatinNameAccronym>` is a 6-letter combination of the first letters of each word,
 51 | * `<Location>` a 4-letter combination describing the geographical location of the recorder.
 52 |     - If underwater, give cardinal direction and abbreviation of the ocean/sea,
 53 |     - If on land, the first two letters specify the region, and the last two letters are the ISO 3166 country codes (see https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#ZZ),
 54 | 
 55 | * `<CallName>` free-format vocalization descriptor.
 56 | 
 57 | 
 58 | ### Outputs
 59 | Please refer to the [User-defined parameters](#User-defined-parameters) section to see the details on the output folder architecture. 
 60 | > [!NOTE]
 61 | >This notebook will assist you in creating:
 62 | > * Benchmark sound files based on user-input specifications
 63 | > * a corresponding Raven selection table for each sound file,
 64 | > * a two-column file-matching CSV (as used for Koogu) and,
 65 | > * a recap annotation CSV file that will match previous datasets, e.g., https://zenodo.org/records/7525805
 66 | 
 67 | ## How to get started
 68 | The Benchmark Dataset Creator is Python-based code that can be run as an app (supported by Streamlit) that opens in the browser. The dataset-creating functions can also run directly in Python code (currently in dev; see user-defined parameters below). 
 69 | 
 70 | ### General workflow to install the app
 71 | To get started, we suggest the following steps:
 72 | 1) Create a work folder
 73 | 2) Create and activate a virtual environment in this folder with Python 3.9
 74 | 3) Download this repository and unzip it in the work folder; move the contents of the unzipped folder to the same level as your Venv.
 75 | 4) Install the packages listed in requirements.txt in the virtual environment
 76 | 5) To run the app: streamlit run BenchmarkDatasetCreator_app/Home.py
 77 | 6) To close the app, close the terminal or ctrl+c
 78 | 
 79 | ### Install and run the app on a Mac
 80 | On my Mac (Macbook Pro 2019, Intel) the series of commands to do these steps:
 81 | 1) Open a terminal at the folder (right click > New Terminal at Folder)
 82 | 2) Follow the instructions of https://mnzel.medium.com/how-to-activate-python-venv-on-a-mac-a8fa1c3cb511 to create a virtual environment. 
 83 | On my Mac, I need to use: 
 84 | 	> `pip3.9 install --user virtualenv`
 85 | 	> `python3.9 -m venv venv`
 86 | 3) Activate the virtual environment
 87 | 	> `source venv/bin/activate`
 88 | Now you should see `(venv)` on the left of your terminal (don't close it).
 89 | 
 90 | 4) Download the BenchmarkDatasetCreator from this Github Repository and place its content in your work folder, at the same level as the venv folder
 91 | https://github.com/leabouffaut/BenchmarkDatasetCreator/
 92 | 
 93 | 5) In the ´(venv)´ environment, you'll need to install the required packages by entering the following:
 94 | 	> `python3.9 -m pip install -r requirements.txt`
 95 | 
 96 | 6) Now you're all set! Start the application using:
 97 | 	> `streamlit run BenchmarkDatasetCreator_app/Home.py`
 98 | 
 99 | To stop the app, close the terminal or ctrl+c
100 | 
101 | 
102 | After the first installation, to reopen the app:
103 | 1) Activate the virtual environment at the working folder
104 | 	> `source venv/bin/activate`
105 | 2) Launch the App:
106 | 	> `streamlit run BenchmarkDatasetCreator_app/Home.py`
107 | 
108 | 
109 | 
110 | 
111 | ## User-defined parameters (to run the Benchmark Dataset Creator in a .py file or Jupyter notebook (in dev)) <a id='User-defined-parameters'></a>
112 | ```ruby
113 | export_settings = {
114 |     'Original project name': '2021_CLOCCB_BermudaPlantBank_S1105', 
115 |     'Audio duration (s)': 300,  
116 |     'fs (Hz)': 8000, 
117 |     'Bit depth': 24,
118 |     'Export label': 'Tags',
119 |     'Split export selections': [True, 1],
120 |     'Export folder': 'benchmark_data'
121 |     }
122 | ```
123 | 
124 | The field illustrated above is a series of user-defined parameters in the form of a [Python dictionary](https://realpython.com/python-dicts/#defining-a-dictionary) (surrounded by curly braces, entries separated by commas, typical entry: `'key word': 'value'`) to create the Benchmark dataset, note that the following fields can be filled in any specific order but must all be present:
125 | * `Original project name`, helps you keep track of the origin of the data, should be written between as a string of characters, which in Python is between quotes `'Project'`. This code will create the folder architecture, please do not end this entry by "/" or "\" and avoid spaces " ".
126 | * `Audio duration (s)` is the chosen export audio file duration for the Benchmark dataset in seconds. Our recommendation is to set it to encompass the vocalization(s) of interest but also some context. What is the minimum duration that would represent the signal's repetition or call/cue rate (with several annotations)?
127 | * `fs (Hz)` is the sampling frequency in Hz, to be set at minima at double the maximum frequency of the signals of interest. If relevant, BirdNET uses fs = 48 kHz (see: [BirdNET Analyzer technical details](https://github.com/kahst/BirdNET-Analyzer?tab=readme-ov-file#technical-details))
128 | * `Bit depth` determines the number of possible amplitude values we can record for each audio sample; for SWIFT units, it is set to 16 bits and for Rockhopper to 24 bits.
129 | * `Export label` defines the name of the label column for the created export Raven selection tables
130 | * `Split export selections` specifies the method when a selection is at the junction between two export audio files if it should be split (True) or not (False). In the case the split is selected, a second value should be entered to specify the minimum duration to report an annotation in the selection table in seconds, e.g., `[True, 3]` or `[False, ]`. If you have hundreds or even tens of selections of your target signals, we would recommend to set this parameter to false. This parameter can be handy if, for example, you selected "long" periods of background noise (long compared to the annotations of signals of interest) that could be split across two audio export files. In that case, you can set the minimun duration to something longer than your signals of interest or to 3 s if you plan to work with BirdNET. Another use case is if you have a very tight selection around your signal of interest (in time) and want even a very small portion of that signal to be labeled.
131 | * `Export folder` is where the data will be saved following this structure (example where `<Project>` is 2013_UnivMD_Maryland_71485_MD0)
132 | ```
133 | Export_folder/
134 | │
135 | └───2013_UnivMD_Maryland_71485_MD02/
136 |     │   2013_UnivMD_Maryland_71485_02_metadata.json   
137 |     │   2013_UnivMD_Maryland_71485_MD02_annotations.csv
138 |     │   2013_UnivMD_Maryland_71485_MD02_audio_seltab_map.csv
139 |     │
140 |     └───audio/
141 |     │   │   <Project>_<OriginalFileName>_<OriginalSamplingFrequency>_<OriginalChannel>_<FileTimeStamp>.flac
142 |     │   │   2013_UnivMD_Maryland_71485_MD02_71485MD02_002K_M11_multi_20150626_031500Z_2kHz_ch03_0600s.flac
143 |     │   │   ...
144 |     │   
145 |     └───annotations/
146 |         │   <Project>_<OriginalFileName>_<OriginalSamplingFrequency>_<OriginalChannel>_<FileTimeStamp>.txt
147 |         │   2013_UnivMD_Maryland_71485_MD02_71485MD02_002K_M11_multi_20150626_031500Z_2kHz_ch03_0600s.txt
148 |         │   ...
149 | ```
150 | 
151 | 


--------------------------------------------------------------------------------
/SelectionTable/MD02_truth_selections.txt:
--------------------------------------------------------------------------------
 1 | Selection	View	Channel	Begin Time (s)	End Time (s)	Low Freq (Hz)	High Freq (Hz)	Begin Date	Begin Hour	Begin Path	Begin File	File Offset (s)	Delta Time (s)	SNR NIST Quick (dB)	Tag	Notes	VF2
 2 | 1	Spectrogram 1	11	197575.630000000	197576.570000000	78.100	246.900		54	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_064500Z.aif	71485MD02_002K_M11_multi_20150902_064500Z.aif	475.6300	0.9400	16.19	NARW	?	v
 3 | 2	Spectrogram 1	11	197616.450000000	197617.100000000	80.500	201.100		54	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_064500Z.aif	71485MD02_002K_M11_multi_20150902_064500Z.aif	516.4500	0.6500	15.92	NARW	added by KBH	v
 4 | 3	Spectrogram 1	11	197659.170000000	197660.190000000	109.400	303.100		54	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_064500Z.aif	71485MD02_002K_M11_multi_20150902_064500Z.aif	559.1700	1.0200	14.36	NARW	?	v
 5 | 4	Spectrogram 1	11	197679.540000000	197680.590000000	118.800	293.800		54	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_064500Z.aif	71485MD02_002K_M11_multi_20150902_064500Z.aif	579.5400	1.0500	14.90	NARW	?	v
 6 | 5	Spectrogram 1	11	198612.520000000	198613.660000000	96.900	278.100		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_070000Z.aif	71485MD02_002K_M11_multi_20150902_070000Z.aif	612.5200	1.1400	16.48	NARW	?	v
 7 | 6	Spectrogram 1	11	199420.070000000	199420.860000000	114.900	275.900		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_071500Z.aif	71485MD02_002K_M11_multi_20150902_071500Z.aif	520.0700	0.7900	12.39	NARW	Added by KBH	v
 8 | 7	Spectrogram 1	11	199442.540000000	199443.350000000	97.700	241.400		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_071500Z.aif	71485MD02_002K_M11_multi_20150902_071500Z.aif	542.5400	0.8100	15.06	NARW	Added by KBH	v
 9 | 8	Spectrogram 1	11	199463.930000000	199464.620000000	114.900	252.900		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_071500Z.aif	71485MD02_002K_M11_multi_20150902_071500Z.aif	563.9300	0.6900	13.09	NARW	Added by KBH	v
10 | 9	Spectrogram 1	11	200157.620000000	200158.620000000	120.700	287.400		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_073000Z.aif	71485MD02_002K_M11_multi_20150902_073000Z.aif	357.6200	1.0000	16.03	NARW	Added by KBH	v
11 | 10	Spectrogram 1	11	200181.930000000	200182.930000000	103.100	275.000		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_073000Z.aif	71485MD02_002K_M11_multi_20150902_073000Z.aif	381.9300	1.0000	18.49	NARW	?	v
12 | 11	Spectrogram 1	11	200248.580000000	200249.490000000	78.100	253.100		55	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_073000Z.aif	71485MD02_002K_M11_multi_20150902_073000Z.aif	448.5800	0.9100	19.22	NARW	?	v
13 | 12	Spectrogram 1	11	201797.360000000	201798.300000000	90.600	275.000		56	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_080000Z.aif	71485MD02_002K_M11_multi_20150902_080000Z.aif	197.3600	0.9400	17.03	NARW	?	v
14 | 13	Spectrogram 1	11	239679.880000000	239681.500000000	98.000	223.000		66	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_183000Z.aif	71485MD02_002K_M11_multi_20150902_183000Z.aif	279.8800	1.6200	13.02	NARW	added by KBH	v
15 | 14	Spectrogram 1	11	239728.010000000	239729.120000000	114.900	296.900		66	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_183000Z.aif	71485MD02_002K_M11_multi_20150902_183000Z.aif	328.0100	1.1100	15.59	NARW	?	v
16 | 15	Spectrogram 1	11	240391.100000000	240392.390000000	90.600	265.600		66	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150902/71485MD02_002K_M11_multi_20150902_184500Z.aif	71485MD02_002K_M11_multi_20150902_184500Z.aif	91.1000	1.2900	11.94	NARW	?	v
17 | 16	Spectrogram 1	3	12422.497000000	12425.527000000	75.000	231.200		3	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150626/71485MD02_002K_M11_multi_20150626_031500Z.aif	71485MD02_002K_M11_multi_20150626_031500Z.aif	722.4970	3.0300	10.75	NARW		v
18 | 17	Spectrogram 1	8	26923.640000000	26925.266000000	105.300	236.800		7	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150626/71485MD02_002K_M11_multi_20150626_071500Z.aif	71485MD02_002K_M11_multi_20150626_071500Z.aif	823.6400	1.6260	12.59	NARW		v
19 | 18	Spectrogram 1	5	26923.751000000	26925.672000000	46.100	243.400		7	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150626/71485MD02_002K_M11_multi_20150626_071500Z.aif	71485MD02_002K_M11_multi_20150626_071500Z.aif	823.7510	1.9210	12.01	NARW		v
20 | 19	Spectrogram 1	8	30970.751000000	30972.266000000	65.800	315.800		8	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150626/71485MD02_002K_M11_multi_20150626_083000Z.aif	71485MD02_002K_M11_multi_20150626_083000Z.aif	370.7510	1.5150	12.36	NARW		v
21 | 20	Spectrogram 1	2	96267.465000000	96269.941000000	98.700	289.500		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_023000Z.aif	71485MD02_002K_M11_multi_20150724_023000Z.aif	867.4650	2.4760	16.05	NARW		v
22 | 21	Spectrogram 1	9	96272.675000000	96274.152000000	92.100	250.000		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_023000Z.aif	71485MD02_002K_M11_multi_20150724_023000Z.aif	872.6750	1.4770	14.55	NARW		v
23 | 22	Spectrogram 1	8	96277.219000000	96278.512000000	92.100	263.200		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_023000Z.aif	71485MD02_002K_M11_multi_20150724_023000Z.aif	877.2190	1.2930	10.94	NARW		v
24 | 23	Spectrogram 1	2	96365.675000000	96367.226000000	105.300	223.700		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_024500Z.aif	71485MD02_002K_M11_multi_20150724_024500Z.aif	65.6750	1.5510	14.99	NARW		v
25 | 24	Spectrogram 1	9	96370.108000000	96372.029000000	72.400	256.600		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_024500Z.aif	71485MD02_002K_M11_multi_20150724_024500Z.aif	70.1080	1.9210	13.36	NARW		v
26 | 25	Spectrogram 1	6	96374.135000000	96375.613000000	98.700	236.800		26	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_024500Z.aif	71485MD02_002K_M11_multi_20150724_024500Z.aif	74.1350	1.4780	12.53	NARW		v
27 | 26	Spectrogram 1	2	99651.621000000	99653.911000000	78.900	223.700		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	651.6210	2.2900	15.56	NARW		v
28 | 27	Spectrogram 1	9	99657.162000000	99658.825000000	98.700	269.700		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	657.1620	1.6630	11.47	NARW		v
29 | 28	Spectrogram 1	2	99674.527000000	99676.670000000	85.500	197.400		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	674.5270	2.1430	18.25	NARW		v
30 | 29	Spectrogram 1	9	99679.404000000	99681.140000000	92.100	223.700		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	679.4040	1.7360	13.42	NARW		v
31 | 30	Spectrogram 1	2	99730.226000000	99732.111000000	78.900	263.200		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	730.2260	1.8850	17.15	NARW		v
32 | 31	Spectrogram 1	9	99735.177000000	99736.655000000	52.600	223.700		27	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_033000Z.aif	71485MD02_002K_M11_multi_20150724_033000Z.aif	735.1770	1.4780	14.68	NARW		v
33 | 32	Spectrogram 1	2	101636.014000000	101637.086000000	59.200	210.500		28	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_040000Z.aif	71485MD02_002K_M11_multi_20150724_040000Z.aif	836.0140	1.0720	19.00	NARW		v
34 | 33	Spectrogram 1	9	101640.596000000	101641.815000000	59.200	236.800		28	/Volumes/ag-clo-repnas5.ad.cornell.edu-1/projects/2013_UnivMD_Maryland_71485/Sounds/71485_MD02_002K_11CH_AIFF/002K_11CH_20150724/71485MD02_002K_M11_multi_20150724_040000Z.aif	71485MD02_002K_M11_multi_20150724_040000Z.aif	840.5960	1.2190	13.21	NARW		v
35 | 


--------------------------------------------------------------------------------
/docs/DependenciesMapping.txt:
--------------------------------------------------------------------------------
 1 | Dependencies map:
 2 | 
 3 | CreateBenchmarkDataset notebook:
 4 | - check_export_settings
 5 | - check_selection_tab
 6 | - create_path
 7 | - load_selection_table
 8 | - benchmark_size_estimator
 9 | - update_labels
10 | - benchmark_creator
11 | 
12 | benchmark_creator function: 
13 | - get_bitdepth(export_settings): This function is called to retrieve the bit depth from the export settings.
14 | - get_print_fs(fs_original): This function is called to format the original sampling frequency for file naming.
15 | - exports(export_settings, selection_table_af_df, save_sel_dict): This function is called to export audio and annotation files.
16 | 
17 | benchmark_size_estimator function: 
18 | - get_number_clips(unique_audiofiles, export_settings['Audio duration (s)']): This function is called to determine the number of clips based on the duration of audio files and export settings.
19 | - check_bitdepth(export_settings): This function is called to validate the bit depth specified in the export settings.
20 | 
21 | exports function: 
22 | - save_audioclip: This function is called to export the audio clip based on provided parameters.
23 | - write_selection_table: This function is called to write entries in the selection table file.
24 | - write_annotation_csv: This function is called to write annotations in a global CSV file.
25 | - map_audio_selection: This function is called to create a file association CSV.
26 | 
27 | 
28 | Modules Imported:
29 | librosa: Used for loading audio files.
30 | os.path: Used for manipulating file paths.
31 | numpy as np: Used for numerical operations.
32 | soundfile as sf: Used for writing audio files.
33 | pandas: Utilized for working with DataFrames.


--------------------------------------------------------------------------------
/docs/HowToInstall/HowToInstall_Mac.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | How to install on a Mac
 3 | 
 4 | 1) Open terminal at the folder level you want to work (right clic > New Terminal at Folder)
 5 | 2) Follow instructions of https://mnzel.medium.com/how-to-activate-python-venv-on-a-mac-a8fa1c3cb511 to create a virtual environment. 
 6 | 
 7 | On my Mac, I need to use: 
 8 | 	> pip3.9 install --user virtualenv
 9 | 	> python3.9 -m venv venv
10 | 3) Activate the virtual environment
11 | 	> source venv/bin/activate
12 | Now you should see (venv) on the left of your terminal (don't close it).
13 | 
14 | 4) Download the BenchmarkDatasetCreator from the Github Repository and place it in your work folder
15 | https://github.com/leabouffaut/BenchmarkDatasetCreator/
16 | 
17 | 5) Move the content of BenchmarkDatasetCreator-main to the same level as your venv
18 | 6) back in the (venv) environment you'll need to install some packages entering the following (check if the folderName for the Benchmark Dataset Creator matches yours)
19 | 	> python3.9 -m pip install -r requirements.txt
20 | 
21 | 6) Now you're all set! Start the application using:
22 | 	> streamlit run BenchmarkDatasetCreator_app/Home.py
23 | 
24 | To stop the app, close the terminal or ctrl+c
25 | 
26 | 
27 | 
28 | After the first installation, to reopen the app:
29 | 1) Activate the virtual environment at the working folder
30 | 	> source venv/bin/activate
31 | 2) Launch the App:
32 | 	> streamlit run BenchmarkDatasetCreator_app/Home.py
33 | 


--------------------------------------------------------------------------------
/docs/bioacoustics_species_list.txt:
--------------------------------------------------------------------------------
1 | Yang Center Bioacoustics species list:
2 | See here https://docs.google.com/spreadsheets/d/1ScxYST26QIGE2d_ovEI1NtyPDmpWeMHJJ2LEu4nFwOw/edit?usp=sharing (Cornell-restricted access, Editor privilege)
3 | 


--------------------------------------------------------------------------------
/docs/illustrations/method_schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leabouffaut/BenchmarkDatasetCreator/bdb3f1f46056d9f8fe21e948d330a22506638d03/docs/illustrations/method_schematic.png


--------------------------------------------------------------------------------
/docs/illustrations/method_schematicV3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leabouffaut/BenchmarkDatasetCreator/bdb3f1f46056d9f8fe21e948d330a22506638d03/docs/illustrations/method_schematicV3.jpeg


--------------------------------------------------------------------------------
/docs/illustrations/‎method_schematicV2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leabouffaut/BenchmarkDatasetCreator/bdb3f1f46056d9f8fe21e948d330a22506638d03/docs/illustrations/‎method_schematicV2.png


--------------------------------------------------------------------------------
/docs/illustrations/‎method_schematicV2_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leabouffaut/BenchmarkDatasetCreator/bdb3f1f46056d9f8fe21e948d330a22506638d03/docs/illustrations/‎method_schematicV2_zoom.png


--------------------------------------------------------------------------------
/examples/CreateBenchmarkDataset.py:
--------------------------------------------------------------------------------
 1 | # Create Benchmark Dataset Python script
 2 | #
 3 | # Léa Bouffaut, Ph.D. -- K. Lisa Yang Center for Conservation Bioacoustics, Cornell University
 4 | # lea.bouffaut@cornell.edu
 5 | #
 6 | # e.g. runs in Pycharm
 7 | 
 8 | from BenchmarkDatasetCreator import dataset as bc
 9 | 
10 | # User-defined export settings dictionary
11 | export_settings = {
12 |     'Original project name': '2021_CLOCCB_BermudaPlantBank_S1105',
13 |     'Audio duration (s)': 300,
14 |     'fs (Hz)': 8000,
15 |     'Bit depth': 24,
16 |     'Export label': 'Tags',
17 |     'Split export selections': [True, 1],
18 |     'Export folder': 'benchmark_data'
19 |     }
20 | 
21 | 
22 | # Run check on the user-defined entries
23 | bc.check_export_settings(export_settings)
24 | 
25 | # User-defined path to selection table(s)
26 | selection_table_path = '/Volumes/DCLDE/projects/2022_CLOCCB_IthacaNY_S1112/Atlantic_whales/2021_CLOCCB_BermudaPlantBank_S1105/annotations/'
27 | bc.check_selection_tab(selection_table_path)
28 | 
29 | # Create directories
30 | bc.create_path(export_settings)
31 | 
32 | # Load selection table
33 | selection_table_df = bc.load_selection_table(selection_table_path)
34 | 
35 | if selection_table_df.empty == False:
36 |     print(selection_table_df)
37 | 
38 | # User-defined label key, should be in the Selection table keys displaid above
39 | label_key = 'Call Type'
40 | 
41 | # Test selection table and estimate size
42 | # Remove duplicates (e.g., if we have both the spectrogram and waveform view)
43 | selection_table_df.drop_duplicates(subset='Begin Time (s)', keep="last");
44 | 
45 | # Estimate the size of the dataset
46 | bc.benchmark_size_estimator(selection_table_df, export_settings, label_key)
47 | 
48 | # Check & update labels
49 | # Get a list of unique labels from the selection table
50 | unique_labels = selection_table_df[label_key].unique()
51 | 
52 | # Print the list of unique labels
53 | print('Unique label list:')
54 | for lab in unique_labels:
55 |     print(lab)
56 | 
57 | # New label dictionnary
58 | # Yang Center species list: https://docs.google.com/spreadsheets/d/1ScxYST26QIGE2d_ovEI1NtyPDmpWeMHJJ2LEu4nFwOw/edit?usp=sharing"
59 | 
60 | new_labels_dict = {
61 |     'NARW': 'EUBGLA.NWAO.Upcall',
62 |     'na': 'BALMUS.NWAO.Dcall',
63 | }
64 | 
65 | # Swap the labels
66 | selection_table_df = bc.update_labels(selection_table_df, new_labels_dict, label_key)
67 | 
68 | # Create the dataset
69 | import time
70 | start_time = time.time()
71 | 
72 | bc.benchmark_creator(selection_table_df, export_settings, label_key)
73 | 
74 | print(f'The Benchmark Dataset Creator took {time.time() - start_time} s to run')


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altair==5.2.0
 2 | attrs==23.2.0
 3 | audioread==3.0.1
 4 | blinker==1.7.0
 5 | cachetools==5.3.3
 6 | certifi==2024.2.2
 7 | cffi==1.16.0
 8 | charset-normalizer==3.3.2
 9 | click==8.1.7
10 | decorator==5.1.1
11 | gitdb==4.0.11
12 | GitPython==3.1.42
13 | h3==3.7.7
14 | idna==3.6
15 | importlib-metadata==7.1.0
16 | importlib-resources==6.4.0
17 | Jinja2==3.1.3
18 | joblib==1.3.2
19 | jsonschema==4.21.1
20 | jsonschema-specifications==2023.12.1
21 | lazy-loader==0.3
22 | librosa==0.10.1
23 | llvmlite==0.41.1
24 | markdown-it-py==3.0.0
25 | MarkupSafe==2.1.5
26 | mdurl==0.1.2
27 | msgpack==1.0.8
28 | numba==0.58.1
29 | numpy==1.24.4
30 | packaging==23.2
31 | pandas==2.0.3
32 | pillow==10.2.0
33 | pkgutil-resolve-name==1.3.10
34 | platformdirs==4.2.0
35 | pooch==1.8.1
36 | protobuf==4.25.3
37 | pyarrow==15.0.2
38 | pycparser==2.21
39 | pydeck==0.8.1b0
40 | pygments==2.17.2
41 | python-dateutil==2.9.0.post0
42 | pytz==2024.1
43 | referencing==0.34.0
44 | requests==2.31.0
45 | rich==13.7.1
46 | rpds-py==0.18.0
47 | scikit-learn==1.3.2
48 | scipy==1.10.1
49 | six==1.16.0
50 | smmap==5.0.1
51 | soundfile==0.12.1
52 | soxr==0.3.7
53 | streamlit==1.32.2
54 | tenacity==8.2.3
55 | threadpoolctl==3.4.0
56 | timezonefinder==6.5.0
57 | toml==0.10.2
58 | toolz==0.12.1
59 | tornado==6.4
60 | tqdm==4.66.2
61 | typing-extensions==4.10.0
62 | tzdata==2024.1
63 | urllib3==2.2.1
64 | zipp==3.18.1
65 | 


--------------------------------------------------------------------------------