├── json-to-elan ├── input │ └── .gitkeep ├── requirements.txt ├── .gitignore ├── README.md └── convert.py ├── make-elans-from-wavs ├── wav │ └── .gitkeep ├── flac │ └── .gitkeep ├── .gitignore ├── README.md ├── flac2wav.py ├── flatten.py └── make-elan.py ├── elan-character-spacer ├── input │ ├── .gitkeep │ ├── 1_1_3.eaf │ └── abui_1.eaf ├── output │ ├── .gitkeep │ ├── 1_1_3.eaf │ ├── abui_1.eaf │ └── 1_1_3.pfsx ├── .gitignore ├── README.md └── elan-insert-spaces.py ├── python-tier-selector ├── input │ ├── .gitkeep │ ├── abui_2.eaf │ ├── abui_3.eaf │ ├── abui_4.eaf │ └── abui_1.eaf ├── .gitignore └── select-tiers.py ├── make-elans-from-wavs-and-spreadsheet ├── eaf │ └── .gitkeep ├── input │ ├── .gitkeep │ └── test.xlsx ├── wav │ └── .gitkeep ├── .gitignore ├── README.md └── make-elan.py ├── elan-splitter ├── requirements.txt ├── .gitignore ├── README.md └── split_eafs.py ├── .gitignore ├── elan-to-json ├── input │ ├── abui_1.wav │ ├── abui_2.wav │ ├── abui_3.wav │ ├── abui_4.wav │ ├── abui_2.eaf │ ├── abui_3.eaf │ ├── abui_4.eaf │ └── abui_1.eaf ├── output │ └── elan.json ├── README.md └── elan_to_json.py ├── make-elans-from-text ├── .gitignore ├── README.md ├── requirements.txt └── convert.py ├── make-elans-from-timit ├── .gitignore ├── requirements.txt ├── README.md └── convert.py ├── CONTRIBUTING.md ├── README.md └── LICENSE /json-to-elan/input/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /make-elans-from-wavs/wav/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /elan-character-spacer/input/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /elan-character-spacer/output/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /json-to-elan/requirements.txt: -------------------------------------------------------------------------------- 1 | pympi-ling==1.70.2 2 | -------------------------------------------------------------------------------- /make-elans-from-wavs/flac/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /python-tier-selector/input/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/eaf/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/input/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/wav/.gitkeep: -------------------------------------------------------------------------------- 1 | # keep this dir -------------------------------------------------------------------------------- /elan-splitter/requirements.txt: -------------------------------------------------------------------------------- 1 | pydub 2 | pympi-ling 3 | python-slugify 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | venv 6 | __pycache__ 7 | .idea -------------------------------------------------------------------------------- /elan-to-json/input/abui_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_1.wav -------------------------------------------------------------------------------- /elan-to-json/input/abui_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_2.wav -------------------------------------------------------------------------------- /elan-to-json/input/abui_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_3.wav -------------------------------------------------------------------------------- /elan-to-json/input/abui_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_4.wav -------------------------------------------------------------------------------- /make-elans-from-wavs/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea 3 | .DS_Store 4 | flac/* 5 | wav/* 6 | !flac/.gitkeep 7 | !wav/.gitkeep 8 | 9 | -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/input/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/make-elans-from-wavs-and-spreadsheet/input/test.xlsx -------------------------------------------------------------------------------- /elan-splitter/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | # Source and target data dirs 6 | input/* 7 | !input/.gitkeep 8 | output/* 9 | !output/.gitkeep 10 | -------------------------------------------------------------------------------- /elan-character-spacer/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | # Source and target data dirs 6 | input/* 7 | !input/.gitkeep 8 | output/* 9 | !output/.gitkeep 10 | -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .idea 3 | .DS_Store 4 | flac/* 5 | wav/* 6 | eaf/* 7 | !flac/.gitkeep 8 | !wav/.gitkeep 9 | !eaf/.gitkeep 10 | input/~* 11 | 12 | -------------------------------------------------------------------------------- /json-to-elan/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | venv 6 | __pycache__ 7 | .idea 8 | all 9 | # Source and target data dirs 10 | input/* 11 | !input/.gitkeep 12 | output/* 13 | !output/.gitkeep 14 | -------------------------------------------------------------------------------- /python-tier-selector/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | venv 6 | __pycache__ 7 | .idea 8 | # Source and target data dirs 9 | input/* 10 | !input/.gitkeep 11 | output/* 12 | !output/.gitkeep 13 | -------------------------------------------------------------------------------- /make-elans-from-text/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | venv 6 | __pycache__ 7 | .idea 8 | all 9 | # Source and target data dirs 10 | input/* 11 | !input/.gitkeep 12 | output/* 13 | !output/.gitkeep 14 | -------------------------------------------------------------------------------- /make-elans-from-timit/.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | # Thumbnails 4 | ._* 5 | venv 6 | __pycache__ 7 | .idea 8 | all 9 | # Source and target data dirs 10 | input/* 11 | !input/.gitkeep 12 | output/* 13 | !output/.gitkeep 14 | -------------------------------------------------------------------------------- /elan-to-json/output/elan.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "audio_file_name": "abui_1.wav", 4 | "transcript": "amakaang 你好 di kaai hada muila", 5 | "start_ms": 290, 6 | "stop_ms": 1910, 7 | "speaker_id": "SL" 8 | } 9 | ] -------------------------------------------------------------------------------- /elan-character-spacer/README.md: -------------------------------------------------------------------------------- 1 | # Elan character spacer 2 | 3 | This script will space-separate characters in all words in specified Elan tiers from `input` folder and save modified files to `output` folder. 4 | 5 | Very crude, requires the input and output dirs to already exist. 6 | 7 | May not be UTF-8 safe. -------------------------------------------------------------------------------- /make-elans-from-text/README.md: -------------------------------------------------------------------------------- 1 | Given a folder of `.txt` files containing annotations of audio in `.wav` files, create `.eaf` ELAN files. 2 | 3 | Annotation start-time is 0 and end-time is duration of audio. 4 | 5 | 6 | ```shell 7 | python3 -m venv venv 8 | source venv/bin/activate 9 | pip install -r requirements.txt 10 | python convert.py 11 | ``` 12 | 13 | 14 | Use optional `--copy_wavs` setting to copy WAV files into the out dir -------------------------------------------------------------------------------- /make-elans-from-text/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | audioread==2.1.9 3 | certifi==2020.12.5 4 | cffi==1.14.5 5 | chardet==4.0.0 6 | decorator==5.0.9 7 | idna==2.10 8 | joblib==1.0.1 9 | librosa==0.8.1 10 | llvmlite==0.36.0 11 | numba==0.53.1 12 | numpy==1.20.3 13 | packaging==20.9 14 | pooch==1.3.0 15 | pycparser==2.20 16 | pympi-ling==1.69 17 | pyparsing==2.4.7 18 | requests==2.25.1 19 | resampy==0.2.2 20 | scikit-learn==0.24.2 21 | scipy==1.6.3 22 | six==1.16.0 23 | SoundFile==0.10.3.post1 24 | threadpoolctl==2.1.0 25 | urllib3==1.26.5 26 | -------------------------------------------------------------------------------- /make-elans-from-timit/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | audioread==2.1.9 3 | certifi==2020.12.5 4 | cffi==1.14.5 5 | chardet==4.0.0 6 | decorator==5.0.9 7 | idna==2.10 8 | joblib==1.0.1 9 | librosa==0.8.1 10 | llvmlite==0.36.0 11 | numba==0.53.1 12 | numpy==1.20.3 13 | packaging==20.9 14 | pooch==1.3.0 15 | pycparser==2.20 16 | pympi-ling==1.69 17 | pyparsing==2.4.7 18 | requests==2.25.1 19 | resampy==0.2.2 20 | scikit-learn==0.24.2 21 | scipy==1.6.3 22 | six==1.16.0 23 | SoundFile==0.10.3.post1 24 | threadpoolctl==2.1.0 25 | urllib3==1.26.5 26 | -------------------------------------------------------------------------------- /make-elans-from-timit/README.md: -------------------------------------------------------------------------------- 1 | Given a folder of `.txt` files containing annotations of audio in `.wav` files, create `.eaf` ELAN files. 2 | 3 | Annotation start-time is 0 and end-time is duration of audio. 4 | 5 | TIMIT text files are in `start end text` format, e.g. 6 | ``` 7 | 0 49460 Even then, if she took one step forward he could catch her. 8 | 9 | ``` 10 | 11 | 12 | ```shell 13 | python3 -m venv venv 14 | source venv/bin/activate 15 | pip install -r requirements.txt 16 | python convert.py 17 | ``` 18 | 19 | 20 | Use optional `--copy_wavs` setting to copy WAV files into the out dir -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guide 2 | 3 | 1. Create a new folder in the top level 4 | - Use lower case, hyphen-separated words, e.g. `header-extractor` 5 | 6 | 2. Place your script(s) in the that folder, e.g. `header-extractor/extract.py` 7 | 8 | 3. Add all dependencies into a `dist` folder, e.g. `header-extractor/dist/some-dependency.py` 9 | 10 | 4. Include a `README.md` in the root of your script folder, e.g. `header-extractor/README.md`, specifying 11 | 1. the usage environment (e.g. Python 3.6 + numpy + Libxml2) 12 | 2. usage instructions (e.g. `python extract.py infile.cha outfile.csv`) 13 | 14 | 5. Ideally, also include dummy or sample files to show a minimal working example of script. 15 | -------------------------------------------------------------------------------- /make-elans-from-wavs/README.md: -------------------------------------------------------------------------------- 1 | # Make ELANs from WAVs 2 | 3 | Processing scripts to generate Elan files matching a nested folders of WAVs. The WAV filenames are used as the annotation value, and written forms of digits are converted to spoken forms. 4 | 5 | Written for the TIDIGITS corpus, so there are some specific naming manipulations in here that you won't need for your own data, including splitting filenames into individual characters. E.g., for file `123.wav` the script will create an Elan file `123.eaf` with annotation `one two three`. 6 | 7 | Includes script to convert FLAC audio to WAV. 8 | 9 | This was written to convert the TIDIGITS corpus audio for Elpis-ready format. 10 | 11 | To use it, drop your audio in the flac dir and run `flac2wav.py` to convert FLAC audio to WAV. 12 | 13 | Then run the `make-elan.py` script to generate Elan files. 14 | 15 | If you want files to be in a single directory rather than nested, you can run `flatten.py`, which will move files into a single dir, renaming the files to incorporate the original dir structure in the filenames. 16 | -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/README.md: -------------------------------------------------------------------------------- 1 | # Make ELANs from WAVs 2 | 3 | This is a script to generate ELAN files matching a folder of WAVs. The WAV filenames are used to retrieve annotation value from a spreadsheet. If an audio filename isn't found in the spreadsheet, an ELAN file with blank annotation value will be created. 4 | 5 | ## Requirements 6 | 7 | This script has been written for Python3. 8 | 9 | The spreadsheet must contain at least one column named "File name" and one column named "Transcription". The spreadsheet can contain other columns. 10 | 11 | Put your audio in the wav dir, put your spreadsheet in the input dir. 12 | 13 | ## Usage 14 | 15 | Start a Python virtual environment. 16 | 17 | ```bash 18 | python3 -m venv venv 19 | source ./venv/bin/activate 20 | 21 | ``` 22 | 23 | Install the packages which the script needs. 24 | 25 | ```bash 26 | pip install argparse librosa pympi-ling pandas xlrd 27 | 28 | ``` 29 | 30 | Run the script. 31 | 32 | ```bash 33 | python make-elan.py -a input/test.xlsx -s wav -t eaf 34 | ``` 35 | 36 | 37 | When finished, deactivate the venv. 38 | 39 | ```bash 40 | deactivate 41 | ``` -------------------------------------------------------------------------------- /elan-character-spacer/elan-insert-spaces.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import xml.etree.ElementTree as ET 3 | import glob 4 | import os 5 | from pathlib import Path 6 | 7 | 8 | def spaceMe(file_): 9 | print(file_) 10 | # Which tier? 11 | tier_name = 'Phrase' 12 | tree = ET.parse(file_) 13 | 14 | root = tree.getroot() 15 | 16 | for tier in root.iter('TIER'): 17 | if tier.attrib['TIER_ID'] == tier_name: 18 | for annotation in tier.iter('ANNOTATION_VALUE'): 19 | # Get the original text 20 | # OPTION: use this to not end up with double spaces between words 21 | source_text = annotation.text.replace(" ", "") 22 | # OR: use this to have double spaces between words 23 | # source_text = annotation.text 24 | 25 | insert_spaces = " ".join(source_text) 26 | 27 | # update the annotation 28 | annotation.text = str(insert_spaces) 29 | 30 | # feedback 31 | print("done") 32 | 33 | # Save the file to output dir 34 | tree.write(os.path.join("output", os.path.basename(file_))) 35 | 36 | def main(): 37 | for path in Path('./input').rglob('*.eaf'): 38 | spaceMe(path) 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /elan-to-json/README.md: -------------------------------------------------------------------------------- 1 | # Elan to JSON 2 | The script reads an Elan file (or a directory including Elan files) and exports a JSON file with the annotations on a selected tier. You can choose which tier by passing a tier name, tier type, or a tier order number as an argument to the script. Note that the tier selection has to be consistent across the data set. 3 | 4 | ### Sample usage: 5 | 6 | This will write a JSON file containing the annotations on the third tier (from the top) of all `.eaf` files in a folder named `transcriptions`. 7 | 8 | ``` 9 | python3 elan_to_json.py -i transcriptions -r 3 10 | ``` 11 | 12 | ### Sample output: 13 | 14 | ``` 15 | [ 16 | { 17 | "audio_file_name": "abui_1.wav", 18 | "transcript": "amakaang di kaai hada muila", 19 | "start_ms": 290, 20 | "stop_ms": 1910, 21 | "speaker_id": "SL" 22 | }, 23 | { 24 | "audio_file_name": "abui_2.wav", 25 | "transcript": "dining ayoku kamar mia mui muila", 26 | "start_ms": 890, 27 | "stop_ms": 2960, 28 | "speaker_id": "SL" 29 | }, 30 | { 31 | "audio_file_name": "abui_3.wav", 32 | "transcript": "hekaai dining ayoku kamar mia muila", 33 | "start_ms": 1850, 34 | "stop_ms": 4140, 35 | "speaker_id": "SL" 36 | } 37 | ] 38 | ``` 39 | -------------------------------------------------------------------------------- /make-elans-from-wavs/flac2wav.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def flac2wav(source_parent_dir, target_parent_dir): 5 | """ 6 | Convert flac audio to wav 7 | """ 8 | 9 | for dirname, dirnames, filenames in os.walk(source_parent_dir): 10 | 11 | # print path to all subdirectories first. 12 | for subdirname in dirnames: 13 | print(os.path.join(dirname, subdirname)) 14 | 15 | # print path to all filenames. 16 | for filename in filenames: 17 | if '.flac' in filename: 18 | print(filename) 19 | parent, gender, child = dirname.split(os.path.sep) 20 | basename, ext = os.path.splitext(os.path.basename(filename)) 21 | print(parent, gender, child, filename) 22 | 23 | source_path = os.path.join(source_parent_dir, gender, child) 24 | target_path = os.path.join(target_parent_dir, gender, child) 25 | 26 | if not os.path.exists(target_path): 27 | os.makedirs(target_path) 28 | 29 | os.system(f'ffmpeg -hide_banner -loglevel warning -y -i {source_path}/{filename} {target_path}/{basename}.wav') 30 | 31 | 32 | if __name__ == "__main__": 33 | source_parent_dir = 'flac' 34 | target_parent_dir = 'wav' 35 | flac2wav(source_parent_dir, target_parent_dir) 36 | -------------------------------------------------------------------------------- /make-elans-from-wavs/flatten.py: -------------------------------------------------------------------------------- 1 | from os import scandir, chdir 2 | from pathlib import Path 3 | from shutil import move, rmtree 4 | 5 | 6 | def flatten(root_path: str, absolute_root: str) -> None: 7 | """ 8 | Flatten a tree of files and give the files the names of the enclosing 9 | files separated by underscores. 10 | WARNING: operates in place and will destroy exiting file structure. 11 | :param root_path: the local root (at the start this will match the 12 | absolute root) 13 | :param absolute_root: the root directory of the tree you want to flatten 14 | """ 15 | path = Path(root_path) 16 | stack = set(scandir(root_path)) 17 | while stack: 18 | entry = stack.pop() 19 | entry_path = Path(entry.path) 20 | if ".DS_Store" in entry.name or not entry_path.exists() and entry_path.is_file(): 21 | if entry_path.exists(): 22 | entry_path.unlink() 23 | continue 24 | if entry_path.is_dir(): 25 | flatten(entry.path, absolute_root) 26 | if root_path != absolute_root: 27 | stack = stack.union(filter(lambda x: x.is_file(), 28 | list(scandir(root_path)))) 29 | rmtree(entry.path) 30 | elif entry_path.exists() and entry_path.is_file(): 31 | target_file = f"{path.parent.resolve()}/{path.name}_{entry.name}" 32 | move(entry.path, target_file) 33 | 34 | 35 | if __name__ == "__main__": 36 | target = "wav" 37 | # chdir(target) 38 | flatten(target, target) 39 | -------------------------------------------------------------------------------- /json-to-elan/README.md: -------------------------------------------------------------------------------- 1 | # JSON to ELAN 2 | 3 | The script reads a JSON file (or folder) and generates an ELAN file to match. 4 | 5 | ## JSON format 6 | 7 | It has been written for the JSON output from Huggingface ASR pipelines. Here's an example of the expected JSON format. 8 | 9 | ```json 10 | [ 11 | { 12 | "text": "luanghan", 13 | "timestamp": 14 | [ 15 | 1.16, 16 | 1.48 17 | ] 18 | }, 19 | { 20 | "text": "ian", 21 | "timestamp": 22 | [ 23 | 1.56, 24 | 1.7 25 | ] 26 | } 27 | ] 28 | ``` 29 | 30 | ## Setup 31 | 32 | Create a virtual environment and install the required packages. 33 | ``` 34 | python3 -m venv venv 35 | source venv/bin/activate 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | 40 | Put your JSON files in the `input` directory, then run the script. The script will build an ELAN files for each JSON file, using that JSON file's annotation data. It expects that your WAV files are named the same as the JSON basename (eg, if your JSON file is named `audio_1.json`, the ELAN file will end up with a linked media added for `audio_1.wav`). To open the ELAN file you will need to copy your audio into the output dir. Note that the output dir is erased each time the script it run, so be sure to keep a copy of the audio. 41 | ``` 42 | python convert.py 43 | ``` 44 | 45 | Optionally, you can specify a different input directory for the JSON files, and the output directory to write the ELAN files. If you have WAV files in the input directory, you can choose to copy them into the output too. You can also set a different tier name from the default (which is "default"). Here's an example: 46 | ``` 47 | python convert.py --tier_name Words --input source_files --output elan_files --copy_wavs 48 | ``` 49 | -------------------------------------------------------------------------------- /elan-character-spacer/input/1_1_3.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239 6 | 1 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | hekaai dining ayoku kamar mia muila 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elan Helpers 2 | 3 | Tools and scripts for working with ELAN. See the README file in each script folder for usage instuctions. 4 | 5 | ## Elan Character Spacer 6 | 7 | This script will space-separate characters in all words in specified Elan tiers. 8 | 9 | 10 | ## Elan to JSON 11 | 12 | The script reads an Elan file (or a directory including Elan files) and exports a JSON file with the annotations on a selected tier. You can choose which tier by passing a tier name, tier type, or a tier order number as an argument to the script. 13 | 14 | 15 | ## Elan Splitter 16 | 17 | The script processes a directory of audio annotated in Elan, and outputs audio clips and matching-named text files containing the respective annotations. The original files are not altered. Audio clips are determined by the start and end times of annotations on the first tier. You can choose to use another tier by passing a tier name or a different order number (not tier type) as an argument to the script. 18 | 19 | ## JSON to ELAN 20 | 21 | This script will process a folder of JSON files containing annotations, and build ELAN files for each. 22 | 23 | ## Python Tier Selector 24 | 25 | This script doesn't do much on its own, but can be used as a basis for your own processing script. It looks in a folder the user specifies, and compiles a list of all the tiers in the Elan files in that folder. The script offers the user an option to select one or more tiers from the list. From this point you could extend the script to extract all the annotations on the selected tiers, or perhaps write a new Elan file that combines the selected tiers. 26 | 27 | 28 | ## Make ELANs from text 29 | 30 | This script can be used to make ELAN files that are based on a folder of text annotations and WAV audio recordings. 31 | 32 | 33 | ## Make ELANs from TIMIT 34 | 35 | This script can be used to make ELAN files from the TIMIT dataset. Handy for testing Elpis. 36 | 37 | 38 | ## Make ELANs from WAVs 39 | 40 | Processing scripts to generate Elan files for a nested folders of WAVs, using the WAV file names as the annotation values. Perfect for converting a directory of audio files to Elpis-ready format. 41 | -------------------------------------------------------------------------------- /elan-character-spacer/output/1_1_3.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239 6 | 2 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | h e k a a i d i n i n g a y o k u k a m a r m i a m u i l a 18 | 19 | 20 | 21 | 22 | 23 | 24 | something 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
34 | -------------------------------------------------------------------------------- /elan-to-json/input/abui_2.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:4395a55c-130b-4bfd-8c19-2785dfee027b 6 | 2 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | dining ayoku kamar mia mui muila 18 | 19 | 20 | 21 | 22 | 23 | 24 | kamar mia mui muila 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | -------------------------------------------------------------------------------- /elan-to-json/input/abui_3.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239 6 | 2 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | hekaai dining ayoku kamar mia muila 18 | 19 | 20 | 21 | 22 | 23 | 24 | kamar mia muila 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | -------------------------------------------------------------------------------- /make-elans-from-wavs/make-elan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import re 4 | import num2words 5 | from pympi.Elan import Eaf 6 | 7 | 8 | def make_elan(source_parent_dir, target_parent_dir): 9 | """ 10 | Make elan files based on filenames of wav files 11 | Written for the TIDIGITS corpus, so some things are specific to the name formats of that corpus 12 | """ 13 | 14 | for dirname, dirnames, filenames in os.walk(source_parent_dir): 15 | 16 | # print path to all subdirectories first. 17 | for subdirname in dirnames: 18 | print(os.path.join(dirname, subdirname)) 19 | 20 | # print path to all filenames. 21 | for filename in filenames: 22 | if '.wav' in filename: 23 | parent, gender, child = dirname.split(os.path.sep) 24 | basename, ext = os.path.splitext(os.path.basename(filename)) 25 | print(parent, gender, child, filename) 26 | 27 | source_path = os.path.join(source_parent_dir, gender, child) 28 | target_path = os.path.join(target_parent_dir, gender, child) 29 | 30 | if not os.path.exists(target_path): 31 | print(target_path) 32 | os.makedirs(target_path) 33 | 34 | # Audio file duration - use this as end timeslot 35 | duration = int(librosa.get_duration(filename=os.path.join(source_path, filename))*1000) 36 | 37 | # Make file annotation from filename (minus the suffix) 38 | annotation = " ".join([char for char in basename[:-1]]) 39 | # These are specific to the TIDIGITS naming convention 40 | annotation = annotation.replace("o", "oh") 41 | annotation = annotation.replace("z", "zero") 42 | 43 | text = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) 44 | 45 | print(filename, duration, annotation, text) 46 | 47 | # Make elan 48 | output_eaf = Eaf() 49 | output_eaf.add_tier('tx') 50 | output_eaf.insert_annotation('tx', 0, duration, text) 51 | output_eaf.add_linked_file(os.path.join(target_path, f'{basename}.wav')) 52 | 53 | output_eaf.to_file(os.path.join(target_path, f'{basename}.eaf')) 54 | 55 | 56 | if __name__ == "__main__": 57 | source_parent_dir = 'wav' 58 | target_parent_dir = 'wav' 59 | make_elan(source_parent_dir, target_parent_dir) 60 | -------------------------------------------------------------------------------- /python-tier-selector/select-tiers.py: -------------------------------------------------------------------------------- 1 | from PyInquirer import prompt 2 | import glob 3 | import os 4 | from pympi.Elan import Eaf 5 | from typing import Set 6 | 7 | 8 | # Return all files that have matching extension, from a specified directory 9 | def find_files_by_ext(all_files: Set[str], extensions: Set[str]): 10 | files = [] 11 | for file in all_files: 12 | name, ext = os.path.splitext(file) 13 | if ("*" + ext.lower()) in extensions: 14 | files.append(file) 15 | return files 16 | 17 | 18 | # Prompt user to select multiple tier names from a list of options 19 | def select_tier(tier_names: Set[str]): 20 | print(type(tier_names)) 21 | tier_names_checkboxes = [] 22 | for tier_name in list(tier_names): 23 | tier_names_checkboxes.append({'name': tier_name}) 24 | questions = [ 25 | { 26 | 'type': 'checkbox', 27 | 'name': 'tier', 28 | 'message': 'Choose a tier', 29 | 'choices': tier_names_checkboxes 30 | } 31 | ] 32 | tier_choice = prompt(questions) 33 | print("Selected tier/s:", tier_choice["tier"]) 34 | 35 | 36 | # Read files from a folder, 37 | # Compile a list of tier names, 38 | # Then ask user to choose one tier 39 | def main(): 40 | # Start by asking where the Elan files are 41 | # Default is a folder named "input" in the same directory as this script 42 | input_dir_question = [ 43 | { 44 | 'type': 'input', 45 | 'name': 'input_dir', 46 | 'message': 'Name of folder with Elan files?', 47 | 'default': 'input' 48 | } 49 | ] 50 | input_dir_prompt = prompt(input_dir_question) 51 | input_dir = input_dir_prompt["input_dir"] 52 | # Get all files from the input directory 53 | extensions = set(["*.eaf"]) 54 | tier_names = set() 55 | all_files = set(glob.glob(os.path.join(input_dir, "**"), recursive=True)) 56 | input_files = find_files_by_ext(all_files, extensions) 57 | # Compile tier info for the files in the input dir 58 | for input_file_path in input_files: 59 | input_file = Eaf(input_file_path) 60 | # Get the tier names — using pympi-ling 61 | file_tier_names = list(input_file.get_tier_names()) 62 | # Compile tiers into set to use for user prompt 63 | for tier_name in file_tier_names: 64 | tier_names.add(tier_name) 65 | print(type(tier_names)) 66 | select_tier(tier_names) 67 | 68 | 69 | if __name__ == '__main__': 70 | main() 71 | -------------------------------------------------------------------------------- /python-tier-selector/input/abui_2.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:4395a55c-130b-4bfd-8c19-2785dfee027b 6 | 2 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | dining ayoku kamar mia mui muila 18 | 19 | 20 | 21 | 22 | 23 | 24 | kamar mia mui muila 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | -------------------------------------------------------------------------------- /python-tier-selector/input/abui_3.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239 6 | 2 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | hekaai dining ayoku kamar mia muila 18 | 19 | 20 | 21 | 22 | 23 | 24 | kamar mia muila 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
35 | -------------------------------------------------------------------------------- /elan-to-json/input/abui_4.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:269a144c-5adb-4b5e-b1b7-ea19b2e8ee04 6 | 3 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | hekaai deina del ong hayei ba 20 | 21 | 22 | 23 | 24 | hepikaai deina botol homi dong yaari 25 | 26 | 27 | 28 | 29 | 30 | 31 | hekaai deina del ong hayei ba 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 |
42 | -------------------------------------------------------------------------------- /python-tier-selector/input/abui_4.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:269a144c-5adb-4b5e-b1b7-ea19b2e8ee04 6 | 3 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | hekaai deina del ong hayei ba 20 | 21 | 22 | 23 | 24 | hepikaai deina botol homi dong yaari 25 | 26 | 27 | 28 | 29 | 30 | 31 | hekaai deina del ong hayei ba 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 |
42 | -------------------------------------------------------------------------------- /python-tier-selector/input/abui_1.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f 6 | 3 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | amakaang di kaai hada muila 20 | 21 | 22 | 23 | 24 | 25 | 26 | amakaang di kaai 27 | 28 | 29 | 30 | 31 | 32 | 33 | add gloss here 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 |
45 | -------------------------------------------------------------------------------- /json-to-elan/convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pympi.Elan import Eaf 3 | from typing import List, Dict 4 | import argparse 5 | import shutil 6 | import json 7 | from pathlib import Path 8 | 9 | 10 | def make_elans(tier_name: str, input_dir: str, output_dir: str, copy_wavs: bool): 11 | """ 12 | Make ELAN files based on JSON data 13 | :param tier_name: The name of the tier to write into 14 | :param input_dir: Directory name of folder containing JSON (and optionally also matching WAV audio) files 15 | :param output_dir: Directory name to save EAF files into 16 | :param copy_wavs: Setting whether or not to copy the WAV file to the output dir 17 | """ 18 | # Process each file 19 | for _, _, filenames in os.walk(input_dir): 20 | 21 | for filename in filenames: 22 | if '.json' in filename: 23 | print(filename) 24 | basename, ext = os.path.splitext(os.path.basename(filename)) 25 | 26 | file_path = Path(input_dir, filename) 27 | # read the JSON 28 | with open(file_path) as json_file: 29 | annotation_data = json.load(json_file) 30 | print(annotation_data) 31 | 32 | # Make EAF file 33 | output_eaf = Eaf() 34 | for annotation in annotation_data: 35 | start = int(annotation["timestamp"][0] * 1000) 36 | end = int(annotation["timestamp"][1] * 1000) 37 | print(end, start) 38 | output_eaf.add_annotation("default", start, end, value=annotation["text"]) 39 | 40 | if tier_name != "default": 41 | output_eaf.rename_tier("default", tier_name) 42 | output_eaf.add_linked_file(str(Path(output_dir, f'{basename}.wav'))) 43 | output_eaf.to_file(str(Path(output_dir, f'{basename}.eaf'))) 44 | 45 | # Copy WAV? 46 | if copy_wavs: 47 | shutil.copyfile(Path(input_dir, f"{basename}.wav"), Path(output_dir, f"{basename}.wav")) 48 | 49 | 50 | def main(): 51 | parser = argparse.ArgumentParser(description='Make ELAN files from JSON data') 52 | parser.add_argument('-t', '--tier_name', help='Name of the tier', default='default') 53 | parser.add_argument('-i', '--input_dir', help='Folder of JSON files', default='input') 54 | parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output') 55 | parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true') 56 | parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false') 57 | parser.set_defaults(copy_wavs=False) 58 | 59 | args = parser.parse_args() 60 | tier_name = args.tier_name 61 | input_dir = args.input_dir 62 | output_dir = args.output_dir 63 | copy_wavs = args.copy_wavs 64 | 65 | # Reset the output dir 66 | print("resetting output dir") 67 | if os.path.isdir(output_dir): 68 | shutil.rmtree(output_dir) 69 | os.makedirs(output_dir) 70 | 71 | # Go 72 | print("making elan files") 73 | make_elans(tier_name, input_dir, output_dir, copy_wavs) 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /elan-splitter/README.md: -------------------------------------------------------------------------------- 1 | # Elan Splitter 2 | 3 | The script processes a directory of audio annotated in Elan, and outputs audio clips and matching-named text files containing the respective annotations. The original files are not altered. Audio clips are determined by the start and end times of annotations on the first tier. You can choose to use another tier by passing a tier name or a different order number as an argument to the script (see below for examples). Note that the tier name or number has to be consistent across the data set, it doesn't enable individual files to be handled differently. 4 | 5 | Instructions here are for Mac OSX. 6 | 7 | Requires Python 3. 8 | 9 | ## Installation 10 | 11 | Open Terminal and check what versions of Homebrew and Python you have (if any). If you get a message "command not found: ..." then we need to install that software. 12 | 13 | ``` 14 | brew --version 15 | python --version 16 | python3 --version 17 | ``` 18 | 19 | **Homebrew** 20 | Install Homebrew if needed, following the instructions at https://brew.sh/ 21 | 22 | **Python** 23 | OSX comes with Python 2.7 but we need version 3. If you got a "command not found" error when you did the python3 --version command before, type this into Terminal.
24 | `brew install python` 25 | 26 | If you want to output mp3 files, also install ffmpeg
27 | `brew install ffmpeg` 28 | 29 | 30 | **Splitter** 31 | 32 | Clone this repository (maybe to the Desktop) and cd into this script dir `~/Desktop/elan-helpers/elan-splitter`. 33 | 34 | Start a venv and install the script's dependencies. 35 | ``` 36 | python3 -m venv venv 37 | source venv/bin/activate 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | Make an `input` folder (eg `~/Desktop/elan-helpers/elan-splitter/input`), and put your Elan and audio files into it. 42 | 43 | 44 | ## Usage 45 | 46 | Then you are ready to run the script.
47 | `python split_eafs.py` 48 | 49 | The output folder should now be populated with clipped audio files and text annotations! 50 | 51 | 52 | 53 | By the way, if you are using WAV audio, and don't have ffmpeg installed, you can ignore the error about RuntimeWarning: Couldn't find ffmpeg or avconv. They would be needed if converting to mp3 or some other audio formats, WAV format is handled by the Python library. 54 | 55 | 56 | ## Options 57 | 58 | To slice using annotations on a tier named "Words" you can use this command:
59 | `python split_eafs.py -t Words` 60 | 61 | To get annotations from the second tier pass the number as an argument like this:
62 | `python split_eafs.py -o 2` 63 | 64 | If you want the files it generates to be named with the annotation name, run the script with -n flag.
65 | `python split_eafs.py -n` 66 | 67 | Add a prefix to the generated files with `-p` setting. The following command, used with an Elan file that has the transcription "dog" will result in generated files named `A111_dog.txt` and `A111_dog.wav`.
68 | `python split_eafs.py -n -p "A111"` 69 | 70 | To output audio in MP3 format, set the format type with -f flag. The default output format is WAV.
71 | `python split_eafs.py -f mp3` 72 | 73 | You can combine options! E.g., to get annotations from the second highest tier, write files with annotations as the name, with A111 prefix:
74 | `python split_eafs.py -o 2 -n -p "A111"` 75 | -------------------------------------------------------------------------------- /elan-character-spacer/output/abui_1.eaf: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f 5 | 4 6 |
7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | a m a k a a n g d i k a a i h a d a m u i l a 21 | 22 | 23 | 24 | 25 | 26 | 27 | amakaang di kaai 28 | 29 | 30 | 31 | 32 | 33 | 34 | add gloss here 35 | 36 | 37 | 38 | 39 | 40 | 41 | mememememememe 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 |
-------------------------------------------------------------------------------- /elan-to-json/input/abui_1.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f 6 | 4 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | amakaang 你好 di kaai hada muila 22 | 23 | 24 | 25 | 26 | 27 | 28 | amakaang di kaai 29 | 30 | 31 | 32 | 33 | 34 | 35 | add gloss here 36 | 37 | 38 | 39 | 40 | 41 | 42 | mememememememe 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
54 | -------------------------------------------------------------------------------- /elan-character-spacer/input/abui_1.eaf: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f 6 | 4 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | amakaang di kaai hada muila 22 | 23 | 24 | 25 | 26 | 27 | 28 | amakaang di kaai 29 | 30 | 31 | 32 | 33 | 34 | 35 | add gloss here 36 | 37 | 38 | 39 | 40 | 41 | 42 | mememememememe 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
54 | -------------------------------------------------------------------------------- /make-elans-from-text/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import librosa 4 | from pympi.Elan import Eaf 5 | from typing import List, Dict 6 | import argparse 7 | import shutil 8 | 9 | 10 | def get_annotation(input_dir: str, basename: str): 11 | """ 12 | Get annotation from a the text file contents 13 | :param input_dir: Name of the source folder 14 | :param basename: Base name of the file which contains an annotation 15 | :return: annotation text 16 | """ 17 | annotation = '' 18 | with open(os.path.join(input_dir, basename + '.txt'), 'r', encoding='utf-8') as text_file: 19 | annotation = text_file.read() 20 | return annotation 21 | 22 | 23 | def make_elans(input_dir: str, output_dir: str, copy_wavs: bool): 24 | """ 25 | Make ELAN files based on filenames of WAV files and annotation from matching text file 26 | :param input_dir: Directory name of folder containing TXT and WAV audio files 27 | :param output_dir: Directory name to save EAF files into 28 | :param copy_wavs: Setting whether or not to copy the WAV file to the output dir 29 | """ 30 | # Process each file 31 | for _, _, filenames in os.walk(input_dir): 32 | 33 | for filename in filenames: 34 | if '.wav' in filename: 35 | basename, ext = os.path.splitext(os.path.basename(filename)) 36 | print(basename) 37 | 38 | # Get audio file duration - use this as the EAF annotation's end timeslot 39 | duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000) 40 | 41 | # Get annotation from the text file matching on file basename 42 | annotation = get_annotation(input_dir, basename) 43 | 44 | # Add any annotation cleaning here 45 | # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) 46 | 47 | print(duration, annotation) 48 | 49 | # Make EAF file 50 | output_eaf = Eaf() 51 | # output_eaf.add_tier('default') 52 | output_eaf.insert_annotation('default', 0, duration, annotation) 53 | output_eaf.add_linked_file(os.path.join(output_dir, f'{basename}.wav')) 54 | output_eaf.to_file(os.path.join(output_dir, f'{basename}.eaf')) 55 | 56 | # Copy WAV? 57 | if copy_wavs: 58 | shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename)) 59 | print('>>> Done') 60 | 61 | 62 | def main(): 63 | parser = argparse.ArgumentParser(description='Make ELAN files to match TXT and WAVs') 64 | parser.add_argument('-i', '--input_dir', help='Folder of TXT and WAV files', default='input') 65 | parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output') 66 | 67 | parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true') 68 | parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false') 69 | parser.set_defaults(copy_wavs=False) 70 | args = parser.parse_args() 71 | 72 | input_dir = args.input_dir 73 | output_dir = args.output_dir 74 | copy_wavs = args.copy_wavs 75 | 76 | if copy_wavs: 77 | print('copying WAVs') 78 | else: 79 | print('skip copying WAVs') 80 | 81 | # Reset the output dir 82 | print("resetting output dir") 83 | shutil.rmtree(output_dir) 84 | os.makedirs(output_dir) 85 | 86 | # Go 87 | print("making elan files") 88 | make_elans(input_dir, output_dir, copy_wavs) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /make-elans-from-wavs-and-spreadsheet/make-elan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import librosa 4 | from pympi.Elan import Eaf 5 | from typing import List, Dict 6 | import argparse 7 | import pandas 8 | import json 9 | 10 | 11 | def get_annotations(spreadsheet: str): 12 | """ 13 | Get filenames and annotations from a spreadsheet (actually loads all spreadsheet columns) 14 | :param spreadsheet: Name of the spreadsheet which contains rows of audio filenames and annotations 15 | :return: JSON format list of objects. Each object corresponds to a row of data in the excel file 16 | """ 17 | spreadsheet_data = pandas.read_excel(spreadsheet) 18 | spreadsheet_json = spreadsheet_data.to_json(orient='records') 19 | annotations = json.loads(spreadsheet_json) 20 | print('Spreadsheet loaded') 21 | return annotations 22 | 23 | 24 | def get_annotation(annotations: List[Dict[str, str]], filename: str): 25 | """ 26 | Get annotation for an audio file by looking up filename match in the spreadsheet json 27 | :param annotations: data from input spreadsheet in JSON format 28 | :param filename: name of WAV file to get annotation for 29 | :return: annotation retrieved from the spreadsheet data matching the WAV filename 30 | """ 31 | annotation = '' 32 | for record in annotations: 33 | if record["File name"] == filename: 34 | annotation = record["Transcription"] 35 | break 36 | return annotation 37 | 38 | 39 | def make_elans(spreadsheet: str, source: str, target: str): 40 | """ 41 | Make ELAN files based on filenames of WAV files 42 | :param spreadsheet: Path and file name of the spreadsheet containing WAV filenames and matching annotations 43 | :param source: Directory name of folder containing WAV audio files 44 | :param target: Directory name to save EAF files into 45 | """ 46 | 47 | # Read spreadsheet data and convert to JSON format 48 | print('Loading data from spreadsheet') 49 | annotations = get_annotations(spreadsheet) 50 | 51 | # Process each file 52 | print('Processing WAVs') 53 | for _, _, filenames in os.walk(source): 54 | 55 | for filename in filenames: 56 | if '.wav' in filename: 57 | basename, ext = os.path.splitext(os.path.basename(filename)) 58 | 59 | # Get audio file duration - use this as the EAF annotation's end timeslot 60 | duration = int(librosa.get_duration(filename=os.path.join(source, filename))*1000) 61 | 62 | # Get annotation from the source data matching on filename 63 | annotation = get_annotation(annotations, filename) 64 | 65 | # Add any annotation cleaning here 66 | # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) 67 | 68 | print(filename, duration, annotation) 69 | 70 | # Make EAF file 71 | output_eaf = Eaf() 72 | output_eaf.add_tier('tx') 73 | output_eaf.insert_annotation('tx', 0, duration, annotation) 74 | output_eaf.add_linked_file(os.path.join(target, f'{basename}.wav')) 75 | output_eaf.to_file(os.path.join(target, f'{basename}.eaf')) 76 | print('>>> Done') 77 | 78 | 79 | def main(): 80 | parser = argparse.ArgumentParser(description='make ELAN files to match WAVs') 81 | parser.add_argument('-a', '--annotations', help='spreadsheet name', default=os.path.join('input', 'test.xlsx')) 82 | parser.add_argument('-s', '--source', help='folder of WAVs', default='wav') 83 | parser.add_argument('-t', '--target', help='folder to save EAFs', default='eaf') 84 | args = parser.parse_args() 85 | 86 | spreadsheet = args.annotations 87 | source = args.source 88 | target = args.target 89 | make_elans(spreadsheet, source, target) 90 | 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /make-elans-from-timit/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | # import librosa 4 | from pympi.Elan import Eaf 5 | from typing import List, Dict 6 | import argparse 7 | import shutil 8 | import glob 9 | 10 | 11 | def make_elans(input_dir: str, output_dir: str, copy_wavs: bool): 12 | """ 13 | Make ELAN files based on filenames of WAV files and annotation from matching text file 14 | :param input_dir: Directory name of folder containing TXT and WAV audio files 15 | :param output_dir: Directory name to save EAF files into 16 | :param copy_wavs: Setting whether or not to copy the WAV file to the output dir 17 | """ 18 | # Process each file 19 | files = glob.glob(f'{input_dir}/**/*.txt', recursive=True) 20 | print(files) 21 | 22 | for filename in files: 23 | 24 | filepath, ext = os.path.splitext(filename) 25 | basename = os.path.splitext(os.path.basename(filepath))[0] 26 | subdirname = os.path.basename(os.path.dirname(filepath)) 27 | 28 | sex = subdirname[0] 29 | participant = subdirname[1:] 30 | 31 | # SEX :== m | f 32 | # SPEAKER_ID :== 33 | # INITIALS :== speaker initials, 3 letters 34 | # DIGIT :== number 0-9 to differentiate speakers with identical initials 35 | 36 | # print(filename) # input/dr1/fmem0/sa2.txt 37 | # print(filepath) # input/dr1/fmem0/sa2 38 | # print(subdirname) # fmem0 39 | # print(basename) # sa2 40 | # print(ext) # txt 41 | 42 | # Get audio file duration - use this as the EAF annotation's end timeslot 43 | # duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000) 44 | 45 | # Get annotation from the text file matching on file basename 46 | with open(filename, 'r', encoding='utf-8') as text_file: 47 | annotation = text_file.read() 48 | annotation_split = annotation.split() 49 | start = int(annotation_split[0]) 50 | duration = int(annotation_split[1]) 51 | # convert audio samples to seconds to ms 52 | duration = int(duration/16000*1000) 53 | annotation_text = " ".join(annotation_split[2:]) 54 | 55 | # Add any annotation cleaning here 56 | # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) 57 | 58 | print(start, duration, annotation_text) 59 | 60 | # Make EAF file 61 | output_eaf = Eaf() 62 | output_eaf.add_tier('default', part=participant) 63 | output_eaf.add_annotation('default', start, duration, annotation_text) 64 | output_eaf.add_linked_file(os.path.join(output_dir, f'{subdirname}-{basename}.wav')) 65 | output_eaf.to_file(os.path.join(output_dir, f'{subdirname}-{basename}.eaf')) 66 | 67 | # Copy WAV? 68 | # if copy_wavs: 69 | shutil.copyfile(f'{filepath}.wav', os.path.join(output_dir, f'{subdirname}-{basename}.wav')) 70 | 71 | print('>>> Done') 72 | 73 | 74 | def main(): 75 | parser = argparse.ArgumentParser(description='Make ELAN files to match TXT and WAVs') 76 | parser.add_argument('-i', '--input_dir', help='Folder of TXT and WAV files', default='input') 77 | parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output') 78 | 79 | parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true') 80 | parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false') 81 | parser.set_defaults(copy_wavs=False) 82 | args = parser.parse_args() 83 | 84 | input_dir = args.input_dir 85 | output_dir = args.output_dir 86 | copy_wavs = args.copy_wavs 87 | 88 | if copy_wavs: 89 | print('copying WAVs') 90 | else: 91 | print('skip copying WAVs') 92 | 93 | # Reset the output dir 94 | print("resetting output dir") 95 | shutil.rmtree(output_dir) 96 | os.makedirs(output_dir) 97 | 98 | # Go 99 | print("making elan files") 100 | make_elans(input_dir, output_dir, copy_wavs) 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /elan-character-spacer/output/1_1_3.pfsx: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 1280,697 6 | 7 | 8 | 7 9 | 10 | 11 | mpi.eudico.client.annotator.viewer.TimeLineViewer 12 | 13 | 14 | 24 15 | 16 | 17 | Phrase 18 | 19 | 20 | 1695 21 | 22 | 23 | 200.0 24 | 25 | 26 | 1 27 | 28 | 29 | 30 | 0 31 | 32 | 33 | 75 34 | 35 | 36 | 75 37 | 38 | 39 | 75 40 | 41 | 42 | 75 43 | 44 | 45 | 0 46 | 47 | 48 | 75 49 | 50 | 51 | 75 52 | 53 | 54 | 75 55 | 56 | 57 | 75 58 | 59 | 60 | 61 | 62 | Start Time 63 | End Time 64 | Tier 65 | Initials 66 | Comment 67 | Thread 68 | Sender 69 | Recipient 70 | Creation Date 71 | Modification Date 72 | 73 | 74 | AAM-LR Phone level audio segmentation 75 | 76 | 77 | 342,23 78 | 79 | 80 | 200.0 81 | 82 | 83 | 1695 84 | 85 | 86 | 87 | http://lux17.mpi.nl/aamlr/ 88 | 89 | 90 | ///Users/bbb/Desktop/abui-recordings/extra/1_1_3.wav 91 | 92 | 93 | 94 | 515 95 | 96 | 97 | 1695 98 | 99 | 100 | 101 | 160,90,90 102 | 103 | 104 | 105 | Phrase 106 | test 107 | 108 | 109 | 110 | 0.06 111 | 112 | 113 | 114 | 115 | 70 116 | 117 | 118 | Sender 119 | Recipient 120 | 121 | 122 | -------------------------------------------------------------------------------- /elan-splitter/split_eafs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 3 | # Copyright Ben Foley ben@cbmm.io 30 Jan 2018 4 | # 5 | # Split an audio file by the start and end times of annotations on a particular .eaf tier 6 | # Don't worry about 'Parsing unknown version of ELAN spec... ' warnings, 7 | # pympi is looking for v 2.7 or 2.8 of elan schema 8 | 9 | # You can open and save WAV files with pure python. 10 | # For opening and saving non-wav files – like mp3 – you'll need ffmpeg or libav. 11 | # On OSX, install ffmpeg with `brew install ffmpeg` 12 | 13 | 14 | # default usage: python3 split_eafs.py 15 | 16 | import argparse 17 | import glob 18 | import json 19 | import os 20 | import sys 21 | from pydub import AudioSegment 22 | from pympi.Elan import Eaf 23 | from slugify import slugify 24 | 25 | 26 | parser = argparse.ArgumentParser(description="This script will slice audio and output text based on ELAN annotations.") 27 | parser.add_argument('-i', '--input_dir', help='Directory of audio and eaf files', type=str, default='./input') 28 | parser.add_argument('-o', '--tier_order', help='Get the annotations from this tier index, eg top tier would be 1', type=int, default='1') 29 | parser.add_argument('-t', '--slice_tier', help='Tier name to use for slicing start end times', type=str, default='default') 30 | parser.add_argument('-w', '--text_tier', help='Tier name to use for annotation text', type=str, default='default') 31 | parser.add_argument('-m', '--silence_marker', help='Skip any annotations on the target language tier with this value', type=str, default='*PUB') 32 | parser.add_argument('-s', '--silence_tier', help='Silence audio when annotations are found on this ref tier', type=str, default='Silence') 33 | parser.add_argument('-a', '--output_audio_dir', help='Directory to save the audio files', type=str, default='./output') 34 | parser.add_argument('-l', '--output_label_dir', help='Directory to save text files', type=str, default='./output') 35 | parser.add_argument('-j', '--output_json', help='File name to output json', type=str, default='./output/annotations.json') 36 | parser.add_argument('-v', '--verbose', help='Verbose output', action='store_true') 37 | parser.add_argument('-n', '--name_with_annotation', help='Name the file with annotation value', action='store_true') 38 | parser.add_argument('-p', '--prefix', help='Prefix the file name with this', type=str, default='') 39 | parser.add_argument('-f', '--output_audio_format', help='Audio format to save media clips (requires ffmpeg)', type=str, default='wav') 40 | args = parser.parse_args() 41 | try: 42 | input_dir = args.input_dir 43 | tier_order = args.tier_order 44 | slice_tier = args.slice_tier 45 | text_tier = args.text_tier 46 | silence_marker = args.silence_marker 47 | silence_tier = args.silence_tier 48 | output_audio_dir = args.output_audio_dir 49 | output_label_dir = args.output_label_dir 50 | output_json = args.output_json 51 | verbose = args.verbose 52 | name_with_annotation = args.name_with_annotation 53 | prefix = args.prefix 54 | output_audio_format = args.output_audio_format.lower() 55 | except Exception: 56 | parser.print_help() 57 | sys.exit(0) 58 | 59 | # Limit to wav or mp3 outputs 60 | valid_formats = ['mp3', 'wav'] 61 | if output_audio_format not in valid_formats: 62 | raise Exception('Please use wav or mp3 output formats only') 63 | 64 | if not os.path.exists(output_audio_dir): 65 | os.makedirs(output_audio_dir) 66 | if not os.path.exists(output_label_dir): 67 | os.makedirs(output_label_dir) 68 | 69 | if verbose: 70 | print("tier_order", tier_order) 71 | print("slice_tier", slice_tier) 72 | 73 | 74 | def split_audio_by_start_end(input_audio, start, end, fname): 75 | output = input_audio[start:end] 76 | output.export(os.path.join(output_audio_dir, fname + "." + output_audio_format), format=output_audio_format) 77 | 78 | 79 | def write_text(annotation, fname): 80 | f = open(os.path.join(output_label_dir, fname + ".txt"), 'w') 81 | f.write(annotation) 82 | f.close() 83 | 84 | 85 | def write_json(annotations_data): 86 | with open(output_json, 'w') as outfile: 87 | json.dump(annotations_data, 88 | outfile, 89 | indent=4, 90 | separators=(',', ': '), 91 | sort_keys=False, 92 | ensure_ascii=False) 93 | 94 | 95 | def read_eaf(ie): 96 | 97 | if verbose: 98 | print("input file is", ie) 99 | 100 | input_eaf = Eaf(ie) 101 | 102 | # Check if the tiers we have been given exist 103 | tier_names = list(input_eaf.get_tier_names()) 104 | if verbose: 105 | print("tier_names", tier_names, file=sys.stderr) 106 | 107 | # Are we working by slice_tier name or order? 108 | if slice_tier != "default" : 109 | if verbose: 110 | print("using slice_tier by name:", slice_tier, file=sys.stderr) 111 | else: 112 | 113 | # Sanity check that the slice_tier num is not greater than the num of tiers 114 | if tier_order > len(tier_names): 115 | print("Error: tier number is greater than the number of tiers", 116 | file=sys.stderr) 117 | return False 118 | if verbose: 119 | print("using slice_tier by number:", tier_names[tier_order-1], file=sys.stderr) 120 | 121 | if slice_tier not in tier_names: 122 | print('Error: missing slice_tier ' + slice_tier, file=sys.stderr) 123 | return False 124 | 125 | if silence_tier not in tier_names: 126 | if verbose: 127 | print('silence tier not found: ' + silence_tier, file=sys.stderr) 128 | 129 | # get the input audio file 130 | inDir, name = os.path.split(ie) 131 | basename, ext = os.path.splitext(name) 132 | 133 | # we can write out mp3 or whatever, still require wav input 134 | ia = os.path.join(inDir, basename + ".wav") 135 | input_audio = AudioSegment.from_wav(ia) 136 | 137 | # We can pass in an arg for a ref tier that has silence labels 138 | check_silence_ref_tier = False 139 | if silence_tier in tier_names: 140 | silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier) 141 | if silence_tier_info.get("PARENT_REF") == tier: 142 | check_silence_ref_tier = True 143 | 144 | # Get annotation values, start and end times, and speaker id 145 | if text_tier not in tier_names: 146 | print('Error: missing text tier') 147 | return False 148 | 149 | annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier)) 150 | 151 | params = input_eaf.get_parameters_for_tier(text_tier) 152 | if 'PARTICIPANT' in params: 153 | speaker_id = params['PARTICIPANT'] 154 | 155 | annotations_data = [] 156 | i = 0 157 | for ann in annotations: 158 | skip = False 159 | ref_annotation = [] 160 | start = ann[0] 161 | end = ann[1] 162 | # output new values, not the original clip start end times 163 | clip_start = 0 164 | clip_end = ann[1] - ann[0] 165 | annotation = ann[2] 166 | 167 | # Check for annotations labelled with a particular symbol on the main tier 168 | if annotation == silence_marker: 169 | skip = True 170 | 171 | # Check for existence of an annotation in ref tier to silence 172 | # Annotation value doesn't matter 173 | if check_silence_ref_tier: 174 | ref_annotation = input_eaf.get_ref_annotation_at_time(silence_tier, start) 175 | if len(ref_annotation) is True: 176 | skip = True 177 | 178 | if skip is True: 179 | print('skipping annotation: ' + annotation, start, end) 180 | else: 181 | print('processing annotation: ' + annotation, start, end) 182 | # build the output audio/text filename 183 | fname = basename + "_" + str(i) 184 | if name_with_annotation: 185 | fname = slugify(annotation) 186 | 187 | if prefix != '': 188 | fname = prefix + '_' + fname 189 | obj = { 190 | 'audioFileName': os.path.join(".", fname + ".wav"), 191 | 'transcript': annotation, 192 | 'startMs': clip_start, 193 | 'stopMs': clip_end 194 | } 195 | if 'PARTICIPANT' in params: 196 | obj["speakerId"] = speaker_id 197 | annotations_data.append(obj) 198 | split_audio_by_start_end(input_audio, start, end, fname) 199 | write_text(annotation, fname) 200 | i += 1 201 | # output the json data for the next step in kaldi pipeline 202 | write_json(annotations_data) 203 | 204 | 205 | if verbose: 206 | print(annotations_data) 207 | 208 | 209 | def findFilesByExt(setOfAllFiles, exts): 210 | res = [] 211 | for f in setOfAllFiles: 212 | name, ext = os.path.splitext(f) 213 | if ("*" + ext.lower()) in exts: 214 | res.append(f) 215 | return res 216 | 217 | 218 | g_exts = ["*.eaf"] 219 | allFilesInDir = set(glob.glob(os.path.join(input_dir, "**"), recursive=True)) 220 | input_eafs = findFilesByExt(allFilesInDir, set(g_exts)) 221 | 222 | for ie in input_eafs: 223 | read_eaf(ie) 224 | -------------------------------------------------------------------------------- /elan-to-json/elan_to_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Get all files in the repository can use recursive atm as long as we don't need numpy 5 | pass in corpus path throw an error if matching file wav isn't found in the corpus directory 6 | 7 | Usage: python3 elan_to_json.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] [-t TIER] [-j OUTPUT_JSON] 8 | 9 | Copyright: University of Queensland, 2020 10 | Contributors: 11 | Nicholas Lambourne - (The University of Queensland, 2018) 12 | Ben Foley - (The University of Queensland, 2020) 13 | 14 | Derived from the Elpis `elan_to_json.py` script 15 | """ 16 | 17 | import argparse 18 | import glob 19 | import json 20 | import os 21 | import sys 22 | from _io import TextIOWrapper 23 | from pympi.Elan import Eaf 24 | from typing import List, Dict, Tuple, Union 25 | 26 | 27 | def load_json_file(file_name: str) -> List[Dict[str, str]]: 28 | """ 29 | Given a filename (parameter) containing JSON, load and 30 | return the a list of python dictionaries with containing the same information. 31 | :param file_name: name of file containing JSON to read from. 32 | :return a Python dictionary with the contents of the JSON file. 33 | """ 34 | data = [] 35 | if os.path.exists(file_name) and os.path.getsize(file_name) > 0: 36 | with open(file_name, "r", encoding="utf-8") as file_: 37 | data = json.load(file_) 38 | return data 39 | 40 | 41 | def write_data_to_json_file(data: object = {}, output: Union[str, TextIOWrapper] = []) -> None: 42 | """ 43 | Writes the given Python dictionary (or list) object to a JSON file at the the given 44 | output location (which can either be a file - specified as a string, or 45 | directed to an output like sys.stdout or sys.stderr). 46 | :param data: the Python dictionary to be converted to JSON and written. 47 | :param output: the file to write the dictionary contents to. 48 | """ 49 | json_data_string = json.dumps(data, 50 | indent=4, 51 | separators=(',', ': '), 52 | sort_keys=False, 53 | ensure_ascii=False) 54 | 55 | if isinstance(output, str): 56 | with open(output, "w") as file: 57 | file.write(json_data_string) 58 | else: 59 | print(json_data_string, file=output, flush=True) 60 | 61 | 62 | def save_tier_info(input_eaf: Eaf = None, 63 | file_name: str = '', 64 | tier_types: List = [], 65 | corpus_tiers_file: str = 'corpus_tiers.json'): 66 | tiers = [] 67 | for tier_type in tier_types: 68 | tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type) 69 | tiers.append( { tier_type: tier_names } ) 70 | file_data = {"file": file_name, "tiers": tiers} 71 | corpus_tiers = load_json_file(corpus_tiers_file) 72 | corpus_tiers.append(file_data) 73 | write_data_to_json_file(data=corpus_tiers, 74 | output=corpus_tiers_file) 75 | 76 | 77 | def process_eaf(input_elan_file: str = '', 78 | tier_order: int = 0, 79 | tier_name: str = '', 80 | tier_type: str = '', 81 | corpus_tiers_file: str = '') -> List[dict]: 82 | """ 83 | Method to process a particular tier in an eaf file (ELAN Annotation Format). 84 | Transcriptions are read from an elan file tier. 85 | Tiers are nodes from the tree structure in the .eaf file. 86 | The tier to read from is determined by tier order (eg top tier would be order 1), 87 | tier type (eg default-lt) or tier name (eg Phrase). 88 | If tier type is used, the first tier matching this type is used. 89 | Elan can have multiple tiers of same type, future work would support reading data 90 | from multiple tiers of the selected type. 91 | 92 | It stores the transcriptions in the following format: 93 | {'speaker_id': , 94 | 'audio_file_name': , 95 | 'transcript': , 96 | 'start_ms': , 97 | 'stop_ms': } 98 | 99 | :param input_elan_file: name of input elan file 100 | :param tier_order: index of the elan tier to process 101 | :param tier_type: type of the elan tier to process 102 | :param tier_name: name of the elan tier to process 103 | :return: a list of dictionaries, where each dictionary is an annotation 104 | """ 105 | 106 | print(f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}") 107 | 108 | # Get paths to files 109 | input_directory, full_file_name = os.path.split(input_elan_file) 110 | file_name, extension = os.path.splitext(full_file_name) 111 | 112 | # Look for wav file matching the eaf file in same directory 113 | if os.path.isfile(os.path.join(input_directory, file_name + ".wav")): 114 | print("WAV file found for " + file_name, file=sys.stderr) 115 | else: 116 | raise ValueError(f"WAV file not found for {full_file_name}. " 117 | f"Please put it next to the eaf file in {input_directory}.") 118 | 119 | # Get tier data from Elan file 120 | input_eaf = Eaf(input_elan_file) 121 | tier_types: List[str] = list(input_eaf.get_linguistic_type_names()) 122 | tier_names: List[str] = list(input_eaf.get_tier_names()) 123 | 124 | # Keep this data handy for future corpus analysis 125 | # save_tier_info(input_eaf=input_eaf, 126 | # tier_types=tier_types, 127 | # file_name=file_name, 128 | # corpus_tiers_file=corpus_tiers_file) 129 | 130 | # Get annotations and parameters (things like speaker id) on the target tier 131 | annotations: List[Tuple[str, str, str]] = [] 132 | annotations_data: List[dict] = [] 133 | 134 | # First try using tier order to get tier name 135 | if tier_order: 136 | # Watch out for files that may not have this many tiers 137 | # tier_order is 1-index but List indexing is 0-index 138 | try: 139 | tier_name = tier_names[tier_order-1] 140 | print(f"using tier order {tier_order} to get tier name {tier_name}") 141 | except IndexError: 142 | print("couldn't find a tier") 143 | pass 144 | else: 145 | # else use tier type to get a tier name 146 | if tier_type in tier_types: 147 | print(f"found tier type {tier_type}") 148 | tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type) 149 | tier_name = tier_names[0] 150 | if tier_name: 151 | print(f"found tier name {tier_name}") 152 | else: 153 | print("tier type not found in this file") 154 | 155 | if tier_name in tier_names: 156 | print(f"using tier name {tier_name}") 157 | annotations = input_eaf.get_annotation_data_for_tier(tier_name) 158 | 159 | if annotations: 160 | print(f"annotations {annotations}") 161 | annotations = sorted(annotations) 162 | parameters: Dict[str,str] = input_eaf.get_parameters_for_tier(tier_name) 163 | print(f"parameters {parameters}") 164 | speaker_id: str = parameters.get("PARTICIPANT", "") 165 | 166 | for annotation in annotations: 167 | start: str = annotation[0] 168 | end: str = annotation[1] 169 | annotation_text: str = annotation[2] 170 | print(f"annotation {annotation} {start} {end}") 171 | obj = { 172 | "audio_file_name": f"{file_name}.wav", 173 | "transcript": annotation_text, 174 | "start_ms": start, 175 | "stop_ms": end 176 | } 177 | if "PARTICIPANT" in parameters: 178 | obj["speaker_id"] = speaker_id 179 | annotations_data.append(obj) 180 | 181 | return annotations_data 182 | 183 | 184 | def main(): 185 | 186 | """ 187 | Run the entire elan_to_json.py as a command line utility. It extracts information on speaker, audio file, 188 | transcription etc. from the given tier of the specified .eaf file. 189 | 190 | Tier can be selected by name, tier order or tier type 191 | 192 | Usage: python3 elan_to_json.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] [-j OUTPUT_JSON] 193 | [-r TIER-ORDER] [-n TIER-NAME] [-t TIER-TYPE] 194 | 195 | python3 elan_to_json.py 196 | 197 | """ 198 | 199 | parser: argparse.ArgumentParser = argparse.ArgumentParser( 200 | description="This script takes an directory with ELAN files and " 201 | "saves the audio and output text in JSON format to a file") 202 | parser.add_argument("-i", "--input_dir", 203 | help="Directory of dirty audio and eaf files", 204 | default="./input/") 205 | parser.add_argument("-o", "--output_dir", 206 | help="Output directory", 207 | default="./output/") 208 | parser.add_argument("-j", "--output_json", 209 | help="File path to output json", 210 | default="elan.json") 211 | parser.add_argument("-r", "--tier_order", 212 | help="Source tier order", 213 | type=int, 214 | default=0) 215 | parser.add_argument("-n", "--tier_name", 216 | help="Source tier name", 217 | default="Phrase") 218 | parser.add_argument("-t", "--tier_type", 219 | help="Source tier type", 220 | default="default-lt") 221 | arguments: argparse.Namespace = parser.parse_args() 222 | 223 | # Build output directory if needed 224 | if not os.path.exists(arguments.output_dir): 225 | os.makedirs(arguments.output_dir) 226 | 227 | all_files_in_directory = set(glob.glob(os.path.join(arguments.input_dir, "**"), recursive=True)) 228 | input_elan_files = [ file_ for file_ in all_files_in_directory if file_.endswith(".eaf") ] 229 | 230 | annotations_data = [] 231 | 232 | for input_elan_file in input_elan_files: 233 | annotations_data.extend(process_eaf(input_elan_file=input_elan_file, 234 | tier_order=arguments.tier_order, 235 | tier_name=arguments.tier_name, 236 | tier_type=arguments.tier_type)) 237 | # TODO sort JSON by file name 238 | annotations_data.sort(key=lambda x: x["audio_file_name"], reverse=False) 239 | 240 | write_data_to_json_file(data=annotations_data, 241 | output=os.path.join(arguments.output_dir, arguments.output_json)) 242 | 243 | 244 | if __name__ == "__main__": 245 | main() 246 | 247 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------