├── json-to-elan
    ├── input
    │   └── .gitkeep
    ├── requirements.txt
    ├── .gitignore
    ├── README.md
    └── convert.py
├── make-elans-from-wavs
    ├── wav
    │   └── .gitkeep
    ├── flac
    │   └── .gitkeep
    ├── .gitignore
    ├── README.md
    ├── flac2wav.py
    ├── flatten.py
    └── make-elan.py
├── elan-character-spacer
    ├── input
    │   ├── .gitkeep
    │   ├── 1_1_3.eaf
    │   └── abui_1.eaf
    ├── output
    │   ├── .gitkeep
    │   ├── 1_1_3.eaf
    │   ├── abui_1.eaf
    │   └── 1_1_3.pfsx
    ├── .gitignore
    ├── README.md
    └── elan-insert-spaces.py
├── python-tier-selector
    ├── input
    │   ├── .gitkeep
    │   ├── abui_2.eaf
    │   ├── abui_3.eaf
    │   ├── abui_4.eaf
    │   └── abui_1.eaf
    ├── .gitignore
    └── select-tiers.py
├── make-elans-from-wavs-and-spreadsheet
    ├── eaf
    │   └── .gitkeep
    ├── input
    │   ├── .gitkeep
    │   └── test.xlsx
    ├── wav
    │   └── .gitkeep
    ├── .gitignore
    ├── README.md
    └── make-elan.py
├── elan-splitter
    ├── requirements.txt
    ├── .gitignore
    ├── README.md
    └── split_eafs.py
├── .gitignore
├── elan-to-json
    ├── input
    │   ├── abui_1.wav
    │   ├── abui_2.wav
    │   ├── abui_3.wav
    │   ├── abui_4.wav
    │   ├── abui_2.eaf
    │   ├── abui_3.eaf
    │   ├── abui_4.eaf
    │   └── abui_1.eaf
    ├── output
    │   └── elan.json
    ├── README.md
    └── elan_to_json.py
├── make-elans-from-text
    ├── .gitignore
    ├── README.md
    ├── requirements.txt
    └── convert.py
├── make-elans-from-timit
    ├── .gitignore
    ├── requirements.txt
    ├── README.md
    └── convert.py
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/json-to-elan/input/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/wav/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/elan-character-spacer/input/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/elan-character-spacer/output/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/json-to-elan/requirements.txt:
--------------------------------------------------------------------------------
1 | pympi-ling==1.70.2
2 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/flac/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/python-tier-selector/input/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/eaf/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/input/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/wav/.gitkeep:
--------------------------------------------------------------------------------
1 | # keep this dir


--------------------------------------------------------------------------------
/elan-splitter/requirements.txt:
--------------------------------------------------------------------------------
1 | pydub
2 | pympi-ling
3 | python-slugify
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # General
2 | .DS_Store
3 | # Thumbnails
4 | ._*
5 | venv
6 | __pycache__
7 | .idea


--------------------------------------------------------------------------------
/elan-to-json/input/abui_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_1.wav


--------------------------------------------------------------------------------
/elan-to-json/input/abui_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_2.wav


--------------------------------------------------------------------------------
/elan-to-json/input/abui_3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_3.wav


--------------------------------------------------------------------------------
/elan-to-json/input/abui_4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/elan-to-json/input/abui_4.wav


--------------------------------------------------------------------------------
/make-elans-from-wavs/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .idea
3 | .DS_Store
4 | flac/*
5 | wav/*
6 | !flac/.gitkeep
7 | !wav/.gitkeep
8 | 
9 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/input/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoEDL/elan-helpers/HEAD/make-elans-from-wavs-and-spreadsheet/input/test.xlsx


--------------------------------------------------------------------------------
/elan-splitter/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | # Source and target data dirs
 6 | input/*
 7 | !input/.gitkeep
 8 | output/*
 9 | !output/.gitkeep
10 | 


--------------------------------------------------------------------------------
/elan-character-spacer/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | # Source and target data dirs
 6 | input/*
 7 | !input/.gitkeep
 8 | output/*
 9 | !output/.gitkeep
10 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/.gitignore:
--------------------------------------------------------------------------------
 1 | venv
 2 | .idea
 3 | .DS_Store
 4 | flac/*
 5 | wav/*
 6 | eaf/*
 7 | !flac/.gitkeep
 8 | !wav/.gitkeep
 9 | !eaf/.gitkeep
10 | input/~*
11 | 
12 | 


--------------------------------------------------------------------------------
/json-to-elan/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | venv
 6 | __pycache__
 7 | .idea
 8 | all
 9 | # Source and target data dirs
10 | input/*
11 | !input/.gitkeep
12 | output/*
13 | !output/.gitkeep
14 | 


--------------------------------------------------------------------------------
/python-tier-selector/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | venv
 6 | __pycache__
 7 | .idea
 8 | # Source and target data dirs
 9 | input/*
10 | !input/.gitkeep
11 | output/*
12 | !output/.gitkeep
13 | 


--------------------------------------------------------------------------------
/make-elans-from-text/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | venv
 6 | __pycache__
 7 | .idea
 8 | all
 9 | # Source and target data dirs
10 | input/*
11 | !input/.gitkeep
12 | output/*
13 | !output/.gitkeep
14 | 


--------------------------------------------------------------------------------
/make-elans-from-timit/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | # Thumbnails
 4 | ._*
 5 | venv
 6 | __pycache__
 7 | .idea
 8 | all
 9 | # Source and target data dirs
10 | input/*
11 | !input/.gitkeep
12 | output/*
13 | !output/.gitkeep
14 | 


--------------------------------------------------------------------------------
/elan-to-json/output/elan.json:
--------------------------------------------------------------------------------
1 | [
2 |     {
3 |         "audio_file_name": "abui_1.wav",
4 |         "transcript": "amakaang 你好 di kaai hada muila",
5 |         "start_ms": 290,
6 |         "stop_ms": 1910,
7 |         "speaker_id": "SL"
8 |     }
9 | ]


--------------------------------------------------------------------------------
/elan-character-spacer/README.md:
--------------------------------------------------------------------------------
1 | # Elan character spacer
2 | 
3 | This script will space-separate characters in all words in specified Elan tiers from `input` folder and save modified files to `output` folder.
4 | 
5 | Very crude, requires the input and output dirs to already exist.
6 | 
7 | May not be UTF-8 safe.


--------------------------------------------------------------------------------
/make-elans-from-text/README.md:
--------------------------------------------------------------------------------
 1 | Given a folder of `.txt` files containing annotations of audio in `.wav` files, create `.eaf` ELAN files.
 2 | 
 3 | Annotation start-time is 0 and end-time is duration of audio.  
 4 | 
 5 | 
 6 | ```shell
 7 | python3 -m venv venv
 8 | source venv/bin/activate
 9 | pip install -r requirements.txt
10 | python convert.py
11 | ```
12 | 
13 | 
14 | Use optional `--copy_wavs` setting to copy WAV files into the out dir


--------------------------------------------------------------------------------
/make-elans-from-text/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | audioread==2.1.9
 3 | certifi==2020.12.5
 4 | cffi==1.14.5
 5 | chardet==4.0.0
 6 | decorator==5.0.9
 7 | idna==2.10
 8 | joblib==1.0.1
 9 | librosa==0.8.1
10 | llvmlite==0.36.0
11 | numba==0.53.1
12 | numpy==1.20.3
13 | packaging==20.9
14 | pooch==1.3.0
15 | pycparser==2.20
16 | pympi-ling==1.69
17 | pyparsing==2.4.7
18 | requests==2.25.1
19 | resampy==0.2.2
20 | scikit-learn==0.24.2
21 | scipy==1.6.3
22 | six==1.16.0
23 | SoundFile==0.10.3.post1
24 | threadpoolctl==2.1.0
25 | urllib3==1.26.5
26 | 


--------------------------------------------------------------------------------
/make-elans-from-timit/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | audioread==2.1.9
 3 | certifi==2020.12.5
 4 | cffi==1.14.5
 5 | chardet==4.0.0
 6 | decorator==5.0.9
 7 | idna==2.10
 8 | joblib==1.0.1
 9 | librosa==0.8.1
10 | llvmlite==0.36.0
11 | numba==0.53.1
12 | numpy==1.20.3
13 | packaging==20.9
14 | pooch==1.3.0
15 | pycparser==2.20
16 | pympi-ling==1.69
17 | pyparsing==2.4.7
18 | requests==2.25.1
19 | resampy==0.2.2
20 | scikit-learn==0.24.2
21 | scipy==1.6.3
22 | six==1.16.0
23 | SoundFile==0.10.3.post1
24 | threadpoolctl==2.1.0
25 | urllib3==1.26.5
26 | 


--------------------------------------------------------------------------------
/make-elans-from-timit/README.md:
--------------------------------------------------------------------------------
 1 | Given a folder of `.txt` files containing annotations of audio in `.wav` files, create `.eaf` ELAN files.
 2 | 
 3 | Annotation start-time is 0 and end-time is duration of audio.  
 4 | 
 5 | TIMIT text files are in `start end text` format, e.g.
 6 | ```
 7 | 0 49460 Even then, if she took one step forward he could catch her.
 8 | 
 9 | ```
10 | 
11 | 
12 | ```shell
13 | python3 -m venv venv
14 | source venv/bin/activate
15 | pip install -r requirements.txt
16 | python convert.py
17 | ```
18 | 
19 | 
20 | Use optional `--copy_wavs` setting to copy WAV files into the out dir


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution guide
 2 | 
 3 | 1. Create a new folder in the top level
 4 | 	- Use lower case, hyphen-separated words, e.g. `header-extractor`
 5 | 
 6 | 2. Place your script(s) in the that folder, e.g. `header-extractor/extract.py`
 7 | 
 8 | 3. Add all dependencies into a `dist` folder, e.g. `header-extractor/dist/some-dependency.py`
 9 | 
10 | 4. Include a `README.md` in the root of your script folder, e.g. `header-extractor/README.md`, specifying
11 | 	1. the usage environment (e.g. Python 3.6 + numpy + Libxml2)
12 | 	2. usage instructions (e.g. `python extract.py infile.cha outfile.csv`)
13 | 
14 | 5. Ideally, also include dummy or sample files to show a minimal working example of script.
15 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/README.md:
--------------------------------------------------------------------------------
 1 | # Make ELANs from WAVs
 2 | 
 3 | Processing scripts to generate Elan files matching a nested folders of WAVs. The WAV filenames are used as the annotation value, and written forms of digits are converted to spoken forms. 
 4 | 
 5 | Written for the TIDIGITS corpus, so there are some specific naming manipulations in here that you won't need for your own data, including splitting filenames into individual characters. E.g., for file `123.wav` the script will create an Elan file `123.eaf` with annotation `one two three`.
 6 | 
 7 | Includes script to convert FLAC audio to WAV. 
 8 | 
 9 | This was written to convert the TIDIGITS corpus audio for Elpis-ready format.
10 | 
11 | To use it, drop your audio in the flac dir and run `flac2wav.py` to convert FLAC audio to WAV.
12 | 
13 | Then run the `make-elan.py` script to generate Elan files. 
14 | 
15 | If you want files to be in a single directory rather than nested, you can run `flatten.py`, which will move files into a single dir, renaming the files to incorporate the original dir structure in the filenames.
16 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/README.md:
--------------------------------------------------------------------------------
 1 | # Make ELANs from WAVs
 2 | 
 3 | This is a script to generate ELAN files matching a folder of WAVs. The WAV filenames are used to retrieve annotation value from a spreadsheet. If an audio filename isn't found in the spreadsheet, an ELAN file with blank annotation value will be created. 
 4 | 
 5 | ## Requirements
 6 | 
 7 | This script has been written for Python3.
 8 | 
 9 | The spreadsheet must contain at least one column named "File name" and one column named "Transcription". The spreadsheet can contain other columns.
10 | 
11 | Put your audio in the wav dir, put your spreadsheet in the input dir.
12 | 
13 | ## Usage
14 | 
15 | Start a Python virtual environment.
16 | 
17 | ```bash
18 | python3 -m venv venv
19 | source ./venv/bin/activate
20 | 
21 | ```
22 | 
23 | Install the packages which the script needs.
24 | 
25 | ```bash
26 | pip install argparse librosa pympi-ling pandas xlrd
27 | 
28 | ```
29 | 
30 | Run the script.
31 | 
32 | ```bash
33 | python make-elan.py -a input/test.xlsx -s wav -t eaf 
34 | ```
35 | 
36 | 
37 | When finished, deactivate the venv.
38 | 
39 | ```bash
40 | deactivate
41 | ```


--------------------------------------------------------------------------------
/elan-character-spacer/elan-insert-spaces.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import xml.etree.ElementTree as ET
 3 | import glob
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def spaceMe(file_):
 9 |     print(file_)
10 |     # Which tier?
11 |     tier_name = 'Phrase'
12 |     tree = ET.parse(file_)
13 | 
14 |     root = tree.getroot()
15 | 
16 |     for tier in root.iter('TIER'):
17 |         if tier.attrib['TIER_ID'] == tier_name:
18 |             for annotation in tier.iter('ANNOTATION_VALUE'):
19 |                 # Get the original text
20 |                 # OPTION: use this to not end up with double spaces between words
21 |                 source_text = annotation.text.replace(" ", "")
22 |                 # OR: use this to have double spaces between words
23 |                 # source_text = annotation.text
24 | 
25 |                 insert_spaces = " ".join(source_text)
26 | 
27 |                 # update the annotation
28 |                 annotation.text = str(insert_spaces)
29 | 
30 |                 # feedback
31 |                 print("done")
32 | 
33 |     # Save the file to output dir
34 |     tree.write(os.path.join("output", os.path.basename(file_)))
35 | 
36 | def main():
37 |     for path in Path('./input').rglob('*.eaf'):
38 |         spaceMe(path)
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/elan-to-json/README.md:
--------------------------------------------------------------------------------
 1 | # Elan to JSON
 2 | The script reads an Elan file (or a directory including Elan files) and exports a JSON file with the annotations on a selected tier. You can choose which tier by passing a tier name, tier type, or a tier order number as an argument to the script. Note that the tier selection has to be consistent across the data set.
 3 | 
 4 | ### Sample usage:
 5 | 
 6 | This will write a JSON file containing the annotations on the third tier (from the top) of all `.eaf` files in a folder named `transcriptions`. 
 7 | 
 8 | ```
 9 | python3 elan_to_json.py -i transcriptions -r 3
10 | ```
11 | 
12 | ### Sample output:
13 | 
14 | ```
15 | [
16 |     {
17 |         "audio_file_name": "abui_1.wav",
18 |         "transcript": "amakaang di kaai hada muila",
19 |         "start_ms": 290,
20 |         "stop_ms": 1910,
21 |         "speaker_id": "SL"
22 |     },
23 |     {
24 |         "audio_file_name": "abui_2.wav",
25 |         "transcript": "dining ayoku kamar mia mui muila",
26 |         "start_ms": 890,
27 |         "stop_ms": 2960,
28 |         "speaker_id": "SL"
29 |     },
30 |     {
31 |         "audio_file_name": "abui_3.wav",
32 |         "transcript": "hekaai dining ayoku kamar mia muila",
33 |         "start_ms": 1850,
34 |         "stop_ms": 4140,
35 |         "speaker_id": "SL"
36 |     }
37 | ]
38 | ```
39 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/flac2wav.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def flac2wav(source_parent_dir, target_parent_dir):
 5 |     """
 6 |     Convert flac audio to wav
 7 |     """
 8 | 
 9 |     for dirname, dirnames, filenames in os.walk(source_parent_dir):
10 | 
11 |         # print path to all subdirectories first.
12 |         for subdirname in dirnames:
13 |             print(os.path.join(dirname, subdirname))
14 | 
15 |         # print path to all filenames.
16 |         for filename in filenames:
17 |             if '.flac' in filename:
18 |                 print(filename)
19 |                 parent, gender, child = dirname.split(os.path.sep)
20 |                 basename, ext = os.path.splitext(os.path.basename(filename))
21 |                 print(parent, gender, child, filename)
22 | 
23 |                 source_path = os.path.join(source_parent_dir, gender, child)
24 |                 target_path = os.path.join(target_parent_dir, gender, child)
25 | 
26 |                 if not os.path.exists(target_path):
27 |                     os.makedirs(target_path)
28 | 
29 |                 os.system(f'ffmpeg -hide_banner -loglevel warning -y -i {source_path}/{filename} {target_path}/{basename}.wav')
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     source_parent_dir = 'flac'
34 |     target_parent_dir = 'wav'
35 |     flac2wav(source_parent_dir, target_parent_dir)
36 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/flatten.py:
--------------------------------------------------------------------------------
 1 | from os import scandir, chdir
 2 | from pathlib import Path
 3 | from shutil import move, rmtree
 4 | 
 5 | 
 6 | def flatten(root_path: str, absolute_root: str) -> None:
 7 |     """
 8 |     Flatten a tree of files and give the files the names of the enclosing
 9 |     files separated by underscores.
10 |     WARNING: operates in place and will destroy exiting file structure.
11 |     :param root_path: the local root (at the start this will match the
12 |     absolute root)
13 |     :param absolute_root: the root directory of the tree you want to flatten
14 |     """
15 |     path = Path(root_path)
16 |     stack = set(scandir(root_path))
17 |     while stack:
18 |         entry = stack.pop()
19 |         entry_path = Path(entry.path)
20 |         if ".DS_Store" in entry.name or not entry_path.exists() and entry_path.is_file():
21 |             if entry_path.exists():
22 |                 entry_path.unlink()
23 |             continue
24 |         if entry_path.is_dir():
25 |             flatten(entry.path, absolute_root)
26 |             if root_path != absolute_root:
27 |                 stack = stack.union(filter(lambda x: x.is_file(),
28 |                                         list(scandir(root_path))))
29 |             rmtree(entry.path)
30 |         elif entry_path.exists() and entry_path.is_file():
31 |             target_file = f"{path.parent.resolve()}/{path.name}_{entry.name}"
32 |             move(entry.path, target_file)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     target = "wav"
37 |     # chdir(target)
38 |     flatten(target, target)
39 | 


--------------------------------------------------------------------------------
/json-to-elan/README.md:
--------------------------------------------------------------------------------
 1 | # JSON to ELAN
 2 | 
 3 | The script reads a JSON file (or folder) and generates an ELAN file to match.
 4 | 
 5 | ## JSON format
 6 | 
 7 | It has been written for the JSON output from Huggingface ASR pipelines. Here's an example of the expected JSON format. 
 8 | 
 9 | ```json
10 | [
11 |     {
12 |         "text": "luanghan",
13 |         "timestamp":
14 |         [
15 |             1.16,
16 |             1.48
17 |         ]
18 |     },
19 |     {
20 |         "text": "ian",
21 |         "timestamp":
22 |         [
23 |             1.56,
24 |             1.7
25 |         ]
26 |     }
27 | ]
28 | ```
29 | 
30 | ## Setup 
31 | 
32 | Create a virtual environment and install the required packages.
33 | ```
34 | python3 -m venv venv
35 | source venv/bin/activate
36 | pip install -r requirements.txt
37 | ```
38 | 
39 | 
40 | Put your JSON files in the `input` directory, then run the script. The script will build an ELAN files for each JSON file, using that JSON file's annotation data. It expects that your WAV files are named the same as the JSON basename (eg, if your JSON file is named `audio_1.json`, the ELAN file will end up with a linked media added for `audio_1.wav`). To open the ELAN file you will need to copy your audio into the output dir. Note that the output dir is erased each time the script it run, so be sure to keep a copy of the audio. 
41 | ```
42 | python convert.py
43 | ```
44 | 
45 | Optionally, you can specify a different input directory for the JSON files, and the output directory to write the ELAN files. If you have WAV files in the input directory, you can choose to copy them into the output too. You can also set a different tier name from the default (which is "default"). Here's an example:
46 | ```
47 | python convert.py --tier_name Words --input source_files --output elan_files --copy_wavs 
48 | ```
49 | 


--------------------------------------------------------------------------------
/elan-character-spacer/input/1_1_3.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2017-07-06T13:37:00+10:00" FORMAT="2.8" VERSION="2.8" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv2.8.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/neinheim/Documents/GitHub/asr-daan/toy_corpus/data/1_1_3.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./1_1_3.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">1</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="1850"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="4140"/>
11 |     </TIME_ORDER>
12 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
13 |         <ANNOTATION>
14 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts2">
15 |                 <ANNOTATION_VALUE>hekaai dining ayoku kamar mia muila</ANNOTATION_VALUE>
16 |             </ALIGNABLE_ANNOTATION>
17 |         </ANNOTATION>
18 |     </TIER>
19 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
20 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
21 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
22 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
23 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
24 | </ANNOTATION_DOCUMENT>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Elan Helpers
 2 | 
 3 | Tools and scripts for working with ELAN. See the README file in each script folder for usage instuctions.
 4 | 
 5 | ## Elan Character Spacer
 6 | 
 7 | This script will space-separate characters in all words in specified Elan tiers.
 8 | 
 9 | 
10 | ## Elan to JSON
11 | 
12 | The script reads an Elan file (or a directory including Elan files) and exports a JSON file with the annotations on a selected tier. You can choose which tier by passing a tier name, tier type, or a tier order number as an argument to the script.
13 | 
14 | 
15 | ## Elan Splitter
16 | 
17 | The script processes a directory of audio annotated in Elan, and outputs audio clips and matching-named text files containing the respective annotations. The original files are not altered. Audio clips are determined by the start and end times of annotations on the first tier. You can choose to use another tier by passing a tier name or a different order number (not tier type) as an argument to the script.
18 | 
19 | ## JSON to ELAN
20 | 
21 | This script will process a folder of JSON files containing annotations, and build ELAN files for each.
22 | 
23 | ## Python Tier Selector
24 | 
25 | This script doesn't do much on its own, but can be used as a basis for your own processing script. It looks in a folder the user specifies, and compiles a list of all the tiers in the Elan files in that folder. The script offers the user an option to select one or more tiers from the list. From this point you could extend the script to extract all the annotations on the selected tiers, or perhaps write a new Elan file that combines the selected tiers.
26 | 
27 | 
28 | ## Make ELANs from text
29 | 
30 | This script can be used to make ELAN files that are based on a folder of text annotations and WAV audio recordings.
31 | 
32 | 
33 | ## Make ELANs from TIMIT
34 | 
35 | This script can be used to make ELAN files from the TIMIT dataset. Handy for testing Elpis.
36 | 
37 | 
38 | ## Make ELANs from WAVs
39 | 
40 | Processing scripts to generate Elan files for a nested folders of WAVs, using the WAV file names as the annotation values. Perfect for converting a directory of audio files to Elpis-ready format.
41 | 


--------------------------------------------------------------------------------
/elan-character-spacer/output/1_1_3.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2020-05-20T13:22:42+10:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/abui-recordings/extra/1_1_3.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="../../../Desktop/abui-recordings/extra/1_1_3.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">2</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="1850"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="2800"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="3910"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="4140"/>
13 |     </TIME_ORDER>
14 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
15 |         <ANNOTATION>
16 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts4">
17 |                 <ANNOTATION_VALUE>h e k a a i d i n i n g a y o k u k a m a r m i a m u i l a</ANNOTATION_VALUE>
18 |             </ALIGNABLE_ANNOTATION>
19 |         </ANNOTATION>
20 |     </TIER>
21 |     <TIER LINGUISTIC_TYPE_REF="default-lt" TIER_ID="test">
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts3">
24 |                 <ANNOTATION_VALUE>something</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
29 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
30 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
31 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
32 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
33 | </ANNOTATION_DOCUMENT>
34 | 


--------------------------------------------------------------------------------
/elan-to-json/input/abui_2.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:53:00+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_2.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_2.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:4395a55c-130b-4bfd-8c19-2785dfee027b</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">2</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="890"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="1632"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="2822"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="2960"/>
13 |     </TIME_ORDER>
14 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
15 |         <ANNOTATION>
16 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts4">
17 |                 <ANNOTATION_VALUE>dining ayoku kamar mia mui muila</ANNOTATION_VALUE>
18 |             </ALIGNABLE_ANNOTATION>
19 |         </ANNOTATION>
20 |     </TIER>
21 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts3">
24 |                 <ANNOTATION_VALUE>kamar mia mui muila</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
29 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
30 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
31 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
32 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
33 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
34 | </ANNOTATION_DOCUMENT>
35 | 


--------------------------------------------------------------------------------
/elan-to-json/input/abui_3.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:52:53+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_3.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_3.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">2</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="1850"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="2730"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="4140"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="4630"/>
13 |     </TIME_ORDER>
14 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
15 |         <ANNOTATION>
16 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts3">
17 |                 <ANNOTATION_VALUE>hekaai dining ayoku kamar mia muila</ANNOTATION_VALUE>
18 |             </ALIGNABLE_ANNOTATION>
19 |         </ANNOTATION>
20 |     </TIER>
21 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts4">
24 |                 <ANNOTATION_VALUE>kamar mia muila</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
29 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
30 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
31 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
32 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
33 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
34 | </ANNOTATION_DOCUMENT>
35 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs/make-elan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import librosa
 3 | import re
 4 | import num2words
 5 | from pympi.Elan import Eaf
 6 | 
 7 | 
 8 | def make_elan(source_parent_dir, target_parent_dir):
 9 |     """
10 |     Make elan files based on filenames of wav files
11 |     Written for the TIDIGITS corpus, so some things are specific to the name formats of that corpus
12 |     """
13 | 
14 |     for dirname, dirnames, filenames in os.walk(source_parent_dir):
15 | 
16 |         # print path to all subdirectories first.
17 |         for subdirname in dirnames:
18 |             print(os.path.join(dirname, subdirname))
19 | 
20 |         # print path to all filenames.
21 |         for filename in filenames:
22 |             if '.wav' in filename:
23 |                 parent, gender, child = dirname.split(os.path.sep)
24 |                 basename, ext = os.path.splitext(os.path.basename(filename))
25 |                 print(parent, gender, child, filename)
26 | 
27 |                 source_path = os.path.join(source_parent_dir, gender, child)
28 |                 target_path = os.path.join(target_parent_dir, gender, child)
29 | 
30 |                 if not os.path.exists(target_path):
31 |                     print(target_path)
32 |                     os.makedirs(target_path)
33 | 
34 |                 # Audio file duration - use this as end timeslot
35 |                 duration = int(librosa.get_duration(filename=os.path.join(source_path, filename))*1000)
36 | 
37 |                 # Make file annotation from filename (minus the suffix)
38 |                 annotation = " ".join([char for char in basename[:-1]])
39 |                 # These are specific to the TIDIGITS naming convention
40 |                 annotation = annotation.replace("o", "oh")
41 |                 annotation = annotation.replace("z", "zero")
42 | 
43 |                 text = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)
44 | 
45 |                 print(filename, duration, annotation, text)
46 | 
47 |                 # Make elan
48 |                 output_eaf = Eaf()
49 |                 output_eaf.add_tier('tx')
50 |                 output_eaf.insert_annotation('tx', 0, duration, text)
51 |                 output_eaf.add_linked_file(os.path.join(target_path, f'{basename}.wav'))
52 | 
53 |                 output_eaf.to_file(os.path.join(target_path, f'{basename}.eaf'))
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     source_parent_dir = 'wav'
58 |     target_parent_dir = 'wav'
59 |     make_elan(source_parent_dir, target_parent_dir)
60 | 


--------------------------------------------------------------------------------
/python-tier-selector/select-tiers.py:
--------------------------------------------------------------------------------
 1 | from PyInquirer import prompt
 2 | import glob
 3 | import os
 4 | from pympi.Elan import Eaf
 5 | from typing import Set
 6 | 
 7 | 
 8 | # Return all files that have matching extension, from a specified directory
 9 | def find_files_by_ext(all_files: Set[str], extensions: Set[str]):
10 |     files = []
11 |     for file in all_files:
12 |         name, ext = os.path.splitext(file)
13 |         if ("*" + ext.lower()) in extensions:
14 |             files.append(file)
15 |     return files
16 | 
17 | 
18 | # Prompt user to select multiple tier names from a list of options
19 | def select_tier(tier_names: Set[str]):
20 |     print(type(tier_names))
21 |     tier_names_checkboxes = []
22 |     for tier_name in list(tier_names):
23 |         tier_names_checkboxes.append({'name': tier_name})
24 |     questions = [
25 |         {
26 |             'type': 'checkbox',
27 |             'name': 'tier',
28 |             'message': 'Choose a tier',
29 |             'choices': tier_names_checkboxes
30 |         }
31 |     ]
32 |     tier_choice = prompt(questions)
33 |     print("Selected tier/s:", tier_choice["tier"])
34 | 
35 | 
36 | # Read files from a folder,
37 | # Compile a list of tier names,
38 | # Then ask user to choose one tier
39 | def main():
40 |     # Start by asking where the Elan files are
41 |     # Default is a folder named "input" in the same directory as this script
42 |     input_dir_question = [
43 |         {
44 |             'type': 'input',
45 |             'name': 'input_dir',
46 |             'message': 'Name of folder with Elan files?',
47 |             'default': 'input'
48 |         }
49 |     ]
50 |     input_dir_prompt = prompt(input_dir_question)
51 |     input_dir = input_dir_prompt["input_dir"]
52 |     # Get all files from the input directory
53 |     extensions = set(["*.eaf"])
54 |     tier_names = set()
55 |     all_files = set(glob.glob(os.path.join(input_dir, "**"), recursive=True))
56 |     input_files = find_files_by_ext(all_files, extensions)
57 |     # Compile tier info for the files in the input dir
58 |     for input_file_path in input_files:
59 |         input_file = Eaf(input_file_path)
60 |         # Get the tier names — using pympi-ling
61 |         file_tier_names = list(input_file.get_tier_names())
62 |         # Compile tiers into set to use for user prompt
63 |         for tier_name in file_tier_names:
64 |             tier_names.add(tier_name)
65 |     print(type(tier_names))
66 |     select_tier(tier_names)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------
/python-tier-selector/input/abui_2.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:53:00+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_2.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_2.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:4395a55c-130b-4bfd-8c19-2785dfee027b</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">2</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="890"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="1632"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="2822"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="2960"/>
13 |     </TIME_ORDER>
14 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
15 |         <ANNOTATION>
16 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts4">
17 |                 <ANNOTATION_VALUE>dining ayoku kamar mia mui muila</ANNOTATION_VALUE>
18 |             </ALIGNABLE_ANNOTATION>
19 |         </ANNOTATION>
20 |     </TIER>
21 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts3">
24 |                 <ANNOTATION_VALUE>kamar mia mui muila</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
29 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
30 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
31 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
32 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
33 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
34 | </ANNOTATION_DOCUMENT>
35 | 


--------------------------------------------------------------------------------
/python-tier-selector/input/abui_3.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:52:53+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_3.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_3.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:54aad654-b699-44e0-9ec6-9fba95eb3239</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">2</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="1850"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="2730"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="4140"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="4630"/>
13 |     </TIME_ORDER>
14 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
15 |         <ANNOTATION>
16 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts3">
17 |                 <ANNOTATION_VALUE>hekaai dining ayoku kamar mia muila</ANNOTATION_VALUE>
18 |             </ALIGNABLE_ANNOTATION>
19 |         </ANNOTATION>
20 |     </TIER>
21 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts4">
24 |                 <ANNOTATION_VALUE>kamar mia muila</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
29 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
30 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
31 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
32 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
33 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
34 | </ANNOTATION_DOCUMENT>
35 | 


--------------------------------------------------------------------------------
/elan-to-json/input/abui_4.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:53:41+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_4.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_4.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:269a144c-5adb-4b5e-b1b7-ea19b2e8ee04</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">3</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="660"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="660"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="2570"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="2570"/>
13 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="3950"/>
14 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="6040"/>
15 |     </TIME_ORDER>
16 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
17 |         <ANNOTATION>
18 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts3">
19 |                 <ANNOTATION_VALUE>hekaai deina del ong hayei ba</ANNOTATION_VALUE>
20 |             </ALIGNABLE_ANNOTATION>
21 |         </ANNOTATION>
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts5" TIME_SLOT_REF2="ts6">
24 |                 <ANNOTATION_VALUE>hepikaai deina botol homi dong yaari</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
29 |         <ANNOTATION>
30 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts4">
31 |                 <ANNOTATION_VALUE>hekaai deina del ong hayei ba</ANNOTATION_VALUE>
32 |             </ALIGNABLE_ANNOTATION>
33 |         </ANNOTATION>
34 |     </TIER>
35 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
36 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
37 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
38 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
39 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
40 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
41 | </ANNOTATION_DOCUMENT>
42 | 


--------------------------------------------------------------------------------
/python-tier-selector/input/abui_4.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:53:41+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_4.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_4.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:269a144c-5adb-4b5e-b1b7-ea19b2e8ee04</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">3</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="660"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="660"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="2570"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="2570"/>
13 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="3950"/>
14 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="6040"/>
15 |     </TIME_ORDER>
16 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
17 |         <ANNOTATION>
18 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts3">
19 |                 <ANNOTATION_VALUE>hekaai deina del ong hayei ba</ANNOTATION_VALUE>
20 |             </ALIGNABLE_ANNOTATION>
21 |         </ANNOTATION>
22 |         <ANNOTATION>
23 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts5" TIME_SLOT_REF2="ts6">
24 |                 <ANNOTATION_VALUE>hepikaai deina botol homi dong yaari</ANNOTATION_VALUE>
25 |             </ALIGNABLE_ANNOTATION>
26 |         </ANNOTATION>
27 |     </TIER>
28 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
29 |         <ANNOTATION>
30 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts4">
31 |                 <ANNOTATION_VALUE>hekaai deina del ong hayei ba</ANNOTATION_VALUE>
32 |             </ALIGNABLE_ANNOTATION>
33 |         </ANNOTATION>
34 |     </TIER>
35 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
36 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
37 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
38 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
39 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
40 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
41 | </ANNOTATION_DOCUMENT>
42 | 


--------------------------------------------------------------------------------
/python-tier-selector/input/abui_1.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2019-09-24T10:54:21+01:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/Desktop/test-eaf/abui_1.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_1.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">3</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="290"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="395"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="395"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="1067"/>
13 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="1067"/>
14 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="1910"/>
15 |     </TIME_ORDER>
16 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
17 |         <ANNOTATION>
18 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts6">
19 |                 <ANNOTATION_VALUE>amakaang di kaai hada muila</ANNOTATION_VALUE>
20 |             </ALIGNABLE_ANNOTATION>
21 |         </ANNOTATION>
22 |     </TIER>
23 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
24 |         <ANNOTATION>
25 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts4">
26 |                 <ANNOTATION_VALUE>amakaang di kaai</ANNOTATION_VALUE>
27 |             </ALIGNABLE_ANNOTATION>
28 |         </ANNOTATION>
29 |     </TIER>
30 |     <TIER LINGUISTIC_TYPE_REF="gloss" TIER_ID="gloss@speaker1">
31 |         <ANNOTATION>
32 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts3" TIME_SLOT_REF2="ts5">
33 |                 <ANNOTATION_VALUE>add gloss here</ANNOTATION_VALUE>
34 |             </ALIGNABLE_ANNOTATION>
35 |         </ANNOTATION>
36 |     </TIER>
37 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
38 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
39 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="gloss" TIME_ALIGNABLE="true"/>
40 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
41 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
42 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
43 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
44 | </ANNOTATION_DOCUMENT>
45 | 


--------------------------------------------------------------------------------
/json-to-elan/convert.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pympi.Elan import Eaf
 3 | from typing import List, Dict
 4 | import argparse
 5 | import shutil
 6 | import json
 7 | from pathlib import Path
 8 | 
 9 | 
10 | def make_elans(tier_name: str, input_dir: str, output_dir: str, copy_wavs: bool):
11 |     """
12 |     Make ELAN files based on JSON data
13 |     :param tier_name: The name of the tier to write into
14 |     :param input_dir: Directory name of folder containing JSON (and optionally also matching WAV audio) files
15 |     :param output_dir: Directory name to save EAF files into
16 |     :param copy_wavs: Setting whether or not to copy the WAV file to the output dir
17 |     """
18 |     # Process each file
19 |     for _, _, filenames in os.walk(input_dir):
20 | 
21 |         for filename in filenames:
22 |             if '.json' in filename:
23 |                 print(filename)
24 |                 basename, ext = os.path.splitext(os.path.basename(filename))
25 | 
26 |                 file_path = Path(input_dir, filename)
27 |                 # read the JSON
28 |                 with open(file_path) as json_file:
29 |                     annotation_data = json.load(json_file)
30 |                     print(annotation_data)
31 | 
32 |                 # Make EAF file
33 |                 output_eaf = Eaf()
34 |                 for annotation in annotation_data:
35 |                     start = int(annotation["timestamp"][0] * 1000)
36 |                     end = int(annotation["timestamp"][1] * 1000)
37 |                     print(end, start)
38 |                     output_eaf.add_annotation("default", start, end, value=annotation["text"])
39 | 
40 |                 if tier_name != "default":
41 |                     output_eaf.rename_tier("default", tier_name)
42 |                 output_eaf.add_linked_file(str(Path(output_dir, f'{basename}.wav')))
43 |                 output_eaf.to_file(str(Path(output_dir, f'{basename}.eaf')))
44 | 
45 |                 # Copy WAV?
46 |                 if copy_wavs:
47 |                     shutil.copyfile(Path(input_dir, f"{basename}.wav"), Path(output_dir, f"{basename}.wav"))
48 | 
49 | 
50 | def main():
51 |     parser = argparse.ArgumentParser(description='Make ELAN files from JSON data')
52 |     parser.add_argument('-t', '--tier_name', help='Name of the tier', default='default')
53 |     parser.add_argument('-i', '--input_dir', help='Folder of JSON files', default='input')
54 |     parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output')
55 |     parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true')
56 |     parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false')
57 |     parser.set_defaults(copy_wavs=False)
58 | 
59 |     args = parser.parse_args()
60 |     tier_name = args.tier_name
61 |     input_dir = args.input_dir
62 |     output_dir = args.output_dir
63 |     copy_wavs = args.copy_wavs
64 | 
65 |     # Reset the output dir
66 |     print("resetting output dir")
67 |     if os.path.isdir(output_dir):
68 |         shutil.rmtree(output_dir)
69 |     os.makedirs(output_dir)
70 | 
71 |     # Go
72 |     print("making elan files")
73 |     make_elans(tier_name, input_dir, output_dir, copy_wavs)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/elan-splitter/README.md:
--------------------------------------------------------------------------------
 1 | # Elan Splitter
 2 | 
 3 | The script processes a directory of audio annotated in Elan, and outputs audio clips and matching-named text files containing the respective annotations. The original files are not altered. Audio clips are determined by the start and end times of annotations on the first tier. You can choose to use another tier by passing a tier name or a different order number as an argument to the script (see below for examples). Note that the tier name or number has to be consistent across the data set, it doesn't enable individual files to be handled differently.
 4 | 
 5 | Instructions here are for Mac OSX.
 6 | 
 7 | Requires Python 3.
 8 | 
 9 | ## Installation
10 | 
11 | Open Terminal and check what versions of Homebrew and Python you have (if any). If you get a message "command not found: ..." then we need to install that software.
12 | 
13 | ```
14 | brew --version
15 | python --version
16 | python3 --version
17 | ```
18 | 
19 | **Homebrew**
20 | Install Homebrew if needed, following the instructions at https://brew.sh/
21 | 
22 | **Python**
23 | OSX comes with Python 2.7 but we need version 3. If you got a "command not found" error when you did the python3 --version command before, type this into Terminal. <br />
24 | `brew install python`
25 | 
26 | If you want to output mp3 files, also install ffmpeg <br />
27 | `brew install ffmpeg`
28 | 
29 | 
30 | **Splitter**
31 | 
32 | Clone this repository (maybe to the Desktop) and cd into this script dir `~/Desktop/elan-helpers/elan-splitter`.
33 | 
34 | Start a venv and install the script's dependencies.
35 | ```
36 | python3 -m venv venv
37 | source venv/bin/activate
38 | pip install -r requirements.txt
39 | ```
40 | 
41 | Make an `input` folder (eg `~/Desktop/elan-helpers/elan-splitter/input`), and put your Elan and audio files into it.
42 | 
43 | 
44 | ## Usage
45 | 
46 | Then you are ready to run the script. <br />
47 | `python split_eafs.py`
48 | 
49 | The output folder should now be populated with clipped audio files and text annotations!
50 | 
51 | 
52 | 
53 | By the way, if you are using WAV audio, and don't have ffmpeg installed, you can ignore the error about RuntimeWarning: Couldn't find ffmpeg or avconv. They would be needed if converting to mp3 or some other audio formats, WAV format is handled by the Python library.
54 | 
55 | 
56 | ## Options
57 | 
58 | To slice using annotations on a tier named "Words" you can use this command: <br />
59 | `python split_eafs.py -t Words`
60 | 
61 | To get annotations from the second tier pass the number as an argument like this: <br />
62 | `python split_eafs.py -o 2`
63 | 
64 | If you want the files it generates to be named with the annotation name, run the script with -n flag. <br />
65 | `python split_eafs.py -n`
66 | 
67 | Add a prefix to the generated files with `-p` setting. The following command, used with an Elan file that has the transcription "dog" will result in generated files named `A111_dog.txt` and `A111_dog.wav`. <br />
68 | `python split_eafs.py -n -p "A111"`
69 | 
70 | To output audio in MP3 format, set the format type with -f flag. The default output format is WAV. <br />
71 | `python split_eafs.py -f mp3`
72 | 
73 | You can combine options! E.g., to get annotations from the second highest tier, write files with annotations as the name, with A111 prefix: <br />
74 | `python split_eafs.py -o 2 -n -p "A111"`
75 | 


--------------------------------------------------------------------------------
/elan-character-spacer/output/abui_1.eaf:
--------------------------------------------------------------------------------
 1 | <ANNOTATION_DOCUMENT xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" AUTHOR="" DATE="2020-04-12T12:00:21+10:00" FORMAT="3.0" VERSION="3.0" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 2 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 3 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/sandbox/dev-corpora/abui-elan-tier-test/transcribed/abui_1.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_1.wav" />
 4 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f</PROPERTY>
 5 |         <PROPERTY NAME="lastUsedAnnotationId">4</PROPERTY>
 6 |     </HEADER>
 7 |     <TIME_ORDER>
 8 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="250" />
 9 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="290" />
10 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="395" />
11 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="395" />
12 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="1067" />
13 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="1067" />
14 |         <TIME_SLOT TIME_SLOT_ID="ts7" TIME_VALUE="1910" />
15 |         <TIME_SLOT TIME_SLOT_ID="ts8" TIME_VALUE="1970" />
16 |     </TIME_ORDER>
17 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
18 |         <ANNOTATION>
19 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts7">
20 |                 <ANNOTATION_VALUE>a m a k a a n g d i k a a i h a d a m u i l a</ANNOTATION_VALUE>
21 |             </ALIGNABLE_ANNOTATION>
22 |         </ANNOTATION>
23 |     </TIER>
24 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
25 |         <ANNOTATION>
26 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts3" TIME_SLOT_REF2="ts5">
27 |                 <ANNOTATION_VALUE>amakaang di kaai</ANNOTATION_VALUE>
28 |             </ALIGNABLE_ANNOTATION>
29 |         </ANNOTATION>
30 |     </TIER>
31 |     <TIER LINGUISTIC_TYPE_REF="gloss" TIER_ID="gloss@speaker1">
32 |         <ANNOTATION>
33 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts4" TIME_SLOT_REF2="ts6">
34 |                 <ANNOTATION_VALUE>add gloss here</ANNOTATION_VALUE>
35 |             </ALIGNABLE_ANNOTATION>
36 |         </ANNOTATION>
37 |     </TIER>
38 |     <TIER LINGUISTIC_TYPE_REF="default-lt" TIER_ID="phrase">
39 |         <ANNOTATION>
40 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a4" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts8">
41 |                 <ANNOTATION_VALUE>mememememememe</ANNOTATION_VALUE>
42 |             </ALIGNABLE_ANNOTATION>
43 |         </ANNOTATION>
44 |     </TIER>
45 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true" />
46 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true" />
47 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="gloss" TIME_ALIGNABLE="true" />
48 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision" />
49 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision" />
50 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association" />
51 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In" />
52 | </ANNOTATION_DOCUMENT>


--------------------------------------------------------------------------------
/elan-to-json/input/abui_1.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2020-04-12T12:00:21+10:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/sandbox/dev-corpora/abui-elan-tier-test/transcribed/abui_1.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_1.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">4</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="250"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="290"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="395"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="395"/>
13 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="1067"/>
14 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="1067"/>
15 |         <TIME_SLOT TIME_SLOT_ID="ts7" TIME_VALUE="1910"/>
16 |         <TIME_SLOT TIME_SLOT_ID="ts8" TIME_VALUE="1970"/>
17 |     </TIME_ORDER>
18 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
19 |         <ANNOTATION>
20 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts7">
21 |                 <ANNOTATION_VALUE>amakaang 你好 di kaai hada muila</ANNOTATION_VALUE>
22 |             </ALIGNABLE_ANNOTATION>
23 |         </ANNOTATION>
24 |     </TIER>
25 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
26 |         <ANNOTATION>
27 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts3" TIME_SLOT_REF2="ts5">
28 |                 <ANNOTATION_VALUE>amakaang di kaai</ANNOTATION_VALUE>
29 |             </ALIGNABLE_ANNOTATION>
30 |         </ANNOTATION>
31 |     </TIER>
32 |     <TIER LINGUISTIC_TYPE_REF="gloss" TIER_ID="gloss@speaker1">
33 |         <ANNOTATION>
34 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts4" TIME_SLOT_REF2="ts6">
35 |                 <ANNOTATION_VALUE>add gloss here</ANNOTATION_VALUE>
36 |             </ALIGNABLE_ANNOTATION>
37 |         </ANNOTATION>
38 |     </TIER>
39 |     <TIER LINGUISTIC_TYPE_REF="default-lt" TIER_ID="phrase">
40 |         <ANNOTATION>
41 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a4" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts8">
42 |                 <ANNOTATION_VALUE>mememememememe</ANNOTATION_VALUE>
43 |             </ALIGNABLE_ANNOTATION>
44 |         </ANNOTATION>
45 |     </TIER>
46 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
47 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
48 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="gloss" TIME_ALIGNABLE="true"/>
49 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
50 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
51 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
52 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
53 | </ANNOTATION_DOCUMENT>
54 | 


--------------------------------------------------------------------------------
/elan-character-spacer/input/abui_1.eaf:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <ANNOTATION_DOCUMENT AUTHOR="" DATE="2020-04-12T12:00:21+10:00" FORMAT="3.0" VERSION="3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv3.0.xsd">
 3 |     <HEADER MEDIA_FILE="" TIME_UNITS="milliseconds">
 4 |         <MEDIA_DESCRIPTOR MEDIA_URL="file:///Users/bbb/sandbox/dev-corpora/abui-elan-tier-test/transcribed/abui_1.wav" MIME_TYPE="audio/x-wav" RELATIVE_MEDIA_URL="./abui_1.wav"/>
 5 |         <PROPERTY NAME="URN">urn:nl-mpi-tools-elan-eaf:a680ab14-1485-4eda-a9fb-0e0003430d2f</PROPERTY>
 6 |         <PROPERTY NAME="lastUsedAnnotationId">4</PROPERTY>
 7 |     </HEADER>
 8 |     <TIME_ORDER>
 9 |         <TIME_SLOT TIME_SLOT_ID="ts1" TIME_VALUE="250"/>
10 |         <TIME_SLOT TIME_SLOT_ID="ts2" TIME_VALUE="290"/>
11 |         <TIME_SLOT TIME_SLOT_ID="ts3" TIME_VALUE="395"/>
12 |         <TIME_SLOT TIME_SLOT_ID="ts4" TIME_VALUE="395"/>
13 |         <TIME_SLOT TIME_SLOT_ID="ts5" TIME_VALUE="1067"/>
14 |         <TIME_SLOT TIME_SLOT_ID="ts6" TIME_VALUE="1067"/>
15 |         <TIME_SLOT TIME_SLOT_ID="ts7" TIME_VALUE="1910"/>
16 |         <TIME_SLOT TIME_SLOT_ID="ts8" TIME_VALUE="1970"/>
17 |     </TIME_ORDER>
18 |     <TIER LINGUISTIC_TYPE_REF="default-lt" PARTICIPANT="SL" TIER_ID="Phrase">
19 |         <ANNOTATION>
20 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a1" TIME_SLOT_REF1="ts2" TIME_SLOT_REF2="ts7">
21 |                 <ANNOTATION_VALUE>amakaang di kaai hada muila</ANNOTATION_VALUE>
22 |             </ALIGNABLE_ANNOTATION>
23 |         </ANNOTATION>
24 |     </TIER>
25 |     <TIER LINGUISTIC_TYPE_REF="transcription" TIER_ID="transcription@speaker1">
26 |         <ANNOTATION>
27 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a2" TIME_SLOT_REF1="ts3" TIME_SLOT_REF2="ts5">
28 |                 <ANNOTATION_VALUE>amakaang di kaai</ANNOTATION_VALUE>
29 |             </ALIGNABLE_ANNOTATION>
30 |         </ANNOTATION>
31 |     </TIER>
32 |     <TIER LINGUISTIC_TYPE_REF="gloss" TIER_ID="gloss@speaker1">
33 |         <ANNOTATION>
34 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a3" TIME_SLOT_REF1="ts4" TIME_SLOT_REF2="ts6">
35 |                 <ANNOTATION_VALUE>add gloss here</ANNOTATION_VALUE>
36 |             </ALIGNABLE_ANNOTATION>
37 |         </ANNOTATION>
38 |     </TIER>
39 |     <TIER LINGUISTIC_TYPE_REF="default-lt" TIER_ID="phrase">
40 |         <ANNOTATION>
41 |             <ALIGNABLE_ANNOTATION ANNOTATION_ID="a4" TIME_SLOT_REF1="ts1" TIME_SLOT_REF2="ts8">
42 |                 <ANNOTATION_VALUE>mememememememe</ANNOTATION_VALUE>
43 |             </ALIGNABLE_ANNOTATION>
44 |         </ANNOTATION>
45 |     </TIER>
46 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
47 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="transcription" TIME_ALIGNABLE="true"/>
48 |     <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="gloss" TIME_ALIGNABLE="true"/>
49 |     <CONSTRAINT DESCRIPTION="Time subdivision of parent annotation's time interval, no time gaps allowed within this interval" STEREOTYPE="Time_Subdivision"/>
50 |     <CONSTRAINT DESCRIPTION="Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered" STEREOTYPE="Symbolic_Subdivision"/>
51 |     <CONSTRAINT DESCRIPTION="1-1 association with a parent annotation" STEREOTYPE="Symbolic_Association"/>
52 |     <CONSTRAINT DESCRIPTION="Time alignable annotations within the parent annotation's time interval, gaps are allowed" STEREOTYPE="Included_In"/>
53 | </ANNOTATION_DOCUMENT>
54 | 


--------------------------------------------------------------------------------
/make-elans-from-text/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import librosa
 4 | from pympi.Elan import Eaf
 5 | from typing import List, Dict
 6 | import argparse
 7 | import shutil
 8 | 
 9 | 
10 | def get_annotation(input_dir: str, basename: str):
11 |     """
12 |     Get annotation from a the text file contents
13 |     :param input_dir: Name of the source folder
14 |     :param basename: Base name of the file which contains an annotation
15 |     :return: annotation text
16 |     """
17 |     annotation = ''
18 |     with open(os.path.join(input_dir, basename + '.txt'), 'r', encoding='utf-8') as text_file:
19 |         annotation = text_file.read()
20 |     return annotation
21 | 
22 | 
23 | def make_elans(input_dir: str, output_dir: str, copy_wavs: bool):
24 |     """
25 |     Make ELAN files based on filenames of WAV files and annotation from matching text file
26 |     :param input_dir: Directory name of folder containing TXT and WAV audio files
27 |     :param  output_dir: Directory name to save EAF files into
28 |     :param copy_wavs: Setting whether or not to copy the WAV file to the output dir
29 |     """
30 |     # Process each file
31 |     for _, _, filenames in os.walk(input_dir):
32 | 
33 |         for filename in filenames:
34 |             if '.wav' in filename:
35 |                 basename, ext = os.path.splitext(os.path.basename(filename))
36 |                 print(basename)
37 | 
38 |                 # Get audio file duration - use this as the EAF annotation's end timeslot
39 |                 duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000)
40 | 
41 |                 # Get annotation from the text file matching on file basename
42 |                 annotation = get_annotation(input_dir, basename)
43 | 
44 |                 # Add any annotation cleaning here
45 |                 # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)
46 | 
47 |                 print(duration, annotation)
48 | 
49 |                 # Make EAF file
50 |                 output_eaf = Eaf()
51 |                 # output_eaf.add_tier('default')
52 |                 output_eaf.insert_annotation('default', 0, duration, annotation)
53 |                 output_eaf.add_linked_file(os.path.join(output_dir, f'{basename}.wav'))
54 |                 output_eaf.to_file(os.path.join(output_dir, f'{basename}.eaf'))
55 | 
56 |                 # Copy WAV?
57 |                 if copy_wavs:
58 |                     shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
59 |     print('>>> Done')
60 | 
61 | 
62 | def main():
63 |     parser = argparse.ArgumentParser(description='Make ELAN files to match TXT and WAVs')
64 |     parser.add_argument('-i', '--input_dir', help='Folder of TXT and WAV files', default='input')
65 |     parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output')
66 | 
67 |     parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true')
68 |     parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false')
69 |     parser.set_defaults(copy_wavs=False)
70 |     args = parser.parse_args()
71 | 
72 |     input_dir = args.input_dir
73 |     output_dir = args.output_dir
74 |     copy_wavs = args.copy_wavs
75 | 
76 |     if copy_wavs:
77 |         print('copying WAVs')
78 |     else:
79 |         print('skip copying WAVs')
80 | 
81 |     # Reset the output dir
82 |     print("resetting output dir")
83 |     shutil.rmtree(output_dir)
84 |     os.makedirs(output_dir)
85 | 
86 |     # Go
87 |     print("making elan files")
88 |     make_elans(input_dir, output_dir, copy_wavs)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/make-elans-from-wavs-and-spreadsheet/make-elan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import librosa
 4 | from pympi.Elan import Eaf
 5 | from typing import List, Dict
 6 | import argparse
 7 | import pandas
 8 | import json
 9 | 
10 | 
11 | def get_annotations(spreadsheet: str):
12 |     """
13 |     Get filenames and annotations from a spreadsheet (actually loads all spreadsheet columns)
14 |     :param spreadsheet: Name of the spreadsheet which contains rows of audio filenames and annotations
15 |     :return: JSON format list of objects. Each object corresponds to a row of data in the excel file
16 |     """
17 |     spreadsheet_data = pandas.read_excel(spreadsheet)
18 |     spreadsheet_json = spreadsheet_data.to_json(orient='records')
19 |     annotations = json.loads(spreadsheet_json)
20 |     print('Spreadsheet loaded')
21 |     return annotations
22 | 
23 | 
24 | def get_annotation(annotations: List[Dict[str, str]], filename: str):
25 |     """
26 |     Get annotation for an audio file by looking up filename match in the spreadsheet json
27 |     :param annotations: data from input spreadsheet in JSON format
28 |     :param filename: name of WAV file to get annotation for
29 |     :return: annotation retrieved from the spreadsheet data matching the WAV filename
30 |     """
31 |     annotation = ''
32 |     for record in annotations:
33 |         if record["File name"] == filename:
34 |             annotation = record["Transcription"]
35 |             break
36 |     return annotation
37 | 
38 | 
39 | def make_elans(spreadsheet: str, source: str, target: str):
40 |     """
41 |     Make ELAN files based on filenames of WAV files
42 |     :param spreadsheet: Path and file name of the spreadsheet containing WAV filenames and matching annotations
43 |     :param source: Directory name of folder containing WAV audio files
44 |     :param  target: Directory name to save EAF files into
45 |     """
46 | 
47 |     # Read spreadsheet data and convert to JSON format
48 |     print('Loading data from spreadsheet')
49 |     annotations = get_annotations(spreadsheet)
50 | 
51 |     # Process each file
52 |     print('Processing WAVs')
53 |     for _, _, filenames in os.walk(source):
54 | 
55 |         for filename in filenames:
56 |             if '.wav' in filename:
57 |                 basename, ext = os.path.splitext(os.path.basename(filename))
58 | 
59 |                 # Get audio file duration - use this as the EAF annotation's end timeslot
60 |                 duration = int(librosa.get_duration(filename=os.path.join(source, filename))*1000)
61 | 
62 |                 # Get annotation from the source data matching on filename
63 |                 annotation = get_annotation(annotations, filename)
64 | 
65 |                 # Add any annotation cleaning here
66 |                 # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)
67 | 
68 |                 print(filename, duration, annotation)
69 | 
70 |                 # Make EAF file
71 |                 output_eaf = Eaf()
72 |                 output_eaf.add_tier('tx')
73 |                 output_eaf.insert_annotation('tx', 0, duration, annotation)
74 |                 output_eaf.add_linked_file(os.path.join(target, f'{basename}.wav'))
75 |                 output_eaf.to_file(os.path.join(target, f'{basename}.eaf'))
76 |     print('>>> Done')
77 | 
78 | 
79 | def main():
80 |     parser = argparse.ArgumentParser(description='make ELAN files to match WAVs')
81 |     parser.add_argument('-a', '--annotations', help='spreadsheet name', default=os.path.join('input', 'test.xlsx'))
82 |     parser.add_argument('-s', '--source', help='folder of WAVs', default='wav')
83 |     parser.add_argument('-t', '--target', help='folder to save EAFs', default='eaf')
84 |     args = parser.parse_args()
85 | 
86 |     spreadsheet = args.annotations
87 |     source = args.source
88 |     target = args.target
89 |     make_elans(spreadsheet, source, target)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/make-elans-from-timit/convert.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import os
  3 | # import librosa
  4 | from pympi.Elan import Eaf
  5 | from typing import List, Dict
  6 | import argparse
  7 | import shutil
  8 | import glob
  9 | 
 10 | 
 11 | def make_elans(input_dir: str, output_dir: str, copy_wavs: bool):
 12 |     """
 13 |     Make ELAN files based on filenames of WAV files and annotation from matching text file
 14 |     :param input_dir: Directory name of folder containing TXT and WAV audio files
 15 |     :param  output_dir: Directory name to save EAF files into
 16 |     :param copy_wavs: Setting whether or not to copy the WAV file to the output dir
 17 |     """
 18 |     # Process each file
 19 |     files = glob.glob(f'{input_dir}/**/*.txt', recursive=True)
 20 |     print(files)
 21 | 
 22 |     for filename in files:
 23 | 
 24 |         filepath, ext = os.path.splitext(filename)
 25 |         basename = os.path.splitext(os.path.basename(filepath))[0]
 26 |         subdirname = os.path.basename(os.path.dirname(filepath))
 27 | 
 28 |         sex = subdirname[0]
 29 |         participant = subdirname[1:]
 30 | 
 31 |         # SEX :== m | f
 32 |         # SPEAKER_ID :== <INITIALS><DIGIT>
 33 |         # INITIALS :== speaker initials, 3 letters
 34 |         # DIGIT :== number 0-9 to differentiate speakers with identical initials
 35 | 
 36 |         # print(filename)     # input/dr1/fmem0/sa2.txt
 37 |         # print(filepath)     # input/dr1/fmem0/sa2
 38 |         # print(subdirname)   # fmem0
 39 |         # print(basename)     # sa2
 40 |         # print(ext)          # txt
 41 | 
 42 |         # Get audio file duration - use this as the EAF annotation's end timeslot
 43 |         # duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000)
 44 | 
 45 |         # Get annotation from the text file matching on file basename
 46 |         with open(filename, 'r', encoding='utf-8') as text_file:
 47 |             annotation = text_file.read()
 48 |         annotation_split = annotation.split()
 49 |         start = int(annotation_split[0])
 50 |         duration = int(annotation_split[1])
 51 |         # convert audio samples to seconds to ms
 52 |         duration = int(duration/16000*1000)
 53 |         annotation_text = " ".join(annotation_split[2:])
 54 | 
 55 |         # Add any annotation cleaning here
 56 |         # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)
 57 | 
 58 |         print(start, duration, annotation_text)
 59 | 
 60 |         # Make EAF file
 61 |         output_eaf = Eaf()
 62 |         output_eaf.add_tier('default', part=participant)
 63 |         output_eaf.add_annotation('default', start, duration, annotation_text)
 64 |         output_eaf.add_linked_file(os.path.join(output_dir, f'{subdirname}-{basename}.wav'))
 65 |         output_eaf.to_file(os.path.join(output_dir, f'{subdirname}-{basename}.eaf'))
 66 | 
 67 |         # Copy WAV?
 68 |         # if copy_wavs:
 69 |         shutil.copyfile(f'{filepath}.wav', os.path.join(output_dir, f'{subdirname}-{basename}.wav'))
 70 | 
 71 |     print('>>> Done')
 72 | 
 73 | 
 74 | def main():
 75 |     parser = argparse.ArgumentParser(description='Make ELAN files to match TXT and WAVs')
 76 |     parser.add_argument('-i', '--input_dir', help='Folder of TXT and WAV files', default='input')
 77 |     parser.add_argument('-o', '--output_dir', help='Folder to save EAFs', default='output')
 78 | 
 79 |     parser.add_argument('--copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_true')
 80 |     parser.add_argument('--no-copy_wavs', help='Copy WAV files to output dir', dest='copy_wavs', action='store_false')
 81 |     parser.set_defaults(copy_wavs=False)
 82 |     args = parser.parse_args()
 83 | 
 84 |     input_dir = args.input_dir
 85 |     output_dir = args.output_dir
 86 |     copy_wavs = args.copy_wavs
 87 | 
 88 |     if copy_wavs:
 89 |         print('copying WAVs')
 90 |     else:
 91 |         print('skip copying WAVs')
 92 | 
 93 |     # Reset the output dir
 94 |     print("resetting output dir")
 95 |     shutil.rmtree(output_dir)
 96 |     os.makedirs(output_dir)
 97 | 
 98 |     # Go
 99 |     print("making elan files")
100 |     make_elans(input_dir, output_dir, copy_wavs)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/elan-character-spacer/output/1_1_3.pfsx:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <preferences version="1.1"
  3 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/Prefs_v1.1.xsd">
  4 |     <pref key="FrameSize">
  5 |         <Object class="java.awt.Dimension">1280,697</Object>
  6 |     </pref>
  7 |     <pref key="LayoutManager.SelectedTabIndex">
  8 |         <Int>7</Int>
  9 |     </pref>
 10 |     <pref key="LayoutManager.VisibleMultiTierViewer">
 11 |         <String>mpi.eudico.client.annotator.viewer.TimeLineViewer</String>
 12 |     </pref>
 13 |     <pref key="TimeLineViewer.FontSize">
 14 |         <Int>24</Int>
 15 |     </pref>
 16 |     <pref key="MultiTierViewer.ActiveTierName">
 17 |         <String>Phrase</String>
 18 |     </pref>
 19 |     <pref key="SelectionEndTime">
 20 |         <Long>1695</Long>
 21 |     </pref>
 22 |     <pref key="SignalViewer.ZoomLevel">
 23 |         <Float>200.0</Float>
 24 |     </pref>
 25 |     <pref key="LayoutManager.CurrentMode">
 26 |         <Int>1</Int>
 27 |     </pref>
 28 |     <prefGroup key="CommentViewer.Columns">
 29 |         <pref key="Sender">
 30 |             <Int>0</Int>
 31 |         </pref>
 32 |         <pref key="Comment">
 33 |             <Int>75</Int>
 34 |         </pref>
 35 |         <pref key="Creation Date">
 36 |             <Int>75</Int>
 37 |         </pref>
 38 |         <pref key="Tier">
 39 |             <Int>75</Int>
 40 |         </pref>
 41 |         <pref key="Modification Date">
 42 |             <Int>75</Int>
 43 |         </pref>
 44 |         <pref key="Recipient">
 45 |             <Int>0</Int>
 46 |         </pref>
 47 |         <pref key="Start Time">
 48 |             <Int>75</Int>
 49 |         </pref>
 50 |         <pref key="Initials">
 51 |             <Int>75</Int>
 52 |         </pref>
 53 |         <pref key="End Time">
 54 |             <Int>75</Int>
 55 |         </pref>
 56 |         <pref key="Thread">
 57 |             <Int>75</Int>
 58 |         </pref>
 59 |     </prefGroup>
 60 |     <prefGroup key="TierFonts"/>
 61 |     <prefList key="CommentViewer.Columns.Order">
 62 |         <String>Start Time</String>
 63 |         <String>End Time</String>
 64 |         <String>Tier</String>
 65 |         <String>Initials</String>
 66 |         <String>Comment</String>
 67 |         <String>Thread</String>
 68 |         <String>Sender</String>
 69 |         <String>Recipient</String>
 70 |         <String>Creation Date</String>
 71 |         <String>Modification Date</String>
 72 |     </prefList>
 73 |     <pref key="ActiveRecognizerName">
 74 |         <String>AAM-LR Phone level audio segmentation</String>
 75 |     </pref>
 76 |     <pref key="FrameLocation">
 77 |         <Object class="java.awt.Point">342,23</Object>
 78 |     </pref>
 79 |     <pref key="TimeLineViewer.ZoomLevel">
 80 |         <Float>200.0</Float>
 81 |     </pref>
 82 |     <pref key="SelectionBeginTime">
 83 |         <Long>1695</Long>
 84 |     </pref>
 85 |     <prefGroup key="AAM-LR Phone level audio segmentation">
 86 |         <pref key="base_url">
 87 |             <String>http://lux17.mpi.nl/aamlr/</String>
 88 |         </pref>
 89 |         <pref key="source_audio">
 90 |             <String>///Users/bbb/Desktop/abui-recordings/extra/1_1_3.wav</String>
 91 |         </pref>
 92 |     </prefGroup>
 93 |     <pref key="TimeScaleBeginTime">
 94 |         <Long>515</Long>
 95 |     </pref>
 96 |     <pref key="MediaTime">
 97 |         <Long>1695</Long>
 98 |     </pref>
 99 |     <prefGroup key="TierColors">
100 |         <pref key="test">
101 |             <Object class="java.awt.Color">160,90,90</Object>
102 |         </pref>
103 |     </prefGroup>
104 |     <prefList key="MultiTierViewer.TierOrder">
105 |         <String>Phrase</String>
106 |         <String>test</String>
107 |     </prefList>
108 |     <prefGroup key="IndividualPlayerVolumes">
109 |         <pref key="1_1_3.wav">
110 |             <Float>0.06</Float>
111 |         </pref>
112 |     </prefGroup>
113 |     <prefGroup key="TierHighlightColors"/>
114 |     <pref key="LayoutManager.SplitPaneDividerLocation">
115 |         <Int>70</Int>
116 |     </pref>
117 |     <prefList key="CommentViewer.Columns.Hidden">
118 |         <String>Sender</String>
119 |         <String>Recipient</String>
120 |     </prefList>
121 | </preferences>
122 | 


--------------------------------------------------------------------------------
/elan-splitter/split_eafs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #
  3 | # Copyright Ben Foley ben@cbmm.io 30 Jan 2018
  4 | #
  5 | # Split an audio file by the start and end times of annotations on a particular .eaf tier
  6 | # Don't worry about 'Parsing unknown version of ELAN spec... ' warnings,
  7 | # pympi is looking for v 2.7 or 2.8 of elan schema
  8 | 
  9 | # You can open and save WAV files with pure python.
 10 | # For opening and saving non-wav files – like mp3 – you'll need ffmpeg or libav.
 11 | # On OSX, install ffmpeg with `brew install ffmpeg`
 12 | 
 13 | 
 14 | # default usage: python3 split_eafs.py
 15 | 
 16 | import argparse
 17 | import glob
 18 | import json
 19 | import os
 20 | import sys
 21 | from pydub import AudioSegment
 22 | from pympi.Elan import Eaf
 23 | from slugify import slugify
 24 | 
 25 | 
 26 | parser = argparse.ArgumentParser(description="This script will slice audio and output text based on ELAN annotations.")
 27 | parser.add_argument('-i', '--input_dir', help='Directory of audio and eaf files', type=str, default='./input')
 28 | parser.add_argument('-o', '--tier_order', help='Get the annotations from this tier index, eg top tier would be 1', type=int, default='1')
 29 | parser.add_argument('-t', '--slice_tier', help='Tier name to use for slicing start end times', type=str, default='default')
 30 | parser.add_argument('-w', '--text_tier', help='Tier name to use for annotation text', type=str, default='default')
 31 | parser.add_argument('-m', '--silence_marker', help='Skip any annotations on the target language tier with this value', type=str, default='*PUB')
 32 | parser.add_argument('-s', '--silence_tier', help='Silence audio when annotations are found on this ref tier', type=str, default='Silence')
 33 | parser.add_argument('-a', '--output_audio_dir', help='Directory to save the audio files', type=str, default='./output')
 34 | parser.add_argument('-l', '--output_label_dir', help='Directory to save text files', type=str, default='./output')
 35 | parser.add_argument('-j', '--output_json', help='File name to output json', type=str, default='./output/annotations.json')
 36 | parser.add_argument('-v', '--verbose', help='Verbose output', action='store_true')
 37 | parser.add_argument('-n', '--name_with_annotation', help='Name the file with annotation value', action='store_true')
 38 | parser.add_argument('-p', '--prefix', help='Prefix the file name with this', type=str, default='')
 39 | parser.add_argument('-f', '--output_audio_format', help='Audio format to save media clips (requires ffmpeg)', type=str, default='wav')
 40 | args = parser.parse_args()
 41 | try:
 42 |     input_dir = args.input_dir
 43 |     tier_order = args.tier_order
 44 |     slice_tier = args.slice_tier
 45 |     text_tier = args.text_tier
 46 |     silence_marker = args.silence_marker
 47 |     silence_tier = args.silence_tier
 48 |     output_audio_dir = args.output_audio_dir
 49 |     output_label_dir = args.output_label_dir
 50 |     output_json = args.output_json
 51 |     verbose = args.verbose
 52 |     name_with_annotation = args.name_with_annotation
 53 |     prefix = args.prefix
 54 |     output_audio_format = args.output_audio_format.lower()
 55 | except Exception:
 56 |     parser.print_help()
 57 |     sys.exit(0)
 58 | 
 59 | # Limit to wav or mp3 outputs
 60 | valid_formats = ['mp3', 'wav']
 61 | if output_audio_format not in valid_formats:
 62 |     raise Exception('Please use wav or mp3 output formats only')
 63 | 
 64 | if not os.path.exists(output_audio_dir):
 65 |     os.makedirs(output_audio_dir)
 66 | if not os.path.exists(output_label_dir):
 67 |     os.makedirs(output_label_dir)
 68 | 
 69 | if verbose:
 70 |     print("tier_order", tier_order)
 71 |     print("slice_tier", slice_tier)
 72 | 
 73 | 
 74 | def split_audio_by_start_end(input_audio, start, end, fname):
 75 |     output = input_audio[start:end]
 76 |     output.export(os.path.join(output_audio_dir, fname + "." + output_audio_format), format=output_audio_format)
 77 | 
 78 | 
 79 | def write_text(annotation, fname):
 80 |     f = open(os.path.join(output_label_dir, fname + ".txt"), 'w')
 81 |     f.write(annotation)
 82 |     f.close()
 83 | 
 84 | 
 85 | def write_json(annotations_data):
 86 |     with open(output_json, 'w') as outfile:
 87 |         json.dump(annotations_data, 
 88 |                   outfile, 
 89 |                   indent=4, 
 90 |                   separators=(',', ': '), 
 91 |                   sort_keys=False,
 92 |                   ensure_ascii=False)
 93 | 
 94 | 
 95 | def read_eaf(ie):
 96 | 
 97 |     if verbose:
 98 |         print("input file is", ie)
 99 | 
100 |     input_eaf = Eaf(ie)
101 | 
102 |     # Check if the tiers we have been given exist
103 |     tier_names = list(input_eaf.get_tier_names())
104 |     if verbose:
105 |         print("tier_names", tier_names, file=sys.stderr)
106 | 
107 |     # Are we working by slice_tier name or order?
108 |     if slice_tier != "default" :
109 |         if verbose:
110 |             print("using slice_tier by name:", slice_tier, file=sys.stderr)
111 |     else:
112 | 
113 |         # Sanity check that the slice_tier num is not greater than the num of tiers
114 |         if tier_order > len(tier_names):
115 |             print("Error: tier number is greater than the number of tiers",
116 |                   file=sys.stderr)
117 |             return False
118 |         if verbose:
119 |             print("using slice_tier by number:", tier_names[tier_order-1], file=sys.stderr)
120 | 
121 |     if slice_tier not in tier_names:
122 |         print('Error: missing slice_tier ' + slice_tier, file=sys.stderr)
123 |         return False
124 | 
125 |     if silence_tier not in tier_names:
126 |         if verbose:
127 |             print('silence tier not found: ' + silence_tier, file=sys.stderr)
128 | 
129 |     # get the input audio file
130 |     inDir, name = os.path.split(ie)
131 |     basename, ext = os.path.splitext(name)
132 | 
133 |     # we can write out mp3 or whatever, still require wav input
134 |     ia = os.path.join(inDir, basename + ".wav")
135 |     input_audio = AudioSegment.from_wav(ia)
136 | 
137 |     # We can pass in an arg for a ref tier that has silence labels
138 |     check_silence_ref_tier = False
139 |     if silence_tier in tier_names:
140 |         silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
141 |         if silence_tier_info.get("PARENT_REF") == tier:
142 |             check_silence_ref_tier = True
143 | 
144 |     # Get annotation values, start and end times, and speaker id
145 |     if text_tier not in tier_names:
146 |         print('Error: missing text tier')
147 |         return False
148 | 
149 |     annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier))
150 | 
151 |     params = input_eaf.get_parameters_for_tier(text_tier)
152 |     if 'PARTICIPANT' in params:
153 |         speaker_id = params['PARTICIPANT']
154 | 
155 |     annotations_data = []
156 |     i = 0
157 |     for ann in annotations:
158 |         skip = False
159 |         ref_annotation = []
160 |         start = ann[0]
161 |         end = ann[1]
162 |         # output new values, not the original clip start end times
163 |         clip_start = 0
164 |         clip_end = ann[1] - ann[0]
165 |         annotation = ann[2]
166 | 
167 |         # Check for annotations labelled with a particular symbol on the main tier
168 |         if annotation == silence_marker:
169 |             skip = True
170 | 
171 |         # Check for existence of an annotation in ref tier to silence
172 |         # Annotation value doesn't matter
173 |         if check_silence_ref_tier:
174 |             ref_annotation = input_eaf.get_ref_annotation_at_time(silence_tier, start)
175 |             if len(ref_annotation) is True:
176 |                 skip = True
177 | 
178 |         if skip is True:
179 |             print('skipping annotation: ' + annotation, start, end)
180 |         else:
181 |             print('processing annotation: ' + annotation, start, end)
182 |             # build the output audio/text filename
183 |             fname = basename + "_" + str(i)
184 |             if name_with_annotation:
185 |                 fname = slugify(annotation)
186 | 
187 |             if prefix != '':
188 |                 fname = prefix + '_' + fname
189 |             obj = {
190 |                 'audioFileName': os.path.join(".", fname + ".wav"),
191 |                 'transcript': annotation,
192 |                 'startMs': clip_start,
193 |                 'stopMs': clip_end
194 |             }
195 |             if 'PARTICIPANT' in params:
196 |                 obj["speakerId"] = speaker_id
197 |             annotations_data.append(obj)
198 |             split_audio_by_start_end(input_audio, start, end, fname)
199 |             write_text(annotation, fname)
200 |             i += 1
201 |     # output the json data for the next step in kaldi pipeline
202 |     write_json(annotations_data)
203 | 
204 | 
205 |     if verbose:
206 |         print(annotations_data)
207 | 
208 | 
209 | def findFilesByExt(setOfAllFiles, exts):
210 |     res = []
211 |     for f in setOfAllFiles:
212 |         name, ext = os.path.splitext(f)
213 |         if ("*" + ext.lower()) in exts:
214 |             res.append(f)
215 |     return res
216 | 
217 | 
218 | g_exts = ["*.eaf"]
219 | allFilesInDir = set(glob.glob(os.path.join(input_dir, "**"), recursive=True))
220 | input_eafs = findFilesByExt(allFilesInDir, set(g_exts))
221 | 
222 | for ie in input_eafs:
223 |     read_eaf(ie)
224 | 


--------------------------------------------------------------------------------
/elan-to-json/elan_to_json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """
  4 | Get all files in the repository can use recursive atm as long as we don't need numpy
  5 | pass in corpus path throw an error if matching file wav isn't found in the corpus directory
  6 | 
  7 | Usage: python3 elan_to_json.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] [-t TIER] [-j OUTPUT_JSON]
  8 | 
  9 | Copyright: University of Queensland, 2020
 10 | Contributors:
 11 |               Nicholas Lambourne - (The University of Queensland, 2018)
 12 |               Ben Foley - (The University of Queensland, 2020)
 13 | 
 14 | Derived from the Elpis `elan_to_json.py` script
 15 | """
 16 | 
 17 | import argparse
 18 | import glob
 19 | import json
 20 | import os
 21 | import sys
 22 | from _io import TextIOWrapper
 23 | from pympi.Elan import Eaf
 24 | from typing import List, Dict, Tuple, Union
 25 | 
 26 | 
 27 | def load_json_file(file_name: str) -> List[Dict[str, str]]:
 28 |     """
 29 |     Given a filename (parameter) containing JSON, load and
 30 |     return the a list of python dictionaries with containing the same information.
 31 |     :param file_name: name of file containing JSON to read from.
 32 |     :return a Python dictionary with the contents of the JSON file.
 33 |     """
 34 |     data = []
 35 |     if os.path.exists(file_name) and os.path.getsize(file_name) > 0:
 36 |         with open(file_name, "r", encoding="utf-8") as file_:
 37 |             data = json.load(file_)
 38 |     return data
 39 | 
 40 | 
 41 | def write_data_to_json_file(data: object = {}, output: Union[str, TextIOWrapper] = []) -> None:
 42 |     """
 43 |     Writes the given Python dictionary (or list) object to a JSON file at the the given
 44 |     output location (which can either be a file - specified as a string, or
 45 |     directed to an output like sys.stdout or sys.stderr).
 46 |     :param data: the Python dictionary to be converted to JSON and written.
 47 |     :param output: the file to write the dictionary contents to.
 48 |     """
 49 |     json_data_string = json.dumps(data,
 50 |                                   indent=4,
 51 |                                   separators=(',', ': '),
 52 |                                   sort_keys=False,
 53 |                                   ensure_ascii=False)
 54 | 
 55 |     if isinstance(output, str):
 56 |         with open(output, "w") as file:
 57 |             file.write(json_data_string)
 58 |     else:
 59 |         print(json_data_string, file=output, flush=True)
 60 | 
 61 | 
 62 | def save_tier_info(input_eaf: Eaf = None,
 63 |                   file_name: str = '',
 64 |                   tier_types: List = [],
 65 |                   corpus_tiers_file: str = 'corpus_tiers.json'):
 66 |     tiers = []
 67 |     for tier_type in tier_types:
 68 |         tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
 69 |         tiers.append( { tier_type: tier_names } )
 70 |     file_data = {"file": file_name, "tiers": tiers}
 71 |     corpus_tiers = load_json_file(corpus_tiers_file)
 72 |     corpus_tiers.append(file_data)
 73 |     write_data_to_json_file(data=corpus_tiers,
 74 |                             output=corpus_tiers_file)
 75 | 
 76 | 
 77 | def process_eaf(input_elan_file: str = '',
 78 |                 tier_order: int = 0,
 79 |                 tier_name: str = '',
 80 |                 tier_type: str = '',
 81 |                 corpus_tiers_file: str = '') -> List[dict]:
 82 |     """
 83 |     Method to process a particular tier in an eaf file (ELAN Annotation Format).
 84 |     Transcriptions are read from an elan file tier.
 85 |     Tiers are nodes from the tree structure in the .eaf file.
 86 |     The tier to read from is determined by tier order (eg top tier would be order 1),
 87 |     tier type (eg default-lt) or tier name (eg Phrase).
 88 |     If tier type is used, the first tier matching this type is used.
 89 |     Elan can have multiple tiers of same type, future work would support reading data
 90 |     from multiple tiers of the selected type.
 91 | 
 92 |     It stores the transcriptions in the following format:
 93 |                     {'speaker_id': <speaker_id>,
 94 |                     'audio_file_name': <file_name>,
 95 |                     'transcript': <transcription_label>,
 96 |                     'start_ms': <start_time_in_milliseconds>,
 97 |                     'stop_ms': <stop_time_in_milliseconds>}
 98 | 
 99 |     :param input_elan_file: name of input elan file
100 |     :param tier_order: index of the elan tier to process
101 |     :param tier_type:  type of the elan tier to process
102 |     :param tier_name:  name of the elan tier to process
103 |     :return: a list of dictionaries, where each dictionary is an annotation
104 |     """
105 | 
106 |     print(f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}")
107 | 
108 |     # Get paths to files
109 |     input_directory, full_file_name = os.path.split(input_elan_file)
110 |     file_name, extension = os.path.splitext(full_file_name)
111 | 
112 |     # Look for wav file matching the eaf file in same directory
113 |     if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
114 |         print("WAV file found for " + file_name, file=sys.stderr)
115 |     else:
116 |         raise ValueError(f"WAV file not found for {full_file_name}. "
117 |                          f"Please put it next to the eaf file in {input_directory}.")
118 | 
119 |     # Get tier data from Elan file
120 |     input_eaf = Eaf(input_elan_file)
121 |     tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
122 |     tier_names: List[str] = list(input_eaf.get_tier_names())
123 | 
124 |     # Keep this data handy for future corpus analysis
125 |     # save_tier_info(input_eaf=input_eaf,
126 |     #               tier_types=tier_types,
127 |     #               file_name=file_name,
128 |     #               corpus_tiers_file=corpus_tiers_file)
129 | 
130 |     # Get annotations and parameters (things like speaker id) on the target tier
131 |     annotations: List[Tuple[str, str, str]] = []
132 |     annotations_data: List[dict] = []
133 | 
134 |     # First try using tier order to get tier name
135 |     if tier_order:
136 |         # Watch out for files that may not have this many tiers
137 |         # tier_order is 1-index but List indexing is 0-index
138 |         try:
139 |             tier_name = tier_names[tier_order-1]
140 |             print(f"using tier order {tier_order} to get tier name {tier_name}")
141 |         except IndexError:
142 |             print("couldn't find a tier")
143 |             pass
144 |     else:
145 |         # else use tier type to get a tier name
146 |         if tier_type in tier_types:
147 |             print(f"found tier type {tier_type}")
148 |             tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
149 |             tier_name = tier_names[0]
150 |             if tier_name:
151 |                 print(f"found tier name {tier_name}")
152 |         else:
153 |             print("tier type not found in this file")
154 | 
155 |     if tier_name in tier_names:
156 |         print(f"using tier name {tier_name}")
157 |         annotations = input_eaf.get_annotation_data_for_tier(tier_name)
158 | 
159 |     if annotations:
160 |         print(f"annotations {annotations}")
161 |         annotations = sorted(annotations)
162 |         parameters: Dict[str,str] = input_eaf.get_parameters_for_tier(tier_name)
163 |         print(f"parameters {parameters}")
164 |         speaker_id: str = parameters.get("PARTICIPANT", "")
165 | 
166 |     for annotation in annotations:
167 |         start: str = annotation[0]
168 |         end: str = annotation[1]
169 |         annotation_text: str = annotation[2]
170 |         print(f"annotation {annotation} {start} {end}")
171 |         obj = {
172 |             "audio_file_name": f"{file_name}.wav",
173 |             "transcript": annotation_text,
174 |             "start_ms": start,
175 |             "stop_ms": end
176 |         }
177 |         if "PARTICIPANT" in parameters:
178 |             obj["speaker_id"] = speaker_id
179 |         annotations_data.append(obj)
180 | 
181 |     return annotations_data
182 | 
183 | 
184 | def main():
185 | 
186 |     """
187 |     Run the entire elan_to_json.py as a command line utility. It extracts information on speaker, audio file,
188 |     transcription etc. from the given tier of the specified .eaf file.
189 | 
190 |     Tier can be selected by name, tier order or tier type
191 | 
192 |     Usage: python3 elan_to_json.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] [-j OUTPUT_JSON]
193 |      [-r TIER-ORDER] [-n TIER-NAME] [-t TIER-TYPE]
194 | 
195 |     python3 elan_to_json.py
196 |     
197 |     """
198 | 
199 |     parser: argparse.ArgumentParser = argparse.ArgumentParser(
200 |                             description="This script takes an directory with ELAN files and "
201 |                                         "saves the audio and output text in JSON format to a file")
202 |     parser.add_argument("-i", "--input_dir",
203 |                         help="Directory of dirty audio and eaf files",
204 |                         default="./input/")
205 |     parser.add_argument("-o", "--output_dir",
206 |                         help="Output directory",
207 |                         default="./output/")
208 |     parser.add_argument("-j", "--output_json",
209 |                         help="File path to output json",
210 |                         default="elan.json")
211 |     parser.add_argument("-r", "--tier_order",
212 |                         help="Source tier order",
213 |                         type=int,
214 |                         default=0)
215 |     parser.add_argument("-n", "--tier_name",
216 |                         help="Source tier name",
217 |                         default="Phrase")
218 |     parser.add_argument("-t", "--tier_type",
219 |                         help="Source tier type",
220 |                         default="default-lt")
221 |     arguments: argparse.Namespace = parser.parse_args()
222 | 
223 |     # Build output directory if needed
224 |     if not os.path.exists(arguments.output_dir):
225 |         os.makedirs(arguments.output_dir)
226 | 
227 |     all_files_in_directory = set(glob.glob(os.path.join(arguments.input_dir, "**"), recursive=True))
228 |     input_elan_files = [ file_ for file_ in all_files_in_directory if file_.endswith(".eaf") ]
229 | 
230 |     annotations_data = []
231 | 
232 |     for input_elan_file in input_elan_files:
233 |         annotations_data.extend(process_eaf(input_elan_file=input_elan_file,
234 |                                             tier_order=arguments.tier_order,
235 |                                             tier_name=arguments.tier_name,
236 |                                             tier_type=arguments.tier_type))
237 |     # TODO sort JSON by file name
238 |     annotations_data.sort(key=lambda x: x["audio_file_name"], reverse=False)
239 | 
240 |     write_data_to_json_file(data=annotations_data,
241 |                             output=os.path.join(arguments.output_dir, arguments.output_json))
242 | 
243 | 
244 | if __name__ == "__main__":
245 |     main()
246 | 
247 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------