├── asrtoolkit
    ├── alignment
    │   ├── __init__.py
    │   ├── preprocess_txt.py
    │   ├── initialize_logger.py
    │   ├── preprocess_gk_json.py
    │   ├── align.py
    │   ├── align_utils.py
    │   └── aligned_doc.py
    ├── file_utils
    │   ├── __init__.py
    │   ├── common_file_operations.py
    │   ├── script_input_validation.py
    │   └── name_cleaners.py
    ├── data_handlers
    │   ├── __init__.py
    │   ├── greenkey.py
    │   ├── data_handlers_common.py
    │   ├── webvtt_common.py
    │   ├── txt.py
    │   ├── srt.py
    │   ├── vtt.py
    │   ├── stm.py
    │   ├── html.py
    │   ├── rttm.py
    │   └── json.py
    ├── data_structures
    │   ├── __init__.py
    │   ├── formatting.py
    │   ├── exemplar.py
    │   ├── segment.py
    │   ├── time_aligned_text.py
    │   ├── audio_file.py
    │   └── corpus.py
    ├── degrade_audio_file.py
    ├── convert_transcript.py
    ├── __init__.py
    ├── split_audio_file.py
    ├── align_json.py
    ├── extract_excel_spreadsheets.py
    ├── split_corpus.py
    ├── combine_audio_files.py
    ├── prepare_audio_corpora.py
    ├── wer.py
    ├── deformatting_utils.py
    └── clean_formatting.py
├── samples
    ├── simple_test.txt
    ├── simple_test.stm
    ├── FinancialStatementFY18Q4.xlsx
    ├── invalid.stm
    ├── simple_test.json
    └── BillGatesTEDTalk_intentionally_poor_transcription.txt
├── .flake8
├── tests
    ├── small-test-file.mp3
    ├── run_tests.sh
    ├── small-test-file.stm
    ├── no_speaker.json
    ├── utils.py
    ├── test_xlsx_extraction.py
    ├── docker_test.sh
    ├── test_alignment.py
    ├── test_split_audio_file.py
    ├── test_wer.py
    ├── test_initialization.py
    ├── test_remove_invalid_lines.py
    ├── test_split_corpus.py
    ├── test_conversion.py
    └── test_clean_up.py
├── MANIFEST.in
├── requirements.txt
├── NOTICE
├── .gitignore
├── pyproject.toml
├── Pipfile
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── CONTRIBUTING.md
├── Dockerfile
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── setup.py
├── README.md
├── LICENSE
└── .pylintrc


/asrtoolkit/alignment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/asrtoolkit/file_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/greenkey.py:
--------------------------------------------------------------------------------
1 | json.py


--------------------------------------------------------------------------------
/samples/simple_test.txt:
--------------------------------------------------------------------------------
1 | Testing testing 1 2 3.


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | ignore = E501,W503,E203


--------------------------------------------------------------------------------
/samples/simple_test.stm:
--------------------------------------------------------------------------------
1 | unknown 1 UnknownSpeaker 0.00 3.20 <o,f0,male> testing testing one two three


--------------------------------------------------------------------------------
/tests/small-test-file.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/finos/greenkey-asrtoolkit/HEAD/tests/small-test-file.mp3


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include asrtoolkit *.py
2 | include *.md
3 | include LICENSE
4 | include requirements.txt
5 | 


--------------------------------------------------------------------------------
/samples/FinancialStatementFY18Q4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/finos/greenkey-asrtoolkit/HEAD/samples/FinancialStatementFY18Q4.xlsx


--------------------------------------------------------------------------------
/tests/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | test_dir="$(dirname "${BASH_SOURCE[0]}")"
3 | python3 -m pytest -vv --doctest-modules $test_dir/../
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4
 2 | editdistance
 3 | fire
 4 | num2words
 5 | pytest
 6 | regex
 7 | termcolor
 8 | tqdm
 9 | webvtt-py
10 | xlrd<=1.2.0
11 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | GreenKey ASRtoolkit - FINOS
2 | Copyright 2019 GreenKey Technologies
3 | 
4 | This product includes software developed at the Fintech Open Source Foundation (https://www.finos.org/).
5 | 


--------------------------------------------------------------------------------
/tests/small-test-file.stm:
--------------------------------------------------------------------------------
1 | small-test-file 1 gk_speaker 0.765 2.881 <o,f0,mixed> one two three four five
2 | small-test-file 1 gk_speaker 5.008 6.966 <o,f0,female> six seven eight nine ten
3 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/data_handlers_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for common utils for file/data handlers
 4 | """
 5 | 
 6 | separator = "\n"
 7 | 
 8 | 
 9 | def header():
10 |     " Returns header "
11 |     return ""
12 | 
13 | 
14 | def footer():
15 |     " Returns footer "
16 |     return ""
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | 
 7 | # Python egg metadata, regenerated from source files by setuptools.
 8 | /*.egg-info
 9 | __pycache__
10 | 
11 | 
12 | # osx
13 | .DS_Store
14 | 
15 | # PyCharm
16 | .idea/
17 | 
18 | # IPython Notebooks
19 | .ipynb
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | 
 3 | line-length = 88
 4 | target-version = ['py38']
 5 | include = '\.py(x|i)?$'
 6 | 
 7 | 
 8 | [tool.isort]
 9 | 
10 | multi_line_output = 3
11 | include_trailing_comma = true
12 | force_grid_wrap = 0
13 | use_parentheses = true
14 | ensure_newline_before_comments = true
15 | line_length = 88
16 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | beautifulsoup4 = "*"
 8 | editdistance = "*"
 9 | num2words = "*"
10 | pytest = "*"
11 | termcolor = "*"
12 | tqdm = "*"
13 | webvtt-py = "*"
14 | 
15 | [dev-packages]
16 | pandas = "*"
17 | yapf = "*"
18 | 
19 | [requires]
20 | 


--------------------------------------------------------------------------------
/asrtoolkit/file_utils/common_file_operations.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Simple wrapper for general file functions
 4 | """
 5 | 
 6 | 
 7 | def make_list_of_dirs(input_dir_list):
 8 |     """
 9 |     Make an entire list of directories
10 |     """
11 |     import os
12 | 
13 |     for this_dir in input_dir_list:
14 |         os.makedirs(this_dir, exist_ok=True)
15 | 


--------------------------------------------------------------------------------
/tests/no_speaker.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "segments": [
 3 |     {
 4 |       "final": true,
 5 |       "words": [
 6 |         { "word": "yeah", "start": 0.54, "length": 0.18, "confidence": 0.53 },
 7 |         { "word": "i", "start": 1.44, "length": 0.27, "confidence": 0.62 },
 8 |         { "word": "uh", "start": 1.71, "length": 0.12, "confidence": 0.44 }
 9 |       ],
10 |       "transcript": "yeah i uh",
11 |       "startTimeSec": 0.0,
12 |       "endTimeSec": 2.0,
13 |       "speakerInfo": "",
14 |       "confidence": 1.0
15 |     }
16 |   ]
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Helper functions to find the test and sample directories
 4 | """
 5 | import os
 6 | 
 7 | 
 8 | def get_test_dir(input_file):
 9 |     """
10 |     >>> get_test_dir(__file__).endswith("tests")
11 |     True
12 |     """
13 |     return os.path.dirname(os.path.realpath(input_file))
14 | 
15 | 
16 | def get_sample_dir(input_file):
17 |     """
18 |     >>> get_sample_dir(__file__).endswith("samples")
19 |     True
20 |     """
21 |     return os.path.join(
22 |         os.path.dirname(os.path.dirname(os.path.realpath(input_file))), "samples"
23 |     )
24 | 


--------------------------------------------------------------------------------
/tests/test_xlsx_extraction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test xlsx extraction
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | from asrtoolkit.extract_excel_spreadsheets import proc_input_dir_to_corpus
 9 | 
10 | from utils import get_test_dir
11 | 
12 | test_dir = get_test_dir(__file__)
13 | 
14 | 
15 | def test_excel_conversion():
16 |     " execute single test "
17 | 
18 |     proc_input_dir_to_corpus("samples", f"{test_dir}/corpus")
19 |     assert os.path.exists(f"{test_dir}/corpus/FinancialStatementFY18Q4.txt")
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     import sys
24 | 
25 |     import pytest
26 | 
27 |     pytest.main(sys.argv)
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | **Is your feature request related to a problem? Please describe.**
 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 9 | 
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 | 
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/tests/docker_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | # Run this from the head of git repo
 4 | 
 5 | files_to_cleanup="corpus split-corpus split file_conversion_test.txt file_conversion_test.stm json_to_stm_test_2.stm json_to_txt_test.txt json_to_stm_test_1.stm stm_to_srt_test.srt stm_to_vtt_test.vtt stm_to_html_test.html stm_to_txt_test.txt BillGatesTEDTalk_aligned.stm __pycache__ good*"
 6 | 
 7 | docker run --rm -it -w /work \
 8 |   --user $(id -u):$(id -g) --userns=host \
 9 |   -e LOG_LEVEL=${LOG_LEVEL:-DEBUG} \
10 |   -v $PWD:/work asrtoolkit:${TAG:-latest} \
11 |   bash -c 'tests/run_tests.sh'
12 | (cd tests/ && rm -r $files_to_cleanup)
13 | 
14 | # Then tests formatting as in CI
15 | python3 -m flake8 --ignore=E501,W504,W503,W291,F401,E741,E302,E126,E402,E251 .
16 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:buster-slim
 2 | LABEL maintainer="Matthew Goldey <mgoldey@greenkeytech.com>" \
 3 |       organization="Green Key Technologies <transcription@greenkeytech.com>"
 4 | 
 5 | # APT INSTALLS
 6 | RUN apt update && \
 7 |     apt install -y python3-dev libsox-fmt-mp3 wget curl build-essential sox && \
 8 |     apt-get clean autoclean && \
 9 |     apt-get autoremove -y && \
10 |     rm -rf /usr/share/doc /var/lib/apt/lists/* && \
11 |     curl https://bootstrap.pypa.io/get-pip.py | python3 && \
12 |     wget https://storage.googleapis.com/gkt-external/sample_audio_files.tar.gz && tar -xvzf sample_audio_files.tar.gz
13 | 
14 | WORKDIR /asrtoolkit
15 | COPY . /asrtoolkit
16 | 
17 | RUN \
18 |   python3 -m pip install .[dev] && \
19 |   python3 -m pip install "requests>=2.18.4"
20 | 


--------------------------------------------------------------------------------
/tests/test_alignment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Test to ensure that alignment can work
 4 | """
 5 | 
 6 | from asrtoolkit import align_json, time_aligned_text
 7 | from utils import get_sample_dir, get_test_dir
 8 | 
 9 | test_dir = get_test_dir(__file__)
10 | sample_dir = get_sample_dir(__file__)
11 | 
12 | 
13 | def test_simple_alignment():
14 |     align_json(
15 |         f"{sample_dir}/BillGatesTEDTalk.txt",
16 |         f"{sample_dir}/BillGatesTEDTalk.json",
17 |         f"{test_dir}/BillGatesTEDTalk_aligned.stm",
18 |     )
19 | 
20 |     aligned_transcript = time_aligned_text(f"{test_dir}/BillGatesTEDTalk_aligned.stm")
21 |     assert len(aligned_transcript.segments) == 104
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     import sys
26 | 
27 |     import pytest
28 | 
29 |     pytest.main(sys.argv)
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | A clear and concise description of what the bug is.
 9 | 
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Execute '....'
15 | 4. See error
16 | 
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 | 
20 | **Screenshots**
21 | If applicable, add screenshots to help explain your problem.
22 | 
23 | **Desktop (please complete the following information):**
24 |  - OS: [e.g. iOS]
25 |  - Browser [e.g. chrome, safari]
26 |  - Version [e.g. 22]
27 |  - Python version
28 |  - Using docker or pip package?
29 | 
30 | **Additional context**
31 | Add any other context about the problem here.
32 | 


--------------------------------------------------------------------------------
/samples/invalid.stm:
--------------------------------------------------------------------------------
 1 | Valid
 2 | unknown 1 UnknownSpeaker 0.00 3.20 <o,f0,male> testing testing one two three or maybe 10 4
 3 | unknown chan UnknownSpeaker 0.00 3.20 <o,f0,male> testing testing one two three
 4 | unknown 2 UnknownSpeaker 0.00 3.20 <o,f0,male> testing testing, one two three!
 5 | 
 6 | Invalid
 7 | wpdofin aposifn209r8n3- 938r1n-30912n- -asdf- -13r12412mmv.
 8 | unknown 1 UnknownSpeaker 0.00 3.20 31 <o,f0,male> testing testing one two three
 9 | unknown 1 UnknownSpeaker 0.00 3.20 <o,f0,male,garbage> testing testing one two three
10 | unknown 1 typo UnknownSpeaker 0.00 3.20 <o,f0,male,garbage> testing testing one two three
11 | unknown 2 UnknownSpeaker 0.00 3.20 <o,f0,alien> testing testing one two three
12 | unknown 2 UnknownSpeaker 0.00 3.20 <new_letter,f0,alien> testing testing one two three
13 | unknown 2 UnknownSpeaker 9.00 3.20 <o,f0,female> testing testing one two three
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: python
 3 | dist: xenial
 4 | python:
 5 |   - "3.6"
 6 |   - "3.6-dev" # 3.6 development branch
 7 |   - "3.7"
 8 |   - "3.7-dev" # 3.7 development branch
 9 |   - "3.8"
10 |   - "3.8-dev" # 3.8 development branch
11 | # command to install dependencies
12 | before_install:
13 |   - sudo apt-get update
14 |   - sudo apt-get install -y sox libsox-fmt-mp3
15 | install:
16 |   - python3 -m pip install .[dev]
17 |   - python3 -m pip install -r requirements.txt
18 | # command to run tests
19 | script:
20 |   # validate black formatting
21 |   - python3 -m black --check .
22 |   # validate imports are sorted
23 |   - python3 -m isort --check-only .
24 |   # check for undefined variables (F821 is an excellent error for a linter to catch)
25 |   # add errors to ignore here as desired
26 |   - python3 -m flake8 .
27 |   # run test suite
28 |   - python -m pytest -v --doctest-modules
29 | 


--------------------------------------------------------------------------------
/asrtoolkit/degrade_audio_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Script for degrading audio files to G711 audio quality
 4 | """
 5 | 
 6 | import logging
 7 | 
 8 | from fire import Fire
 9 | 
10 | from asrtoolkit.data_structures.audio_file import degrade_audio
11 | from asrtoolkit.file_utils.script_input_validation import valid_input_file
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | 
15 | 
16 | def degrade_all_files(*audio_files):
17 |     """
18 |     Degrade all audio files given as arguments (in place by default)
19 |     """
20 |     for file_name in audio_files:
21 |         if valid_input_file(file_name, ["mp3", "sph", "wav", "au", "raw"]):
22 |             degrade_audio(file_name)
23 |         else:
24 |             LOGGER.error("Invalid input file %s", file_name)
25 | 
26 | 
27 | def cli():
28 |     Fire(degrade_all_files)
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     cli()
33 | 


--------------------------------------------------------------------------------
/tests/test_split_audio_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test audio file splitter
 4 | """
 5 | import os
 6 | 
 7 | from asrtoolkit.split_audio_file import split_audio_file
 8 | from utils import get_test_dir
 9 | 
10 | test_dir = get_test_dir(__file__)
11 | 
12 | 
13 | def test_split_audio_file():
14 |     """
15 |     Test audio file splitter
16 |     """
17 |     split_audio_file(
18 |         f"{test_dir}/small-test-file.mp3",
19 |         f"{test_dir}/small-test-file.stm",
20 |         f"{test_dir}/split",
21 |     )
22 |     assert set(os.listdir(f"{test_dir}/split")) == {
23 |         "small_test_file_seg_00001.stm",
24 |         "small_test_file_seg_00000.mp3",
25 |         "small_test_file_seg_00001.mp3",
26 |         "small_test_file_seg_00000.stm",
27 |     }
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     import sys
32 | 
33 |     import pytest
34 | 
35 |     pytest.main(sys.argv)
36 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/webvtt_common.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for common utils for WEBVTT files
 4 | 
 5 | This expects a segment from class derived in convert_text
 6 | """
 7 | 
 8 | import logging
 9 | import re
10 | 
11 | from asrtoolkit.data_structures.segment import segment
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | non_transcript_marks = re.compile(r"\[[A-Za-z0-9]{1,}\]")
15 | 
16 | 
17 | def read_caption(caption):
18 |     """
19 |     Parses caption object to return a segment object
20 |     """
21 |     seg = None
22 | 
23 |     try:
24 |         start = caption.start_in_seconds
25 |         stop = caption.end_in_seconds
26 | 
27 |         text = re.sub(non_transcript_marks, lambda v: "", caption.text.strip()).strip()
28 | 
29 |         seg = segment({"start": start, "stop": stop, "text": text})
30 |     except Exception as exc:
31 |         seg = None
32 |         LOGGER.exception(exc)
33 | 
34 |     return seg if seg and seg.validate() else None
35 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/preprocess_txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from asrtoolkit.clean_formatting import clean_up
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | def parse_transcript(transcript, speaker="unknown", gender="male", token_idx_offset=0):
12 |     """
13 |     Given reference transcript (and ideally speaker and gender),
14 |     Return a list of tokenized word objects
15 |     """
16 | 
17 |     if os.path.exists(transcript):
18 |         transcript = open(transcript).read()
19 |     else:
20 |         LOGGER.info(
21 |             "Transcript is not a file location; assuming it is raw text instead."
22 |         )
23 | 
24 |     clean_lattice = []
25 |     for i, token in enumerate(clean_up(transcript).split()):
26 |         word_dict = dict(
27 |             token=token, token_idx=token_idx_offset + i, speaker=speaker, gender=gender
28 |         )
29 |         clean_lattice.append(word_dict)
30 |     return clean_lattice
31 | 


--------------------------------------------------------------------------------
/asrtoolkit/convert_transcript.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Python class for converting file formats used in Automatic Speech Recognition
 4 | """
 5 | 
 6 | import logging
 7 | import sys
 8 | 
 9 | from fire import Fire
10 | 
11 | from asrtoolkit.file_utils.script_input_validation import assign_if_valid
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | 
15 | 
16 | def check_input_file_validity(input_file):
17 |     if input_file is None:
18 |         LOGGER.error("Invalid input file %s", input_file)
19 |         sys.exit(1)
20 | 
21 | 
22 | def convert(input_file, output_file):
23 |     """
24 |     Convert between text file formats (supported formats are stm, json, srt, vtt, txt, and html)
25 | 
26 |     Validates lines of transcript before writing new file.
27 |     STM files are unformatted (eg 10 -> ten)
28 |     """
29 |     check_input_file_validity(input_file)
30 |     input_file = assign_if_valid(input_file)
31 |     input_file.write(output_file)
32 | 
33 | 
34 | def cli():
35 |     Fire(convert)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     cli()
40 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for reading TXT files
 4 | 
 5 | This expects a segment from class derived in convert_text
 6 | """
 7 | 
 8 | # do not delete - needed in time_aligned_text
 9 | from asrtoolkit.data_handlers.data_handlers_common import footer, header, separator
10 | from asrtoolkit.data_structures.segment import segment
11 | 
12 | 
13 | def format_segment(seg):
14 |     """
15 |     Formats a segment assuming it's an instance of class segment with text element
16 |     """
17 |     return seg.formatted_text if getattr(seg, "formatted_text") else seg.text
18 | 
19 | 
20 | def read_in_memory(input_data):
21 |     """
22 |     Reads input text
23 |     """
24 |     segments = []
25 |     for line in input_data.splitlines():
26 |         segments.append(segment({"text": line.strip()}))
27 |     return segments
28 | 
29 | 
30 | def read_file(file_name):
31 |     """
32 |     Reads a TXT file
33 |     """
34 |     segments = []
35 |     with open(file_name, encoding="utf-8") as f:
36 |         segments = read_in_memory(f.read())
37 |     return segments
38 | 
39 | 
40 | __all__ = [header, footer, separator]
41 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/formatting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Holds all formatting utilities
 4 | """
 5 | 
 6 | 
 7 | def std_float(number, num_decimals=2):
 8 |     """
 9 |     Print a number to string with n digits after the decimal point
10 |     (default = 2)
11 |     """
12 |     return "{0:.{1:}f}".format(float(number), num_decimals)
13 | 
14 | 
15 | def timestamp_to_seconds(timestamp):
16 |     """
17 |     Convert a timestamp to seconds
18 |     """
19 |     parts = timestamp.split(":")
20 |     return std_float(float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2]), 3)
21 | 
22 | 
23 | def seconds_to_timestamp(seconds):
24 |     """
25 |     Convert from seconds to a timestamp
26 |     """
27 |     minutes, seconds = divmod(float(seconds), 60)
28 |     hours, minutes = divmod(minutes, 60)
29 |     return "%02d:%02d:%06.3f" % (hours, minutes, seconds)
30 | 
31 | 
32 | def clean_float(input_float):
33 |     """
34 |     Return float in seconds (even if it was a timestamp originally)
35 |     """
36 |     return (
37 |         timestamp_to_seconds(input_float)
38 |         if ":" in str(input_float)
39 |         else std_float(input_float)
40 |     )
41 | 


--------------------------------------------------------------------------------
/tests/test_wer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test wer calculation
 4 | """
 5 | 
 6 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 7 | from asrtoolkit.wer import cer, wer
 8 | from utils import get_sample_dir
 9 | 
10 | sample_dir = get_sample_dir(__file__)
11 | 
12 | 
13 | def test_conversion_wer():
14 |     " execute single test "
15 | 
16 |     reference_file = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
17 |     transcript_file = time_aligned_text(
18 |         f"{sample_dir}/BillGatesTEDTalk_intentionally_poor_transcription.txt"
19 |     )
20 | 
21 |     # test fixed precision output of wer calculation
22 |     assert (
23 |         "{:5.3f}".format(wer(reference_file.text(), transcript_file.text(), True))
24 |         == "3.332"
25 |     )
26 | 
27 | 
28 | def test_non_ascii():
29 |     """
30 |     Test WER for non-ascii characters
31 |     """
32 | 
33 |     ref = """﻿﻿정답입니다
34 | 정답 이에요
35 | 삐약"""
36 |     hyp = """﻿﻿정답 입 니다
37 | 정답 이에요
38 | 하이륭"""
39 | 
40 |     assert wer(ref, hyp) == 100.0
41 | 
42 |     assert cer(ref, hyp) == 100.0 / 3.0
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     import sys
47 | 
48 |     import pytest
49 | 
50 |     pytest.main(sys.argv)
51 | 


--------------------------------------------------------------------------------
/asrtoolkit/file_utils/script_input_validation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Simple wrapper for validating script inputs
 4 | """
 5 | 
 6 | from os.path import isfile
 7 | 
 8 | from asrtoolkit.file_utils.name_cleaners import get_extension
 9 | 
10 | VALID_EXTENSIONS = ["json", "srt", "stm", "vtt", "txt", "html"]
11 | 
12 | 
13 | def valid_input_file(file_name, valid_extensions=[]):
14 |     """
15 |     tests that a file exists and that the extension is one asrtoolkit scripts can accept
16 |     >>> import os
17 |     >>> module_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
18 |     >>> valid_input_file(f"{module_path}/setup.py")
19 |     False
20 |     >>> valid_input_file(f"{module_path}/setup.py", ["py"])
21 |     True
22 |     >>> valid_input_file(f"{module_path}/requirements.txt", ["txt"])
23 |     True
24 |     """
25 |     return isfile(file_name) and get_extension(file_name) in (
26 |         valid_extensions if valid_extensions else VALID_EXTENSIONS
27 |     )
28 | 
29 | 
30 | def assign_if_valid(file_name):
31 |     from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
32 | 
33 |     " returns a time_aligned_text object if valid else None"
34 |     return time_aligned_text(file_name) if valid_input_file(file_name) else None
35 | 


--------------------------------------------------------------------------------
/asrtoolkit/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import logging
 3 | 
 4 | from num2words import base
 5 | from pkg_resources import get_distribution
 6 | 
 7 | from asrtoolkit.clean_formatting import clean_up
 8 | from asrtoolkit.convert_transcript import convert
 9 | from asrtoolkit.data_structures.audio_file import audio_file, combine_audio
10 | from asrtoolkit.data_structures.corpus import corpus
11 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
12 | from asrtoolkit.file_utils.name_cleaners import (
13 |     basename,
14 |     get_extension,
15 |     sanitize,
16 |     strip_extension,
17 | )
18 | from asrtoolkit.wer import cer, wer
19 | 
20 | LOGGER = logging.getLogger(__name__)
21 | 
22 | __version__ = get_distribution("asrtoolkit").version
23 | __all__ = [
24 |     audio_file,
25 |     base,
26 |     basename,
27 |     cer,
28 |     clean_up,
29 |     combine_audio,
30 |     convert,
31 |     corpus,
32 |     get_extension,
33 |     sanitize,
34 |     strip_extension,
35 |     time_aligned_text,
36 |     wer,
37 | ]
38 | 
39 | try:
40 |     from asrtoolkit.align_json import align_json
41 | 
42 |     __all__.append(align_json)
43 | except ImportError:
44 |     # Catch attribute error to lest doctests pass
45 |     LOGGER.info(
46 |         "Unable to import alignment utilities due to missing development package requirements"
47 |     )
48 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/srt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for reading/writing SRT files
 4 | 
 5 | This expects a segment from class derived in convert_text
 6 | """
 7 | 
 8 | from webvtt import WebVTT
 9 | 
10 | # do not delete - needed in time_aligned_text
11 | from asrtoolkit.data_handlers.data_handlers_common import footer, header, separator
12 | from asrtoolkit.data_handlers.webvtt_common import read_caption
13 | from asrtoolkit.data_structures.formatting import seconds_to_timestamp
14 | 
15 | 
16 | def format_segment(seg):
17 |     """
18 |     Formats a segment assuming it's an instance of class segment with elements
19 |     filename, channel, speaker, start and stop times, label, and text
20 |     """
21 | 
22 |     ret_str = "1\n{:} --> {:}\n".format(
23 |         seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop)
24 |     ).replace(".", ",")
25 |     ret_str += "{:}\n\n".format(seg.formatted_text if seg.formatted_text else seg.text)
26 | 
27 |     return ret_str
28 | 
29 | 
30 | def read_file(file_name):
31 |     """ Reads an SRT file """
32 | 
33 |     data = WebVTT.from_srt(file_name)
34 |     captions = data.captions
35 | 
36 |     segments = []
37 |     for caption in captions:
38 |         seg = read_caption(caption)
39 |         if seg is not None:
40 |             segments.append(seg)
41 | 
42 |     return segments
43 | 
44 | 
45 | __all__ = [header, footer, separator]
46 | 


--------------------------------------------------------------------------------
/asrtoolkit/split_audio_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Script for splitting audio files using a transcript with start/stop times
 4 | """
 5 | 
 6 | import logging
 7 | import sys
 8 | 
 9 | from fire import Fire
10 | 
11 | from asrtoolkit.data_structures.audio_file import audio_file
12 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
13 | from asrtoolkit.file_utils.script_input_validation import valid_input_file
14 | 
15 | LOGGER = logging.getLogger(__name__)
16 | 
17 | 
18 | def split_audio_file(source_audio_file, source_transcript, target_directory="split"):
19 |     """
20 |     Split source audio file into segments denoted by transcript file
21 |     into target_directory
22 |     Results in stm and sph files in target directory
23 |     """
24 |     source_audio = audio_file(source_audio_file)
25 |     transcript = time_aligned_text(source_transcript)
26 |     source_audio.split(transcript, target_directory)
27 | 
28 | 
29 | def validate_transcript(transcript):
30 |     """
31 |     Exit if invalid transcript
32 |     """
33 |     if not valid_input_file(transcript):
34 |         LOGGER.error("Invalid transcript file %s", transcript)
35 |         sys.exit(1)
36 | 
37 | 
38 | def validate_audio_file(source_audio_file):
39 |     if not valid_input_file(source_audio_file, ["mp3", "sph", "wav", "au", "raw"]):
40 |         LOGGER.error("Invalid audio file %s", source_audio_file)
41 |         sys.exit(1)
42 | 
43 | 
44 | def cli():
45 |     Fire(split_audio_file)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     cli()
50 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/initialize_logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | This file should be imported/called before any other modules that use any logging.
 4 | If any other modules that use logging are imported before this file,
 5 | then any calls to the logging module will not be formatted properly until after this module is loaded.
 6 | """
 7 | import logging
 8 | import os
 9 | import sys
10 | 
11 | # Get logging level from the environment, if it is set
12 | LOG_LEVEL_ENV = os.environ.get("LOG_LEVEL", "INFO").upper()
13 | 
14 | 
15 | def initialize_logger(stream=sys.stderr):
16 |     """
17 |     Set logging level to the level specified in environment, or "INFO" if no valid value specified.
18 |     """
19 |     # Note, do NOT do any logging in this function until the "basicConfig" is set!
20 |     unable_to_set_user_log_level = False
21 | 
22 |     try:
23 |         logging_level = getattr(logging, LOG_LEVEL_ENV)
24 |     except AttributeError:
25 |         unable_to_set_user_log_level = True
26 |         logging_level = logging.INFO
27 | 
28 |     logging.basicConfig(
29 |         level=logging_level,
30 |         format="%(levelname)-8s - %(asctime)s - %(name)s - %(message)s",
31 |         stream=stream,
32 |     )
33 | 
34 |     logging.info(
35 |         'Logger initialized! Threshold set to "%s".',
36 |         logging.getLevelName(logging_level),
37 |     )
38 | 
39 |     if unable_to_set_user_log_level:
40 |         logging.warning(
41 |             "User attempted to set log level to %s, which is not a valid log level!",
42 |             LOG_LEVEL_ENV,
43 |         )
44 | 


--------------------------------------------------------------------------------
/asrtoolkit/align_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Forced alignment tools CLI
 4 | """
 5 | 
 6 | from fire import Fire
 7 | 
 8 | from asrtoolkit.alignment import preprocess_gk_json, preprocess_txt
 9 | from asrtoolkit.alignment.align import align
10 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
11 | from asrtoolkit.file_utils.name_cleaners import basename, sanitize, strip_extension
12 | 
13 | 
14 | def align_json(ref_txt, json_file, filename=None):
15 |     """
16 |     CLI for forced alignment tools
17 |     Using a reference txt file and a hypothesis gk json
18 |         file, this time-aligns the reference txt file
19 |         and outputs an STM file
20 |     Input
21 |       ref_txt, str - reference text file containing ground truth
22 |       json_file, str - hypothesis gk JSON file
23 |       filename, str - output STM filename
24 |     """
25 | 
26 |     ref_tokens = preprocess_txt.parse_transcript(ref_txt)
27 |     gk_json = preprocess_gk_json.preprocess_transcript(json_file)
28 |     segments = align(gk_json, ref_tokens)
29 | 
30 |     if filename is None:
31 |         filename = basename(sanitize(strip_extension(ref_txt))) + ".stm"
32 | 
33 |     # fix segment filename and speaker
34 |     for seg in segments:
35 |         seg.filename = strip_extension(filename)
36 |         seg.speaker = strip_extension(filename) + "UnknownSpeaker"
37 | 
38 |     output = time_aligned_text()
39 |     output.segments = segments
40 |     output.write(filename)
41 | 
42 | 
43 | def cli():
44 |     Fire(align_json)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     cli()
49 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/vtt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for reading/writing WEBVTT files
 4 | 
 5 | This expects a segment from class derived in convert_text
 6 | """
 7 | 
 8 | from webvtt import WebVTT
 9 | 
10 | # do not delete - needed for time_aligned_text
11 | from asrtoolkit.data_handlers.data_handlers_common import footer, separator
12 | from asrtoolkit.data_handlers.webvtt_common import read_caption
13 | from asrtoolkit.data_structures.formatting import seconds_to_timestamp
14 | 
15 | 
16 | def header():
17 |     " Returns header - see https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API for detailed use "
18 |     return "WEBVTT - This file produced by GreenKey's ASRToolkit.\n\n"
19 | 
20 | 
21 | def format_segment(seg):
22 |     """
23 |     Formats a segment assuming it's an instance of class segment with elements
24 |     filename, channel, speaker, start and stop times, label, and text
25 |     """
26 | 
27 |     ret_str = "{:} --> {:}".format(
28 |         seconds_to_timestamp(seg.start), seconds_to_timestamp(seg.stop)
29 |     )
30 |     ret_str += " <v Channel {:}>".format(seg.channel)
31 |     ret_str += " <v Speaker {:}>".format(seg.speaker)
32 |     ret_str += "\n{:}\n".format(seg.formatted_text if seg.formatted_text else seg.text)
33 | 
34 |     return ret_str
35 | 
36 | 
37 | def read_file(file_name):
38 |     """ Reads a VTT file """
39 | 
40 |     data = WebVTT.read(file_name)
41 |     captions = data.captions
42 | 
43 |     segments = []
44 |     for caption in captions:
45 |         seg = read_caption(caption)
46 |         if seg is not None:
47 |             segments.append(seg)
48 | 
49 |     return segments
50 | 
51 | 
52 | __all__ = [header, footer, separator]
53 | 


--------------------------------------------------------------------------------
/tests/test_initialization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test wer calculation
 4 | """
 5 | 
 6 | import hashlib
 7 | import json
 8 | 
 9 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
10 | 
11 | from utils import get_sample_dir, get_test_dir
12 | 
13 | test_dir = get_test_dir(__file__)
14 | sample_dir = get_sample_dir(__file__)
15 | 
16 | 
17 | def test_json_initialization():
18 |     " execute single test "
19 | 
20 |     input_dict = json.load(open(f"{sample_dir}/BillGatesTEDTalk.json"))
21 |     text_object = time_aligned_text(input_dict)
22 | 
23 |     reference_sha = hashlib.sha1(
24 |         open(f"{sample_dir}/BillGatesTEDTalk_transcribed.stm", "r", encoding="utf8")
25 |         .read()
26 |         .encode()
27 |     ).hexdigest()
28 |     text_object.write(f"{test_dir}/file_conversion_test.stm")
29 |     new_sha = hashlib.sha1(
30 |         open(f"{test_dir}/file_conversion_test.stm", "r", encoding="utf8")
31 |         .read()
32 |         .encode()
33 |     ).hexdigest()
34 |     assert reference_sha == new_sha
35 | 
36 | 
37 | def test_txt_initialization():
38 |     " execute single test "
39 | 
40 |     input_dict = json.load(open(f"{sample_dir}/BillGatesTEDTalk.json"))
41 |     text = time_aligned_text(input_dict)
42 |     text.file_extension = "txt"
43 | 
44 |     text_object = time_aligned_text(text.__str__())
45 | 
46 |     reference_sha = hashlib.sha1(
47 |         open(f"{sample_dir}/BillGatesTEDTalk_transcribed.txt", "r", encoding="utf8")
48 |         .read()
49 |         .encode()
50 |     ).hexdigest()
51 |     text_object.write(f"{test_dir}/file_conversion_test.txt")
52 |     new_sha = hashlib.sha1(
53 |         open(f"{test_dir}/file_conversion_test.txt", "r", encoding="utf8")
54 |         .read()
55 |         .encode()
56 |     ).hexdigest()
57 |     assert reference_sha == new_sha
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     import sys
62 | 
63 |     import pytest
64 | 
65 |     pytest.main(sys.argv)
66 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/preprocess_gk_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json
 4 | 
 5 | from asrtoolkit.clean_formatting import clean_up
 6 | 
 7 | 
 8 | def preprocess_transcript(input_file):
 9 |     """
10 |     Given a str file path to gk json output
11 |     Return a list of tokenized word objects
12 |     """
13 |     data = json.load(open(input_file, "r+", encoding="utf-8"))
14 |     segment_times = [
15 |         (segment["startTimeSec"], segment["endTimeSec"]) for segment in data["segments"]
16 |     ]
17 |     lattice_segments = [segment["words"] for segment in data["segments"]]
18 |     transcript_segments = [segment["transcript"] for segment in data["segments"]]
19 | 
20 |     # start token count (increment for included tokens)
21 |     token_idx = 0
22 |     clean_lattice = []
23 |     changed = []
24 |     for seg_id, seg_text in enumerate(transcript_segments):
25 |         seg_lattice = lattice_segments[seg_id]
26 |         start_time, end_time = segment_times[seg_id]
27 |         for i, token in enumerate(seg_text.split()):
28 |             clean_token = clean_up(token).strip()
29 |             if clean_token:
30 |                 word_dict = seg_lattice[i]
31 |                 assert word_dict["word"] == token
32 | 
33 |                 if word_dict["word"] != clean_token:
34 |                     changed.append([seg_id, i, token, clean_token])
35 | 
36 |                 # times (audio-aligned)
37 |                 start, duration = start_time + word_dict["start"], word_dict["length"]
38 |                 end = start + duration
39 | 
40 |                 # generate token-level dict for each cleaned token retained
41 |                 word_dict = dict(
42 |                     gk_token=clean_token,
43 |                     gk_token_idx=token_idx,
44 |                     seg_id=seg_id,
45 |                     start=start,
46 |                     end=end,
47 |                     duration=duration,
48 |                 )
49 |                 # adding cleaned word + metadata to clean_lattice
50 |                 clean_lattice.append(word_dict)
51 |                 token_idx += 1
52 |     return clean_lattice
53 | 


--------------------------------------------------------------------------------
/asrtoolkit/file_utils/name_cleaners.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Simple wrapper for name-cleaning functions
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | 
 9 | def get_extension(file_name):
10 |     """
11 |     Returns file extension
12 |     >>> get_extension("foo.txt")
13 |     'txt'
14 |     """
15 |     return file_name.split(".")[-1]
16 | 
17 | 
18 | def basename(file_name):
19 |     """
20 |     Returns basename of a file without the preceding directory
21 |     """
22 |     return os.path.basename(file_name)
23 | 
24 | 
25 | def strip_extension(file_name):
26 |     """
27 |     Returns file without extension
28 |     """
29 |     return ".".join(file_name.split(".")[:-1]) if file_name else ""
30 | 
31 | 
32 | def sanitize(file_name, chars_to_replace="- ", silent=True):
33 |     """
34 |     replace input characters with underscores if present in file name
35 |     >>> sanitize("-asdflkj_ .tmp")
36 |     '_asdflkj__.tmp'
37 |     """
38 | 
39 |     def replace_char(input_char, file_name):
40 |         " replace specific char if present "
41 |         if input_char in file_name and not silent:
42 |             print(
43 |                 "replacing '{:}'s with underscores in sph file output - ".format(c)
44 |                 + "check to make sure your audio files and transcript files match"
45 |             )
46 |         return os.path.join(
47 |             os.path.dirname(file_name), basename(file_name).replace(c, "_")
48 |         )
49 | 
50 |     for c in chars_to_replace:
51 |         file_name = replace_char(c, file_name)
52 | 
53 |     return file_name
54 | 
55 | 
56 | def sanitize_hyphens(file_name, silent=True):
57 |     """
58 |     replace hyphens with underscores if present in file name
59 |     """
60 |     return sanitize(file_name, "-", silent=silent)
61 | 
62 | 
63 | def generate_segmented_file_name(target_dir, file_name, iseg):
64 |     """
65 |     Take a target location, a current location, and a segment number and generate a target filename
66 |     """
67 |     return sanitize_hyphens(
68 |         target_dir
69 |         + os.sep
70 |         + basename(strip_extension(file_name))
71 |         + "_seg_{:05d}.".format(iseg)
72 |         + file_name.split(".")[-1]
73 |     )
74 | 


--------------------------------------------------------------------------------
/tests/test_remove_invalid_lines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test invalid line removal
 4 | """
 5 | 
 6 | from asrtoolkit.convert_transcript import convert
 7 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 8 | from utils import get_sample_dir, get_test_dir
 9 | 
10 | test_dir = get_test_dir(__file__)
11 | sample_dir = get_sample_dir(__file__)
12 | 
13 | 
14 | EXPECTED_UNFORMATTED_TRANSCRIPTS = (
15 |     "testing testing one two three or maybe ten four",
16 |     "testing testing one two three",
17 |     "testing testing one two three",
18 | )
19 | 
20 | EXPECTED_FORMATTED_TRANSCRIPTS = (
21 |     "testing testing one two three or maybe 10 4",
22 |     "testing testing one two three",
23 |     "testing testing, one two three!",
24 | )
25 | 
26 | 
27 | def validate_sample(ext, expected_transcripts, out_segments):
28 |     base_output = f"{test_dir}/good"
29 |     convert(f"{sample_dir}/invalid.stm", base_output + ext)
30 |     validated_transcript = time_aligned_text(base_output + ext)
31 |     assert len(validated_transcript.segments) == out_segments
32 |     for seg, expected_text in zip(validated_transcript.segments, expected_transcripts):
33 |         assert seg.text == expected_text
34 | 
35 | 
36 | def test_stm_to_stm():
37 |     " Test stm to stm validation "
38 |     validate_sample(".stm", EXPECTED_UNFORMATTED_TRANSCRIPTS, 3)
39 | 
40 | 
41 | def test_stm_to_json():
42 |     " Test stm to gk json validation "
43 |     validate_sample(".json", EXPECTED_FORMATTED_TRANSCRIPTS, 3)
44 | 
45 | 
46 | def test_stm_to_vtt():
47 |     " Test stm to vtt validation "
48 |     validate_sample(".vtt", EXPECTED_FORMATTED_TRANSCRIPTS, 3)
49 | 
50 | 
51 | def test_stm_to_html():
52 |     " Test stm to html validation "
53 |     validate_sample(".html", EXPECTED_FORMATTED_TRANSCRIPTS, 3)
54 | 
55 | 
56 | def test_stm_to_srt():
57 |     " Test stm to gk json validation "
58 |     validate_sample(".srt", EXPECTED_FORMATTED_TRANSCRIPTS, 3)
59 | 
60 | 
61 | def test_stm_to_txt():
62 |     " Test stm to gk json validation "
63 |     validate_sample(".txt", EXPECTED_FORMATTED_TRANSCRIPTS, 3)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     import sys
68 | 
69 |     import pytest
70 | 
71 |     pytest.main(sys.argv)
72 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/stm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for reading STM files
 4 | 
 5 | Expected file format is derived from http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/infmts.htm#stm_fmt_name_0
 6 | 
 7 | This expects a segment from class derived in convert_text
 8 | """
 9 | 
10 | from asrtoolkit.clean_formatting import clean_up
11 | 
12 | # leave in place for other imports
13 | from asrtoolkit.data_handlers.data_handlers_common import footer, header, separator
14 | from asrtoolkit.data_structures.segment import segment
15 | 
16 | 
17 | def footer():
18 |     " Returns footer with trailing line break "
19 |     return "\n"
20 | 
21 | 
22 | def format_segment(seg):
23 |     """
24 |     :param seg: segment object
25 |     :return str: text for a particular STM line (see segment __str__ method)
26 |       Formats a segment assuming it's an instance of class segment with elements
27 |       filename, channel, speaker, start and stop times, label, and text
28 |     """
29 |     # clean_up used to unformat stm file text
30 |     return " ".join(
31 |         [
32 |             str(getattr(seg, _))
33 |             for _ in ("filename", "channel", "speaker", "start", "stop", "label")
34 |         ]
35 |         + [clean_up(seg.text)]
36 |     )
37 | 
38 | 
39 | def parse_line(line):
40 |     """
41 |     :param line: str; a single line of an stm file
42 |     :return: segment object if STM file line contains accurately formatted data; else None
43 |     """
44 |     data = line.strip().split()
45 | 
46 |     seg = None
47 |     if len(data) > 6:
48 |         filename, channel, speaker, start, stop, label = data[:6]
49 |         text = " ".join(data[6:])
50 |         seg = segment(
51 |             {
52 |                 "filename": filename,
53 |                 "channel": channel,
54 |                 "speaker": speaker,
55 |                 "start": start,
56 |                 "stop": stop,
57 |                 "label": label,
58 |                 "text": text,
59 |             }
60 |         )
61 |     return seg if (seg is not None) and seg.validate() else None
62 | 
63 | 
64 | def read_file(file_name):
65 |     """
66 |     Reads an STM file, skipping any gap lines
67 |     :return: list of segment objects
68 |     """
69 |     segments = []
70 |     with open(file_name, encoding="utf-8") as f:
71 |         for line in f:
72 |             seg = parse_line(line)
73 |             if seg is not None:
74 |                 segments.append(seg)
75 |     return segments
76 | 
77 | 
78 | __all__ = [header, footer, separator]
79 | 


--------------------------------------------------------------------------------
/asrtoolkit/extract_excel_spreadsheets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | python extract_excel_spreadsheets foldername
 4 | 
 5 | Parses all spreadsheets in an input folder, extracts text,
 6 | and formats it into a target corpus folder for language model training
 7 | """
 8 | 
 9 | import argparse
10 | import os
11 | from glob import glob
12 | 
13 | import pandas as pd
14 | 
15 | from asrtoolkit.clean_formatting import clean_up
16 | from asrtoolkit.file_utils.name_cleaners import basename, sanitize, strip_extension
17 | 
18 | 
19 | def clean_line(line):
20 |     "clean up a line and test for empty values"
21 |     return clean_up(
22 |         " ".join(map(lambda val: str(val) if not pd.isnull(val) else "", line))
23 |     )
24 | 
25 | 
26 | def dump_sheet(output_file, sheet):
27 |     "dump a sheet from a list of spreadsheets into a file"
28 |     output_file.write("\n".join(clean_line(line) for line in sheet if clean_line(line)))
29 | 
30 | 
31 | def extract_xlsx(filename, target_folder):
32 |     """
33 |     For an excel spreadsheet, extract to a text file
34 |     """
35 |     working_excel_data_structure = pd.ExcelFile(filename)
36 |     raw_name = sanitize(strip_extension(basename(filename)))
37 | 
38 |     with open("".join([target_folder, "/", raw_name, ".txt"]), "a+") as output_file:
39 |         for sheet in working_excel_data_structure.sheet_names:
40 |             dump_sheet(output_file, working_excel_data_structure.parse(sheet).values)
41 | 
42 | 
43 | def proc_input_dir_to_corpus(input_dir, output_dir):
44 |     """
45 |     Take an input dir of excel spreadsheets and process it to an output corpus dir of text files
46 |     """
47 |     os.makedirs(output_dir, exist_ok=True)
48 | 
49 |     for spreadsheet in glob(input_dir + "/*.xlsx") + glob(input_dir + "/*.xls"):
50 |         extract_xlsx(spreadsheet, output_dir)
51 | 
52 | 
53 | def main():
54 |     parser = argparse.ArgumentParser(
55 |         description="convert a folder of excel spreadsheets to a corpus of text files"
56 |     )
57 |     parser.add_argument(
58 |         "--input-folder",
59 |         default="./",
60 |         type=str,
61 |         help="input folder of excel spreadsheets ending in .xls or .xlsx",
62 |     )
63 |     parser.add_argument(
64 |         "--output-corpus",
65 |         default="corpus",
66 |         type=str,
67 |         help="output folder for storing text corpus",
68 |     )
69 |     args = parser.parse_args()
70 |     proc_input_dir_to_corpus(args.input_folder, args.output_corpus)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for handling HTML file io
 4 | 
 5 | Note that the structure may change without notice and break backwards compatibility
 6 | This expects a segment from class derived in convert_text
 7 | """
 8 | 
 9 | from bs4 import BeautifulSoup
10 | 
11 | # do not delete - needed in time_aligned_text
12 | from asrtoolkit.data_handlers.data_handlers_common import separator
13 | from asrtoolkit.data_structures.segment import segment
14 | 
15 | 
16 | def table_header(text, width):
17 |     " make a table header with input width "
18 |     return '<th align="left" width="{:}%">{:}</th>'.format(width, text)
19 | 
20 | 
21 | def table_delimiter(text):
22 |     " make a table delimiter element "
23 |     return '<td align="left">{:}</td>'.format(text)
24 | 
25 | 
26 | def header():
27 |     " Returns html header "
28 | 
29 |     widths = [10, 8, 82]
30 | 
31 |     return (
32 |         "<table>\n<tr>"
33 |         + "".join(
34 |             table_header(t, w)
35 |             for t, w in zip(
36 |                 ["[Start time - End time]", "Speaker", "Transcript"], widths
37 |             )
38 |         )
39 |         + "</tr>\n"
40 |     )
41 | 
42 | 
43 | def footer():
44 |     " Returns html footer "
45 |     return "</table>\n"
46 | 
47 | 
48 | def format_segment(seg):
49 |     """
50 |     Formats a segment assuming it's an instance of class segment with elements
51 |     filename, channel, speaker, start and stop times, label, and text
52 |     """
53 | 
54 |     return (
55 |         "<tr>"
56 |         + "".join(
57 |             table_delimiter(t)
58 |             for t in [
59 |                 "[{:} - {:}]".format(seg.start, seg.stop),
60 |                 seg.speaker,
61 |                 seg.formatted_text if seg.formatted_text else seg.text,
62 |             ]
63 |         )
64 |         + "</tr>\n"
65 |     )
66 | 
67 | 
68 | def parse_line(line):
69 |     " parse a single line of an html file"
70 |     cols = line.findAll("td")
71 |     seg = None
72 |     if cols:
73 |         start_stop, speaker, text = [[val for val in col.children][0] for col in cols]
74 |         start, stop = start_stop[1:-1].split(" - ")
75 |         seg = segment({"speaker": speaker, "start": start, "stop": stop, "text": text})
76 |         seg = seg if seg.validate() else None
77 |     return seg
78 | 
79 | 
80 | def read_file(file_name):
81 |     """
82 |     Reads an HTML file, skipping any gap lines
83 |     """
84 |     soup = BeautifulSoup(open(file_name).read(), "html.parser")
85 |     table = soup.find("table", {})
86 | 
87 |     segments = [_ for _ in map(parse_line, table.findAll("tr")) if _]
88 | 
89 |     return segments
90 | 
91 | 
92 | __all__ = [header, footer, separator]
93 | 


--------------------------------------------------------------------------------
/tests/test_split_corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test audio file splitter
 4 | """
 5 | import os
 6 | import shutil
 7 | from os.path import join as pjoin
 8 | 
 9 | from asrtoolkit.data_structures.corpus import corpus
10 | from asrtoolkit.split_corpus import split_corpus
11 | from utils import get_sample_dir, get_test_dir
12 | 
13 | test_dir = get_test_dir(__file__)
14 | sample_dir = get_sample_dir(__file__)
15 | 
16 | 
17 | def setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars):
18 |     """ Setup fake corpus for testing """
19 |     os.makedirs(orig_dir, exist_ok=True)
20 |     os.makedirs(trn_dir, exist_ok=True)
21 |     os.makedirs(dev_dir, exist_ok=True)
22 |     for i in range(n_exemplars):
23 |         shutil.copy(
24 |             f"{test_dir}/small-test-file.mp3",
25 |             pjoin(orig_dir, "file-{:02d}.mp3".format(i)),
26 |         )
27 |         shutil.copy(
28 |             f"{test_dir}/small-test-file.stm",
29 |             pjoin(orig_dir, "file-{:02d}.stm".format(i)),
30 |         )
31 | 
32 | 
33 | def validate_split(directory, inds):
34 |     """ Validate the files were split as expected """
35 |     assert set(os.listdir(directory)) == {
36 |         "file-{:02d}.{}".format(i, ext) for ext in ["sph", "stm"] for i in inds
37 |     }
38 | 
39 | 
40 | def test_split_corpus():
41 |     """ Test corpus splitter """
42 |     n_exemplars = 10
43 |     corpus_dir = f"{test_dir}/split-corpus"
44 | 
45 |     orig_dir = pjoin(corpus_dir, "orig")
46 |     split_dir = pjoin(corpus_dir, "splits")
47 |     trn_dir = pjoin(split_dir, "train")
48 |     dev_dir = pjoin(split_dir, "dev")
49 | 
50 |     setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
51 |     orig_corpus = corpus({"location": orig_dir})
52 |     split_corpus(
53 |         orig_dir,
54 |         split_dir=split_dir,
55 |         split_name="dev",
56 |         split_words=19,
57 |         min_split_segs=1,
58 |         leftover_data_split_name="train",
59 |         rand_seed=1337,
60 |     )
61 | 
62 |     # Make sure we didn't destroy input data
63 |     final_corpus = corpus({"location": orig_dir})
64 |     assert orig_corpus.validate() == 1
65 |     assert final_corpus.validate() == 1
66 |     orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
67 |     final_hashes = [_.hash() for _ in final_corpus.exemplars]
68 |     assert all(h in final_hashes for h in orig_hashes)
69 | 
70 |     # Make sure correct number of words present in data split
71 |     dev_corpus = corpus({"location": dev_dir})
72 |     assert sum(e.count_words() for e in dev_corpus.exemplars) == 20
73 |     assert dev_corpus.validate()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     import sys
78 | 
79 |     import pytest
80 | 
81 |     pytest.main(sys.argv)
82 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/rttm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Module for reading/writing RTTM files
 4 | 
 5 | This expects a segment from class derived in convert_text
 6 | 
 7 | See https://catalog.ldc.upenn.edu/docs/LDC2004T12/RTTM-format-v13.pdf
 8 | for the RTTM file format standard, copied below with minor edits for clarity
 9 | 
10 | RTTM files store object attributes in white-space separated fields
11 | 
12 | ```
13 | Field 1 2 3 4 5 6 7 8 9
14 | type file chnl tbeg tdur ortho stype name conf
15 | ```
16 | where
17 | file is the waveform file base name (i.e., without path names or extensions).
18 | chnl is the waveform channel (e.g., “1” or “2”).
19 | tbeg is the beginning time of the object, in seconds, measured from the start time of the file. If there is no beginning time, use tbeg = “<NA>”.
20 | tdur is the duration of the object, in seconds. If there is no duration, use tdur = “<NA>”.
21 | stype is the subtype of the object. If there is no subtype, use stype = “<NA>”.
22 | ortho is the orthographic rendering (spelling) of the object for STT object types. If there is no orthographic representation, use ortho = “<NA>”.
23 | name is the name of the speaker. name must uniquely specify the speaker within the scope of the file. If name is not applicable or if no claim is being made as to the identity of the speaker, use name = “<NA>”.
24 | conf is the confidence (probability) that the object information is correct. If conf is not available, use conf = “<NA>”.    
25 | 
26 | """
27 | 
28 | # do not delete - needed for time_aligned_text
29 | from asrtoolkit.data_handlers.data_handlers_common import footer, separator
30 | from asrtoolkit.data_structures.segment import segment
31 | from asrtoolkit.data_structures.formatting import clean_float
32 | 
33 | def header():
34 |     "Header for rttm files is empty"
35 |     return ""
36 | 
37 | 
38 | def format_segment(seg):
39 |     """
40 |     Formats a segment assuming it's an instance of class segment with elements
41 |     filename, channel, speaker, start and stop times, label, and text
42 |     """
43 |     return f"SPEAKER {seg.filename} {seg.channel} {seg.start} {clean_float(float(seg.stop)-float(seg.start))} <NA> <NA> {seg.speaker} <NA> <NA>"
44 | 
45 | 
46 | def read_file(file_name):
47 |     """ Reads an RTTM file """
48 | 
49 |     segments = []
50 |     with open(file_name) as data:
51 |         for line in data:
52 |             _, filename, channel, start, duration, _, _, speaker, _, _ = line.split()
53 |             seg = segment(
54 |                 **dict(
55 |                     filename=filename,
56 |                     channel=channel,
57 |                     start=start,
58 |                     stop=start + duration,
59 |                     speaker=speaker,
60 |                 )
61 |             )
62 |             segments.append(seg)
63 |     return segments
64 | 
65 | 
66 | __all__ = [header, footer, separator]
67 | 


--------------------------------------------------------------------------------
/asrtoolkit/split_corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Create train, dev, and test splits a folder of ASR data
 5 | """
 6 | 
 7 | import logging
 8 | import os
 9 | import sys
10 | from random import seed
11 | 
12 | from fire import Fire
13 | 
14 | from asrtoolkit.data_structures.corpus import corpus
15 | 
16 | LOGGER = logging.getLogger(__name__)
17 | 
18 | 
19 | def log_corpus_creation(corp, name):
20 |     """ had to make this function to satisfy code climate """
21 |     LOGGER.info(
22 |         "Created %s split with %d words using %d files with %d segments",
23 |         corp.location,
24 |         sum(map(lambda x: x.n_words, corp.exemplars)),
25 |         len(corp.exemplars),
26 |         corp.calculate_number_of_segments(),
27 |     )
28 | 
29 | 
30 | def perform_split(
31 |     corpus_to_split,
32 |     split_dir,
33 |     split_name,
34 |     split_words,
35 |     min_split_segs,
36 |     leftover_data_split_name,
37 | ):
38 |     leftover_corpus, new_corpus = corpus_to_split.split(split_words, min_split_segs)
39 | 
40 |     new_corpus.prepare_for_training(os.path.join(split_dir, split_name))
41 |     log_corpus_creation(new_corpus, split_name)
42 | 
43 |     leftover_corpus.prepare_for_training(
44 |         os.path.join(split_dir, leftover_data_split_name)
45 |     )
46 |     log_corpus_creation(leftover_corpus, leftover_data_split_name)
47 | 
48 | 
49 | def split_corpus(
50 |     in_dir,
51 |     split_dir,
52 |     split_name="split",
53 |     split_words=1000,
54 |     min_split_segs=10,
55 |     leftover_data_split_name="orig",
56 |     rand_seed=None,
57 | ):
58 |     """
59 |     Splits an ASR corpus directory based on number of words outputting splits in split_dir.
60 |     At least 1000 words is recommended for dev or tests splits to make WER calculations significant ~0.1%
61 |     Invalid files, such as empty files, will not be included in data splits.
62 | 
63 |     Set rand_seed for reproducible splits
64 |     """
65 |     seed(rand_seed)
66 | 
67 |     c = corpus({"location": in_dir})
68 |     LOGGER.debug("%d exemplars before validating them", len(c.exemplars))
69 |     valid_exemplars, total_words = c.count_exemplar_words()
70 |     c.exemplars = valid_exemplars
71 |     LOGGER.debug("%d exemplars after validating them", len(valid_exemplars))
72 | 
73 |     if min_split_segs > c.calculate_number_of_segments():
74 |         LOGGER.error(
75 |             "Not enough valid segments in corpus, %d, to make a split with %d segments. Reduce min_split_segs or get more data",
76 |             c.calculate_number_of_segments(),
77 |             min_split_segs,
78 |         )
79 |         sys.exit(1)
80 | 
81 |     perform_split(
82 |         c, split_dir, split_name, split_words, min_split_segs, leftover_data_split_name
83 |     )
84 | 
85 | 
86 | def cli():
87 |     Fire(split_corpus)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     cli()
92 | 


--------------------------------------------------------------------------------
/tests/test_conversion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Test file conversion using samples
 4 | """
 5 | import os
 6 | import hashlib
 7 | 
 8 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 9 | from utils import get_sample_dir, get_test_dir
10 | 
11 | test_dir = get_test_dir(__file__)
12 | sample_dir = get_sample_dir(__file__)
13 | 
14 | 
15 | def test_stm_to_txt_conversion():
16 |     " execute stm to txt test "
17 | 
18 |     transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
19 |     convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_txt_test.txt")
20 | 
21 | 
22 | def test_stm_to_html_conversion():
23 |     " execute stm to html test "
24 | 
25 |     transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
26 |     convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_html_test.html")
27 | 
28 | 
29 | def test_stm_to_vtt_conversion():
30 |     " execute stm to vtt test "
31 | 
32 |     transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
33 |     convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_vtt_test.vtt")
34 | 
35 | 
36 | def test_stm_to_srt_conversion():
37 |     " execute stm to srt test "
38 | 
39 |     transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
40 |     convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_srt_test.srt")
41 | 
42 | 
43 | def test_json_to_stm_conversion():
44 |     " execute json to stm tests "
45 | 
46 |     transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.json")
47 |     convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_1.stm")
48 | 
49 |     transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
50 |     convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_2.stm")
51 | 
52 | 
53 | def test_json_to_txt_conversion():
54 |     " execute json to txt test "
55 | 
56 |     transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
57 |     convert_and_test_it_loads(transcript, f"{test_dir}/json_to_txt_test.txt")
58 | 
59 | 
60 | def test_json_to_rttm_conversion():
61 |     """
62 |     execute json to rttm test
63 |     """
64 |     transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
65 |     convert_and_test_it_loads(transcript, f"{test_dir}/json_to_rttm_test.rttm")
66 | 
67 | 
68 | def test_json_to_rttm_conversion_without_speaker():
69 |     """
70 |     execute json to rttm test
71 |     """
72 |     transcript = time_aligned_text(f"{test_dir}/no_speaker.json")
73 | 
74 |     convert_and_test_it_loads(transcript, f"{test_dir}/no_speaker.rttm")
75 | 
76 | 
77 | def convert_and_test_it_loads(transcript_obj, output_filename):
78 |     """
79 |     Tests that conversion works
80 |     Tests that the file can reload
81 |     Removes transitory file
82 |     """
83 |     transcript_obj.write(output_filename)
84 |     time_aligned_text(output_filename)
85 | 
86 |     os.remove(output_filename)
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     import sys
91 | 
92 |     import pytest
93 | 
94 |     pytest.main(sys.argv)
95 | 


--------------------------------------------------------------------------------
/samples/simple_test.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "duration": 3.2,
  3 |   "transcription_time": 5.1,
  4 |   "version": "3.5.1",
  5 |   "timestamp": 1541178463841,
  6 |   "audioScore": 2.97,
  7 |   "audioComponents": {
  8 |     "snr": 52.0,
  9 |     "clipping": 0.0,
 10 |     "peaksCount": 19.0,
 11 |     "sampleRate": 48000.0,
 12 |     "signalRange": 2852.0,
 13 |     "meanFrequency": 1411.48,
 14 |     "meanAmplitude": 1974.57
 15 |   },
 16 |   "segments": [
 17 |     {
 18 |       "transcript": "testing testing one two three",
 19 |       "words": [
 20 |         {
 21 |           "word": "testing",
 22 |           "start": 0.24,
 23 |           "length": 0.93,
 24 |           "confidence": 0.94
 25 |         },
 26 |         {
 27 |           "word": "testing",
 28 |           "start": 1.2,
 29 |           "length": 0.6,
 30 |           "confidence": 1.0
 31 |         },
 32 |         {
 33 |           "word": "one",
 34 |           "start": 1.8,
 35 |           "length": 0.21,
 36 |           "confidence": 1.0
 37 |         },
 38 |         {
 39 |           "word": "two",
 40 |           "start": 2.01,
 41 |           "length": 0.18,
 42 |           "confidence": 0.96
 43 |         },
 44 |         {
 45 |           "word": "three",
 46 |           "start": 2.19,
 47 |           "length": 0.75,
 48 |           "confidence": 1.0
 49 |         }
 50 |       ],
 51 |       "confidence": 0.98,
 52 |       "model": "greenkey_svt_lstm",
 53 |       "status": 0,
 54 |       "time_elapsed": 5.1,
 55 |       "boundary": "long",
 56 |       "startTimeSec": 0,
 57 |       "endTimeSec": 3.2,
 58 |       "channel": 1,
 59 |       "genderInfo": {
 60 |         "confidence": 0.0
 61 |       },
 62 |       "audioScore": [
 63 |         2.97,
 64 |         {
 65 |           "snr": 52.0,
 66 |           "clipping": 0.0,
 67 |           "peaksCount": 19.0,
 68 |           "sampleRate": 48000.0,
 69 |           "signalRange": 2852.0,
 70 |           "meanFrequency": 1411.48,
 71 |           "meanAmplitude": 1974.57
 72 |         }
 73 |       ],
 74 |       "formatted_transcript": "Testing testing 1 2 3.",
 75 |       "formatted_entities": [
 76 |         {
 77 |           "word": "Testing",
 78 |           "start": 0.24,
 79 |           "length": 0.93,
 80 |           "confidence": 0.94
 81 |         },
 82 |         {
 83 |           "word": "testing",
 84 |           "start": 1.2,
 85 |           "length": 0.6,
 86 |           "confidence": 1.0
 87 |         },
 88 |         {
 89 |           "word": "1 2 3.",
 90 |           "start": 1.8,
 91 |           "length": 1.14,
 92 |           "confidence": 0.99
 93 |         }
 94 |       ],
 95 |       "punctuated_transcript": "Testing testing 1 2 3.",
 96 |       "sentiment": {
 97 |         "polarity": 0.0,
 98 |         "subjectivity": 0.0
 99 |       }
100 |     }
101 |   ],
102 |   "cloud_influence": 0.0,
103 |   "progress": 100.0,
104 |   "insights": {
105 |     "keyTerms": [],
106 |     "keyPhrases": [],
107 |     "averageSentiment": {
108 |       "polarity": 0.0,
109 |       "subjectivity": 0.0
110 |     }
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at voice@finos.org. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/exemplar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Stores exemplar class for corpus management
 5 | """
 6 | import os
 7 | 
 8 | from asrtoolkit.clean_formatting import clean_up
 9 | from asrtoolkit.file_utils.name_cleaners import basename, strip_extension
10 | 
11 | 
12 | class exemplar(object):
13 |     """
14 |     Create an exemplar class to pair one audio file with one transcript file
15 |     """
16 | 
17 |     audio_file = None
18 |     transcript_file = None
19 | 
20 |     def __init__(self, *args, **kwargs):
21 |         " Instantiate using input args and kwargs "
22 |         for dictionary in args:
23 |             if isinstance(dictionary, dict):
24 |                 for key in dictionary:
25 |                     setattr(self, key, dictionary[key])
26 |         for key in kwargs:
27 |             setattr(self, key, kwargs[key])
28 | 
29 |     def validate(self):
30 |         """
31 |         Validates exemplar object by constraining that the filenames before the
32 |         extension are the same
33 |         """
34 | 
35 |         audio_filename = basename(strip_extension(self.audio_file.location))
36 |         transcript_filename = basename(strip_extension(self.transcript_file.location))
37 | 
38 |         # Audio and transcript filename must match
39 |         # Audio file must not be empty
40 |         # Transcript file must not be empty
41 |         valid = (
42 |             audio_filename == transcript_filename
43 |             and os.path.getsize(self.audio_file.location)
44 |             and os.path.getsize(self.transcript_file.location)
45 |         )
46 |         # This returns an integer corresponding to the output of the last condition, not a boolean.
47 |         # Thats just how `and` works in python
48 | 
49 |         return bool(valid)
50 | 
51 |     def count_words(self, clean_func=clean_up):
52 |         """ Count words in a exemplar after cleaning it """
53 |         return (
54 |             len(clean_func(self.transcript_file.text()).split())
55 |             if self.validate()
56 |             else 0
57 |         )
58 | 
59 |     def prepare_for_training(self, target, sample_rate=16000, nested=False):
60 |         """
61 |         Prepare one exemplar for training
62 |         Returning a new exemplar object with updated file locations
63 |         and a resampled audio_file
64 |         """
65 |         if nested:
66 |             af_target_file = os.path.join(
67 |                 target, "sph", basename(self.audio_file.location)
68 |             )
69 |             tf_target_file = os.path.join(
70 |                 target, "stm", basename(self.transcript_file.location)
71 |             )
72 |         else:
73 |             af_target_file = os.path.join(target, basename(self.audio_file.location))
74 |             tf_target_file = os.path.join(
75 |                 target, basename(self.transcript_file.location)
76 |             )
77 | 
78 |         af = self.audio_file.prepare_for_training(
79 |             af_target_file,
80 |             sample_rate=sample_rate,
81 |         )
82 | 
83 |         tf = self.transcript_file.write(tf_target_file)
84 | 
85 |         return (
86 |             exemplar({"audio_file": af, "transcript_file": tf})
87 |             if all([af, tf])
88 |             else None
89 |         )
90 | 
91 |     def hash(self):
92 |         """
93 |         Returns combined hash of two files
94 |         """
95 |         return self.audio_file.hash() + self.transcript_file.hash()
96 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Creates asrtoolkit
 4 | """
 5 | from setuptools import find_packages, setup
 6 | 
 7 | with open("README.md") as f:
 8 |     long_description = f.read()
 9 | 
10 | 
11 | def install_deps():
12 |     """
13 |     Reads requirements.txt and preprocess it
14 |     to be feed into setuptools.
15 | 
16 |     This is the only possible way (we found)
17 |     how requirements.txt can be reused in setup.py
18 |     using dependencies from private github repositories.
19 | 
20 |     Links must be appendend by `-{StringWithAtLeastOneNumber}`
21 |     or something like that, so e.g. `-9231` works as well as
22 |     `1.1.0`. This is ignored by the setuptools, but has to be there.
23 | 
24 |     Warnings:
25 |         to make pip respect the links, you have to use
26 |         `--process-dependency-links` switch. So e.g.:
27 |         `pip install --process-dependency-links {git-url}`
28 | 
29 |     Returns:
30 |          list of packages and dependency links.
31 |     """
32 |     default = open("requirements.txt", "r").readlines()
33 |     new_pkgs = []
34 |     links = []
35 |     for resource in default:
36 |         if "http" in resource:
37 |             pkg = resource.split("#")[-1]
38 |             links.append(resource.strip() + "-9876543210")
39 |             new_pkgs.append(pkg.replace("egg=", "").rstrip())
40 |         else:
41 |             new_pkgs.append(resource.strip())
42 |     return new_pkgs, links
43 | 
44 | 
45 | pkgs, new_links = install_deps()
46 | 
47 | setup(
48 |     name="asrtoolkit",
49 |     version="0.2.5-alpha1",
50 |     description="The GreenKey ASRToolkit provides tools for automatic speech recognition (ASR) file conversion and corpora organization.",
51 |     long_description=long_description,
52 |     long_description_content_type="text/markdown",
53 |     url="http://github.com/finos-voice/greenkey-asrtoolkit",
54 |     author="Matthew Goldey",
55 |     author_email="mgoldey@greenkeytech.com",
56 |     install_requires=pkgs,
57 |     extras_require={
58 |         "dev": [
59 |             "black==20.8b1",
60 |             "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm",
61 |             "flake8",
62 |             "isort==5.7.0",
63 |             "numpy>=1.17.0",
64 |             "pandas",
65 |             "pytest",
66 |             "spacy==2.2.0",
67 |             "srsly<2.0.0,>=0.1.0",
68 |             "textacy<0.11.0",
69 |         ]
70 |     },
71 |     dependency_links=new_links,
72 |     keywords="asr speech recognition greenkey gk word error rate",
73 |     entry_points={
74 |         "console_scripts": [
75 |             "align_json=asrtoolkit.align_json:cli",
76 |             "clean_formatting=asrtoolkit.clean_formatting:cli",
77 |             "combine_audio_files=asrtoolkit.combine_audio_files:main",
78 |             "convert_transcript = asrtoolkit.convert_transcript:cli",
79 |             "degrade_audio_file=asrtoolkit.degrade_audio_file:cli",
80 |             "extract_excel_spreadsheets=asrtoolkit.extract_excel_spreadsheets:main",
81 |             "prepare_audio_corpora=asrtoolkit.prepare_audio_corpora:cli",
82 |             "split_audio_file=asrtoolkit.split_audio_file:cli",
83 |             "wer=asrtoolkit.wer:cli",
84 |         ]
85 |     },
86 |     license="Apache v2",
87 |     packages=find_packages(),
88 |     zip_safe=True,
89 |     classifiers=[
90 |         "Programming Language :: Python :: 3",
91 |         "Operating System :: OS Independent",
92 |     ],
93 | )
94 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/segment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Class for holding a segment
  4 | 
  5 | """
  6 | import json
  7 | import logging
  8 | 
  9 | from asrtoolkit.data_structures.formatting import clean_float
 10 | 
 11 | LOGGER = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class segment(object):
 15 |     """
 16 |     Class for holding segment-specific information
 17 |     Segment objects corresponds to dict under the key 'segment'
 18 |     in the ASR generated transcript (lattice)
 19 |     - the fields included below are shared across 'segments'
 20 |       but 'segments' may contain many other fields (i.e. sentiment) depending
 21 |       on the the text processing pipeline selected.
 22 |     """
 23 | 
 24 |     # refer to some file if possible
 25 |     filename = "unknown"
 26 |     # by default, use channel 1
 27 |     channel = "1"
 28 |     # need a speaker id
 29 |     speaker = "UnknownSpeaker"
 30 |     # start at beginning of file
 31 |     start = clean_float(0.0)
 32 |     # this should go the length of the file or the segment
 33 |     stop = clean_float(0.0)
 34 | 
 35 |     # Arbitrarily choose a default gender since
 36 |     # unknown does not play well with some programs
 37 |     # which digest ASR output
 38 |     label = "<o,f0,male>"
 39 |     # text to be populated from read class
 40 |     text = ""
 41 |     # text for printing out to fancy output formats
 42 |     formatted_text = ""
 43 |     # confidence in accuracy of text
 44 |     confidence = 1.0
 45 | 
 46 |     def __init__(self, *args, **kwargs):
 47 |         """
 48 |         Stores and initializes filename, channel, speaker, start & stop times,
 49 |         label, and formatted and unformatted text fields.
 50 |         - Unmodified ASR transcripts are unformatted text.
 51 |         - Raw Chat data is formatted text;
 52 |           `clean_up` from asrtoolkit.clean_formatting is used to
 53 |           convert it to unformatted text
 54 |         Note: `channel` (as currently defined) applies only to audio input
 55 |           - all chat data will retain default value of '1'
 56 | 
 57 |         >>> seg = segment({"text":"this is a test"})
 58 | 
 59 |         """
 60 |         for dictionary in [_ for _ in args if isinstance(_, dict)]:
 61 |             for key in dictionary:
 62 |                 setattr(self, key, dictionary[key])
 63 |         for key in kwargs:
 64 |             setattr(self, key, kwargs[key])
 65 | 
 66 |     def __str__(self, data_handler=None):
 67 |         """
 68 |         Returns the string corresponding to TXT format by default
 69 |         >>> seg = segment({"text":"this is a test"})
 70 |         >>> print(seg)
 71 |         this is a test
 72 |         """
 73 |         ret_str = data_handler.format_segment(self) if data_handler else self.text
 74 | 
 75 |         return ret_str
 76 | 
 77 |     def validate(self):
 78 |         """
 79 |         Checks for common failure cases for if a line is valid or not
 80 |         """
 81 |         valid = (
 82 |             self.speaker != "inter_segment_gap"
 83 |             and self.text
 84 |             and self.text != "ignore_time_segment_in_scoring"
 85 |             and self.label in ["<o,f0,male>", "<o,f0,female>", "<o,f0,mixed>"]
 86 |         )
 87 | 
 88 |         try:
 89 |             self.start = clean_float(self.start)
 90 |             self.stop = clean_float(self.stop)
 91 |             valid = valid and float(self.start) < float(self.stop)
 92 |         except Exception as exc:
 93 |             valid = False
 94 |             print(exc)
 95 | 
 96 |         if not valid:
 97 |             LOGGER.error(
 98 |                 """Skipping segment due to validation error.
 99 | Please note that this invalidates WER calculations based on the entire file.
100 | Segment: %s""",
101 |                 json.dumps(self.__dict__),
102 |             )
103 | 
104 |         if "-" in self.filename:
105 |             self.filename = self.filename.replace("-", "_")
106 |             print("Please rename audio file to replace hyphens with underscores")
107 | 
108 |         return valid
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     import doctest
113 | 
114 |     doctest.testmod(raise_on_error=True, verbose=True)
115 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to GreenKey ASRtoolkit
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change. 
 5 | 
 6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
 7 | 
 8 | # Contributor License Agreement (CLA)
 9 | A CLA is a document that specifies how a project is allowed to use your
10 | contribution; they are commonly used in many open source projects.
11 | 
12 | **_All_ contributions to _all_ projects hosted by [FINOS](https://www.finos.org/)
13 | must be made with a
14 | [Foundation CLA](https://finosfoundation.atlassian.net/wiki/spaces/FINOS/pages/83034172/Contribute)
15 | in place, and there are [additional legal requirements](https://finosfoundation.atlassian.net/wiki/spaces/FINOS/pages/75530375/Legal+Requirements)
16 | that must also be met.**
17 | 
18 | _NOTE:_ Commits and pull requests to FINOS repositories will only be accepted from those contributors with an active, executed Individual Contributor License Agreement (ICLA) with FINOS OR who are covered under an existing and active Corporate Contribution License Agreement (CCLA) executed with FINOS. Commits from individuals not covered under an ICLA or CCLA will be flagged and blocked by the FINOS Clabot tool. Please note that some CCLAs require individuals/employees to be explicitly named on the CCLA.
19 | 
20 | As a result, PRs submitted to the GreenKey ASRtoolkit project cannot be accepted until you have a CLA in place with the Foundation.
21 | 
22 | *Need an ICLA? Unsure if you are covered under an existing CCLA? Email [help@finos.org](mailto:help@finos.org)*
23 | 
24 | # Contributing Issues
25 | 
26 | ## Prerequisites
27 | 
28 | * [ ] Have you [searched for duplicates](https://github.com/finos-voice/greenkey-asrtoolkit/issues?utf8=%E2%9C%93&q=)?  A simple search for exception error messages or a summary of the unexpected behaviour should suffice.
29 | * [ ] Are you running the latest version?
30 | * [ ] Are you sure this is a bug or missing capability?
31 | 
32 | ## Raising an Issue
33 | * Create your issue [here](https://github.com/finos-voice/greenkey-asrtoolkit/issues/new).
34 | * New issues contain two templates in the description: bug report and enhancement request. Please pick the most appropriate for your issue, **then delete the other**.
35 |   * Please also tag the new issue with either "Bug" or "Enhancement".
36 | * Please use [Markdown formatting](https://help.github.com/categories/writing-on-github/)
37 | liberally to assist in readability.
38 |   * [Code fences](https://help.github.com/articles/creating-and-highlighting-code-blocks/) for exception stack traces and log entries, for example, massively improve readability.
39 | 
40 | # Contributing Pull Requests (Code & Docs)
41 | To make review of PRs easier, please:
42 | 
43 |  * Please make sure your PRs will merge cleanly - PRs that don't are unlikely to be accepted.
44 |  * For code contributions, follow the existing code layout.
45 |  * For documentation contributions, follow the general structure, language, and tone of the [existing docs](https://github.com/finos-voice/greenkey-asrtoolkit/wiki).
46 |  * Keep commits small and cohesive - if you have multiple contributions, please submit them as independent commits (and ideally as independent PRs too).
47 |  * Reference issue #s if your PR has anything to do with an issue (even if it doesn't address it).
48 |  * Minimise non-functional changes (e.g. whitespace).
49 |  * Ensure all new files include a header comment block containing the [Apache License v2.0 and your copyright information](http://www.apache.org/licenses/LICENSE-2.0#apply).
50 |  * If necessary (e.g. due to 3rd party dependency licensing requirements), update the [NOTICE file](https://github.com/finos-voice/greenkey-asrtoolkit/blob/master/NOTICE) with any new attribution or other notices
51 | 
52 | ## Commit and PR Messages
53 | 
54 | * **Reference issues, wiki pages, and pull requests liberally!**
55 | * Use the present tense ("Add feature" not "Added feature")
56 | * Use the imperative mood ("Move button left..." not "Moves button left...")
57 | * Limit the first line to 72 characters or less
58 | 


--------------------------------------------------------------------------------
/asrtoolkit/combine_audio_files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Script for combining audio files using their transcript files with start/stop times
  4 | """
  5 | import argparse
  6 | import logging
  7 | import operator
  8 | import os
  9 | import sys
 10 | from functools import reduce
 11 | 
 12 | from asrtoolkit.data_structures.audio_file import audio_file, combine_audio
 13 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 14 | from asrtoolkit.file_utils.name_cleaners import strip_extension
 15 | from asrtoolkit.file_utils.script_input_validation import valid_input_file
 16 | 
 17 | LOGGER = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | def check_transcript(transcript):
 21 |     if valid_input_file(transcript):
 22 |         return time_aligned_text(input_data=transcript)
 23 |     else:
 24 |         LOGGER.error("Invalid transcript file {}".format(transcript))
 25 |         sys.exit(1)
 26 | 
 27 | 
 28 | def check_audio_file(audio_file_name):
 29 |     if valid_input_file(audio_file_name, ["mp3", "sph", "wav", "au", "raw"]):
 30 |         return audio_file(audio_file_name)
 31 |     else:
 32 |         LOGGER.error("Invalid audio file {}".format(audio_file_name))
 33 |         sys.exit(1)
 34 | 
 35 | 
 36 | def check_transcript_segment(segment):
 37 |     if not hasattr(segment, "start"):
 38 |         LOGGER.error(
 39 |             "Transcript segment doesn't include the start time, segment: {}".format(
 40 |                 segment
 41 |             )
 42 |         )
 43 |         sys.exit(1)
 44 | 
 45 | 
 46 | def combine_transcripts(transcripts, output_file_name):
 47 |     # Get one list of segments
 48 |     out_transcript = reduce(operator.add, transcripts)
 49 |     out_transcript.location = os.path.join(
 50 |         strip_extension(output_file_name) + "." + out_transcript.file_extension
 51 |     )
 52 |     out_transcript.write(out_transcript.location)
 53 | 
 54 | 
 55 | def main():
 56 |     """
 57 |     Combine audio files using their transcript files
 58 |     """
 59 |     parser = argparse.ArgumentParser(
 60 |         description="""Combine audio files using segments from their transcript files. For this utility, transcript files must contain start/stop times.
 61 |            Lists of transcripts and audio files must be ordered identically, meaning the first audio file's
 62 |            transcript is the first transcript.
 63 |            Note: transcripts from each file are not checked for overlapping time intervals when they are combined and sorted.
 64 |         """
 65 |     )
 66 |     parser.add_argument(
 67 |         "--output_file", default="output.wav", required=True, help="Name of output file"
 68 |     )
 69 |     parser.add_argument(
 70 |         "--audio_files",
 71 |         metavar="audio_files",
 72 |         type=str,
 73 |         nargs="+",
 74 |         required=True,
 75 |         help="List of input audio files",
 76 |     )
 77 |     parser.add_argument(
 78 |         "--transcripts",
 79 |         metavar="transcripts",
 80 |         type=str,
 81 |         nargs="+",
 82 |         required=True,
 83 |         help="List of transcripts",
 84 |     )
 85 |     parser.add_argument(
 86 |         "--renormalize",
 87 |         default=False,
 88 |         action="store_true",
 89 |         help="Renormalize files to undo sox normalizing by 1/num_audio_files. Useful when combined audio has little overlap",
 90 |     )
 91 |     # Sending audio through tanh could be helpful if there are significant transient audio signals
 92 | 
 93 |     args = parser.parse_args()
 94 |     if len(args.audio_files) != len(args.transcripts):
 95 |         LOGGER.error(
 96 |             "The number of audio files, {}, must be equal to the number of transcripts, {}".format(
 97 |                 len(args.audio_files), len(args.transcripts)
 98 |             )
 99 |         )
100 |         sys.exit(1)
101 | 
102 |     [check_audio_file(_) for _ in args.audio_files]
103 |     transcripts = [check_transcript(_) for _ in args.transcripts]
104 | 
105 |     [
106 |         check_transcript_segment(_)
107 |         for transcript in transcripts
108 |         for _ in transcript.segments
109 |     ]
110 | 
111 |     combine_transcripts(transcripts, args.output_file)
112 |     combine_audio(args.audio_files, args.output_file, args.renormalize)
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_handlers/json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Module for reading/writing gk JSON files
  4 | """
  5 | 
  6 | import json
  7 | import logging
  8 | 
  9 | from asrtoolkit.data_structures.segment import segment
 10 | from asrtoolkit.file_utils.name_cleaners import sanitize
 11 | 
 12 | LOGGER = logging.getLogger(__name__)
 13 | separator = ",\n"
 14 | 
 15 | 
 16 | def header():
 17 |     " Returns empty header "
 18 |     return '{\n"segments":['
 19 | 
 20 | 
 21 | def footer():
 22 |     " Returns empty footer "
 23 |     return "]}\n"
 24 | 
 25 | 
 26 | def format_segment(seg):
 27 |     """
 28 |     Formats a segment assuming it's an instance of class segment with elements
 29 |     filename, channel, speaker, start and stop times, label, and text
 30 | 
 31 |     :param: seg: segment object
 32 |     :return: dict: key/val pairs contain 'segment'-level information
 33 |     """
 34 |     output_dict = {}
 35 |     output_dict["speakerInfo"] = seg.speaker
 36 |     output_dict["startTimeSec"] = float(seg.start)
 37 |     output_dict["endTimeSec"] = float(seg.stop)
 38 |     output_dict["genderInfo"] = {"gender": seg.label.split(",")[-1].replace(">", "")}
 39 |     output_dict["transcript"] = seg.text
 40 |     output_dict["confidence"] = seg.confidence
 41 | 
 42 |     if len(seg.formatted_text) > 0:
 43 |         output_dict["formatted_transcript"] = seg.formatted_text
 44 | 
 45 |     return json.dumps(output_dict, ensure_ascii=True)
 46 | 
 47 | 
 48 | def parse_segment(input_seg):
 49 |     """
 50 |     Creates an asrtoolkit segment object from an input gk segment
 51 |     :param: input_seg: dict (segment-level dict: input_data['segments'][i]
 52 |       -> dict with keys 'channel', 'startTimeSec' etc mapping to attributes
 53 |     :return: asrtoolkit segment object
 54 |     """
 55 |     extracted_dict = {}
 56 | 
 57 |     def assign_if_present(
 58 |         value, dict_key=None, interior_key=None, proc_val=lambda val: val
 59 |     ):
 60 |         """
 61 |         This transforms gk segment data into a dictionary for input
 62 |         into the asrtoolkit segment object
 63 | 
 64 |         Assigns value to extracted_dict object if present in input_seg
 65 | 
 66 |         :param value:         key from the inside of gk segment
 67 |         :param dict_key:      key to which value should be assigned
 68 |         :param interior_key:  sometimes values are nested under this
 69 |         :param proc_val:      function formatting the value
 70 | 
 71 |         """
 72 |         dict_key = value if dict_key is None else dict_key
 73 |         ret_val = None
 74 |         if value in input_seg and interior_key and interior_key in input_seg[value]:
 75 |             ret_val = proc_val(input_seg[value][interior_key])
 76 |         elif value in input_seg and not interior_key:
 77 |             ret_val = proc_val(input_seg[value])
 78 |         if ret_val not in {"", None}:
 79 |             extracted_dict[dict_key] = ret_val
 80 | 
 81 |     seg = None
 82 |     try:
 83 |         assign_if_present("channel")
 84 |         assign_if_present("startTimeSec", "start")
 85 |         assign_if_present("stopTimeSec", "stop")
 86 |         assign_if_present("endTimeSec", "stop")
 87 |         assign_if_present("transcript", "text")
 88 |         assign_if_present("corrected_transcript", "text")
 89 |         assign_if_present("formatted_transcript", "formatted_text")
 90 |         assign_if_present("punctuated_transcript", "formatted_text")
 91 |         assign_if_present("speakerInfo", "speaker", proc_val=sanitize)
 92 |         assign_if_present(
 93 |             "genderInfo", "label", "gender", lambda gender: "<o,f0,{:}>".format(gender)
 94 |         )
 95 |         assign_if_present("confidence", "confidence")
 96 | 
 97 |         seg = segment(extracted_dict)
 98 | 
 99 |     except Exception as exc:
100 |         LOGGER.exception(exc)
101 | 
102 |     return seg if seg and seg.validate() else None
103 | 
104 | 
105 | def read_in_memory(input_data):
106 |     """
107 |     Reads input json objects
108 | 
109 |     :param: input_data: dict with key 'segments'
110 |       input_data['segments']: List[Dict];
111 |       - segment_dicts contain key/val pairs that map to `segment` attributes
112 |         NB that labels of mapped key-attribute pairs may differ
113 |           for example, segment['startTimeSec'] -> segment.start
114 | 
115 |     :return: list of segment objects
116 |       applies `parse_segment` function to each dict in input_data['segments']
117 | 
118 |     """
119 |     segments = [_ for _ in map(parse_segment, input_data["segments"]) if _ is not None]
120 |     return segments
121 | 
122 | 
123 | def read_file(file_name):
124 |     """
125 |     Reads a JSON file, skipping any bad segments
126 |     """
127 |     with open(file_name, encoding="utf-8") as f:
128 |         input_json = json.load(f)
129 |         segments = read_in_memory(input_json)
130 |     return segments
131 | 


--------------------------------------------------------------------------------
/asrtoolkit/prepare_audio_corpora.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Script for preparing SPH, STM files into training, testing, and development sets from a set of corpora directories
  4 | 
  5 | If present, train, test, dev sets will be used from the individual corpora
  6 | """
  7 | import json
  8 | import logging
  9 | 
 10 | from fire import Fire
 11 | 
 12 | from asrtoolkit.data_structures.corpus import corpus
 13 | from asrtoolkit.file_utils.common_file_operations import make_list_of_dirs
 14 | 
 15 | LOGGER = logging.getLogger()
 16 | 
 17 | data_dirs = ["test", "train", "dev"]
 18 | 
 19 | 
 20 | def auto_split_corpora(corpora, min_size=50):
 21 |     """
 22 |     Given input corpora dict of corpora, auto split if it isn't already split
 23 |     """
 24 |     all_ready = all(
 25 |         corpora[data_dir].validate() if data_dir in corpora else False
 26 |         for data_dir in data_dirs
 27 |     )
 28 | 
 29 |     # dump extra data into training data by default
 30 |     if "unsorted" in corpora:
 31 |         corpora["train"] += corpora["unsorted"]
 32 |     if not all_ready:
 33 |         LOGGER.warning(
 34 |             "Not all training corpora were prepared. Automatically shuffling into training, testing, development sets"
 35 |         )
 36 | 
 37 |         # first pass, populate train directory
 38 |         corpora["train"] += corpora["dev"] + corpora["test"]
 39 | 
 40 |         # pick a file from training set to be dev set such that it contains min_size segments
 41 |         corpora["dev"], corpora["train"] = corpora["train"][:1], corpora["train"][1:]
 42 |         while (
 43 |             corpora["dev"].calculate_number_of_segments() < min_size
 44 |             and corpora["train"].validate() > 0
 45 |         ):
 46 |             corpora["dev"], corpora["train"] = (
 47 |                 (corpora["dev"] + corpora["train"][:1]),
 48 |                 corpora["train"][1:],
 49 |             )
 50 | 
 51 |         # pick 20% for testing
 52 |         split_index = len(corpora["train"].exemplars) * 4 // 5
 53 |         corpora["test"] = corpora["train"][split_index:]
 54 |         corpora["train"] = corpora["train"][:split_index]
 55 | 
 56 |         # ensure no duplicates
 57 |         corpora["train"] -= corpora["test"]
 58 |         corpora["test"] -= corpora["dev"]
 59 |         corpora["train"] -= corpora["dev"]
 60 | 
 61 |     if (
 62 |         corpora["dev"].calculate_number_of_segments() < min_size
 63 |         or corpora["train"].calculate_number_of_segments() < min_size
 64 |     ):
 65 | 
 66 |         # throw error
 67 |         raise (Exception("Error - insufficient data - please add more and try again"))
 68 | 
 69 |     else:
 70 |         return corpora
 71 | 
 72 | 
 73 | def get_corpus(loc):
 74 |     """ returns corpus for input location """
 75 |     return corpus({"location": loc})
 76 | 
 77 | 
 78 | def prep_all_for_training(corpora, target_dir, nested, sample_rate=16000):
 79 |     """
 80 |     prepare all corpora for training and return logs of what was where
 81 |     """
 82 |     return {
 83 |         data_dir: corpora[data_dir].prepare_for_training(
 84 |             target_dir + "/" + data_dir, nested, sample_rate
 85 |         )
 86 |         for data_dir in data_dirs
 87 |     }
 88 | 
 89 | 
 90 | def gather_all_corpora(corpora_dirs):
 91 |     """
 92 |     Finds all existing corpora and gathers into a dictionary
 93 |     """
 94 | 
 95 |     corpora = {
 96 |         data_dir: get_corpus(corpus_dir + "/" + data_dir)
 97 |         for corpus_dir in corpora_dirs
 98 |         for data_dir in data_dirs
 99 |     }
100 | 
101 |     corpora["unsorted"] = corpus()
102 |     for unsorted_corpus in list(map(get_corpus, corpora_dirs)):
103 |         corpora["unsorted"] += unsorted_corpus
104 |     return corpora
105 | 
106 | 
107 | def prepare_audio_corpora(
108 |     *corpora, target_dir="input-data", nested=False, min_train_dev_segments=50
109 | ):
110 |     """
111 |     Copy and organize specified corpora into a target directory.
112 |     Training, testing, and development sets will be created automatically if not already defined.
113 | 
114 |     Input
115 |         corpora, strs - name of one or more directories to combine into `target-dir`
116 |         target-dir, str - target directory where corpora should be organized
117 |         nested, bool (default False) - if present/True, store in stm and sph subdirectories
118 |         min_train_dev_segments int - enforces a minimum number of speech segments in train and dev splits
119 |     """
120 | 
121 |     make_list_of_dirs(
122 |         [
123 |             target_dir + "/" + data_dir + subdirectory
124 |             for data_dir in data_dirs
125 |             for subdirectory in (["/stm/", "/sph/"] if nested else ["/"])
126 |         ]
127 |     )
128 | 
129 |     corpora = gather_all_corpora(corpora)
130 |     corpora = auto_split_corpora(corpora, min_size=min_train_dev_segments)
131 | 
132 |     log = prep_all_for_training(corpora, target_dir, nested)
133 |     with open(target_dir + "/corpora.json", "w") as f:
134 |         f.write(json.dumps(log))
135 | 
136 | 
137 | def cli():
138 |     Fire(prepare_audio_corpora)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     cli()
143 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/time_aligned_text.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Class for holding time_aligned text
  4 | """
  5 | 
  6 | import hashlib
  7 | import importlib
  8 | import os
  9 | 
 10 | from asrtoolkit.file_utils.name_cleaners import (
 11 |     generate_segmented_file_name,
 12 |     sanitize_hyphens,
 13 | )
 14 | 
 15 | 
 16 | class time_aligned_text(object):
 17 |     """
 18 |     Class for storing time-aligned text and converting between formats
 19 |     """
 20 | 
 21 |     location = ""
 22 |     segments = []
 23 |     file_extension = None
 24 | 
 25 |     def __init__(self, input_data=None):
 26 |         """
 27 |         Instantiates a time_aligned text object
 28 |         If 'input_data' is a string, it tries to find the appropriate file.
 29 | 
 30 |         >>> transcript = time_aligned_text()
 31 |         """
 32 |         if (
 33 |             input_data is not None
 34 |             and isinstance(input_data, str)
 35 |             and os.path.exists(input_data)
 36 |         ):
 37 |             self.read(input_data)
 38 |         elif input_data is not None and type(input_data) in [str, dict]:
 39 |             self.file_extension = "txt" if isinstance(input_data, str) else "json"
 40 |             data_handler = importlib.import_module(
 41 |                 "asrtoolkit.data_handlers.{:}".format(self.file_extension)
 42 |             )
 43 |             self.segments = data_handler.read_in_memory(input_data)
 44 | 
 45 |     def hash(self):
 46 |         """
 47 |         Returns a sha1 hash of the file
 48 |         """
 49 |         if self.location:
 50 |             with open(self.location) as f:
 51 |                 return hashlib.sha1(f.read().encode()).hexdigest()
 52 |         else:
 53 |             return hashlib.sha1("".encode()).hexdigest()
 54 | 
 55 |     def __str__(self):
 56 |         """
 57 |         Returns string representation of formatted segments as corresponding
 58 |         By default, use the extension of the file you loaded
 59 |         >>> transcript = time_aligned_text()
 60 |         >>> print(transcript.__str__()=="")
 61 |         True
 62 |         """
 63 |         data_handler = importlib.import_module(
 64 |             "asrtoolkit.data_handlers.{:}".format(
 65 |                 self.file_extension if self.file_extension else "txt"
 66 |             )
 67 |         )
 68 |         return "\n".join(_.__str__(data_handler) for _ in self.segments)
 69 | 
 70 |     def __add__(self, other):
 71 |         """
 72 |         Add two transcripts
 73 |         Set the location after adding if you want to save this!
 74 |         """
 75 |         new_segments = self.segments + other.segments
 76 | 
 77 |         # Sort the segments by their start time then stop time
 78 |         new_segments.sort(key=lambda s: (float(s.start), float(s.stop)))
 79 | 
 80 |         out_transcript = time_aligned_text()
 81 |         out_transcript.file_extension = self.file_extension
 82 |         out_transcript.segments = new_segments
 83 |         return out_transcript
 84 | 
 85 |     def text(self):
 86 |         """
 87 |         Returns unformatted text from all segments
 88 |         """
 89 |         data_handler = importlib.import_module(
 90 |             "asrtoolkit.data_handlers.{:}".format("txt")
 91 |         )
 92 |         return " ".join(_.__str__(data_handler) for _ in self.segments)
 93 | 
 94 |     def read(self, file_name):
 95 |         """ Read a file using class-specific read function """
 96 |         self.file_extension = file_name.split(".")[-1]
 97 |         self.location = file_name
 98 |         data_handler = importlib.import_module(
 99 |             "asrtoolkit.data_handlers.{:}".format(self.file_extension)
100 |         )
101 |         self.segments = data_handler.read_file(file_name)
102 | 
103 |     def write(self, file_name):
104 |         """
105 |         Output to file using segment-specific __str__ function
106 |         """
107 |         file_extension = file_name.split(".")[-1] if "." in file_name else "stm"
108 | 
109 |         file_name = sanitize_hyphens(file_name)
110 | 
111 |         data_handler = importlib.import_module(
112 |             "asrtoolkit.data_handlers.{:}".format(file_extension)
113 |         )
114 |         with open(file_name, "w", encoding="utf-8") as f:
115 |             f.write(data_handler.header())
116 |             f.writelines(
117 |                 data_handler.separator.join(
118 |                     seg.__str__(data_handler) for seg in self.segments
119 |                 )
120 |             )
121 |             f.write(data_handler.footer())
122 | 
123 |         # return back new object in case we are updating a list in place
124 |         return time_aligned_text(file_name)
125 | 
126 |     def split(self, target_dir):
127 |         """
128 |         Split transcript into many pieces based on valid segments of transcript
129 |         """
130 |         os.makedirs(target_dir, exist_ok=True)
131 |         for iseg, seg in enumerate(self.segments):
132 |             new_seg = time_aligned_text()
133 |             new_seg.file_extension = self.file_extension
134 |             new_seg.location = generate_segmented_file_name(
135 |                 target_dir, self.location, iseg
136 |             )
137 |             new_seg.segments = [seg]
138 |             new_seg.write(new_seg.location)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     import doctest
143 | 
144 |     doctest.testmod()
145 | 


--------------------------------------------------------------------------------
/asrtoolkit/wer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Python function for computing word error rates metric for Automatic Speech Recognition files
  4 | """
  5 | 
  6 | import re
  7 | 
  8 | import editdistance
  9 | from fire import Fire
 10 | 
 11 | from asrtoolkit.clean_formatting import clean_up
 12 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 13 | from asrtoolkit.file_utils.script_input_validation import assign_if_valid
 14 | 
 15 | # defines global regex for tagged noises and silence
 16 | re_tagged_nonspeech = re.compile(r"[\[<][A-Za-z #]*[\]>]")
 17 | 
 18 | # define tokenization approach to handle spaces, tabs, and other space characters
 19 | tokenization = re.compile(r"\s+")
 20 | 
 21 | # defines global regex to remove these nsns
 22 | nonsilence_noises = [
 23 |     "noise",
 24 |     "um",
 25 |     "ah",
 26 |     "er",
 27 |     "umm",
 28 |     "uh",
 29 |     "mm",
 30 |     "mn",
 31 |     "mhm",
 32 |     "mnh",
 33 |     "huh",
 34 |     "hmm",
 35 | ]
 36 | re_nonsilence_noises = re.compile(r"\b({})\b".format("|".join(nonsilence_noises)))
 37 | 
 38 | 
 39 | def remove_nonsilence_noises(input_text):
 40 |     """
 41 |     Removes nonsilence noises from a transcript
 42 |     """
 43 |     return re.sub(re_nonsilence_noises, "", input_text)
 44 | 
 45 | 
 46 | def get_wer_components(ref_string, hyp_string):
 47 |     """
 48 |     Helper function that takes as input a reference string and a hypothesis string.
 49 |     Splits the strings by space, computes the WER formula numerator and denominator
 50 |     and returns both.
 51 | 
 52 |     >>> get_wer_components("this is a cat", "this is a dog")
 53 |     (1, 4)
 54 |     >>> get_wer_components(['a','b','c'], ['a','b','d'])
 55 |     (1, 3)
 56 |     """
 57 | 
 58 |     # apply tokenization if given as a string
 59 |     ref = tokenization.split(ref_string) if isinstance(ref_string, str) else ref_string
 60 |     hyp = tokenization.split(hyp_string) if isinstance(hyp_string, str) else hyp_string
 61 | 
 62 |     WER_numerator = editdistance.eval(ref, hyp)
 63 |     WER_denominator = max(1, len(ref))
 64 | 
 65 |     return WER_numerator, WER_denominator
 66 | 
 67 | 
 68 | def standardize_transcript(input_transcript, remove_nsns=False):
 69 |     """
 70 |     Given an input transcript or time_aligned_text object,
 71 |     remove non-speech events
 72 |     [optionally] remove non-silence noises
 73 | 
 74 |     >>> standardize_transcript("this is a test")
 75 |     'this is a test'
 76 |     >>> standardize_transcript("this is um a test")
 77 |     'this is um a test'
 78 |     >>> standardize_transcript("this is um a test", remove_nsns=True)
 79 |     'this is a test'
 80 |     """
 81 | 
 82 |     # accept time_aligned_text objects but use their output text
 83 |     input_transcript = (
 84 |         input_transcript.text()
 85 |         if isinstance(input_transcript, time_aligned_text)
 86 |         else input_transcript
 87 |     )
 88 | 
 89 |     # remove tagged noises and other non-speech events
 90 |     input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)
 91 | 
 92 |     if remove_nsns:
 93 |         input_transcript = remove_nonsilence_noises(input_transcript)
 94 | 
 95 |     # clean punctuation, etc.
 96 |     input_transcript = clean_up(input_transcript)
 97 | 
 98 |     return input_transcript
 99 | 
100 | 
101 | def wer(ref, hyp, remove_nsns=False):
102 |     """
103 |     Calculate word error rate between two string or time_aligned_text objects
104 |     >>> wer("this is a cat", "this is a dog")
105 |     25.0
106 |     """
107 | 
108 |     # standardize input string
109 |     ref, hyp = map(lambda t: standardize_transcript(t, remove_nsns), (ref, hyp))
110 | 
111 |     # calculate WER with helper function
112 |     WER_numerator, WER_denominator = get_wer_components(ref, hyp)
113 | 
114 |     return 100 * WER_numerator / WER_denominator
115 | 
116 | 
117 | def cer(ref, hyp, remove_nsns=False):
118 |     """
119 |     Calculate character error rate between two strings or time_aligned_text objects
120 |     >>> cer("this cat", "this bad")
121 |     25.0
122 |     """
123 | 
124 |     # standardize and convert string to a list of characters
125 |     ref, hyp = map(
126 |         list,
127 |         map(
128 |             lambda transcript: standardize_transcript(transcript, remove_nsns),
129 |             (ref, hyp),
130 |         ),
131 |     )
132 | 
133 |     # calculate CER with helper function
134 |     CER_numerator, CER_denominator = get_wer_components(ref, hyp)
135 | 
136 |     return 100 * CER_numerator / CER_denominator
137 | 
138 | 
139 | def compute_wer(reference_file, transcript_file, char_level=False, ignore_nsns=False):
140 |     """
141 |     Compares a reference and transcript file and calculates word error rate (WER) between these two files
142 |     If --char-level is given, compute CER instead
143 |     If --ignore-nsns is given, ignore non silence noises
144 |     """
145 | 
146 |     # read files from arguments
147 |     ref = assign_if_valid(reference_file)
148 |     hyp = assign_if_valid(transcript_file)
149 | 
150 |     if ref is None or hyp is None:
151 |         print(
152 |             "Error with an input file. Please check all files exist and are accepted by ASRToolkit"
153 |         )
154 |     elif char_level:
155 |         print("CER: {:5.3f}%".format(cer(ref, hyp, ignore_nsns)))
156 |     else:
157 |         print("WER: {:5.3f}%".format(wer(ref, hyp, ignore_nsns)))
158 | 
159 | 
160 | def cli():
161 |     Fire(compute_wer)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     cli()
166 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/audio_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Module for holding information about an audio file and doing basic conversions
  4 | """
  5 | 
  6 | import hashlib
  7 | import logging
  8 | import os
  9 | import subprocess
 10 | 
 11 | from asrtoolkit.file_utils.name_cleaners import (
 12 |     generate_segmented_file_name,
 13 |     sanitize_hyphens,
 14 |     strip_extension,
 15 | )
 16 | from asrtoolkit.file_utils.script_input_validation import valid_input_file
 17 | 
 18 | LOGGER = logging.getLogger()
 19 | 
 20 | 
 21 | def cut_utterance(
 22 |     source_audio_file, target_audio_file, start_time, end_time, sample_rate=16000
 23 | ):
 24 |     """
 25 |     source_audio_file: str, path to file
 26 |     target_audio_file: str, path to file
 27 |     start_time: float or str
 28 |     end_time: float or str
 29 |     sample_rate: int, default 16000; audio sample rate in Hz
 30 | 
 31 |     uses sox to segment source_audio_file to create target_audio_file that
 32 |     contains audio from start_time to end_time
 33 |         with audio sample rate set to sample_rate
 34 |     """
 35 |     subprocess.call(
 36 |         "sox -V1 {} -r {} -b 16 -c 1 {} trim {} ={}".format(
 37 |             source_audio_file,
 38 |             sample_rate,
 39 |             target_audio_file,
 40 |             start_time,
 41 |             end_time,
 42 |         ),
 43 |         shell=True,
 44 |     )
 45 | 
 46 | 
 47 | def degrade_audio(source_audio_file, target_audio_file=None):
 48 |     """
 49 |     Degrades audio to typical G711 level.
 50 |     Useful if models need to target this audio quality.
 51 |     """
 52 | 
 53 |     valid_input_file(source_audio_file, ["mp3", "sph", "wav", "au", "raw"])
 54 | 
 55 |     target_audio_file = (
 56 |         source_audio_file if target_audio_file is None else target_audio_file
 57 |     )
 58 | 
 59 |     # degrade to 8k
 60 |     tmp1 = ".".join(source_audio_file.split(".")[:-1]) + "_tmp1.wav"
 61 |     subprocess.call(
 62 |         "sox -V1 {} -r 8000 -e a-law {}".format(source_audio_file, tmp1),
 63 |         shell=True,
 64 |     )
 65 | 
 66 |     # convert to u-law
 67 |     tmp2 = ".".join(source_audio_file.split(".")[:-1]) + "_tmp2.wav"
 68 |     subprocess.call(
 69 |         "sox -V1 {} --rate 8000 -e u-law {}".format(tmp1, tmp2),
 70 |         shell=True,
 71 |     )
 72 | 
 73 |     # upgrade to 16k a-law signed
 74 |     subprocess.call(
 75 |         "sox -V1 {} --rate 16000 -e signed  -b 16 --channel 1 {}".format(
 76 |             tmp2, target_audio_file
 77 |         ),
 78 |         shell=True,
 79 |     )
 80 |     os.remove(tmp1)
 81 |     os.remove(tmp2)
 82 | 
 83 | 
 84 | def combine_audio(audio_files, output_file, gain=False):
 85 |     """
 86 |     Combine audio files with possible renormalization to 0dB
 87 |     """
 88 |     gain_str = ""
 89 |     if gain:
 90 |         gain_str = "gain -n 0"
 91 |     subprocess.call(
 92 |         "sox -V1 -m {} {} {}".format(" ".join(audio_files), output_file, gain_str),
 93 |         shell=True,
 94 |     )
 95 | 
 96 | 
 97 | class audio_file(object):
 98 |     """
 99 |     Create a audio_file object for
100 |     - storing location
101 |     - retrieving a unique hash
102 |     - resampling for training
103 |     - splitting into segments given an STM file
104 |     """
105 | 
106 |     def __init__(self, location=""):
107 |         """
108 |         Populate file location info
109 |         """
110 |         self.location = None
111 |         if not os.path.exists(location):
112 |             raise FileNotFoundError('Could not find file at "{}"'.format(location))
113 |         self.location = location
114 | 
115 |     def hash(self):
116 |         """
117 |         Returns a sha1 hash of the file
118 |         """
119 |         if self.location:
120 |             with open(self.location, "rb") as f:
121 |                 return hashlib.sha1(f.read()).hexdigest()
122 |         else:
123 |             return hashlib.sha1("".encode()).hexdigest()
124 | 
125 |     def prepare_for_training(self, file_name, sample_rate=16000):
126 |         """
127 |         Converts to single channel (from channel 1) audio file
128 |         in SPH file format
129 |         Returns audio_file object on success, else None
130 |         """
131 |         if file_name.split(".")[-1] != "sph":
132 |             LOGGER.warning(
133 |                 "Forcing training data to use SPH file format for %s", file_name
134 |             )
135 |             file_name = strip_extension(file_name) + ".sph"
136 | 
137 |         file_name = sanitize_hyphens(file_name)
138 | 
139 |         # return None if error code given, otherwise return audio_file object
140 |         output_file = (
141 |             audio_file(file_name)
142 |             if not subprocess.call(
143 |                 "sox -V1 {} {} rate {} remix -".format(
144 |                     self.location, file_name, sample_rate
145 |                 ),
146 |                 shell=True,
147 |             )
148 |             else None
149 |         )
150 | 
151 |         return output_file
152 | 
153 |     def split(self, transcript, target_dir):
154 |         """
155 |         Split audio file and transcript into many pieces based on
156 |         valid segments of transcript
157 |         """
158 | 
159 |         os.makedirs(target_dir, exist_ok=True)
160 |         for iseg, seg in enumerate(transcript.segments):
161 |             cut_utterance(
162 |                 self.location,
163 |                 generate_segmented_file_name(target_dir, self.location, iseg),
164 |                 seg.start,
165 |                 seg.stop,
166 |             )
167 |         transcript.split(target_dir)
168 | 
169 |         return
170 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/align.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import logging
  4 | 
  5 | import en_core_web_sm
  6 | 
  7 | # Third Party
  8 | from spacy.tokens import Doc as spacy_doc
  9 | from textacy.extract import ngrams, noun_chunks
 10 | from toolz.sandbox.core import pluck
 11 | 
 12 | from asrtoolkit.alignment.align_utils import (
 13 |     Extractor,
 14 |     dict_to_segments,
 15 |     word_lattice_to_lines,
 16 | )
 17 | from asrtoolkit.alignment.aligned_doc import AlignedDoc
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | NLP = en_core_web_sm.load()
 21 | 
 22 | 
 23 | class WhitespaceTokenizer:
 24 |     """
 25 |     Text (List of Tokens) to Spacy Docs
 26 |     Tokenizer input - list of words
 27 |       returned spacy Doc tokens same as initial word list
 28 |       equivalent to splitting on white space
 29 |     """
 30 | 
 31 |     def __init__(self, NLP):
 32 |         """NLP: loaded Spacy model"""
 33 |         self.vocab = NLP.vocab
 34 | 
 35 |     def __call__(self, word_list):
 36 |         """word_list: list of words; returns Doc"""
 37 |         return spacy_doc(self.vocab, words=word_list, spaces=[True] * len(word_list))
 38 | 
 39 | 
 40 | def init_spacy_document(
 41 |     lattice,
 42 |     token_key="token",
 43 | ):
 44 |     """
 45 |     Given List of word-level dicts, initializes Spacy Doc
 46 |       using WhitespaceTokenizer
 47 |     :param: lattice: List[Dict];
 48 |       inner dicts contain token-level information for each token in input text
 49 |     :param: token_key: key in Dicts containing tokens in text
 50 |     :return: spacy Doc
 51 |       (1) text: ' '.join(dict['token'])
 52 |       (2) metadata: doc.user_data = lattice (word-level dicts)
 53 |         - Scribe output: word-level time offsets
 54 |         - reference transcript: speaker id and gender
 55 | 
 56 |     * Note: spacy model must be loaded and bound to NLP object
 57 |         NLP = spacy.load('en_core_web_sm')
 58 |     """
 59 |     NLP.make_doc = WhitespaceTokenizer(NLP)
 60 |     word_list = list(pluck(token_key, lattice))
 61 |     doc = NLP(word_list)
 62 |     doc.user_data = lattice
 63 |     assert len(doc) == len(
 64 |         word_list
 65 |     ), "document has different number of tokens {} from word_list {}".format(
 66 |         len(doc), len(word_list)
 67 |     )
 68 |     return doc
 69 | 
 70 | 
 71 | def select_extractors(use_unigrams=False):
 72 |     """
 73 |     Extractors For Alignment
 74 |     :return: List of Extractors objects to use for text-text alignment
 75 |     note: ngram extractors below filter out stopwords and number words/symbols
 76 |     """
 77 |     noun_chunk_extractor = Extractor(
 78 |         lambda doc: list(filter(lambda x: len(x) > 3, list(noun_chunks(doc))))
 79 |     )
 80 |     tetragram_extractor = Extractor(
 81 |         lambda doc: list(ngrams(doc, 4, filter_stops=True, filter_nums=True))
 82 |     )
 83 |     trigram_extractor = Extractor(
 84 |         lambda doc: list(ngrams(doc, 3, filter_stops=True, filter_nums=True))
 85 |     )
 86 |     bigram_extractor = Extractor(
 87 |         lambda doc: list(ngrams(doc, 2, filter_stops=False, filter_nums=False))
 88 |     )
 89 |     unigram_extractor = Extractor(
 90 |         lambda doc: list(ngrams(doc, 1, filter_stops=False, filter_nums=False))
 91 |     )
 92 | 
 93 |     extractor_list = [
 94 |         noun_chunk_extractor,
 95 |         tetragram_extractor,
 96 |         trigram_extractor,
 97 |         bigram_extractor,
 98 |     ]
 99 | 
100 |     if use_unigrams:
101 |         extractor_list.append(unigram_extractor)
102 | 
103 |     return extractor_list
104 | 
105 | 
106 | def lattices_to_aligned_doc(
107 |     time_annotated_tokens,
108 |     reference_tokens,
109 |     extractors,
110 |     num_extractions,
111 | ):
112 |     """
113 |     From two Lists of Word-Dicts, returns AlignedDoc (already aligned)
114 |     :param time_annotated_tokens: List[Dict]; dicts with token-specific data
115 |       including, at minimum, token start time and duration
116 |     :param reference_tokens: List[Dict]; dicts with token-specific data
117 |       including speaker(id) and gender
118 |     :return: AlignedDoc object; contains
119 |       end-product of text-text alignment and
120 |       methods to retrieve matches with and without token-level metadata
121 |     """
122 |     # Create Spacy Docs from List of Tokens + Metadata
123 |     hyp = init_spacy_document(lattice=time_annotated_tokens, token_key="gk_token")
124 |     ref = init_spacy_document(lattice=reference_tokens, token_key="token")
125 |     return AlignedDoc(hyp, ref, extractors, num_extractions)
126 | 
127 | 
128 | def get_segments_from_alignments(
129 |     aligned_token_lattice,
130 |     reference_doc,
131 |     token_idx="token_idx",
132 |     max_duration=15,
133 | ):
134 |     """
135 |     Uses Aligned Tokens to Segment Reference Text into STM Lines
136 |     :param aligned_token_lattice: List of Dicts,STM
137 |       each containing word-level information;
138 |       all tokens are paired and contain data requisite for STM file
139 |     :param max_duration: int; maximum duration of STM line audio
140 |     :return: segments: List[Segment],
141 |       each Segment object contains data for one STM Line
142 |     """
143 |     line_dict = word_lattice_to_lines(
144 |         word_lattice=aligned_token_lattice, MAX_DURATION=max_duration
145 |     )
146 |     segments = dict_to_segments(
147 |         line_dict=line_dict, doc=reference_doc, token_idx=token_idx
148 |     )
149 | 
150 |     return line_dict, segments
151 | 
152 | 
153 | def align(
154 |     gk_json,
155 |     ref_tokens,
156 |     num_extractions=4,
157 |     max_duration=15,
158 | ):
159 |     """
160 |     Given a gk_json and ref_tokens, apply extractors and align
161 | 
162 |     Using num_extractions extractors
163 |     and aligning up to max_duration second segments
164 |     """
165 |     extractor_list = select_extractors()
166 | 
167 |     # Align data
168 |     aligned_doc = lattices_to_aligned_doc(
169 |         time_annotated_tokens=gk_json,
170 |         reference_tokens=ref_tokens,
171 |         extractors=extractor_list,
172 |         num_extractions=num_extractions,
173 |     )
174 | 
175 |     # combine hyp token metadata with aligned ref metadata
176 |     token_lattice = aligned_doc.get_token_metadata()
177 | 
178 |     # dict & segment objects with lines under max_duration between matched tokens (w/ same speaker)
179 |     line_dict, all_segments = get_segments_from_alignments(
180 |         token_lattice,
181 |         aligned_doc.ref,
182 |         token_idx="token_idx",
183 |         max_duration=max_duration,
184 |     )
185 | 
186 |     min_duration = 3.0
187 |     segments = [seg for seg in all_segments if seg.stop - seg.start > min_duration]
188 | 
189 |     return segments
190 | 


--------------------------------------------------------------------------------
/tests/test_clean_up.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Test wer calculation
  4 | """
  5 | 
  6 | from asrtoolkit.clean_formatting import clean_up
  7 | 
  8 | 
  9 | def test_clean_up():
 10 |     " execute suite of tests "
 11 | 
 12 |     tests = [
 13 |         ("1.05", "one point zero five"),
 14 |         ("105.", "one hundred and five"),
 15 |         ("105.", "one hundred and five"),
 16 |         ("They dollars and three cents.", "they dollars and three cents"),
 17 |         (
 18 |             "The machine is in the garden of Mr. MacGregor.",
 19 |             "the machine is in the garden of mr macgregor",
 20 |         ),
 21 |         ("This may be a problem.", "this may be a problem"),
 22 |         (
 23 |             "Yeah th those are my finest percentages.",
 24 |             "yeah th those are my finest percentages",
 25 |         ),
 26 |         ("Please press five after the tone.", "please press five after the tone"),
 27 |         ("Six distinct.", "six distinct"),
 28 |         ("ABC trades at -3.2%.", "a b c trades at negative three point two percent"),
 29 |         (
 30 |             "My 2017 report shows the 5th best earnings.",
 31 |             "my two thousand and seventeen report shows the fifth best earnings",
 32 |         ),
 33 |         ("This is the 9th of November.", "this is the ninth of november"),
 34 |         (
 35 |             "No work has been completed since Tuesday the 16th of September.",
 36 |             "no work has been completed since tuesday the sixteenth of september",
 37 |         ),
 38 |         (
 39 |             "That is what Leah Bradley said last week about the Indians on TV.",
 40 |             "that is what leah bradley said last week about the indians on t v",
 41 |         ),
 42 |         (
 43 |             "I've never done anything that I'd regret.",
 44 |             "i 've never done anything that i 'd regret",
 45 |         ),
 46 |         (
 47 |             "He needs 1.375%.",
 48 |             "he needs one point three hundred and seventy five percent",
 49 |         ),
 50 |         (
 51 |             "I heard Mr. McDonald has $6.23",
 52 |             "i heard mr mcdonald has six dollars and twenty three cents",
 53 |         ),
 54 |         ("Yes this is Mr. MacAllen.", "yes this is mr macallen"),
 55 |         ("Don't break dollars.", "don 't break dollars"),
 56 |         ("This is the best one.", "this is the best one"),
 57 |         ("London has five theatres.", "london has five theatres"),
 58 |         ("Hundreds of cats.", "hundreds of cats"),
 59 |         ("Thousands of cats.", "thousands of cats"),
 60 |         ("Millions and billions of cats.", "millions and billions of cats"),
 61 |         ("Good evening Larry.", "good evening larry"),
 62 |         ("You have two choices Neo.", "you have two choices neo"),
 63 |         ("This one or that one.", "this one or that one"),
 64 |         ("7 8 9 ...", "seven eight nine"),
 65 |         ("Two partridges in one pear tree.", "two partridges in one pear tree"),
 66 |         ("2s 3s 4s.", "twos threes fours"),
 67 |         ("I am -5 on the bonds.", "i am negative five on the bonds"),
 68 |         ("This is your bus terminus seven.", "this is your bus terminus seven"),
 69 |         ("5th of March.", "fifth of march"),
 70 |         ("Hundreds.", "hundreds"),
 71 |         ("ABC.", "a b c"),
 72 |         ("A.B.C.", "a b c"),
 73 |         ("ABC", "a b c"),
 74 |         ("A one sauce.", "a one sauce"),
 75 |         ("This is Prof. Charles Xavier.", "this is prof charles xavier"),
 76 |         ("Welcome to the island of Dr. Moreau.", "welcome to the island of dr moreau"),
 77 |         ("7th Sept against the third of May.", "seventh sept against the third of may"),
 78 |         (
 79 |             "Who is on first and what is on second.",
 80 |             "who is on first and what is on second",
 81 |         ),
 82 |         ("2001 a space Odyssey.", "two thousand and one a space odyssey"),
 83 |         (
 84 |             "I'm selling my car for one trillion.",
 85 |             "i 'm selling my car for one trillion",
 86 |         ),
 87 |         (
 88 |             "I'm selling my car for one trillion.",
 89 |             "i 'm selling my car for one trillion",
 90 |         ),
 91 |         (
 92 |             "I would recommend selling 2s 10s here.",
 93 |             "i would recommend selling twos tens here",
 94 |         ),
 95 |         ("129.6%.", "one hundred and twenty nine point six percent"),
 96 |         (
 97 |             "5.3% and then 129.6%.",
 98 |             "five point three percent and then one hundred and twenty nine point six percent",
 99 |         ),
100 |         (
101 |             "70.3% and coming around 129.6%.",
102 |             "seventy point three percent and coming around one hundred and twenty nine point six percent",
103 |         ),
104 |         (
105 |             "Dr. Joseph owes $12.5 billion.",
106 |             "dr joseph owes twelve point five billion dollars",
107 |         ),
108 |         (
109 |             "Replacements for things like Dr. for drive should only happen where necessary.",
110 |             "replacements for things like dr for drive should only happen where necessary",
111 |         ),
112 |         ("100.", "one hundred"),
113 |         ("115.", "one hundred and fifteen"),
114 |         ("125.", "one hundred and twenty five"),
115 |         ("140.", "one hundred and forty"),
116 |         ("1000.", "one thousand"),
117 |         ("1 2 3 4 5 6 7 8 9 10.", "one two three four five six seven eight nine ten"),
118 |         (
119 |             "his license plate is a. c, f seven...five ! zero",
120 |             "his license plate is a c f seven five zero",
121 |         ),
122 |         ("Q2", "q two"),
123 |         (
124 |             "from our website at www.take2games.com.",
125 |             "from our website at www take two games dot com",
126 |         ),
127 |         ("NBA 2K18", "n b a two k eighteen"),
128 |         ("launched WWE 2K 18", "launched w w e two k eighteen"),
129 |         (
130 |             "released L.A. Noire, the The VR Case Files for the HTC VIVE system",
131 |             "released l a noire the the v r case files for the h t c v i v e system",
132 |         ),
133 |         (
134 |             "Total net bookings were $654 million,",
135 |             "total net bookings were six hundred and fifty four million dollars",
136 |         ),
137 |         (
138 |             "net booking which grew 6% to $380 million.",
139 |             "net booking which grew six percent to three hundred and eighty million dollars",
140 |         ),
141 |         (
142 |             "to $25 dollars or $0.21 per share price.",
143 |             "to twenty five dollars dollars or zero dollars and twenty one cents per share price",
144 |         ),
145 |         ("year-over-year", "year over year"),
146 |         ("HTC VIVE", "h t c v i v e"),
147 |     ]
148 | 
149 |     for test in tests:
150 |         input_string = test[0]
151 |         result = clean_up(input_string)
152 |         assert result == test[1]
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     import sys
157 | 
158 |     import pytest
159 | 
160 |     pytest.main(sys.argv)
161 | 


--------------------------------------------------------------------------------
/asrtoolkit/deformatting_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | De-Formatting functions used in clean_formatting
  5 | """
  6 | 
  7 | import num2words
  8 | 
  9 | 
 10 | def contains_digit(input_string):
 11 |     """
 12 |     check if string contains digit
 13 |     >>> contains_digit("5")
 14 |     True
 15 |     >>> contains_digit("5a")
 16 |     True
 17 |     >>> contains_digit("cat")
 18 |     False
 19 |     """
 20 |     return any(_.isdigit() for _ in input_string)
 21 | 
 22 | 
 23 | def ordinal_to_string(input_string):
 24 |     """
 25 |     convert strings '1st', '2nd', '3rd', ... to a string/word with chars a-z
 26 |     >>> ordinal_to_string("4th")
 27 |     'fourth'
 28 |     """
 29 | 
 30 |     def has_ordinal(input_string):
 31 |         " checks if input_string has ordinal "
 32 |         return contains_digit(input_string) and (
 33 |             input_string[-2:] == "st"
 34 |             or input_string[-2:] == "nd"
 35 |             or input_string[-2:] == "rd"
 36 |             or input_string[-2:] == "th"
 37 |         )
 38 | 
 39 |     ret_str = input_string
 40 | 
 41 |     if has_ordinal(input_string):
 42 |         ret_str = (
 43 |             (num2words.num2words(int(input_string[:-2]), ordinal=True))
 44 |             .replace(",", "")
 45 |             .replace("-", " ")
 46 |         )
 47 | 
 48 |     return ret_str
 49 | 
 50 | 
 51 | def format_dollars(dollar_string, dollars):
 52 |     " format dollar string "
 53 |     return dollar_string + " dollars" if dollars and int(dollars) != 1 else "one dollar"
 54 | 
 55 | 
 56 | def format_cents(cents_string, cents):
 57 |     " format cent string "
 58 |     return (
 59 |         (" and {:} cents".format(cents_string) if int(cents) != 1 else " and one cent")
 60 |         if (cents_string and cents)
 61 |         else ""
 62 |     )
 63 | 
 64 | 
 65 | def format_quantities(input_string):
 66 |     " split off quantities and change into words"
 67 | 
 68 |     possible_quant = input_string[-1].upper() if input_string else ""
 69 |     quantity_dict = {"B": "billion", "M": "million", "K": "thousand"}
 70 |     quant = quantity_dict[possible_quant] if possible_quant in quantity_dict else ""
 71 | 
 72 |     if quant:
 73 |         input_string = input_string[:-1]
 74 | 
 75 |     return input_string, quant
 76 | 
 77 | 
 78 | def format_dollars_and_cents(input_string):
 79 |     "formats de-formatted dollars and cents from dollars and cents string"
 80 |     # split into pieces
 81 |     dollars, cents = (
 82 |         input_string.split(".") if "." in input_string else [input_string, None]
 83 |     )
 84 | 
 85 |     # format all as numbers
 86 |     dollar_words, cent_words = list(
 87 |         map(
 88 |             lambda num: num2words.num2words(int(num)) if num else None,
 89 |             [dollars, cents + "0" if (cents and len(cents) == 1) else cents],
 90 |         )
 91 |     )
 92 | 
 93 |     # format the output
 94 |     return format_dollars(dollar_words, dollars) + format_cents(cent_words, cents)
 95 | 
 96 | 
 97 | def dollars_to_string(input_string):
 98 |     """
 99 |     convert dollar strings '$2', '$2.56', '$10', '$1000000', ... to a string/word with chars a-z
100 |     >>> dollars_to_string("$2.56")
101 |     'two dollars and fifty six cents'
102 |     >>> dollars_to_string("$3.")
103 |     'three dollars'
104 |     >>> dollars_to_string("$3.5")
105 |     'three dollars and fifty cents'
106 |     >>> dollars_to_string("$1")
107 |     'one dollar'
108 |     >>> dollars_to_string("$1.00")
109 |     'one dollar and zero cents'
110 |     >>> dollars_to_string("$1.01")
111 |     'one dollar and one cent'
112 |     >>> dollars_to_string("$1.01B")
113 |     'one point zero one billion dollars'
114 |     """
115 | 
116 |     input_string = input_string.replace("$", "")
117 | 
118 |     # peel off quantity if present
119 |     input_string, quant = format_quantities(input_string)
120 | 
121 |     if not quant:
122 |         ret_str = format_dollars_and_cents(input_string)
123 |     else:
124 |         ret_str = " ".join([num2words.num2words(float(input_string)), quant, "dollars"])
125 | 
126 |     # remove minus signs
127 |     ret_str = ret_str.replace("-", " ")
128 | 
129 |     return ret_str
130 | 
131 | 
132 | def get_numbers_after_decicmal_point(input_string):
133 |     " Format every number after decimal point "
134 |     ret_str = ""
135 |     decimal = (
136 |         input_string.split(".")[1].strip() if len(input_string.split(".")) > 1 else ""
137 |     )
138 |     if decimal:
139 |         ret_str += " point"
140 |         ret_str += " zero" * decimal.count("0")
141 |         ret_str += " " + num2words.num2words(int(decimal))
142 |     return ret_str
143 | 
144 | 
145 | def digits_to_string(input_string):
146 |     """
147 |     convert strings '52.4' to string/word with chars a-z
148 |     >>> digits_to_string("2.56")
149 |     'two point fifty six'
150 |     >>> digits_to_string("2")
151 |     'two'
152 |     >>> digits_to_string("1.05")
153 |     'one point zero five'
154 |     """
155 |     ret_str = input_string
156 |     if input_string:
157 |         ret_str = (
158 |             num2words.num2words(int(input_string.split(".")[0]))
159 |             if input_string.split(".")[0] != ""
160 |             else ""
161 |         )
162 |         ret_str += get_numbers_after_decicmal_point(input_string)
163 | 
164 |     return ret_str.replace("-", " ")
165 | 
166 | 
167 | def pluralize(substring):
168 |     """
169 |     Convert number to number + plural ending
170 |     >>> pluralize("5")
171 |     'fives'
172 |     >>> pluralize("6")
173 |     'sixes'
174 |     >>> pluralize("10")
175 |     'tens'
176 |     >>> pluralize("20")
177 |     'twenties'
178 |     """
179 | 
180 |     # default return value
181 |     ret_str = substring
182 | 
183 |     def is_multiple_of_ten(notten):
184 |         "check for twenties, etc. but not 1-10, 100, 1000, etc."
185 |         return (
186 |             len(notten) == 2 and notten != "00" and notten != "10" and notten[-1] == "0"
187 |         )
188 | 
189 |     if is_multiple_of_ten(substring):
190 |         ret_str = digits_to_string(substring)[:-1] + "ies"
191 |     else:
192 |         ret_str = digits_to_string(substring) + ("es" if substring[-1] == "6" else "s")
193 |     return ret_str
194 | 
195 | 
196 | def plural_numbers_to_string(input_string):
197 |     """
198 |     Converts plural numbers to strings
199 |     >>> plural_numbers_to_string("6s")
200 |     'sixes'
201 |     >>> plural_numbers_to_string("8s")
202 |     'eights'
203 |     >>> plural_numbers_to_string("80s")
204 |     'eighties'
205 |     >>> plural_numbers_to_string("1980s")
206 |     'nineteen eighties'
207 |     """
208 |     input_string = "".join(_ for _ in input_string if _.isdigit())
209 | 
210 |     ret_str = (
211 |         " ".join((digits_to_string(input_string[:2]), pluralize(input_string[2:])))
212 |         if (len(input_string) == 4 and input_string[-2:] != "00")
213 |         else pluralize(input_string)
214 |     )
215 | 
216 |     return ret_str
217 | 
218 | 
219 | def fraction_to_string(input_string):
220 |     """
221 |     Converts fraction to string
222 |     >>> fraction_to_string("1/5")
223 |     'one fifth'
224 |     """
225 | 
226 |     numerator, denominator = input_string.split("/")
227 | 
228 |     # remove spaces
229 |     numerator = numerator.strip()
230 |     denominator = denominator.strip()
231 | 
232 |     numerator = digits_to_string(numerator)
233 |     denominator = num2words.num2words(int(denominator), ordinal=True)
234 |     return " ".join([numerator, denominator])
235 | 


--------------------------------------------------------------------------------
/asrtoolkit/clean_formatting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Text line cleaning functions. For WER calculations, final text should be utf letter chars and \'
  5 | """
  6 | 
  7 | import logging
  8 | import string
  9 | from collections import OrderedDict
 10 | 
 11 | import regex as re
 12 | from fire import Fire
 13 | 
 14 | from asrtoolkit.deformatting_utils import (
 15 |     digits_to_string,
 16 |     dollars_to_string,
 17 |     fraction_to_string,
 18 |     ordinal_to_string,
 19 |     plural_numbers_to_string,
 20 | )
 21 | from asrtoolkit.file_utils.script_input_validation import valid_input_file
 22 | 
 23 | LOGGER = logging.getLogger(__name__)
 24 | 
 25 | # preserve any unicode letters
 26 | invalid_chars = re.compile(r"[^\p{L}<\[\]> \']", re.IGNORECASE)
 27 | 
 28 | spaces = re.compile(r"\s+")
 29 | 
 30 | KNOWN_REPLACEMENTS = OrderedDict(
 31 |     [
 32 |         ("millions", (re.compile(r"\b(mln|mio|mlns)\b"), lambda m: "million")),
 33 |         ("pleases", (re.compile(r"\b(plz|pls)\b"), lambda m: "please")),
 34 |         ("thanks", (re.compile(r"\b(thks|thx)\b"), lambda m: "thanks")),
 35 |         ("otc", (re.compile(r"\b(otc)\b"), lambda m: "o t c")),
 36 |         ("ellipses", (re.compile(r"\.{2,}"), lambda m: " ")),
 37 |         (
 38 |             "websites",
 39 |             (
 40 |                 re.compile(r"[.](net|org|com|gov)\b"),
 41 |                 lambda m: " dot " + m.group().lower().replace(".", ""),
 42 |             ),
 43 |         ),
 44 |         (
 45 |             "phone_numbers",
 46 |             (
 47 |                 re.compile(
 48 |                     r"\b((1|44)[ -.]?)?([\(]?([0-9]{1,}[\)]?[ -.]?){2,5})[0-9]{4}\b"
 49 |                 ),
 50 |                 lambda m: " ".join(
 51 |                     digits_to_string(_) for _ in m.group() if _.isdigit()
 52 |                 ),
 53 |             ),
 54 |         ),
 55 |         (
 56 |             "acronyms",
 57 |             (
 58 |                 re.compile(r"\b(([A-Z]){1,}[.]?){2,}\b"),
 59 |                 lambda m: " ".join(m.group().lower().replace(".", "")),
 60 |             ),
 61 |         ),
 62 |         ("dashes", (re.compile(r"\-[0-9]\b"), lambda m: "negative " + m.group()[1:])),
 63 |         ("negatives", (re.compile(r" \- "), lambda m: "")),
 64 |         ("positives", (re.compile(r"\+"), lambda m: " plus ")),
 65 |         (
 66 |             "ordinals",
 67 |             (
 68 |                 re.compile(r"[0-9]{1,}(st|nd|rd|th)"),
 69 |                 lambda m: ordinal_to_string(m.group()),
 70 |             ),
 71 |         ),
 72 |         (
 73 |             "many_dollars",
 74 |             (
 75 |                 re.compile(r"\$([0-9]{1,}\.?[0-9]{0,})\s(billion|million|trillion)"),
 76 |                 lambda m: " ".join(
 77 |                     [digits_to_string(m.groups()[0]), m.groups()[1], "dollars"]
 78 |                 ),
 79 |             ),
 80 |         ),
 81 |         (
 82 |             "dollars",
 83 |             (
 84 |                 re.compile(r"\$[0-9]{1,}\.?[0-9]{0,}[mbkMBK]?"),
 85 |                 lambda m: dollars_to_string(m.group()),
 86 |             ),
 87 |         ),
 88 |         ("percent", (re.compile(r"\%"), lambda m: " percent")),
 89 |         (
 90 |             "fractions",
 91 |             (
 92 |                 re.compile(r"\b[0-9]\s?\/\s?[0-9]\b"),
 93 |                 lambda m: fraction_to_string(m.group()),
 94 |             ),
 95 |         ),
 96 |         (
 97 |             "plural_numbers",
 98 |             (
 99 |                 re.compile(r"\b[0-9]{1,}s\b"),
100 |                 lambda m: plural_numbers_to_string(m.group()),
101 |             ),
102 |         ),
103 |         (
104 |             "numbers",
105 |             (
106 |                 re.compile(r"[0-9\.]{1,}"),
107 |                 lambda m: " " + digits_to_string(m.group()) + " ",
108 |             ),
109 |         ),
110 |         ("apostrophes", (re.compile(r"\'"), lambda m: " '")),
111 |     ]
112 | )
113 | 
114 | 
115 | def remove_special_chars(line, chars_to_replace):
116 |     "remove a set of special chars"
117 |     for char_to_replace in chars_to_replace:
118 |         line = line.replace(char_to_replace, " ")
119 |     return line
120 | 
121 | 
122 | def remove_all_special_chars(line):
123 |     """
124 |     Only allow unicode letter characters, spaces, apostrophes,
125 |     and angle brackets (for noises) to be output
126 |     """
127 |     return invalid_chars.sub(" ", line)
128 | 
129 | 
130 | def remove_double_spaces(line):
131 |     """
132 |     Remove all double spaces
133 |     """
134 |     return spaces.sub(" ", line)
135 | 
136 | 
137 | def apply_all_regex_and_replacements(input_line):
138 |     """
139 |     For a line and list of paired regex and replacements,
140 |       apply all replacements for all regex on the line
141 |     """
142 | 
143 |     for pat in KNOWN_REPLACEMENTS:
144 |         try:
145 |             input_line = re.sub(
146 |                 KNOWN_REPLACEMENTS[pat][0], KNOWN_REPLACEMENTS[pat][1], input_line
147 |             )
148 |         except Exception as exc:
149 |             LOGGER.exception(
150 |                 "Exception %s with line %s for pattern %s", exc, input_line, pat
151 |             )
152 | 
153 |     return input_line
154 | 
155 | 
156 | def check_for_formatted_chars(input_line):
157 |     "returns True if formatting or special chars are present otherwise False"
158 | 
159 |     return bool(set(input_line).difference(set(string.ascii_lowercase + " ")))
160 | 
161 | 
162 | def clean_up(input_line):
163 |     """
164 |     Apply all text cleaning operations to input line
165 |     >>> clean_up("his license plate is a. c, f seven...five ! zero")
166 |     'his license plate is a c f seven five zero'
167 |     >>> clean_up("Q2")
168 |     'q two'
169 |     >>> clean_up("from our website at www.take2games.com.")
170 |     'from our website at www take two games dot com'
171 |     >>> clean_up("NBA 2K18")
172 |     'n b a two k eighteen'
173 |     >>> clean_up("launched WWE 2K 18")
174 |     'launched w w e two k eighteen'
175 |     >>> clean_up("released L.A. Noire, the The VR Case Files for the HTC VIVE system")
176 |     'released l a noire the the v r case files for the h t c v i v e system'
177 |     >>> clean_up("Total net bookings were $654 million,")
178 |     'total net bookings were six hundred and fifty four million dollars'
179 |     >>> clean_up("net booking which grew 6% to $380 million.")
180 |     'net booking which grew six percent to three hundred and eighty million dollars'
181 |     >>> clean_up("to $25 dollars or $0.21 per share price.")
182 |     'to twenty five dollars dollars or zero dollars and twenty one cents per share price'
183 |     >>> clean_up("year-over-year")
184 |     'year over year'
185 |     >>> clean_up("HTC VIVE")
186 |     'h t c v i v e'
187 |     >>> clean_up("you can reach me at 1-(317)-222-2222 or fax me at 555-555-5555")
188 |     'you can reach me at one three one seven two two two two two two two or fax me at five five five five five five five five five five'
189 |     >>> clean_up("I heard Mr. McDonald has $6.23")
190 |     'i heard mr mcdonald has six dollars and twenty three cents'
191 |     >>> clean_up(" for client X (hide name pls), plz giv $1 mln shs thx")
192 |     'for client x hide name please please giv one million dollars shs thanks'
193 |     >>> clean_up("[laughter]")
194 |     '[laughter]'
195 |     """
196 | 
197 |     if check_for_formatted_chars(input_line):
198 | 
199 |         input_line = remove_special_chars(input_line, ",*&!?")
200 | 
201 |         input_line = apply_all_regex_and_replacements(input_line)
202 | 
203 |         input_line = remove_all_special_chars(input_line)
204 | 
205 |         input_line = input_line.encode().decode("utf-8").lower()
206 | 
207 |     # check for double spacing
208 |     input_line = remove_double_spaces(input_line)
209 | 
210 |     return input_line.strip()
211 | 
212 | 
213 | def clean_one_file(input_text_file):
214 |     """
215 |     Cleans a single file
216 |     """
217 |     with open(input_text_file, "r", encoding="utf-8") as f:
218 |         lines = f.read().splitlines()
219 | 
220 |     cleaned = map(clean_up, lines)
221 | 
222 |     with open(
223 |         input_text_file.replace(".txt", "") + "_cleaned.txt", "w", encoding="utf-8"
224 |     ) as f:
225 |         f.write(" ".join(cleaned))
226 | 
227 | 
228 | def clean_text_file(*input_text_files):
229 |     """
230 |     Cleans input *.txt files and outputs *_cleaned.txt
231 |     """
232 |     for input_text_file in input_text_files:
233 |         if not valid_input_file(input_text_file, valid_extensions=["txt"]):
234 |             LOGGER.error(
235 |                 "File %s does not end in .txt - please only use this for cleaning txt files",
236 |                 input_text_file,
237 |             )
238 |             continue
239 |         clean_one_file(input_text_file)
240 | 
241 |         LOGGER.info("File output: %s", input_text_file.replace(".txt", "_cleaned.txt"))
242 | 
243 | 
244 | def cli():
245 |     Fire(clean_text_file)
246 | 
247 | 
248 | if __name__ == "__main__":
249 |     cli()
250 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![FINOS - Archived](https://cdn.jsdelivr.net/gh/finos/contrib-toolbox@master/images/badge-archived.svg)](https://community.finos.org/docs/governance/Software-Projects/stages/archived)
  2 | 
  3 | 
  4 | NOTE! This project is archived due to lack of activity; you can still consume this software, although not advised, as it is not actively maintained. If you're interested to restore activity on this repository, please email help@finos.org
  5 | 
  6 | ---
  7 | 
  8 | ## The GreenKey ASRToolkit provides tools for file conversion and ASR corpora organization. These are intended to simplify the workflow for building, customizing, and analyzing ASR models, useful for scientists, engineers, and other technologists in speech recognition.
  9 | 
 10 | ### File formats supported
 11 | 
 12 | File formats have format-specific handlers in asrtoolkit/data_handlers. The scripts `convert_transcript` and `wer` support [`stm`](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/infmts.htm), [`srt`](http://zuggy.wz.cz/), [`vtt`](https://w3c.github.io/webvtt/), `txt`, and [GreenKey `json`](https://transcription.greenkeytech.com/) formatted transcripts. A custom `html` format is also available, though this should not be considered a stable format for long term storage as it is subject to change without notice.
 13 | 
 14 | ### convert_transcript 
 15 | ```text
 16 | usage: convert_transcript [-h] input_file output_file
 17 | 
 18 | convert a single transcript from one text file format to another
 19 | 
 20 | positional arguments:
 21 |   input_file   input file
 22 |   output_file  output file
 23 | 
 24 | optional arguments:
 25 |   -h, --help   show this help message and exit
 26 | ```
 27 | This tool allows for easy conversion among file formats listed above.
 28 | 
 29 | Note: Attributes of a segment object not present in a parsed file retain their default values
 30 | 
 31 | - For example, a `segment` object is created for each line of an STM line
 32 | - each is initialized with the following default values which are not encoded in STM files: `formatted_text=''`;  `confidence=1.0` 
 33 | 
 34 | 
 35 | 
 36 | ### wer
 37 | ```text
 38 | usage: wer [-h] [--char-level] [--ignore-nsns]
 39 |            reference_file transcript_file
 40 | 
 41 | Compares a reference and transcript file and calculates word error rate (WER)
 42 | between these two files
 43 | 
 44 | positional arguments:
 45 |   reference_file   reference "truth" file
 46 |   transcript_file  transcript possibly containing errors
 47 | 
 48 | optional arguments:
 49 |   -h, --help       show this help message and exit
 50 |   --char-level     calculate character error rate instead of word error rate
 51 |   --ignore-nsns    ignore non silence noises like um, uh, etc.
 52 | 
 53 | This tool allows for easy comparison of reference and hypothesis transcripts in any format listed above.
 54 | ```
 55 | 
 56 | ### clean_formatting 
 57 | ```text
 58 | usage: clean_formatting.py [-h] files [files ...]
 59 | 
 60 | cleans input *.txt files and outputs *_cleaned.txt
 61 | 
 62 | positional arguments:
 63 |   files       list of input files
 64 | 
 65 | optional arguments:
 66 |   -h, --help  show this help message and exit
 67 | 
 68 | ```
 69 | This script standardizes how abbreviations, numbers, and other formatted text is expressed so that ASR engines can easily use these files as training or testing data. Standardizing the formatting of output is essential for reproducible measurements of ASR accuracy.
 70 | 
 71 | ### split_audio_file 
 72 | ```text
 73 | usage: split_audio_file [-h] [--target-dir TARGET_DIR] audio_file transcript
 74 | 
 75 | Split an audio file using valid segments from a transcript file. For this
 76 | utility, transcript files must contain start/stop times.
 77 | 
 78 | positional arguments:
 79 |   audio_file            input audio file
 80 |   transcript            transcript
 81 | 
 82 | optional arguments:
 83 |   -h, --help            show this help message and exit
 84 |   --target-dir TARGET_DIR
 85 |                         Path to target directory
 86 | ```
 87 | 
 88 | ### prepare_audio_corpora
 89 | ```text
 90 | usage: prepare_audio_corpora [-h] [--target-dir TARGET_DIR]
 91 |                              corpora [corpora ...]
 92 | 
 93 | Copy and organize specified corpora into a target directory. Training,
 94 | testing, and development sets will be created automatically if not already
 95 | defined.
 96 | 
 97 | positional arguments:
 98 |   corpora               Name of one or more directories in directory this
 99 |                         script is run
100 | 
101 | optional arguments:
102 |   -h, --help            show this help message and exit
103 |   --target-dir TARGET_DIR
104 |                         Path to target directory
105 | ```
106 | This script scrapes a list of directories for paired STM and SPH files. If `train`, `test`, and `dev` folders are present, these labels are used for the output folder. By default, a target directory of 'input-data' will be created. Note that filenames with hyphens will be sanitized to underscores and that audio files will be forced to single channel, 16 kHz, signed PCM format. If two channels are present, only the first will be used.
107 | 
108 | ### degrade_audio_file 
109 | ```text
110 | usage: degrade_audio_file input_file1.wav input_file2.wav
111 | 
112 | Degrade audio files to 8 kHz format similar to G711 codec
113 | ```
114 | This script reduces audio quality of input audio files so that acoustic models can learn features from telephony with the G711 codec.
115 | 
116 | ### extract_excel_spreadsheets
117 | Note that the use of this function requires the separate installation of `pandas`. This can be done via `pip install pandas`.
118 | 
119 | ```text
120 | usage: extract_excel_spreadsheets.py [-h] [--input-folder INPUT_FOLDER]
121 |                                      [--output-corpus OUTPUT_CORPUS]
122 | 
123 | convert a folder of excel spreadsheets to a corpus of text files
124 | 
125 | optional arguments:
126 |   -h, --help            show this help message and exit
127 |   --input-folder INPUT_FOLDER
128 |                         input folder of excel spreadsheets ending in .xls or
129 |                         .xlsx
130 |   --output-corpus OUTPUT_CORPUS
131 |                         output folder for storing text corpus
132 | ```
133 | 
134 | 
135 | ### align_json
136 | This aligns a gk hypothesis `json` file with a reference text file for creating forced alignment `STM` files for training new ASR models.
137 | Note that this function requires the installation a few extra packages
138 | ```shell
139 | python3 -m pip install spacy textacy https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
140 | ```
141 | 
142 | ```text
143 | usage: align_json.py [-h] input_json ref output_filename
144 | 
145 | align a gk json file against a reference text file
146 | 
147 | positional arguments:
148 |   input_json       input gk json file
149 |   ref              reference text file
150 |   output_filename  output_filename
151 | 
152 | optional arguments:
153 |   -h, --help       show this help message and exit
154 | ```
155 | 
156 | ### Requirements
157 | 
158 | - Python >= 3.6.1 with `pip`
159 | 
160 | ## Contributing
161 | 
162 | ### Code of Conduct
163 | 
164 | Please make sure you read and observe our [Code of Conduct](https://raw.githubusercontent.com/finos/greenkey-asrtoolkit/master/CODE_OF_CONDUCT.md).
165 | 
166 | ### Pull Request process
167 | 
168 | 1. Fork it
169 | 2. Create your feature branch (`git checkout -b feature/fooBar`)
170 | 3. Commit your changes (`git commit -am 'Add some fooBar'`)
171 | 4. Push to the branch (`git push origin feature/fooBar`)
172 | 5. Create a new Pull Request
173 | 
174 | _NOTE:_ Commits and pull requests to FINOS repositories will only be accepted from those contributors with an active, executed Individual Contributor License Agreement (ICLA) with FINOS OR who are covered under an existing and active Corporate Contribution License Agreement (CCLA) executed with FINOS. Commits from individuals not covered under an ICLA or CCLA will be flagged and blocked by the FINOS Clabot tool. Please note that some CCLAs require individuals/employees to be explicitly named on the CCLA.
175 | 
176 | *Need an ICLA? Unsure if you are covered under an existing CCLA? Email [help@finos.org](mailto:help@finos.org)*
177 | 
178 | ## Authors
179 | 
180 | - [Matthew Goldey](https://github.com/mgoldey)
181 | - [Tejas Shastry](https://github.com/tshastry)
182 | - [Amy Geojo](https://github.com/ageojo)
183 | - [Svyat Vergun](https://github.com/sv-github)
184 | - [Ashley Shultz](https://github.com/AGiantSquid)
185 | - [Colin Brochtrup](https://github.com/cbrochtrup)
186 | 
187 | ## License
188 | 
189 | The code in this repository is distributed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
190 | 
191 | Copyright 2020 GreenKey Technologies
192 | 
193 | <!-- Markdown link & img defs -->
194 | [FINOS]: https://www.finos.org
195 | [Code of Conduct]: https://www.finos.org/code-of-conduct
196 | [Voice Program]: https://finosfoundation.atlassian.net/wiki/spaces/VOICE/overview
197 | [SemVer]: http://semver.org
198 | [list of contributors]: https://github.com/finos/greenkey-asrtoolkit/graphs/contributors
199 | [tags on this repository]: https://github.com/finos/greenkey-asrtoolkit/tags
200 | 


--------------------------------------------------------------------------------
/asrtoolkit/data_structures/corpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Module for organizing SPH/MP3/WAV & STM files from a corpus
  4 | """
  5 | 
  6 | import glob
  7 | import os
  8 | import random
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | from functools import partial
 11 | 
 12 | from tqdm import tqdm
 13 | 
 14 | from asrtoolkit.data_structures.audio_file import audio_file
 15 | from asrtoolkit.data_structures.exemplar import exemplar
 16 | from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
 17 | from asrtoolkit.file_utils.name_cleaners import basename, strip_extension
 18 | 
 19 | 
 20 | def get_files(data_dir, extension):
 21 |     """
 22 |     Gets all files in a data directory with given extension
 23 |     """
 24 |     files = []
 25 |     if data_dir and os.path.exists(data_dir):
 26 |         files = glob.glob(data_dir + "/*." + extension)
 27 |     return files
 28 | 
 29 | 
 30 | class corpus(object):
 31 |     """
 32 |     Create a corpus object for storing information about
 33 |     the location and count of files in a corpus
 34 |     """
 35 | 
 36 |     location = None
 37 |     exemplars = []
 38 |     n_words = 0
 39 | 
 40 |     def __init__(self, *args, **kwargs):
 41 |         """
 42 |         Initialize from location and populate list of
 43 |         SPH, WAV, or MP3 audio files
 44 |         and STM files into segments
 45 |         """
 46 |         for dictionary in args:
 47 |             if isinstance(dictionary, dict):
 48 |                 for key in dictionary:
 49 |                     setattr(self, key, dictionary[key])
 50 |         for key in kwargs:
 51 |             setattr(self, key, kwargs[key])
 52 | 
 53 |         # only if not defined above should we search for exemplars
 54 |         # based on location
 55 |         if not self.exemplars:
 56 |             # instantiate exemplars for this object to override
 57 |             # static class variable
 58 |             self.exemplars = []
 59 | 
 60 |             audio_extensions_to_try = ["sph", "wav", "mp3"][::-1]
 61 |             self.exemplars += [
 62 |                 exemplar(
 63 |                     {
 64 |                         "audio_file": audio_file(fl),
 65 |                         "transcript_file": time_aligned_text(
 66 |                             strip_extension(fl) + ".stm"
 67 |                         ),
 68 |                     }
 69 |                 )
 70 |                 for audio_extension in audio_extensions_to_try
 71 |                 for fl in (
 72 |                     get_files(self.location, audio_extension) if self.location else []
 73 |                 )
 74 |                 if (os.path.exists(strip_extension(fl) + ".stm"))
 75 |             ]
 76 | 
 77 |             # gather all exemplars from /stm and /sph subdirectories if present
 78 |             self.exemplars += [
 79 |                 exemplar(
 80 |                     {
 81 |                         "audio_file": audio_file(fl),
 82 |                         "transcript_file": time_aligned_text(
 83 |                             self.location
 84 |                             + "/stm/"
 85 |                             + basename(strip_extension(fl))
 86 |                             + ".stm"
 87 |                         ),
 88 |                     }
 89 |                 )
 90 |                 for audio_extension in audio_extensions_to_try
 91 |                 for fl in (
 92 |                     get_files(self.location + "/sph/", audio_extension)
 93 |                     if self.location
 94 |                     else []
 95 |                 )
 96 |                 if (
 97 |                     os.path.exists(
 98 |                         self.location + "/stm/" + basename(strip_extension(fl)) + ".stm"
 99 |                     )
100 |                 )
101 |             ]
102 | 
103 |     def validate(self):
104 |         """
105 |         Check and validate each example after sorting by audio file hash
106 |         since stm hash may change
107 |         """
108 |         dict_of_examples = {_.audio_file.hash(): _ for _ in self.exemplars}
109 |         self.exemplars = [dict_of_examples[_] for _ in set(dict_of_examples)]
110 |         return sum(_.validate() for _ in self.exemplars)
111 | 
112 |     def count_exemplar_words(self):
113 |         """
114 |         Count the number of words in valid corpus exemplars
115 |         adds attribute n_words to exemplars
116 |         """
117 |         valid_exemplars = [_ for _ in self.exemplars if _.validate()]
118 | 
119 |         total_words = 0
120 |         for eg in valid_exemplars:
121 |             eg.n_words = eg.count_words()
122 |             total_words += eg.n_words
123 |         return valid_exemplars, total_words
124 | 
125 |     def split(self, split_words, min_segments=10):
126 |         """
127 |         Select exemplars to create data split with specified number of words and minimum number of segments
128 |         Returns the new splits as separate corpora
129 |         """
130 |         valid_exemplars, total_words = self.count_exemplar_words()
131 | 
132 |         # Raise error if we inputs are invalid to avoid infinite loop
133 |         if split_words < 0 or split_words > total_words:
134 |             raise ValueError(
135 |                 "cannot split corpus with {} words into split with {} words".format(
136 |                     total_words, split_words
137 |                 )
138 |             )
139 | 
140 |         exemplars_in_split = []
141 |         word_counter, seg_counter = 0, 0
142 |         while word_counter <= split_words or seg_counter <= min_segments:
143 |             exemplars_in_split += [
144 |                 valid_exemplars.pop(random.randrange(len(valid_exemplars)))
145 |             ]
146 |             word_counter += exemplars_in_split[-1].n_words
147 |             seg_counter += len(exemplars_in_split[-1].transcript_file.segments)
148 | 
149 |         new_corpus = corpus(
150 |             {
151 |                 "location": self.location,
152 |                 "exemplars": exemplars_in_split,
153 |             }
154 |         )
155 | 
156 |         remaining_corpus = self - new_corpus
157 |         remaining_corpus.location = self.location
158 | 
159 |         return remaining_corpus, new_corpus
160 | 
161 |     def log(self):
162 |         """
163 |         Log what each hashed example contains
164 |         """
165 |         return {
166 |             _.hash(): {
167 |                 "audio_file": _.audio_file.location,
168 |                 "audio_file_hash": _.audio_file.hash(),
169 |                 "transcript_file": _.transcript_file.location,
170 |                 "transcript_file_hash": _.transcript_file.hash(),
171 |             }
172 |             for _ in self.exemplars
173 |         }
174 | 
175 |     def calculate_number_of_segments(self):
176 |         """
177 |         Calculate how many segments are in this corpus
178 |         """
179 |         return sum(len(eg.transcript_file.segments) for eg in self.exemplars)
180 | 
181 |     def prepare_for_training(self, target=None, nested=False, sample_rate=16000):
182 |         """
183 |         Run validation and audio file preparation steps
184 |         """
185 | 
186 |         # write corpus back in place if no target
187 |         target = self.location if target is None else target
188 | 
189 |         executor = ThreadPoolExecutor()
190 | 
191 |         # process audio files concurrently for speed
192 |         futures = [
193 |             executor.submit(
194 |                 partial(
195 |                     _.prepare_for_training,
196 |                     target=target,
197 |                     sample_rate=sample_rate,
198 |                     nested=nested,
199 |                 )
200 |             )
201 |             for _ in self.exemplars
202 |         ]
203 | 
204 |         # trigger conversion and gather results
205 |         new_exemplars = [future.result() for future in tqdm(futures)]
206 | 
207 |         new_corpus = corpus(
208 |             {
209 |                 "location": target,
210 |                 "exemplars": [eg for eg in new_exemplars if eg is not None],
211 |             }
212 |         )
213 |         new_corpus.validate()
214 |         return new_corpus.log()
215 | 
216 |     def __add__(self, other):
217 |         """ Allow addition of corpora via + operator """
218 |         return corpus({"location": None, "exemplars": self.exemplars + other.exemplars})
219 | 
220 |     def __iadd__(self, other):
221 |         """ Allow addition of corpora via += operator """
222 |         self.exemplars = self.exemplars + other.exemplars
223 |         return self
224 | 
225 |     def __sub__(self, other):
226 |         """ Allow addition of corpora via - operator """
227 |         return corpus(
228 |             {
229 |                 "location": None,
230 |                 "exemplars": [_ for _ in self.exemplars if _ not in other.exemplars],
231 |             }
232 |         )
233 | 
234 |     def __isub__(self, other):
235 |         """ Allow subtraction of corpora via -= operator """
236 |         self.exemplars = [_ for _ in self.exemplars if _ not in other.exemplars]
237 |         return self
238 | 
239 |     def __getitem__(self, given):
240 |         """ Allow slicing of corpora via [] """
241 |         return corpus(
242 |             {
243 |                 "location": self.location,
244 |                 "exemplars": [self.exemplars[given]]
245 |                 if not isinstance(given, slice)
246 |                 else self.exemplars[given],
247 |             }
248 |         )
249 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/align_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import itertools
  4 | import logging
  5 | from collections import defaultdict
  6 | 
  7 | from toolz import merge
  8 | from toolz.sandbox.core import pluck
  9 | 
 10 | from asrtoolkit.alignment.initialize_logger import initialize_logger
 11 | from asrtoolkit.data_structures.segment import segment as Segment
 12 | 
 13 | initialize_logger()
 14 | LOGGER = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Extractor:
 18 |     def __init__(self, extractor_function):
 19 |         self.extractor_function = extractor_function
 20 | 
 21 |     def match_extractions(self, extract1, extract2):
 22 |         """
 23 |         :param gk_extract: list of spacy/textacy Spans
 24 |         :param stm_extract: list of spacy/textacy Spans
 25 |         :return: 2 lists of spans, gk_shared and stm_shared, each with all spans in gk_extract and stm_extract, respectively,
 26 |           if the text (span.text) appears in both input lists
 27 |           gk_shared and stm_shared may differ in size and in order of elements contained
 28 |             but sorted(set(gk_shared))==sorted(set(stm_shared))   # where sorted uses text of each span
 29 |         """
 30 |         shared = set(x.text for x in extract1).intersection(
 31 |             set(x.text for x in extract2)
 32 |         )
 33 |         extract1_shared = [x for x in extract1 if x.text in shared]
 34 |         extract2_shared = [x for x in extract2 if x.text in shared]
 35 |         return extract1_shared, extract2_shared
 36 | 
 37 |     def shared_extractions(self, hyp, ref):
 38 |         """return: Tuple[List[Span], List[Span]]"""
 39 |         hyp_extracted = list(self.extractor_function(hyp))
 40 |         ref_extracted = list(self.extractor_function(ref))
 41 |         return self.match_extractions(hyp_extracted, ref_extracted)
 42 | 
 43 | 
 44 | ##########################################################################
 45 | 
 46 | 
 47 | # Aligner Functions
 48 | def find_matched_intervals(interval, extractor):
 49 |     """
 50 |     :param interval: Tuple(Tuple(int,int),Tuple(int,int))
 51 |       tuple with 2 tuples
 52 |         inner tuples consist of 2 integers--> the first and last token index of a segment from hyp and ref, respectively
 53 | 
 54 |     :param extractor: Extractor object
 55 |     :return: List of Lists of (matched) Spans (if any)
 56 |       returns an empty list if no shared segments are extracted
 57 |       or if shared extractions are not matched
 58 |     """
 59 | 
 60 |     matched_list = []
 61 | 
 62 |     # If either is a token, do not try to look for matches
 63 |     try:
 64 |         if not (len(interval[0]) > 1 and len(interval[1]) > 1):
 65 |             return matched_list
 66 |     # token does not support indexing
 67 |     except TypeError as exc:
 68 |         LOGGER.info("Error with %s: %s", interval, exc)
 69 |         return matched_list
 70 | 
 71 |     try:
 72 |         shared_extractions = extractor.shared_extractions(interval[0], interval[1])
 73 |     except TypeError as exc:
 74 |         LOGGER.info("Error with %s: %s", interval, exc)
 75 |         return
 76 | 
 77 |     if not shared_extractions[0] or not shared_extractions[1]:
 78 |         return
 79 |     else:
 80 |         gk_shared, ref_shared = shared_extractions
 81 |         matched_list = sequence_match(gk_shared, ref_shared)
 82 |     return matched_list
 83 | 
 84 | 
 85 | def is_sorted(token_matches):
 86 |     # verify in order
 87 |     hyp_tokens = list(pluck(0, token_matches))
 88 |     ref_tokens = list(pluck(1, token_matches))
 89 |     return sorted(hyp_tokens) == hyp_tokens and sorted(ref_tokens) == ref_tokens
 90 | 
 91 | 
 92 | def overlap_tokens(doc, other_doc):
 93 |     """
 94 |     Get the tokens from the original Doc that are also in the comparison Doc.
 95 |     """
 96 |     overlap = []
 97 |     other_tokens = [token.text for token in other_doc]
 98 |     for token in doc:
 99 |         if token.text in other_tokens:
100 |             other_id = other_tokens.index(token.text)
101 |             overlap.append([token, other_doc[other_id]])
102 |     return overlap
103 | 
104 | 
105 | def sequence_match(gk_shared, stm_shared):  # -> List[Tuple[Span, Span]]
106 |     """
107 |     :param: gk_shared: list of Spans
108 |     :param: stm_shared: list of Spans
109 |     Spans are obtained by applying an extractor to aligned segments of text from gk_doc and stm_doc
110 |     and filtering results so each list only contains Spans with text present in at least one Span in the other segment
111 |      - thus sorted(list(set([span.text for span in {doc}_shared])))-> same for doc=gk and doc=stm
112 |      but the order and number of Spans in gk_shared and stm_shared may (and likely do) differ
113 | 
114 |     :return: list of tuples of spans  -> List[Tuple[Span, Span]]
115 |         each tuple contains a matched pair from (gk_doc/segment, stm_doc/segment)
116 |     """
117 |     gk_text = [left.text for left in gk_shared]
118 |     stm_text = [left.text for left in stm_shared]
119 | 
120 |     # Sequence Search
121 |     matches_list = []
122 |     still_left = []
123 |     gk_taken_list = []
124 | 
125 |     for stm_item, stm_word in zip(stm_shared, stm_text):
126 |         # gk_text: list of potential match spans
127 |         if stm_word in gk_text:
128 |             # get gk text match index
129 |             match_idx = gk_text.index(stm_word)
130 | 
131 |             # use to get spacy item match -> gk_left = gk_shared
132 |             # indexing into spacy objects list
133 |             gk_item = gk_shared[match_idx]
134 | 
135 |             if not matches_list:  # first match
136 |                 matches_list.append([gk_item, stm_item])
137 |                 gk_taken_list.append(gk_item)
138 | 
139 |             elif gk_item not in gk_taken_list:  # not already matched
140 |                 prev_gk, prev_stm = matches_list[-1]  # get previous match
141 | 
142 |                 # add to matches if token position of current matched gk & stm spans later than last
143 |                 if (
144 |                     prev_gk[0].idx < gk_item[0].idx
145 |                     and prev_stm[0].idx < stm_item[0].idx
146 |                 ):
147 |                     matches_list.append([gk_item, stm_item])
148 |                     gk_taken_list.append(gk_item)
149 | 
150 |                 else:
151 |                     # remove, add to still_left
152 |                     popped = matches_list.pop()
153 |                     still_left.append(popped)
154 |     return matches_list
155 | 
156 | 
157 | # Helper Functions
158 | def flatten(embedded_iterable):
159 |     return itertools.chain.from_iterable(embedded_iterable)
160 | 
161 | 
162 | def merge_paired_dicts(dicts1, dicts2):
163 |     """
164 |     dicts1, dicts2: list of dicts of equal length -> List[Dict], List[Dict]
165 |     :return: list with the same number of dicts as input dicts;
166 |       dicts at the same index will be merged
167 |         -> equivalent to dict1.update(dict2) for dicts at same index in dicts1 and dicts2
168 |       all *unique* keys (& respective values) are retained
169 |       if merged dicts contain the same key, only the value of key in dict1 will be retained
170 | 
171 |     merge only takes 2 arguments but dicts being merged can have multiple keys
172 |     can NOT pass two lists of dicts
173 |     """
174 |     return [merge(dicts1, dicts2) for dicts1, dicts2 in zip(dicts1, dicts2)]
175 | 
176 | 
177 | def stm_label(gender):
178 |     return f"<o,f0,{gender}>"
179 | 
180 | 
181 | def word_lattice_to_lines(word_lattice, MAX_DURATION=15):
182 |     """
183 |     :param word_lattice: List[Dict]
184 |     dicts contain word-level information from transcribed document (start/end time)
185 |     and reference document (filename, speaker, gender)
186 | 
187 |     # from tests.py : 10 tokens, one... nine; same for both (full match)
188 |     lattice = merge_dicts(expected_gk_processed_output, expected_earnings_reference_output)
189 | 
190 |     :return: dict with keys: ints (stm line)
191 |     values: Dict-word level information; all dicts with words, when combined are under
192 |     MAX_DURATION and consist of a single speaker
193 |     """
194 |     line_dict = defaultdict(list)
195 |     line_no = 0
196 |     start_time, speaker_id = word_lattice[0]["start"], word_lattice[0]["speaker"]
197 |     line_dict[line_no].append(word_lattice[0])
198 |     for word_dict in word_lattice[1:]:
199 |         if (
200 |             word_dict["speaker"] != speaker_id
201 |             or word_dict["end"] - start_time > MAX_DURATION
202 |         ):
203 |             # start new line with new speaker and start time
204 |             line_no += 1
205 |             speaker_id, start_time = word_dict["speaker"], word_dict["start"]
206 |             line_dict[line_no].append(word_dict)
207 |         else:
208 |             line_dict[line_no].append(word_dict)
209 |     return line_dict
210 | 
211 | 
212 | def dict_to_segments(line_dict, doc, token_idx="token_idx"):
213 |     """Dict; key=int (stm line number); value: list of Dicts-> all word_dicts for tokens in line"""
214 |     segments = []
215 |     for group_key, group in line_dict.items():
216 |         d = group[0]
217 | 
218 |         first_token_idx, last_token_idx = group[0][token_idx], group[-1][token_idx]
219 |         text = doc[first_token_idx : last_token_idx + 1].text
220 | 
221 |         # fallback logic around speaker gender
222 |         gender = d["gender"]
223 |         if gender not in ["male", "female"]:
224 |             gender = "male"
225 |         segment = Segment(
226 |             dict(
227 |                 channel=1,
228 |                 speaker=d["speaker"],
229 |                 start=d["start"],
230 |                 stop=group[-1]["end"],
231 |                 label=stm_label(gender),
232 |                 text=text,
233 |             )
234 |         )
235 |         segments.append(segment)
236 |     return segments
237 | 


--------------------------------------------------------------------------------
/asrtoolkit/alignment/aligned_doc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import logging
  4 | from collections import defaultdict, deque
  5 | 
  6 | import spacy
  7 | from toolz import merge, merge_sorted
  8 | 
  9 | from asrtoolkit.alignment.align_utils import find_matched_intervals, is_sorted
 10 | 
 11 | LOGGER = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class AlignedDoc:
 15 |     """
 16 |     An aligned document is an object that has two documents and a set of
 17 |     matched tokens and unmatched intervals between them
 18 | 
 19 |     hyp - should contain token time information
 20 |     ref - reference document; content is accurate
 21 | 
 22 |     """
 23 | 
 24 |     def __init__(self, hyp, ref, extractors, num_extractions=2):
 25 |         self.hyp = hyp
 26 |         self.ref = ref
 27 |         self.extractors = extractors
 28 | 
 29 |         self.matched_tokens = deque()
 30 |         self.matched_dict = defaultdict(dict)
 31 | 
 32 |         self.matched_intervals = deque()
 33 |         self.unmatched_intervals = deque()
 34 | 
 35 |         self.unmatched_intervals.append((self.hyp, self.ref))
 36 | 
 37 |         if not num_extractions:
 38 |             num_extractions = len(self.extractors)
 39 | 
 40 |         for extract_id in range(num_extractions):
 41 |             self.find_alignments(extract_id=extract_id)
 42 | 
 43 |     def add_extractors(self, extractors):
 44 |         """
 45 |         adds extractors List of Extractor object(s) to self.extractors
 46 |         """
 47 |         self.extractors.extend(extractors)
 48 | 
 49 |     def start_and_end_token(self, hyp, ref):
 50 |         """
 51 |         Return start and end token for spacy Doc,
 52 |         Span or Token part of self.hyp and self.ref
 53 |         """
 54 | 
 55 |         if isinstance(hyp, spacy.tokens.Doc) and isinstance(ref, spacy.tokens.Doc):
 56 |             hyp_start, hyp_end = hyp[0].i, hyp[-1].i
 57 |             ref_start, ref_end = ref[0].i, ref[-1].i
 58 | 
 59 |         elif isinstance(hyp, spacy.tokens.Token) and isinstance(
 60 |             ref, spacy.tokens.Token
 61 |         ):
 62 |             hyp_start, hyp_end = hyp.i, hyp.i
 63 |             ref_start, ref_end = ref.i, ref.i
 64 |         else:
 65 |             hyp_start, hyp_end = hyp.start, hyp.end
 66 |             ref_start, ref_end = ref.start, ref.end
 67 |         return hyp_start, hyp_end, ref_start, ref_end
 68 | 
 69 |     def segment_to_tokens(self, segment):
 70 |         """
 71 |         segment: spacy Doc/Span/Token object
 72 |         returns list of token(s) comprising segment
 73 |         """
 74 |         if len(segment) > 1:
 75 |             return [tok for tok in segment]
 76 |         else:
 77 |             return [segment]  # is token
 78 | 
 79 |     def get_token_idxs(self, tokens):
 80 |         """tokens: List of one or more Token objects"""
 81 |         return [tok.i for tok in tokens]
 82 | 
 83 |     def _unique_matches(self, matched_segments):
 84 |         """
 85 |         matched_segments: List[span,span] as for matches_list
 86 |         :return:  each paired
 87 |         """
 88 |         assert isinstance(matched_segments, list)
 89 |         unique_new_spans = [
 90 |             match for match in matched_segments if match not in self.matched_intervals
 91 |         ]
 92 |         LOGGER.debug(
 93 |             "Number of non-unique matched_segments: %s",
 94 |             len(matched_segments) - len(unique_new_spans),
 95 |         )
 96 |         return unique_new_spans
 97 | 
 98 |     def find_unmatched_segments(self, interval, matched_segments):
 99 |         """
100 |         :param interval: doc/span -> unmatched_interval from which matched_segments were obtained (via extraction + sequence_matching)
101 |         :param matched_segments: List[List[Span, Span]] -> List of Lists; inner Lists containing matching Spans from hyp and ref respectively; Spans are subsequences of text comprising hyp_segment and ref_segment of interval
102 |         :return: List of List of Spans of Sequences of Text from Interval that were not matched
103 |         """
104 |         if not matched_segments:
105 |             return
106 | 
107 |         unmatched_segments = []
108 |         (hyp_segment, ref_segment) = interval
109 | 
110 |         for i in range(len(matched_segments[:-1])):
111 |             m1, m2 = matched_segments[i]
112 |             m1_next, m2_next = matched_segments[i + 1]
113 |             slot1 = hyp_segment[m1.end + 1 : m1_next.start]
114 |             slot2 = ref_segment[m2.end + 1 : m2_next.end]
115 |             indices = [slot1, slot2]
116 |             LOGGER.debug(i, ":", indices)
117 | 
118 |             if len(slot1) > 1 and len(slot2) > 1:
119 |                 unmatched_segments.append(indices)
120 | 
121 |         # special cases
122 |         # --> segment btw interval start and first match & last match and interval end
123 |         hyp_start, hyp_end, ref_start, ref_end = self.start_and_end_token(
124 |             hyp=hyp_segment, ref=ref_segment
125 |         )
126 | 
127 |         m1_first, m2_first = matched_segments[0]
128 |         if m1_first.start != hyp_start and m2_first.start != ref_start:
129 |             slot1_first = hyp_segment[: m1_first.start]
130 |             slot2_first = ref_segment[: m2_first.start]
131 |             indices = [slot1_first, slot2_first]
132 | 
133 |             if len(slot1_first) > 1 and len(slot2_first) > 1:
134 |                 unmatched_segments.insert(0, indices)
135 | 
136 |         m1_last, m2_last = matched_segments[-1]
137 |         if m1_last.end != hyp_end and m2_last.end != ref_end:
138 |             slot1_last = hyp_segment[m1_last.end + 1 :]
139 |             slot2_last = ref_segment[m2_last.end + 1 :]
140 | 
141 |             if len(slot1_last) > 1 and len(slot2_last) > 1:
142 |                 unmatched_segments.append(indices)
143 | 
144 |         return unmatched_segments
145 | 
146 |     def find_alignments(self, extract_id):
147 |         all_matched_segments, all_unmatched_segments = [], []
148 | 
149 |         for interval in self.unmatched_intervals:
150 |             matched_segments = find_matched_intervals(
151 |                 interval=interval, extractor=self.extractors[extract_id]
152 |             )
153 | 
154 |             if matched_segments:
155 |                 all_matched_segments.extend(matched_segments)
156 | 
157 |                 unmatched_segments = self.find_unmatched_segments(
158 |                     interval=interval, matched_segments=matched_segments
159 |                 )
160 |                 for segment in (
161 |                     unmatched_segments if unmatched_segments is not None else []
162 |                 ):
163 |                     all_unmatched_segments.append(segment)
164 |             else:
165 |                 all_unmatched_segments.extend(
166 |                     interval if hasattr(interval, "__iter__") else [interval]
167 |                 )
168 | 
169 |         if all_matched_segments:
170 |             # unmatched segments
171 |             self.unmatched_intervals.clear()
172 |             self.unmatched_intervals.extend(all_unmatched_segments)
173 |             # matches
174 |             self.matched_intervals.extend(all_matched_segments)
175 | 
176 |             # dict matches
177 |             self.matched_dict[extract_id] = all_matched_segments
178 | 
179 |     def get_token_matches(self):
180 |         """
181 |         retrieves all matched intervals;
182 |         verifies each paired span contains the same text
183 |         decomposes to paired tokens - removes duplicates in the process
184 |         :param aligned_doc:
185 |         :return: List[Tuple[int,int]]
186 |           ints -> token position in respective docs
187 |             hyp[hyp_token].i, ref[ref_token].i
188 |         """
189 | 
190 |         x = list(merge_sorted(self.matched_dict[0], self.matched_dict[1]))
191 |         y = list(merge_sorted(x, self.matched_dict[2]))
192 |         z = list(merge_sorted(y, self.matched_dict[3]))
193 | 
194 |         matched_spans = z
195 | 
196 |         token_matches = []
197 |         for hyp_match, ref_match in matched_spans:
198 |             assert hyp_match.text == ref_match.text
199 |             m1_tokens = self.get_token_idxs(hyp_match)
200 |             m2_tokens = self.get_token_idxs(ref_match)
201 |             for hyp_tok, ref_tok in zip(m1_tokens, m2_tokens):
202 |                 matched = (hyp_tok, ref_tok)
203 |                 if matched not in token_matches and is_sorted(
204 |                     token_matches + [matched]
205 |                 ):
206 |                     token_matches.append(matched)
207 |         return token_matches
208 | 
209 |     def unique_token_matches(self):
210 |         """
211 |         converts token_matches (List of Tuples of Ints-> aligned token ids) TO Dict
212 |         then convert back (List[Tuple[int,int]] --> ensures tuples are unique
213 |           and both hyp and ref tokens are sorted
214 |         """
215 |         token_mapper = defaultdict(dict)
216 |         token_matches = self.get_token_matches()
217 | 
218 |         for tok1, tok2 in sorted(token_matches):
219 |             if tok1 not in token_mapper or abs(tok1 - token_mapper[tok1]) > abs(
220 |                 tok1 - tok2
221 |             ):
222 |                 token_mapper[tok1] = tok2
223 |         return token_mapper
224 | 
225 |     def get_token_metadata(self):
226 |         """
227 |         given Tuples of matched token ids from hyp and ref -> self.get_token_matches()
228 |           merges user_data corresponding to hyp (start_time/duration/end_time)
229 |           and ref token of each pair (filename/speaker/gender) [requisite for STM file]
230 |         :return: returns list of dicts with word-level information (merged from hyp and ref)
231 |         """
232 |         token_mapper = self.unique_token_matches()
233 | 
234 |         return [
235 |             merge(self.hyp.user_data[d1_tok], self.ref.user_data[d2_tok])
236 |             for d1_tok, d2_tok in token_mapper.items()
237 |         ]
238 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2013-2018 Ben Lambert
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=CVS
 13 | 
 14 | # Pickle collected data for later comparisons.
 15 | persistent=yes
 16 | 
 17 | # List of plugins (as comma separated values of python modules names) to load,
 18 | # usually to register additional checkers.
 19 | load-plugins=
 20 | 
 21 | # Use multiple processes to speed up Pylint.
 22 | jobs=1
 23 | 
 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 25 | # active Python interpreter and may run arbitrary code.
 26 | unsafe-load-any-extension=no
 27 | 
 28 | # A comma-separated list of package or module names from where C extensions may
 29 | # be loaded. Extensions are loading into the active Python interpreter and may
 30 | # run arbitrary code
 31 | extension-pkg-whitelist=
 32 | 
 33 | 
 34 | [MESSAGES CONTROL]
 35 | 
 36 | # Only show warnings with the listed confidence levels. Leave empty to show
 37 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 38 | confidence=
 39 | 
 40 | # Enable the message, report, category or checker with the given id(s). You can
 41 | # either give multiple identifier separated by comma (,) or put this option
 42 | # multiple time. See also the "--disable" option for examples.
 43 | #enable=
 44 | 
 45 | # Disable the message, report, category or checker with the given id(s). You
 46 | # can either give multiple identifiers separated by comma (,) or put this
 47 | # option multiple times (only on the command line, not in the configuration
 48 | # file where it should appear only once).You can also use "--disable=all" to
 49 | # disable everything first and then reenable specific checks. For example, if
 50 | # you want to run only the similarities checker, you can use "--disable=all
 51 | # --enable=similarities". If you want to run only the classes checker, but have
 52 | # no Warning level messages displayed, use"--disable=all --enable=classes
 53 | # --disable=W"
 54 | 
 55 | disable=
 56 |     attribute-defined-outside-init,
 57 |     duplicate-code,
 58 |     fixme,
 59 |     invalid-name,
 60 |     missing-docstring,
 61 |     protected-access,
 62 |     too-few-public-methods,
 63 |     # handled by black
 64 |     format
 65 | 
 66 | 
 67 | [REPORTS]
 68 | 
 69 | # Set the output format. Available formats are text, parseable, colorized, msvs
 70 | # (visual studio) and html. You can also give a reporter class, eg
 71 | # mypackage.mymodule.MyReporterClass.
 72 | output-format=text
 73 | 
 74 | # Put messages in a separate file for each module / package specified on the
 75 | # command line instead of printing them on stdout. Reports (if any) will be
 76 | # written in a file name "pylint_global.[txt|html]".
 77 | files-output=no
 78 | 
 79 | # Tells whether to display a full report or only the messages
 80 | reports=no
 81 | 
 82 | # Python expression which should return a note less than 10 (10 is the highest
 83 | # note). You have access to the variables errors warning, statement which
 84 | # respectively contain the number of errors / warnings messages and the total
 85 | # number of statements analyzed. This is used by the global evaluation report
 86 | # (RP0004).
 87 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 88 | 
 89 | # Template used to display messages. This is a python new-style format string
 90 | # used to format the message information. See doc for all details
 91 | #msg-template=
 92 | 
 93 | 
 94 | [LOGGING]
 95 | 
 96 | # Logging modules to check that the string format arguments are in logging
 97 | # function parameter format
 98 | logging-modules=logging
 99 | 
100 | 
101 | [MISCELLANEOUS]
102 | 
103 | # List of note tags to take in consideration, separated by a comma.
104 | notes=FIXME,XXX,TODO
105 | 
106 | 
107 | [SIMILARITIES]
108 | 
109 | # Minimum lines number of a similarity.
110 | min-similarity-lines=4
111 | 
112 | # Ignore comments when computing similarities.
113 | ignore-comments=yes
114 | 
115 | # Ignore docstrings when computing similarities.
116 | ignore-docstrings=yes
117 | 
118 | # Ignore imports when computing similarities.
119 | ignore-imports=no
120 | 
121 | 
122 | [VARIABLES]
123 | 
124 | # Tells whether we should check for unused import in __init__ files.
125 | init-import=no
126 | 
127 | # A regular expression matching the name of dummy variables (i.e. expectedly
128 | # not used).
129 | dummy-variables-rgx=_$|dummy
130 | 
131 | # List of additional names supposed to be defined in builtins. Remember that
132 | # you should avoid defining new builtins when possible.
133 | additional-builtins=
134 | 
135 | # List of strings which can identify a callback function by name. A callback
136 | # name must start or end with one of those strings.
137 | callbacks=cb_,_cb
138 | 
139 | 
140 | [FORMAT]
141 | 
142 | # Maximum number of characters on a single line.
143 | max-line-length=120
144 | 
145 | # Regexp for a line that is allowed to be longer than the limit.
146 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
147 | 
148 | # Allow the body of an if to be on the same line as the test if there is no
149 | # else.
150 | single-line-if-stmt=no
151 | 
152 | # List of optional constructs for which whitespace checking is disabled
153 | no-space-check=trailing-comma,dict-separator
154 | 
155 | # Maximum number of lines in a module
156 | max-module-lines=2000
157 | 
158 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
159 | # tab).
160 | indent-string='  '
161 | 
162 | # Number of spaces of indent required inside a hanging or continued line.
163 | indent-after-paren=4
164 | 
165 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
166 | expected-line-ending-format=
167 | 
168 | 
169 | [BASIC]
170 | 
171 | # List of builtins function names that should not be used, separated by a comma
172 | bad-functions=map,filter,input
173 | 
174 | # Good variable names which should always be accepted, separated by a comma
175 | good-names=i,j,k,ex,Run,_
176 | 
177 | # Bad variable names which should always be refused, separated by a comma
178 | bad-names=foo,bar,baz,toto,tutu,tata
179 | 
180 | # Colon-delimited sets of names that determine each other's naming style when
181 | # the name regexes allow several styles.
182 | name-group=
183 | 
184 | # Include a hint for the correct naming format with invalid-name
185 | include-naming-hint=no
186 | 
187 | # Regular expression matching correct function names
188 | function-rgx=[a-z_][a-z0-9_]{2,30}$
189 | 
190 | # Naming hint for function names
191 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
192 | 
193 | # Regular expression matching correct variable names
194 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
195 | 
196 | # Naming hint for variable names
197 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
198 | 
199 | # Regular expression matching correct constant names
200 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
201 | 
202 | # Naming hint for constant names
203 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
204 | 
205 | # Regular expression matching correct attribute names
206 | attr-rgx=[a-z_][a-z0-9_]{2,}$
207 | 
208 | # Naming hint for attribute names
209 | attr-name-hint=[a-z_][a-z0-9_]{2,}$
210 | 
211 | # Regular expression matching correct argument names
212 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
213 | 
214 | # Naming hint for argument names
215 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
216 | 
217 | # Regular expression matching correct class attribute names
218 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
219 | 
220 | # Naming hint for class attribute names
221 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
222 | 
223 | # Regular expression matching correct inline iteration names
224 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
225 | 
226 | # Naming hint for inline iteration names
227 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
228 | 
229 | # Regular expression matching correct class names
230 | class-rgx=[A-Z_][a-zA-Z0-9]+$
231 | 
232 | # Naming hint for class names
233 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
234 | 
235 | # Regular expression matching correct module names
236 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
237 | 
238 | # Naming hint for module names
239 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
240 | 
241 | # Regular expression matching correct method names
242 | method-rgx=[a-z_][a-z0-9_]{2,}$
243 | 
244 | # Naming hint for method names
245 | method-name-hint=[a-z_][a-z0-9_]{2,}$
246 | 
247 | # Regular expression which should only match function or class names that do
248 | # not require a docstring.
249 | no-docstring-rgx=__.*__
250 | 
251 | # Minimum line length for functions/classes that require docstrings, shorter
252 | # ones are exempt.
253 | docstring-min-length=-1
254 | 
255 | # List of decorators that define properties, such as abc.abstractproperty.
256 | property-classes=abc.abstractproperty
257 | 
258 | 
259 | [TYPECHECK]
260 | 
261 | # Tells whether missing members accessed in mixin class should be ignored. A
262 | # mixin class is detected if its name ends with "mixin" (case insensitive).
263 | ignore-mixin-members=yes
264 | 
265 | # List of module names for which member attributes should not be checked
266 | # (useful for modules/projects where namespaces are manipulated during runtime
267 | # and thus existing member attributes cannot be deduced by static analysis
268 | ignored-modules=
269 | 
270 | # List of classes names for which member attributes should not be checked
271 | # (useful for classes with attributes dynamically set).
272 | ignored-classes=SQLObject, optparse.Values, thread._local, _thread._local
273 | 
274 | # List of members which are set dynamically and missed by pylint inference
275 | # system, and so shouldn't trigger E1101 when accessed. Python regular
276 | # expressions are accepted.
277 | generated-members=REQUEST,acl_users,aq_parent
278 | 
279 | # List of decorators that create context managers from functions, such as
280 | # contextlib.contextmanager.
281 | contextmanager-decorators=contextlib.contextmanager
282 | 
283 | 
284 | [SPELLING]
285 | 
286 | # Spelling dictionary name. Available dictionaries: none. To make it working
287 | # install python-enchant package.
288 | spelling-dict=
289 | 
290 | # List of comma separated words that should not be checked.
291 | spelling-ignore-words=
292 | 
293 | # A path to a file that contains private dictionary; one word per line.
294 | spelling-private-dict-file=
295 | 
296 | # Tells whether to store unknown words to indicated private dictionary in
297 | # --spelling-private-dict-file option instead of raising a message.
298 | spelling-store-unknown-words=no
299 | 
300 | 
301 | [DESIGN]
302 | 
303 | # Maximum number of arguments for function / method
304 | max-args=10
305 | 
306 | # Argument names that match this expression will be ignored. Default to name
307 | # with leading underscore
308 | ignored-argument-names=_.*
309 | 
310 | # Maximum number of locals for function / method body
311 | max-locals=25
312 | 
313 | # Maximum number of return / yield for function / method body
314 | max-returns=11
315 | 
316 | # Maximum number of branch for function / method body
317 | max-branches=26
318 | 
319 | # Maximum number of statements in function / method body
320 | max-statements=100
321 | 
322 | # Maximum number of parents for a class (see R0901).
323 | max-parents=7
324 | 
325 | # Maximum number of attributes for a class (see R0902).
326 | max-attributes=11
327 | 
328 | # Minimum number of public methods for a class (see R0903).
329 | min-public-methods=2
330 | 
331 | # Maximum number of public methods for a class (see R0904).
332 | max-public-methods=25
333 | 
334 | 
335 | [CLASSES]
336 | 
337 | # List of method names used to declare (i.e. assign) instance attributes.
338 | defining-attr-methods=__init__,__new__,setUp
339 | 
340 | # List of valid names for the first argument in a class method.
341 | valid-classmethod-first-arg=cls
342 | 
343 | # List of valid names for the first argument in a metaclass class method.
344 | valid-metaclass-classmethod-first-arg=mcs
345 | 
346 | # List of member names, which should be excluded from the protected access
347 | # warning.
348 | exclude-protected=_asdict,_fields,_replace,_source,_make
349 | 
350 | 
351 | [IMPORTS]
352 | 
353 | # Deprecated modules which should not be used, separated by a comma
354 | deprecated-modules=regsub,TERMIOS,Bastion,rexec
355 | 
356 | # Create a graph of every (i.e. internal and external) dependencies in the
357 | # given file (report RP0402 must not be disabled)
358 | import-graph=
359 | 
360 | # Create a graph of external dependencies in the given file (report RP0402 must
361 | # not be disabled)
362 | ext-import-graph=
363 | 
364 | # Create a graph of internal dependencies in the given file (report RP0402 must
365 | # not be disabled)
366 | int-import-graph=
367 | 
368 | 
369 | [EXCEPTIONS]
370 | 
371 | # Exceptions that will emit a warning when being caught. Defaults to
372 | # "Exception"
373 | overgeneral-exceptions=Exception
374 | 


--------------------------------------------------------------------------------
/samples/BillGatesTEDTalk_intentionally_poor_transcription.txt:
--------------------------------------------------------------------------------
  1 | i'm going to talk today about n r g and climate
  2 | and that might seem a bit surprising because my full time work at the foundation is mostly about vaccines and seeds about the things that we need to invent and deliver to help the poorest two bellion live better lives
  3 | but n r g and climate are extremely important to these people in fact more important than to anyone else on the planet
  4 | the climate getting worse means that many years their crops won't grow there will be too much rain not enough rain
  5 | things will change in ways that their fragile environment simply can't support and that leads to starvation it leads to uncertainty it leads to unrest sew the the climate changes will be terrible for them
  6 | alsew the price of n r g is very important to them in fact if you could pick just one thing to lower the price of to reduce poverty by far you would pick n r g now the price of n r g has come down over time
  7 | uh really advanced civilization is based on advances in n r g the coal revolution fueled
  8 | the industrial revolution and even in the nineteen hundreds we've seen a very rapid decline in the price of electricity and that's why we have refrigerators air conditioning we can make uh modern materials and do sew many things
  9 | and sew we're in a wonderful situation with uh electricity in the rich world
 10 | but as we make it cheaper and let's say let's go for making it twice as cheap we need to meet a new constraint
 11 | and that constraint has to do with co two co two is warming the planet
 12 | and the equation on co two is actually a very straightforward one if you sum up the co two that gets emitted
 13 | that leads to a temperature increase and that temperature increase leads to some very negative effects the effects on the weather
 14 | and perhaps worse the indirect effects in that the natural ecosystems can't adjust to these rapid changes and sew you get ecosystem collapses
 15 | now the exact amount of how you map from a a certain increase of co two
 16 | to what temperature will be and where the positive feedbacks are there's some uncertainty there but not very much and there's certainly uncertainty about how bad those effects will be but they will be extremely bad
 17 | i asked the top scientists on this several times do we really have to get down to near zero can't we just you know cut it in half or a quarter and the answer is that until we get near to zero
 18 | the temperature will continue to rise and sew that's that's a big challenge it's very different than saying you know we're a twelve feet high truck trying to get under a ten feet bridge and we can just sort of squeeze under this is something that has to get to zero
 19 | now we put out a lot of carbon dioxide every year over twenty six bellion tons for each american
 20 | it's about twenty tons uh for people in poor countries it's less than one ton it's an average of about five tons for everyone on the planet and somehow we have to make changes that will bring that down to zero
 21 | it's been constantly going up it's only various economic changes that have even flattened it at all sew we have to go from rapidly rising to falling and falling all the way to zero this equation has four factors
 22 | a little bit of multiplication sew you've got a thing on the left co two that you want to get to zero and that's going to be based on the number of people
 23 | the services each person's using on average the n r g on average for each service and the co two being put out
 24 | per unit of n r g sew let's look at each one of these and see how we can get this down to zero uh probably one of these numbers is going to have to get pretty near to zero now
 25 | and that's back from high school algebra but let's let's take a look first we've got population
 26 | now the world today has six point eight bellion people that's headed up to about nine billion now if we do a really great job on new vaccines health care
 27 | reproductive health services we could lower that by perhaps ten or fifteen percent but there we see an increase of about one point three
 28 | the second factor is the services we use this encompasses everything the food we eat
 29 | clothing tv heating these are very good things and getting rid of poverty means providing these services to almost everyone on the planet and it's a great thing
 30 | for this number to go up in the rich world perhaps the top one bellion we probably could cut back and use less but every year
 31 | this number on average is going to go up and sew over all that will more than double the services delivered per person
 32 | here we have a very basic service do you have lighting in your house to be able to read your homework and in fact these kids don't sew they're going out and reading their school work under the street lamps
 33 | now efficiency e the n r g for each service here finally we have some good news we have something that's not going up through various inventions and new ways of doing lighting
 34 | through different types of cars different ways of building buildings there are a lot of services where you can bring the n r g for that service down quite substantially
 35 | some individual services even bring it down by ninety percent there are other services like how we make fertilizer or how we do air transport where the rooms for improvement are far far less
 36 | and sew overall here if we're optimistic we may get a reduction of a a factor of three to even perhaps a factor of six
 37 | but for these first three factors now we've gone from twenty six bellion to at best maybe thirteen billion tons and that just won't cut it sew let's look at this fourth factor this is going to be a key one
 38 | and this is the amount of co two put out per each unit of n r g and sew the question is can you actually get that to zero if you burn coal
 39 | no if you burn natural gas no almost every way we make electricity today except for the emerging renewables and nuclear puts out co two and sew what we're going to have to do at a global scale
 40 | is create a new system and sew we need n r g miracles now when i use the term miracle i don't mean something that's impossible
 41 | the microprocessor is a miracle the personal computer is a miracle the internet and its services are a miracle sew the people here have participated in the creation of many miracles usually we don't have a deadline
 42 | where you have to get the miracle by a certain date usually you just kind of stand by and some come along some don't this is a case where we actually have to drive it full speed and get a miracle in a a pretty tight time line
 43 | now i thought how could i really capture this is there some kind of natural illustration some demonstration that would grab people's imagination here
 44 | i thought back to a year ago when i brought mosquitos and somehow people enjoyed that
 45 | it it really got them involved in the idea of you know there are people who live with mosquitos sew with n r g all i could come up with is this i decided that releasing fireflies
 46 | would be my contribution to the environment here this year sew here we have some natural fireflies i'm told they don't bite in fact they might not even not even leave that jar
 47 | now there's all sorts gimmicky solutions like that one but they don't really add up too much we need solutions either one or several that have
 48 | unbelievable scale and unbelievable reliability and although there's many directions of people seeking i really only see five that
 49 | can achieve the big numbers i've left out tide geothermal fusion biofuels those may make some contribution and if they can do better than i expect sew much the better but my key point here
 50 | is that we're going to have to work on each of these five and we can't give up any of them because they they look daunting because they all
 51 | have significant challenges let's look first at the burning fossil fuels either burning coal or burning natural gas what you need to do there
 52 | seems like it might be simple but it's not and that's to take all the co two after you've burned it going out the flue pressurize it create a liquid put it somewhere and hope it stays there
 53 | now we have some pilot things that do this at the sixty to eighty percent level but getting up to that full percentage that will be very tricky and agreeing on where these co two quantities should be put
 54 | will be hard but the toughest one here is this long term issue who's going to be sure who's going to guarantee
 55 | something that is literally bellions of times larger than any type of waste you think of in in terms of nuclear or other things this is a lot of volume
 56 | sew that's a tough one next would be nuclear it alsew has three big problems cost particularly in highly regulated countries is high
 57 | the issue of the safety really feeling good about nothing could go wrong that even though you have these human operators that the fuel
 58 | doesn't get used for weapons and then what do you do with the waste and although it's not very large there are a lot of concerns about that people need to feel good about it sew three very tough problems that might be solvable and sew should be worked on
 59 | the last three of the five i've grouped together these are what people often refer to as the renewable sources and they actually
 60 | although it's great they don't require fuel they have some disadvantages one is that the density
 61 | of n r g gathered in these technologies is dramatically less than a power plant this is n r g farming
 62 | sew you're talking about many square miles thousands of time more area than than you think of as a normal n r g plant alsew these are intermittent sources the sun doesn't shine
 63 | all day it doesn't shine every day and likewise the wind doesn't blow all the time and sew if you depend on these sources
 64 | you have to have some way of getting the n r g during those time periods that it's not available sew we've got big cost challenges here
 65 | we have transmission challenges for example say this n r g source is outside your country you not only need the technology but you have to deal with the risk of the n r g coming from elsewhere and finally this storage problem
 66 | and to dimensionalize this i went through and looked at all the types of batteries that get made for cars for computers for phones for flashlights for everything and compared that
 67 | to the amount of electrical n r g the world uses and what i found is that all the batteries we make now could store less than ten minutes
 68 | of all the n r g and sew in fact we need a big breakthrough here something that's going to be
 69 | a factor of a hundred better than the approaches we have now it's not it's not impossible but it's not a very easy thing now this shows up when you try to get the the intermittent source to be above say twenty to thirty percent
 70 | of what you're using if you're counting on it for a hundred percent you need a an incredible miracle battery
 71 | now how we're going to go forward on this what's what's the right approach is it a manhattan project what's the the thing that can get us there well we need lots
 72 | of companies working on this hundreds in each of these five paths we need at least a hundred people and a lot of them you'll look at and say they're crazy that's good
 73 | and i think here in the ted group we have many people who are already pursuing this
 74 | bell gross has several companies including one called esolar that has some great solar thermal technologies vinod khosla's investing in dozens of companies that are doing great things and have interesting possibilities and i'm i'm trying to help back that nathan myhrvold and i actually are backing a company
 75 | that perhaps surprisingly is actually taking the nuclear approach there are some innovations in nuclear modular liquid
 76 | and innovation really stopped in this industry quite some time ago sew the idea that there's some good ideas laying around is not all that surprising the idea of terrapower
 77 | is that instead of burning a part of uranium the one percent which is the u two thirty five we decided let's burn the ninety nine percent
 78 | the u two thirty eight it is kind of a crazy idea in fact people had talked about it for a long time
 79 | but they could never simulate properly whether it would work or not and sew it's through the advent of modern supercomputers that now you can simulate and see that yes with the right
 80 | material's approach this looks like it would work and because you're burning that ninety nine percent you have
 81 | greatly improved cost profile you actually burn up the waste and you can actually use as fuel all the leftover waste from today's reactors sew instead of worrying about them you just take that it's a great thing
 82 | it breathes this uranium as it goes along sew it's kind of like a candle you can see it's it's a log there often referred to as a traveling wave reactor in terms of fuel
 83 | this really solves the problem i've got a picture here of a place in kentucky this is the left over the ninety nine percent where they've taken out the part they burn now sew it's called depleted uranium that would power the u s
 84 | for hundreds of years and simply by filtering sea water in an inexpensive process you'd have enough fuel for the entire lifetime of the rest of the planet so
 85 | you know it's got lots lots of challenges ahead but it is an example of the many hundreds and hundreds of ideas that we need to move forward
 86 | sew let's think how should we measure ourselves what should our report card look like well let's go out to where we really need to get and and then look at the intermediate for twenty fifty you've heard many people talk about
 87 | this eighty percent reduction that really is very important that we get there and that twenty percent will be used up by
 88 | things going on in poor countries still some agriculture hopefully we will have cleaned up forestry cement sew to get to that eighty percent the developed countries
 89 | including countries like china will have had to switch their electric electricity generation altogether sew the other grade is are we deploying
 90 | this zero emission technology have we deployed it in all the developed countries and we're in the process of of getting it elsewhere that's super important
 91 | that's a key element of making that report card sew backing up from there what should the twenty twenty report card look like
 92 | well it again it should have the two elements we should go through these efficiency measures to start getting reductions the less we emit the less that sum will be of co two and therefore the less the temperature but in some ways the grade we get there
 93 | doing things that don't get us all the way to the big reductions is only equally or maybe even slightly less important than the other which is the piece of innovation
 94 | on these breakthroughs these breakthroughs we need to move those at full speed and we can measure that in terms of companies pilot projects regulatory things that have been changed
 95 | there's a lot of great books that have been written about this the al gore book our choice and the david mckay book sustainable n r g without the hot air they really go through it and i think create a framework that this can be discussed broadly because we need broad backing for this
 96 | there's a lot that has to come together sew this is a wish it's a very concrete wish that
 97 | we invent this technology if you gave me only one wish for the next fifty years i could pick who's president i could pick a vaccine
 98 | which is something i love or i could pick that this thing that's half the cost with no co two gets invented
 99 | this is the wish i would pick this is the one with the greatest impact if we don't get this wish the division between the people who think short term and long term will be terrible
100 | between u s and china between poor countries and rich and most of all the lives of those two bellion will be far worse sew what do we have to do what am i appealing to you to step forward and
101 | and drive we need to go for more research funding when countries get together in places like copenhagen they shouldn't
102 | just discuss the co two they should discuss this innovation agenda and you'd be stunned at the ridiculously low levels of spending
103 | on these innovative approaches we do need the market incentives co two tax cap and trade something that gets that price signal out there
104 | we need to get the message out we need to have this dialogue be a more rational more understandable dialogue including the steps that the government takes this is an important wish but it is one i think we can achieve thank you
105 | thank you thank you
106 | thank you just sew i understand more about about terrapower right
107 | i mean first of all what can you give a sense of what scale of investment this is
108 | to to actually do the software buy the supercomputer hire all the great scientists which we've done that's only tens of millions and even once we test our materials out
109 | in a russian reactor to make sure that our materials work properly then you'll only be up in the hundreds of millions the tough thing is building the pilot reactor
110 | finding the several bellion finding the regulator the location that will actually build the first one of these once you get the first one built
111 | if it works as advertised then it's just clear as day because the economics the n r g density are sew different than nuclear as we know it
112 | and sew to understand it right this involves building deep into the ground almost like a vertical kind of column of nuclear fuel of this sort of spent uranium and then and then and then the process starts at the top and kind of works down
113 | that's right today you're always refueling the reactor sew you have lots of people and lots of controls that can go wrong that thing where you're opening it up and moving things in and out that's
114 | that's not good sew if you have very very cheap fuel that you can put sixty years in just think of it as a log
115 | put it down and not have those same complexities and it just sits there and burns for the sixty years and and then it's done
116 | it's a it's a nuclear power plant that is its own waste disposal solution
117 | yeah well what happens with the waste you can you can let it sit there there's a lot less waste under this approach then you can actually take that and put it into another one and burn that
118 | and and we start off actually by taking the waste that exists today that's sitting in these cooling pools or dry casking by reactors that's our fuel
119 | to begin with sew the thing that's been a problem from those reactors is actually what gets fed into ours and you're reducing the volume of the waste quite dramatically as you're going through this process
120 | but in your talking to different people around the world about the possibilities here where where is there most interest in actually doing something with this
121 | well we haven't uh picked a particular place uh and there's all these interesting disclosure rules about anything that's called nuclear
122 | sew we we we've got a lot of interest that people from the company have been in russia india china i've been back seeing the secretary of n r g here talking about how this
123 | fits into the the n r g agenda sew i'm optimistic you know the french and japanese have done some work this is a a variant on something that has been done it's an important
124 | advance but it's like a fast reactor and a lot of countries have built them sew anybody who's done a fast reactor is a candidate to to be where the first one gets built
125 | sew in in your mind timescale and likelihood of actually taking something like this live
126 | well we need for one of these
127 | high scale electro electro generation things that's very cheap we have twenty years to invent and then twenty years to deploy that's sort of the deadline
128 | that the environment models environmental models have have have shown us that we have to meet and you know terrapower if things go well
129 | which is wishing for a lot could easily meet that and there are fortunately now dozens of companies we need it to be hundreds
130 | who likewise if their science goes well if the funding for their pilot pilot plants goes well that they they can compete for this and it's best if multiple succeed because then you could use a mix a mix of these things we certainly need one to succeed
131 | in terms of big scale possible game changes is this the biggest that you're aware of out there
132 | an n r g breakthrough is the the most important thing it would have been even without the environmental constraint but the environmental constraint just makes it sew much
133 | sew much greater in the nuclear space there are other innovators you know we don't know their work as well as we know this one but
134 | the modular people that's a different approach there's a liquid type reactor which seems a little hard but maybe they say that about us
135 | and sew there there are different ones but the beauty of this is a molecule of uranium has a million times as much n r g as a molecule of say coal
136 | and sew if you can deal with the negatives which are essentially the radiation the footprint and cost the potential in terms of effect on land and various things is almost in a class of its own
137 | then what do we have to start taking emergency measures to try and keep the temperature of the earth stable
138 | yeah if you get into that situation it's like
139 | if you've
140 | you've been over eating and you're about to have a heart attack you know then then then where do you go you may need heart surgery or something there is a line of research on what's called geoengineering
141 | which are various techniques that would delay the heating to buy us twenty or thirty years to get our act together
142 | now you hope that's just an insurance policy you hope you don't need to do that some people say you shouldn't even work on the insurance policy because it might make you
143 | lazy that you'll keep eating because you know heart surgery will be there to save you i'm not sure that's wise given the importance of the problem but
144 | there's now the the geoengineering discussion about should that be in the back pocket in case things happen faster or this innovation goes a lot slower than we expect
145 | well unfortunately the skeptics come in different camps i mean the ones who make scientific arguments are very few
146 | you know are they saying there's negative feedback effects that have to do with clouds that offset things there are very very few things that they can even say you know there's a chance in a million of those things the main problem we have here
147 | is kind of like aids you make the mistake now and you pay for it a lot later and sew when you have all sorts of urgent problems the idea of taking pain now
148 | that has to do with a gain later and a somewhat uncertain pain thing in fact the ipcc report you know that
149 | that's not necessarily the worst case and there are people in the rich world who look at ipcc and say okay you know that that isn't that big of a deal the fact is it's that uncertain part
150 | that should move us towards this but my dream here is that if you can make it economic and meet the co two constraints then the skeptics say okay
151 | i don't care that it doesn't put out co two i kind of wish it did put out co two but i guess i'll accept it because it's cheaper
152 | than what's come before
153 | is that would be your response to the the the bjorn lomborg argument that basically if you if you spend all this n r g trying to solve the co two problem it's going to take away all your other goals of trying to rid the world of poverty and malaria and sew forth it's it's it's a stupid waste of the earth's resources to put money towards that when there are better things we can do
154 | yeah well the actual spending on the r and d piece you know say the u s should spend ten bellion a year more than it is right now it's not that dramatic it shouldn't take away from other things the thing you get into big money on
155 | and this reasonable people can disagree is when you have something that's non economic and you're trying to fund that that to me mostly is a waste unless you're very close and you're just funding the learning curve and it's going to get very cheap i believe we should try you know more
156 | things that have a potential to be far less expensive if the trade off you get into is let's make n r g super super
157 | expensive then the rich can afford that i mean all of us here could pay five times as much for our n r g and not change our lifestyle
158 | the disaster is for that two bellion and even lomborg has changed his shtick now is why isn't the r and d
159 | getting more discussed he he's still because of his earlier stuff further associated with the skeptic camp but he's realized that's that's a pretty lonely camp and so
160 | he's making the r and d point and and sew there is a a a thread of something that i i think is appropriate the r and d piece it's crazy how little it's funded
161 | well bell i suspect i i speak on the behalf of most people here to say i really hope your wish comes true thank you sew much
162 | thank you
163 | 


--------------------------------------------------------------------------------