├── tests
    ├── test_amazon.py
    ├── test_job.py
    ├── test_tools.py
    └── test_helpers.py
├── transcriptor
    ├── __init__.py
    ├── speakers.py
    ├── markers.py
    ├── segments.py
    ├── alternatives.py
    ├── tools.py
    ├── vocabulary.py
    ├── helpers.py
    ├── srt.py
    ├── job.py
    └── amazon.py
├── requirements.in
├── CONTRIBUTING.md
├── setup.py
├── LICENSE
├── requirements.txt
├── .gitignore
└── README.md


/tests/test_amazon.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/transcriptor/__init__.py:
--------------------------------------------------------------------------------
1 | from .job import Job
2 | 


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
 1 | more-itertools
 2 | httpx
 3 | pytest
 4 | boto3
 5 | pytest-mock
 6 | requests-mock
 7 | pytest-cov
 8 | slugify
 9 | 
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Format
 2 | 
 3 | Use [Black] to format your contribution
 4 | 
 5 | # Documentation
 6 | ## In Code
 7 | 
 8 | Apply docstrings to any new functions, classes, and methods
 9 | 
10 | 


--------------------------------------------------------------------------------
/transcriptor/speakers.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | @dataclass
 4 | class Speaker:
 5 |     base_name: str=None # default name
 6 |     name: str=None # User given name
 7 | 
 8 |     @property
 9 |     def label(self):
10 |         return self.name or self.base_name
11 | 


--------------------------------------------------------------------------------
/transcriptor/markers.py:
--------------------------------------------------------------------------------
 1 | from transcriptor.speakers import Speaker
 2 | from dataclasses import dataclass
 3 | 
 4 | import typing
 5 | 
 6 | @dataclass
 7 | class Marker:
 8 |     """Content should only exist when there are no segments"""
 9 |     speaker: typing.Optional[Speaker]= None
10 |     start_time: float=0.0
11 |     end_time: float=0.0
12 |     content: typing.Optional[str] = None
13 | 


--------------------------------------------------------------------------------
/transcriptor/segments.py:
--------------------------------------------------------------------------------
 1 | from .alternatives import Alternative
 2 | from .speakers import Speaker
 3 | from datetime import timedelta
 4 | from dataclasses import dataclass
 5 | import typing
 6 | 
 7 | @dataclass
 8 | class Segment:
 9 |     alternatives: typing.List[Alternative]
10 |     start_time: typing.Optional[float] = 0.0 
11 |     end_time: typing.Optional[float] = 0.0
12 |     speaker: typing.Optional[Speaker] = None
13 | 
14 | 
15 | def gen_segments(segments):
16 |     for segments in segments:
17 |         yield Segment(**kwargs)
18 | 


--------------------------------------------------------------------------------
/transcriptor/alternatives.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | from dataclasses import dataclass
 3 | import typing
 4 | 
 5 | @dataclass
 6 | class Alternative:
 7 |     content: str
 8 |     confidence: float
 9 |     _type: str
10 |     start_time: typing.Optional[float] = 0.0 
11 |     end_time: typing.Optional[float] = 0.0
12 |     # regional times for loading from text
13 |     tag: typing.Optional[str] = None
14 |     regional_start_time: typing.Optional[float]=None 
15 |     regional_end_time: typing.Optional[float]=None
16 | 
17 | def gen_alternatives(alternatives):
18 |     for alternative in alternatives:
19 |         yield Alternative(**kwargs)
20 | 
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md") as filename:
 4 |     long_description = filename.read()
 5 | 
 6 | 
 7 | setup(
 8 |         name='transcriptor',
 9 |         version='2020.8.8',
10 |         description='A wrapper for transcription results.',
11 |         long_description=long_description,
12 |         long_description_content_type='text/markdown',
13 |         url='https://github.com/kjaymiller/transcriptor',
14 |         author='Jay Miller',
15 |         author_email='kjaymiller@gmail.com',
16 |         license='MIT',
17 |         packages=find_packages(),
18 |         install_requires=[
19 |             'more-itertools',
20 |             'requests',
21 |             'boto3',
22 |             'python-slugify',
23 |             ],
24 |         zip_safe=False,
25 |         )
26 | 


--------------------------------------------------------------------------------
/transcriptor/tools.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import typing
 3 | from datetime import timedelta
 4 | from .markers import Marker
 5 | 
 6 | 
 7 | def str_to_timedelta(time_val: timedelta, separator=".") -> str:
 8 |     """Take the seconds and microseconds from a timedelta and return 
 9 | the the remaining seconds and milliseconds, Not Rounded as a str.
10 | Seconds will have a leading zero."""
11 | 
12 |     if time_val.microseconds:
13 |         return str(time_val)[:3]
14 | 
15 |     else:
16 |         return str(time_val) + ',000'
17 | 
18 | 
19 | 
20 | def timedelta_from_str(time_str: str) -> timedelta:
21 |     """Given a string in %H:%M:%S,%f, return a timedelta
22 |     >>> timedelta_from_str("01:23:34,123")
23 |     datetime.timedelta(seconds=5014, microseconds=123000)
24 |     """
25 |     time_splitter = r":"
26 |     [h, m, s] = re.split(time_splitter, time_str)
27 |     s = s.replace(",", ".")  # srt time = H:M:S,Mil
28 |     return timedelta(hours=int(h), minutes=int(m), seconds=float(s))
29 | 


--------------------------------------------------------------------------------
/transcriptor/vocabulary.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | class Vocabulary:
 4 |     def __init__(self,
 5 |             phrase: str,
 6 |             sounds_like: typing.Optional[str] = '',
 7 |             display_as: typing.Optional[str] = '',
 8 |             ipa: typing.Optional[str] = '',
 9 |             abbreviation: bool = False,
10 |             ):
11 |     
12 |         if abbreviation and "." not in phrase:
13 |             self.phrase = ''.join([f'{x}.'.upper() for x in phrase])
14 | 
15 |         if ' ' in phrase:
16 |             self.phrase = '-'.join(phrase.split(' '))
17 | 
18 |         else:
19 |             self.phrase = phrase
20 | 
21 |         self.sounds_like = sounds_like
22 | 
23 |         if abbreviation: # F.B.I. -> FBI
24 |             self.display_as = display_as.replace('.', '').upper()
25 | 
26 |         else :
27 |             self.display_as = display_as
28 | 
29 |         self.ipa = ipa
30 | 
31 |     @property
32 |     def table(self):
33 |         return f'{self.phrase}\t{self.ipa}\t{self.sounds_like}\t{self.display_as}'
34 | 


--------------------------------------------------------------------------------
/tests/test_job.py:
--------------------------------------------------------------------------------
 1 | from transcriptor.job import Job
 2 | import pytest
 3 | 
 4 | @pytest.fixture()
 5 | def tmp_file_data():
 6 |     return """
 7 | 00:00:00,000 --> 00:00:02,123
 8 | This, is a test.
 9 | 
10 | 2
11 | 00:00:03,000 --> 00:00:4,456
12 | Another test for me."""
13 | 
14 | def test_job_from_srt_creates_markers(tmp_file_data, tmp_path):
15 |     test_file = tmp_path / 'test_file.srt'
16 |     test_file.write_text(tmp_file_data)
17 |     newFile = Job.from_srt(test_file)
18 |     assert len(newFile.markers) == 2
19 | 
20 | 
21 | def test_job_from_srt_creates_alternatives(tmp_file_data, tmp_path):
22 |     test_file = tmp_path / 'test_file.srt'
23 |     test_file.write_text(tmp_file_data)
24 |     newJob = Job.from_srt(test_file)
25 |     assert len(newJob.alternatives) == 11
26 | 
27 | 
28 | def test_job_puncuations(tmp_file_data, tmp_path):
29 |     test_file = tmp_path / 'test_file.srt'
30 |     test_file.write_text(tmp_file_data)
31 |     newFile = Job.from_srt(test_file)
32 |     punctuations = [x for x in newFile.alternatives if x._type=='punctuation']
33 |     assert len(punctuations) == 3
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jay Miller
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/test_tools.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import transcriptor.tools as tools
 3 | from datetime import timedelta
 4 | 
 5 | @pytest.mark.parametrize('second_val, result', [
 6 |         (61.1, "01.100"), # tests leading zeroes
 7 |         (130.1000001, "10.100"), # tests that milliseconds returned, not micro
 8 |         (36001.123567, "01.123"), # It does not round
 9 |         ])
10 | def test_adjust_microseconds_returns_seconds_and_minutes(second_val: float,
11 |         result: str):
12 |     """Tests when passed a timedelta it returns the remaining seconds and
13 |     milliseconds as a string"""
14 | 
15 |     seconds=timedelta(seconds=second_val)
16 |     assert tools.adjust_microseconds(seconds) == result
17 | 
18 | @pytest.mark.parametrize('time_val', ["01:23:34,123", "01:23:34.123"])
19 | def test_timedelta_from_str_accepts_comma_or_period(time_val):
20 |     assert tools.timedelta_from_str(time_val) == timedelta(seconds=5014, microseconds=123000)
21 | 
22 | 
23 | def test_timedelta_to_dict():
24 |     a = tools.timedelta_to_dict(timedelta(seconds=3661.123))
25 |     assert a['hours'] == 1
26 |     assert a['minutes'] == 1
27 |     assert str(a['seconds'])[:5] == '1.123' #floats are hard
28 | 
29 | def test_minutes_endtime_adds_a_minute():
30 |     pass  
31 | 


--------------------------------------------------------------------------------
/transcriptor/helpers.py:
--------------------------------------------------------------------------------
 1 | def text_in_range(segments, start_time, end_time):
 2 |     """ Iterate through the segments and return the content that fits between
 3 |     the start and end times.
 4 | 
 5 |     Parameters
 6 |     ----------
 7 |     segments: list
 8 |         The segment to iterate over
 9 |     start_time: str or float
10 |         Amazon JSON files will return a string but this will conver to float
11 |     end_time: str or float
12 |         Amazon JSON files will return a string but this will conver to float
13 | 
14 |     Returns
15 |     -------
16 |     string
17 |         the content from the alternative with the highest confidence
18 | 
19 |     Raises
20 |     ------
21 |     KeyError
22 |         when a key error
23 |     OtherError
24 |         when an other error
25 |     """
26 | 
27 |     segment_range = "" 
28 |     for segment in segments:
29 |         # coming from JSON. segment timestamps will be strings
30 |         if float(segment.end_time) < start_time:
31 |             continue
32 | 
33 |         if float(segment.start_time) <= end_time:
34 |             # get the content with the highest confidence score 
35 |             content = content
36 | 
37 |             if segment['type'] == "pronunciation":
38 |                 content = f" {content}"
39 | 
40 |             segment_range += content
41 | 
42 |         else:
43 |             break #exit from loop
44 | 
45 |     return segment_range.strip()
46 | 
47 | 


--------------------------------------------------------------------------------
/transcriptor/srt.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | from pathlib import Path
 3 | from .job import Job
 4 | from .markers import Marker
 5 | 
 6 | 
 7 | def srt_index_to_Marker(srt_index: str) -> typing.List:
 8 |     """Given a srt index return a Marker Object"""
 9 |     _, timestamps, content = marker.strip().split("\n", maxsplit=2)
10 |     start_time, end_time = [timedelta_from_str(x) for x in timestamps.split(" --> ")]
11 |     return Marker(
12 |             start_time=start_time,
13 |             end_time=end_time,
14 |             content=content.strip(),
15 |             )
16 | 
17 | def markers_to_srt(index, markers: typing.List[Marker]) -> str:
18 |     """Given a Marker object return a srt text index"""
19 |     for marker in markers:
20 |         yield f"{index}\n{start_time} --> {end_time}\n{text}"
21 | 
22 | def iterate_srt(srt):
23 |     """Given a srt text, yield srt_data for each index"""
24 |     for index in re.split("\n{2,}", srt):
25 |         yield srt_index_to_Marker(index)
26 | 
27 | 
28 | def load(filepath) -> Job:
29 |     """Reads a .srt file and returns a job Object"""
30 |     return loads(Path(filepath).read_text())
31 | 
32 | 
33 | def loads(content) -> Job:
34 |     """Reads content and returns a job Object"""
35 |     markers = [gen_markers(x) for x in iterate_srt]
36 |     return Job(markers=srt_markers, text)
37 | 
38 | 
39 | def write(job: Job) -> str:
40 |     """When passed an Job. Convert it to an SRT File"""
41 |     segments = [markers_to_srt(x,y) for x,y in enumerate(self.markers, start=1)]:
42 |     return "\n\n".join(segments)
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | attrs==20.2.0             # via pytest
 8 | boto3==1.15.14            # via -r requirements.in
 9 | botocore==1.18.14         # via boto3, s3transfer
10 | certifi==2020.6.20        # via httpx, requests
11 | chardet==3.0.4            # via requests
12 | coverage==5.3             # via pytest-cov
13 | h11==0.11.0               # via httpcore
14 | httpcore==0.12.0          # via httpx
15 | httpx==0.16.0             # via -r requirements.in
16 | idna==2.10                # via requests, rfc3986
17 | iniconfig==1.0.1          # via pytest
18 | jmespath==0.10.0          # via boto3, botocore
19 | more-itertools==8.5.0     # via -r requirements.in
20 | packaging==20.4           # via pytest
21 | pluggy==0.13.1            # via pytest
22 | py==1.9.0                 # via pytest
23 | pyparsing==2.4.7          # via packaging
24 | pytest-cov==2.10.1        # via -r requirements.in
25 | pytest-mock==3.3.1        # via -r requirements.in
26 | pytest==6.1.1             # via -r requirements.in, pytest-cov, pytest-mock
27 | python-dateutil==2.8.1    # via botocore
28 | requests-mock==1.8.0      # via -r requirements.in
29 | requests==2.24.0          # via requests-mock
30 | rfc3986[idna2008]==1.4.0  # via httpx
31 | s3transfer==0.3.3         # via boto3
32 | six==1.15.0               # via packaging, python-dateutil, requests-mock
33 | slugify==0.0.1            # via -r requirements.in
34 | sniffio==1.1.0            # via httpcore, httpx
35 | toml==0.10.1              # via pytest
36 | urllib3==1.25.10          # via botocore, requests
37 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transcriptor.helpers import text_in_range
 3 | 
 4 | 
 5 | @pytest.fixture()
 6 | def segments():
 7 |     segment = [
 8 |             {
 9 |                 "start_time":0.000,
10 |                 "end_time": 1.0000,
11 |                 "type":"pronunciation",
12 |                 "alternatives": [{
13 |                     "confidence": 1.0,
14 |                     "content": "Once",
15 |                     }],
16 |                 
17 |             },
18 |             {
19 |                 "start_time":1.000,
20 |                 "end_time": 2.0000,
21 |                 "type":"pronunciation",
22 |                 "alternatives": [{
23 |                     "confidence": 1.0,
24 |                     "content": "Upon"
25 |                     }],
26 |             },
27 |             {
28 |                 "start_time":3.000,
29 |                 "end_time": 4.0000,
30 |                 "type":"pronunciation",
31 |                 "alternatives": [{
32 |                     "confidence": 0.0,
33 |                     "content": "b"
34 |                     },
35 |                     {
36 |                     "confidence": 0.4,
37 |                     "content": "A"
38 |                     },
39 |                     ],
40 |             },
41 |             {
42 |                 "start_time":4.000,
43 |                 "end_time": 6.0000,
44 |                 "type": "punctuation",
45 |                 "alternatives": [{
46 |                     "confidence": 1.0,
47 |                     "content": "."
48 |                     }],
49 |             },
50 |         ]
51 |     return segment
52 | 
53 | def test_text_in_range_gets_only_text_in_range(segments):
54 |     """Given a segment text_in_range returns the text between the 
55 |     start/stop times"""
56 | 
57 |     # Test that only the 2 of the segments are returned
58 |     target_start_time = 1.3
59 |     target_end_time = 3.5
60 | 
61 |     test_text = text_in_range(segments, target_start_time, target_end_time)
62 |     assert test_text == "Upon A"
63 | 
64 | 
65 | def test_text_in_range_doesnt_add_space_for_punctuation(segments):
66 |     """Punctuation should not have a space before it.""" 
67 | 
68 |     target_start_time = 2.3
69 |     target_end_time = 7.5
70 | 
71 |     test_text = text_in_range(segments, target_start_time, target_end_time)
72 |     assert test_text == "A."
73 | 
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 


--------------------------------------------------------------------------------
/transcriptor/job.py:
--------------------------------------------------------------------------------
 1 | from .alternatives import Alternative
 2 | from .tools import (
 3 |     str_to_timedelta,
 4 |     timedelta_from_str,
 5 | )
 6 | from .helpers import text_in_range
 7 | from .speakers import Speaker
 8 | from .markers import Marker
 9 | from .segments import Segment
10 | from .speakers import Speaker
11 | 
12 | from pathlib import Path
13 | from datetime import timedelta
14 | import string
15 | import time
16 | import logging
17 | import typing
18 | 
19 | 
20 | class Job():
21 |     """Job objects are the foundation of transcriptor. A job holds the markers, segments, speakers, alternatives, and outputs for a transcription"""
22 | 
23 |     def __init__(self,
24 |             *, 
25 |             markers: typing.Optional[typing.List[Marker]]=None,
26 |             segments: typing.Optional[typing.List[Segment]]=None,
27 |             speakers: typing.Optional[typing.List[Speaker]]=None,
28 |             text: typing.Optional[str]=None,
29 |             ):
30 | 
31 |         self.markers = markers
32 |         self.segments = segments
33 |         self.speakers = speakers
34 |         self.text = text
35 | 
36 |     def _check_marker_content(self, marker):
37 |         """checks given marker for content otherwise loads segments between the
38 |         marker start and end times
39 |         """
40 | 
41 |         if marker.content:
42 |             return marker.content
43 | 
44 |         else:
45 |             return text_in_range(
46 |                     self.segments, 
47 |                     start_time=marker.start_time,
48 |                     end_time=marker.end_time,
49 |                     )
50 | 
51 | 
52 |     def _text_from_marker(self) -> typing.Generator:
53 |         """Generate dictionaries of text for each Marker value"""
54 |         markers = []
55 | 
56 |         for marker in self.markers:
57 |             markers.append({
58 |                 "start_time": marker.start_time,
59 |                 "end_time": marker.end_time,
60 |                 "content": self._check_marker_content(marker),
61 |                 "speaker": marker.speaker if marker.speaker else '',
62 |                 })
63 | 
64 |         return markers
65 | 
66 | 
67 |     def to_text(
68 |         self,
69 |         separator: str = "\n\n",
70 |         text_separator: str = ":\n\n",
71 |         disable_speakers: bool = False,
72 |         disable_timestamp: bool = False,
73 |     ) -> str:
74 | 
75 |         markers = list[self._text_from_marker()]
76 | 
77 |         lines = []
78 | 
79 |         for marker in markers:
80 | 
81 |             if disable_speakers:
82 |                 marker['speaker'] = ''
83 | 
84 |             if disable_timestamp:
85 |                 marker['start_time'] = ''
86 | 
87 |             lines.append(f"{speaker}{start_time}{text_separator}{content}")
88 | 
89 |         return separator.join(lines)
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Transcriptor Logo](https://s3-us-west-2.amazonaws.com/kjaymiller/images/Transcriptor%20Logo%20V1.1.png)
 2 | # Transcriptor
 3 | ## A transcription service wrapper that makes it easier to work with transcriptions.
 4 | 
 5 | Transcription services provide a cost-effective way to add accessibility to
 6 | your audio and video, but they are often a challenge to integrate into your system.
 7 | 
 8 | **Transcriptor aims to make working with transcriptions easier.**
 9 | 
10 | Transcriptor looks at transcription objects as a class.
11 | 
12 | ### Features:
13 | 
14 | - An object-oriented approach to Transcriptions, Markers, and Speakers
15 | - Nondestructive manipulation of text and references.
16 | 
17 | ## Installation
18 | 
19 | Install transcriptor using pip.
20 | 
21 | `pip install transcriptor`
22 | 
23 | 
24 | ## Quickstart
25 | 
26 | Transcriptor currently supports transcription upload and 
27 | creating a **READABLE Transcription Job Object** from their output.
28 | 
29 | ### Importing Transcriptions from AWS Transcribe
30 | 
31 | ```python
32 | from transcriptor import amazon, AmazonJob
33 | 
34 | job = AmazonJob(filepath='<filepath>', bucket='<bucket_name>', )
35 | job.start()
36 | # After Some Time
37 | job.status() # If it says COMPLETED
38 | job.build()
39 | ```
40 | 
41 | Alternatively, you can load the object via the _TranscriptFileUri_
42 | (`AmazonJob.from_uri()`) or the JSON object directly (`AmazonJob.from_json()`).
43 | 
44 | A loaded Transcription Job from AWS Transcribe will give you access to `Markers`, `Speakers`
45 | (if included), the provided `Alternatives`. You also have the original job
46 | object that you can interact with.
47 | 
48 | ### Importing from an SubRip Subtitle (SRT)
49 | 
50 | Text Transcriptions from the Web can be very unique in style, but the most
51 | common format is that of at srt file.
52 | 
53 | You can load an srt file into transcriptor and use that to interact with the
54 | individual markers.
55 | 
56 | ```python
57 | from transcriptor import Job
58 | 
59 | amazon.from_srt('FILENAME.srt')
60 | ```
61 | 
62 | A loaded Transcription Job from srt files will make `Markers`.
63 | 
64 | These markers are given in order as a list. This gives you the ability to
65 | modify a single Marker or iterate your changes across a range or all of the
66 | Marker objects.
67 | 
68 | ## Supported Services
69 | 
70 | - Amazon Transcribe (boto3)
71 | 
72 | ## Planned Support For
73 | 
74 | - Google Speech-To-Text
75 | 
76 | ## Sponsors
77 | This and much of the work that I do is made possible by those that sponsor me
78 | on github.
79 | 
80 | ### Sponsors at the $20/month and higher Level
81 | - [Brian Douglas](https://github.com/bdougie)
82 | - [Anthony Shaw](https://github.com/tonybaloney)
83 | - [Carol Willing](https://github.com/willingc)
84 | 
85 | Thank you to them and all of those that continue to support this project!
86 | 
87 | [**Sponsor this Project**](https://github.com/sponsors/kjaymiller)
88 | 


--------------------------------------------------------------------------------
/transcriptor/amazon.py:
--------------------------------------------------------------------------------
  1 | from .job import Job
  2 | from .markers import Marker 
  3 | from .alternatives import Alternative
  4 | from .helpers import text_in_range
  5 | from .speakers import Speaker
  6 | 
  7 | from datetime import timedelta
  8 | from pathlib import Path
  9 | from slugify import slugify
 10 | 
 11 | import boto3
 12 | import dataclasses
 13 | import more_itertools
 14 | import httpx
 15 | import logging
 16 | import os
 17 | import typing
 18 | import json
 19 | 
 20 | storage = boto3.client("s3")
 21 | transcribe = boto3.client("transcribe")
 22 | 
 23 | class AmazonEnv():
 24 |     def __init__(
 25 |     self,
 26 |     asr_Output: typing.Optional[dict]=None,
 27 |     key: typing.Optional[str]=None,
 28 |     audio_file: typing.Optional[str]=None,
 29 |     bucket: typing.Optional[str]=None,
 30 |     is_uploaded: bool=True,
 31 |     ):
 32 | 
 33 |         self.is_uploaded = False
 34 | 
 35 |         if bucket:
 36 |             self.bucket = bucket
 37 | 
 38 |         if audio_file:
 39 |             self.audio_file = Path(audio_file)
 40 | 
 41 |             if not key:
 42 |                 key = slugify(str(audio_file))
 43 | 
 44 |         check_key = slugify(key) # coerce valid naming structure
 45 | 
 46 |         if key != check_key:
 47 |             logging.warning(f'invalid-{key=}. Will use {check_key} instead.')
 48 | 
 49 |         self.key = check_key
 50 | 
 51 | 
 52 |     def upload_audio_file(self, **kwargs):
 53 |         """Loads file to amazon s3 location. 
 54 |         This is a convenience wrapper for storage.upload_file
 55 |         """
 56 |         storage.upload_file(str(self.audio_file), Bucket=self.bucket, Key=self.key, **kwargs)
 57 |         self.is_uploaded=True
 58 | 
 59 | 
 60 |     def start_transcription(
 61 |         self,
 62 |         *,
 63 |         language:str = 'en-US',
 64 |         vocabulary: typing.Optional[str]=None,
 65 |         speaker_count: int = 0,
 66 |     ):
 67 |         """Optionally upload the file and start the transcription job.
 68 |         This is a convenient wrapper for transcription.start_tranascription_job
 69 |         """
 70 | 
 71 |         if not self.is_uploaded:
 72 |             audio_file = self.upload_audio_file()
 73 | 
 74 |         transcribe_job_uri = f"{storage.meta.endpoint_url}/{self.bucket}/{self.key}"
 75 |         settings = {}
 76 | 
 77 |         if speaker_count > 0:
 78 |             settings["ShowSpeakerLabels"] = True
 79 |             settings["MaxSpeakerLabels"] = speaker_count
 80 | 
 81 |         if vocabulary:
 82 |             settings["VocabularyName"] = vocabulary
 83 | 
 84 |         return transcribe.start_transcription_job(
 85 |             TranscriptionJobName=self.key,
 86 |             Media={"MediaFileUri": transcribe_job_uri},
 87 |             MediaFormat=Path(self.audio_file).suffix[1:],
 88 |             LanguageCode=language,
 89 |             Settings=settings,
 90 |         )
 91 | 
 92 |     @staticmethod
 93 |     def from_job(job_name: str) -> Job:
 94 |         """Create a Job Object based on the TranscriptiobJobName"""
 95 |         job = transcribe.get_transcription_job(TranscriptionJobName=job_name)
 96 |         return AmazonEnv.from_uri(
 97 |             job["TranscriptionJob"]["Transcript"]["TranscriptFileUri"]
 98 |         )
 99 | 
100 |     @staticmethod
101 |     def from_uri(uri: str) -> Job:
102 |         """Create a Job Object based on the TranscriptFileUri"""
103 |         response = httpx.get(uri)
104 |         return AmazonEnv.from_json(response.json())
105 | 
106 |     def build(
107 |             self,
108 |             *,
109 |             split_at:typing.List[str]=['.', '?', '!'],
110 |             ignore_speakers:bool=False
111 |             ) -> Job:
112 |         """
113 |         Create a Job Object when given an Amazon JSON Object
114 |         
115 |         Parameters
116 |         ----------
117 |         split_at: list
118 |             punctuation objects to split for markers
119 |         ignore_speakers: bool
120 |         """
121 |         pass
122 | 
123 |     @classmethod
124 |     def from_json(cls, json_file) -> Job:
125 |         """Create a Job Object when given an Amazon JSON Object"""
126 | 
127 |         results = json_file['results']
128 | 
129 |         if "speaker_labels" in results:
130 |             labels = json_file["results"]["speaker_labels"]
131 |             segments = labels["segments"]
132 |             
133 |         else:
134 |             segment_content = more_itertools.split_when(
135 |                 json_file['results']['items'],
136 |                 lambda x: x['type'] == "pronunciation"
137 |             )
138 |             segments=[]
139 | 
140 |             for segment in segment_content:
141 |                 segments.append({
142 |                     'start_time': float(item[0]["start_time"]),
143 |                     'end_time': float(item[-1]["end_time"]),
144 |                     'speaker': None,
145 |                     })
146 | 
147 | 
148 | 
149 | def from_transcription_jobs(**kwargs):
150 |         """Get a list of transcription jobs and generate a job object for each one
151 |         """
152 | 
153 |         for job in transcribe.list_transcription_jobs(**kwargs)["TranscriptionJobSummaries"]:
154 |             yield AmazonEnv.from_job(job["TranscriptionJobName"])
155 | 


--------------------------------------------------------------------------------