├── tests ├── test_amazon.py ├── test_job.py ├── test_tools.py └── test_helpers.py ├── transcriptor ├── __init__.py ├── speakers.py ├── markers.py ├── segments.py ├── alternatives.py ├── tools.py ├── vocabulary.py ├── helpers.py ├── srt.py ├── job.py └── amazon.py ├── requirements.in ├── CONTRIBUTING.md ├── setup.py ├── LICENSE ├── requirements.txt ├── .gitignore └── README.md /tests/test_amazon.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | -------------------------------------------------------------------------------- /transcriptor/__init__.py: -------------------------------------------------------------------------------- 1 | from .job import Job 2 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | more-itertools 2 | httpx 3 | pytest 4 | boto3 5 | pytest-mock 6 | requests-mock 7 | pytest-cov 8 | slugify 9 | 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Format 2 | 3 | Use [Black] to format your contribution 4 | 5 | # Documentation 6 | ## In Code 7 | 8 | Apply docstrings to any new functions, classes, and methods 9 | 10 | -------------------------------------------------------------------------------- /transcriptor/speakers.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class Speaker: 5 | base_name: str=None # default name 6 | name: str=None # User given name 7 | 8 | @property 9 | def label(self): 10 | return self.name or self.base_name 11 | -------------------------------------------------------------------------------- /transcriptor/markers.py: -------------------------------------------------------------------------------- 1 | from transcriptor.speakers import Speaker 2 | from dataclasses import dataclass 3 | 4 | import typing 5 | 6 | @dataclass 7 | class Marker: 8 | """Content should only exist when there are no segments""" 9 | speaker: typing.Optional[Speaker]= None 10 | start_time: float=0.0 11 | end_time: float=0.0 12 | content: typing.Optional[str] = None 13 | -------------------------------------------------------------------------------- /transcriptor/segments.py: -------------------------------------------------------------------------------- 1 | from .alternatives import Alternative 2 | from .speakers import Speaker 3 | from datetime import timedelta 4 | from dataclasses import dataclass 5 | import typing 6 | 7 | @dataclass 8 | class Segment: 9 | alternatives: typing.List[Alternative] 10 | start_time: typing.Optional[float] = 0.0 11 | end_time: typing.Optional[float] = 0.0 12 | speaker: typing.Optional[Speaker] = None 13 | 14 | 15 | def gen_segments(segments): 16 | for segments in segments: 17 | yield Segment(**kwargs) 18 | -------------------------------------------------------------------------------- /transcriptor/alternatives.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from dataclasses import dataclass 3 | import typing 4 | 5 | @dataclass 6 | class Alternative: 7 | content: str 8 | confidence: float 9 | _type: str 10 | start_time: typing.Optional[float] = 0.0 11 | end_time: typing.Optional[float] = 0.0 12 | # regional times for loading from text 13 | tag: typing.Optional[str] = None 14 | regional_start_time: typing.Optional[float]=None 15 | regional_end_time: typing.Optional[float]=None 16 | 17 | def gen_alternatives(alternatives): 18 | for alternative in alternatives: 19 | yield Alternative(**kwargs) 20 | 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md") as filename: 4 | long_description = filename.read() 5 | 6 | 7 | setup( 8 | name='transcriptor', 9 | version='2020.8.8', 10 | description='A wrapper for transcription results.', 11 | long_description=long_description, 12 | long_description_content_type='text/markdown', 13 | url='https://github.com/kjaymiller/transcriptor', 14 | author='Jay Miller', 15 | author_email='kjaymiller@gmail.com', 16 | license='MIT', 17 | packages=find_packages(), 18 | install_requires=[ 19 | 'more-itertools', 20 | 'requests', 21 | 'boto3', 22 | 'python-slugify', 23 | ], 24 | zip_safe=False, 25 | ) 26 | -------------------------------------------------------------------------------- /transcriptor/tools.py: -------------------------------------------------------------------------------- 1 | import re 2 | import typing 3 | from datetime import timedelta 4 | from .markers import Marker 5 | 6 | 7 | def str_to_timedelta(time_val: timedelta, separator=".") -> str: 8 | """Take the seconds and microseconds from a timedelta and return 9 | the the remaining seconds and milliseconds, Not Rounded as a str. 10 | Seconds will have a leading zero.""" 11 | 12 | if time_val.microseconds: 13 | return str(time_val)[:3] 14 | 15 | else: 16 | return str(time_val) + ',000' 17 | 18 | 19 | 20 | def timedelta_from_str(time_str: str) -> timedelta: 21 | """Given a string in %H:%M:%S,%f, return a timedelta 22 | >>> timedelta_from_str("01:23:34,123") 23 | datetime.timedelta(seconds=5014, microseconds=123000) 24 | """ 25 | time_splitter = r":" 26 | [h, m, s] = re.split(time_splitter, time_str) 27 | s = s.replace(",", ".") # srt time = H:M:S,Mil 28 | return timedelta(hours=int(h), minutes=int(m), seconds=float(s)) 29 | -------------------------------------------------------------------------------- /transcriptor/vocabulary.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | class Vocabulary: 4 | def __init__(self, 5 | phrase: str, 6 | sounds_like: typing.Optional[str] = '', 7 | display_as: typing.Optional[str] = '', 8 | ipa: typing.Optional[str] = '', 9 | abbreviation: bool = False, 10 | ): 11 | 12 | if abbreviation and "." not in phrase: 13 | self.phrase = ''.join([f'{x}.'.upper() for x in phrase]) 14 | 15 | if ' ' in phrase: 16 | self.phrase = '-'.join(phrase.split(' ')) 17 | 18 | else: 19 | self.phrase = phrase 20 | 21 | self.sounds_like = sounds_like 22 | 23 | if abbreviation: # F.B.I. -> FBI 24 | self.display_as = display_as.replace('.', '').upper() 25 | 26 | else : 27 | self.display_as = display_as 28 | 29 | self.ipa = ipa 30 | 31 | @property 32 | def table(self): 33 | return f'{self.phrase}\t{self.ipa}\t{self.sounds_like}\t{self.display_as}' 34 | -------------------------------------------------------------------------------- /tests/test_job.py: -------------------------------------------------------------------------------- 1 | from transcriptor.job import Job 2 | import pytest 3 | 4 | @pytest.fixture() 5 | def tmp_file_data(): 6 | return """ 7 | 00:00:00,000 --> 00:00:02,123 8 | This, is a test. 9 | 10 | 2 11 | 00:00:03,000 --> 00:00:4,456 12 | Another test for me.""" 13 | 14 | def test_job_from_srt_creates_markers(tmp_file_data, tmp_path): 15 | test_file = tmp_path / 'test_file.srt' 16 | test_file.write_text(tmp_file_data) 17 | newFile = Job.from_srt(test_file) 18 | assert len(newFile.markers) == 2 19 | 20 | 21 | def test_job_from_srt_creates_alternatives(tmp_file_data, tmp_path): 22 | test_file = tmp_path / 'test_file.srt' 23 | test_file.write_text(tmp_file_data) 24 | newJob = Job.from_srt(test_file) 25 | assert len(newJob.alternatives) == 11 26 | 27 | 28 | def test_job_puncuations(tmp_file_data, tmp_path): 29 | test_file = tmp_path / 'test_file.srt' 30 | test_file.write_text(tmp_file_data) 31 | newFile = Job.from_srt(test_file) 32 | punctuations = [x for x in newFile.alternatives if x._type=='punctuation'] 33 | assert len(punctuations) == 3 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jay Miller 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/test_tools.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import transcriptor.tools as tools 3 | from datetime import timedelta 4 | 5 | @pytest.mark.parametrize('second_val, result', [ 6 | (61.1, "01.100"), # tests leading zeroes 7 | (130.1000001, "10.100"), # tests that milliseconds returned, not micro 8 | (36001.123567, "01.123"), # It does not round 9 | ]) 10 | def test_adjust_microseconds_returns_seconds_and_minutes(second_val: float, 11 | result: str): 12 | """Tests when passed a timedelta it returns the remaining seconds and 13 | milliseconds as a string""" 14 | 15 | seconds=timedelta(seconds=second_val) 16 | assert tools.adjust_microseconds(seconds) == result 17 | 18 | @pytest.mark.parametrize('time_val', ["01:23:34,123", "01:23:34.123"]) 19 | def test_timedelta_from_str_accepts_comma_or_period(time_val): 20 | assert tools.timedelta_from_str(time_val) == timedelta(seconds=5014, microseconds=123000) 21 | 22 | 23 | def test_timedelta_to_dict(): 24 | a = tools.timedelta_to_dict(timedelta(seconds=3661.123)) 25 | assert a['hours'] == 1 26 | assert a['minutes'] == 1 27 | assert str(a['seconds'])[:5] == '1.123' #floats are hard 28 | 29 | def test_minutes_endtime_adds_a_minute(): 30 | pass 31 | -------------------------------------------------------------------------------- /transcriptor/helpers.py: -------------------------------------------------------------------------------- 1 | def text_in_range(segments, start_time, end_time): 2 | """ Iterate through the segments and return the content that fits between 3 | the start and end times. 4 | 5 | Parameters 6 | ---------- 7 | segments: list 8 | The segment to iterate over 9 | start_time: str or float 10 | Amazon JSON files will return a string but this will conver to float 11 | end_time: str or float 12 | Amazon JSON files will return a string but this will conver to float 13 | 14 | Returns 15 | ------- 16 | string 17 | the content from the alternative with the highest confidence 18 | 19 | Raises 20 | ------ 21 | KeyError 22 | when a key error 23 | OtherError 24 | when an other error 25 | """ 26 | 27 | segment_range = "" 28 | for segment in segments: 29 | # coming from JSON. segment timestamps will be strings 30 | if float(segment.end_time) < start_time: 31 | continue 32 | 33 | if float(segment.start_time) <= end_time: 34 | # get the content with the highest confidence score 35 | content = content 36 | 37 | if segment['type'] == "pronunciation": 38 | content = f" {content}" 39 | 40 | segment_range += content 41 | 42 | else: 43 | break #exit from loop 44 | 45 | return segment_range.strip() 46 | 47 | -------------------------------------------------------------------------------- /transcriptor/srt.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from pathlib import Path 3 | from .job import Job 4 | from .markers import Marker 5 | 6 | 7 | def srt_index_to_Marker(srt_index: str) -> typing.List: 8 | """Given a srt index return a Marker Object""" 9 | _, timestamps, content = marker.strip().split("\n", maxsplit=2) 10 | start_time, end_time = [timedelta_from_str(x) for x in timestamps.split(" --> ")] 11 | return Marker( 12 | start_time=start_time, 13 | end_time=end_time, 14 | content=content.strip(), 15 | ) 16 | 17 | def markers_to_srt(index, markers: typing.List[Marker]) -> str: 18 | """Given a Marker object return a srt text index""" 19 | for marker in markers: 20 | yield f"{index}\n{start_time} --> {end_time}\n{text}" 21 | 22 | def iterate_srt(srt): 23 | """Given a srt text, yield srt_data for each index""" 24 | for index in re.split("\n{2,}", srt): 25 | yield srt_index_to_Marker(index) 26 | 27 | 28 | def load(filepath) -> Job: 29 | """Reads a .srt file and returns a job Object""" 30 | return loads(Path(filepath).read_text()) 31 | 32 | 33 | def loads(content) -> Job: 34 | """Reads content and returns a job Object""" 35 | markers = [gen_markers(x) for x in iterate_srt] 36 | return Job(markers=srt_markers, text) 37 | 38 | 39 | def write(job: Job) -> str: 40 | """When passed an Job. Convert it to an SRT File""" 41 | segments = [markers_to_srt(x,y) for x,y in enumerate(self.markers, start=1)]: 42 | return "\n\n".join(segments) 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | attrs==20.2.0 # via pytest 8 | boto3==1.15.14 # via -r requirements.in 9 | botocore==1.18.14 # via boto3, s3transfer 10 | certifi==2020.6.20 # via httpx, requests 11 | chardet==3.0.4 # via requests 12 | coverage==5.3 # via pytest-cov 13 | h11==0.11.0 # via httpcore 14 | httpcore==0.12.0 # via httpx 15 | httpx==0.16.0 # via -r requirements.in 16 | idna==2.10 # via requests, rfc3986 17 | iniconfig==1.0.1 # via pytest 18 | jmespath==0.10.0 # via boto3, botocore 19 | more-itertools==8.5.0 # via -r requirements.in 20 | packaging==20.4 # via pytest 21 | pluggy==0.13.1 # via pytest 22 | py==1.9.0 # via pytest 23 | pyparsing==2.4.7 # via packaging 24 | pytest-cov==2.10.1 # via -r requirements.in 25 | pytest-mock==3.3.1 # via -r requirements.in 26 | pytest==6.1.1 # via -r requirements.in, pytest-cov, pytest-mock 27 | python-dateutil==2.8.1 # via botocore 28 | requests-mock==1.8.0 # via -r requirements.in 29 | requests==2.24.0 # via requests-mock 30 | rfc3986[idna2008]==1.4.0 # via httpx 31 | s3transfer==0.3.3 # via boto3 32 | six==1.15.0 # via packaging, python-dateutil, requests-mock 33 | slugify==0.0.1 # via -r requirements.in 34 | sniffio==1.1.0 # via httpcore, httpx 35 | toml==0.10.1 # via pytest 36 | urllib3==1.25.10 # via botocore, requests 37 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transcriptor.helpers import text_in_range 3 | 4 | 5 | @pytest.fixture() 6 | def segments(): 7 | segment = [ 8 | { 9 | "start_time":0.000, 10 | "end_time": 1.0000, 11 | "type":"pronunciation", 12 | "alternatives": [{ 13 | "confidence": 1.0, 14 | "content": "Once", 15 | }], 16 | 17 | }, 18 | { 19 | "start_time":1.000, 20 | "end_time": 2.0000, 21 | "type":"pronunciation", 22 | "alternatives": [{ 23 | "confidence": 1.0, 24 | "content": "Upon" 25 | }], 26 | }, 27 | { 28 | "start_time":3.000, 29 | "end_time": 4.0000, 30 | "type":"pronunciation", 31 | "alternatives": [{ 32 | "confidence": 0.0, 33 | "content": "b" 34 | }, 35 | { 36 | "confidence": 0.4, 37 | "content": "A" 38 | }, 39 | ], 40 | }, 41 | { 42 | "start_time":4.000, 43 | "end_time": 6.0000, 44 | "type": "punctuation", 45 | "alternatives": [{ 46 | "confidence": 1.0, 47 | "content": "." 48 | }], 49 | }, 50 | ] 51 | return segment 52 | 53 | def test_text_in_range_gets_only_text_in_range(segments): 54 | """Given a segment text_in_range returns the text between the 55 | start/stop times""" 56 | 57 | # Test that only the 2 of the segments are returned 58 | target_start_time = 1.3 59 | target_end_time = 3.5 60 | 61 | test_text = text_in_range(segments, target_start_time, target_end_time) 62 | assert test_text == "Upon A" 63 | 64 | 65 | def test_text_in_range_doesnt_add_space_for_punctuation(segments): 66 | """Punctuation should not have a space before it.""" 67 | 68 | target_start_time = 2.3 69 | target_end_time = 7.5 70 | 71 | test_text = text_in_range(segments, target_start_time, target_end_time) 72 | assert test_text == "A." 73 | 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /transcriptor/job.py: -------------------------------------------------------------------------------- 1 | from .alternatives import Alternative 2 | from .tools import ( 3 | str_to_timedelta, 4 | timedelta_from_str, 5 | ) 6 | from .helpers import text_in_range 7 | from .speakers import Speaker 8 | from .markers import Marker 9 | from .segments import Segment 10 | from .speakers import Speaker 11 | 12 | from pathlib import Path 13 | from datetime import timedelta 14 | import string 15 | import time 16 | import logging 17 | import typing 18 | 19 | 20 | class Job(): 21 | """Job objects are the foundation of transcriptor. A job holds the markers, segments, speakers, alternatives, and outputs for a transcription""" 22 | 23 | def __init__(self, 24 | *, 25 | markers: typing.Optional[typing.List[Marker]]=None, 26 | segments: typing.Optional[typing.List[Segment]]=None, 27 | speakers: typing.Optional[typing.List[Speaker]]=None, 28 | text: typing.Optional[str]=None, 29 | ): 30 | 31 | self.markers = markers 32 | self.segments = segments 33 | self.speakers = speakers 34 | self.text = text 35 | 36 | def _check_marker_content(self, marker): 37 | """checks given marker for content otherwise loads segments between the 38 | marker start and end times 39 | """ 40 | 41 | if marker.content: 42 | return marker.content 43 | 44 | else: 45 | return text_in_range( 46 | self.segments, 47 | start_time=marker.start_time, 48 | end_time=marker.end_time, 49 | ) 50 | 51 | 52 | def _text_from_marker(self) -> typing.Generator: 53 | """Generate dictionaries of text for each Marker value""" 54 | markers = [] 55 | 56 | for marker in self.markers: 57 | markers.append({ 58 | "start_time": marker.start_time, 59 | "end_time": marker.end_time, 60 | "content": self._check_marker_content(marker), 61 | "speaker": marker.speaker if marker.speaker else '', 62 | }) 63 | 64 | return markers 65 | 66 | 67 | def to_text( 68 | self, 69 | separator: str = "\n\n", 70 | text_separator: str = ":\n\n", 71 | disable_speakers: bool = False, 72 | disable_timestamp: bool = False, 73 | ) -> str: 74 | 75 | markers = list[self._text_from_marker()] 76 | 77 | lines = [] 78 | 79 | for marker in markers: 80 | 81 | if disable_speakers: 82 | marker['speaker'] = '' 83 | 84 | if disable_timestamp: 85 | marker['start_time'] = '' 86 | 87 | lines.append(f"{speaker}{start_time}{text_separator}{content}") 88 | 89 | return separator.join(lines) 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Transcriptor Logo](https://s3-us-west-2.amazonaws.com/kjaymiller/images/Transcriptor%20Logo%20V1.1.png) 2 | # Transcriptor 3 | ## A transcription service wrapper that makes it easier to work with transcriptions. 4 | 5 | Transcription services provide a cost-effective way to add accessibility to 6 | your audio and video, but they are often a challenge to integrate into your system. 7 | 8 | **Transcriptor aims to make working with transcriptions easier.** 9 | 10 | Transcriptor looks at transcription objects as a class. 11 | 12 | ### Features: 13 | 14 | - An object-oriented approach to Transcriptions, Markers, and Speakers 15 | - Nondestructive manipulation of text and references. 16 | 17 | ## Installation 18 | 19 | Install transcriptor using pip. 20 | 21 | `pip install transcriptor` 22 | 23 | 24 | ## Quickstart 25 | 26 | Transcriptor currently supports transcription upload and 27 | creating a **READABLE Transcription Job Object** from their output. 28 | 29 | ### Importing Transcriptions from AWS Transcribe 30 | 31 | ```python 32 | from transcriptor import amazon, AmazonJob 33 | 34 | job = AmazonJob(filepath='', bucket='', ) 35 | job.start() 36 | # After Some Time 37 | job.status() # If it says COMPLETED 38 | job.build() 39 | ``` 40 | 41 | Alternatively, you can load the object via the _TranscriptFileUri_ 42 | (`AmazonJob.from_uri()`) or the JSON object directly (`AmazonJob.from_json()`). 43 | 44 | A loaded Transcription Job from AWS Transcribe will give you access to `Markers`, `Speakers` 45 | (if included), the provided `Alternatives`. You also have the original job 46 | object that you can interact with. 47 | 48 | ### Importing from an SubRip Subtitle (SRT) 49 | 50 | Text Transcriptions from the Web can be very unique in style, but the most 51 | common format is that of at srt file. 52 | 53 | You can load an srt file into transcriptor and use that to interact with the 54 | individual markers. 55 | 56 | ```python 57 | from transcriptor import Job 58 | 59 | amazon.from_srt('FILENAME.srt') 60 | ``` 61 | 62 | A loaded Transcription Job from srt files will make `Markers`. 63 | 64 | These markers are given in order as a list. This gives you the ability to 65 | modify a single Marker or iterate your changes across a range or all of the 66 | Marker objects. 67 | 68 | ## Supported Services 69 | 70 | - Amazon Transcribe (boto3) 71 | 72 | ## Planned Support For 73 | 74 | - Google Speech-To-Text 75 | 76 | ## Sponsors 77 | This and much of the work that I do is made possible by those that sponsor me 78 | on github. 79 | 80 | ### Sponsors at the $20/month and higher Level 81 | - [Brian Douglas](https://github.com/bdougie) 82 | - [Anthony Shaw](https://github.com/tonybaloney) 83 | - [Carol Willing](https://github.com/willingc) 84 | 85 | Thank you to them and all of those that continue to support this project! 86 | 87 | [**Sponsor this Project**](https://github.com/sponsors/kjaymiller) 88 | -------------------------------------------------------------------------------- /transcriptor/amazon.py: -------------------------------------------------------------------------------- 1 | from .job import Job 2 | from .markers import Marker 3 | from .alternatives import Alternative 4 | from .helpers import text_in_range 5 | from .speakers import Speaker 6 | 7 | from datetime import timedelta 8 | from pathlib import Path 9 | from slugify import slugify 10 | 11 | import boto3 12 | import dataclasses 13 | import more_itertools 14 | import httpx 15 | import logging 16 | import os 17 | import typing 18 | import json 19 | 20 | storage = boto3.client("s3") 21 | transcribe = boto3.client("transcribe") 22 | 23 | class AmazonEnv(): 24 | def __init__( 25 | self, 26 | asr_Output: typing.Optional[dict]=None, 27 | key: typing.Optional[str]=None, 28 | audio_file: typing.Optional[str]=None, 29 | bucket: typing.Optional[str]=None, 30 | is_uploaded: bool=True, 31 | ): 32 | 33 | self.is_uploaded = False 34 | 35 | if bucket: 36 | self.bucket = bucket 37 | 38 | if audio_file: 39 | self.audio_file = Path(audio_file) 40 | 41 | if not key: 42 | key = slugify(str(audio_file)) 43 | 44 | check_key = slugify(key) # coerce valid naming structure 45 | 46 | if key != check_key: 47 | logging.warning(f'invalid-{key=}. Will use {check_key} instead.') 48 | 49 | self.key = check_key 50 | 51 | 52 | def upload_audio_file(self, **kwargs): 53 | """Loads file to amazon s3 location. 54 | This is a convenience wrapper for storage.upload_file 55 | """ 56 | storage.upload_file(str(self.audio_file), Bucket=self.bucket, Key=self.key, **kwargs) 57 | self.is_uploaded=True 58 | 59 | 60 | def start_transcription( 61 | self, 62 | *, 63 | language:str = 'en-US', 64 | vocabulary: typing.Optional[str]=None, 65 | speaker_count: int = 0, 66 | ): 67 | """Optionally upload the file and start the transcription job. 68 | This is a convenient wrapper for transcription.start_tranascription_job 69 | """ 70 | 71 | if not self.is_uploaded: 72 | audio_file = self.upload_audio_file() 73 | 74 | transcribe_job_uri = f"{storage.meta.endpoint_url}/{self.bucket}/{self.key}" 75 | settings = {} 76 | 77 | if speaker_count > 0: 78 | settings["ShowSpeakerLabels"] = True 79 | settings["MaxSpeakerLabels"] = speaker_count 80 | 81 | if vocabulary: 82 | settings["VocabularyName"] = vocabulary 83 | 84 | return transcribe.start_transcription_job( 85 | TranscriptionJobName=self.key, 86 | Media={"MediaFileUri": transcribe_job_uri}, 87 | MediaFormat=Path(self.audio_file).suffix[1:], 88 | LanguageCode=language, 89 | Settings=settings, 90 | ) 91 | 92 | @staticmethod 93 | def from_job(job_name: str) -> Job: 94 | """Create a Job Object based on the TranscriptiobJobName""" 95 | job = transcribe.get_transcription_job(TranscriptionJobName=job_name) 96 | return AmazonEnv.from_uri( 97 | job["TranscriptionJob"]["Transcript"]["TranscriptFileUri"] 98 | ) 99 | 100 | @staticmethod 101 | def from_uri(uri: str) -> Job: 102 | """Create a Job Object based on the TranscriptFileUri""" 103 | response = httpx.get(uri) 104 | return AmazonEnv.from_json(response.json()) 105 | 106 | def build( 107 | self, 108 | *, 109 | split_at:typing.List[str]=['.', '?', '!'], 110 | ignore_speakers:bool=False 111 | ) -> Job: 112 | """ 113 | Create a Job Object when given an Amazon JSON Object 114 | 115 | Parameters 116 | ---------- 117 | split_at: list 118 | punctuation objects to split for markers 119 | ignore_speakers: bool 120 | """ 121 | pass 122 | 123 | @classmethod 124 | def from_json(cls, json_file) -> Job: 125 | """Create a Job Object when given an Amazon JSON Object""" 126 | 127 | results = json_file['results'] 128 | 129 | if "speaker_labels" in results: 130 | labels = json_file["results"]["speaker_labels"] 131 | segments = labels["segments"] 132 | 133 | else: 134 | segment_content = more_itertools.split_when( 135 | json_file['results']['items'], 136 | lambda x: x['type'] == "pronunciation" 137 | ) 138 | segments=[] 139 | 140 | for segment in segment_content: 141 | segments.append({ 142 | 'start_time': float(item[0]["start_time"]), 143 | 'end_time': float(item[-1]["end_time"]), 144 | 'speaker': None, 145 | }) 146 | 147 | 148 | 149 | def from_transcription_jobs(**kwargs): 150 | """Get a list of transcription jobs and generate a job object for each one 151 | """ 152 | 153 | for job in transcribe.list_transcription_jobs(**kwargs)["TranscriptionJobSummaries"]: 154 | yield AmazonEnv.from_job(job["TranscriptionJobName"]) 155 | --------------------------------------------------------------------------------