├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── requirements.txt ├── setup.cfg ├── setup.py └── subsync ├── __init__.py ├── __main__.py ├── bin └── subsync ├── ffmpeg.py ├── log.py ├── main.py ├── media.py ├── model ├── convert.py ├── eval_ann.py ├── eval_logloss.py ├── eval_train.py ├── test.py ├── train_ann.py └── train_data.py ├── net.py ├── subsync.pb ├── test ├── test_440hz_880hz.srt └── test_440hz_880hz.wav └── version.py /.gitignore: -------------------------------------------------------------------------------- 1 | training 2 | out 3 | 4 | # Created by https://www.gitignore.io/api/macos,python,windows 5 | 6 | ### macOS ### 7 | *.DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | # Thumbnails 15 | ._* 16 | 17 | # Files that might appear in the root of a volume 18 | .DocumentRevisions-V100 19 | .fseventsd 20 | .Spotlight-V100 21 | .TemporaryItems 22 | .Trashes 23 | .VolumeIcon.icns 24 | .com.apple.timemachine.donotpresent 25 | 26 | # Directories potentially created on remote AFP share 27 | .AppleDB 28 | .AppleDesktop 29 | Network Trash Folder 30 | Temporary Items 31 | .apdisk 32 | 33 | ### Python ### 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | *.class 38 | 39 | # C extensions 40 | *.so 41 | 42 | # Distribution / packaging 43 | .Python 44 | build/ 45 | develop-eggs/ 46 | dist/ 47 | downloads/ 48 | eggs/ 49 | .eggs/ 50 | lib/ 51 | lib64/ 52 | parts/ 53 | sdist/ 54 | var/ 55 | wheels/ 56 | *.egg-info/ 57 | .installed.cfg 58 | *.egg 59 | 60 | # PyInstaller 61 | # Usually these files are written by a python script from a template 62 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 63 | *.manifest 64 | *.spec 65 | 66 | # Installer logs 67 | pip-log.txt 68 | pip-delete-this-directory.txt 69 | 70 | # Unit test / coverage reports 71 | htmlcov/ 72 | .tox/ 73 | .coverage 74 | .coverage.* 75 | .cache 76 | .pytest_cache/ 77 | nosetests.xml 78 | coverage.xml 79 | *.cover 80 | .hypothesis/ 81 | 82 | # Translations 83 | *.mo 84 | *.pot 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # celery beat schedule file 106 | celerybeat-schedule.* 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | 133 | ### Windows ### 134 | # Windows thumbnail cache files 135 | Thumbs.db 136 | ehthumbs.db 137 | ehthumbs_vista.db 138 | 139 | # Folder config file 140 | Desktop.ini 141 | 142 | # Recycle Bin used on file shares 143 | .BIN/ 144 | 145 | # Windows Installer files 146 | *.cab 147 | *.msi 148 | *.msm 149 | *.msp 150 | 151 | # Windows shortcuts 152 | *.lnk 153 | 154 | # Visual studio code 155 | .vscode 156 | 157 | 158 | # End of https://www.gitignore.io/api/macos,python,windows 159 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include subsync/*.pb 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env make 2 | 3 | init: 4 | pip install -r requirements.txt 5 | 6 | setup: 7 | python subsync/model/train_data.py 8 | 9 | train: 10 | python subsync/model/train_ann.py 11 | 12 | eval: 13 | python subsync/model/eval_ann.py 14 | 15 | logloss: 16 | python subsync/model/eval_logloss.py 17 | 18 | convert: 19 | python subsync/model/convert.py 20 | 21 | test: 22 | python subsync/model/test.py 23 | .PHONY: test 24 | 25 | freeze: 26 | pip freeze > requirements.txt 27 | 28 | dist: 29 | python setup.py sdist 30 | 31 | publish: 32 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Subsync 2 | **Synchronize your subtitles using machine learning** 3 | 4 | Subsync analyses and processes the sound from your media files and uses machine learning to detect speech. Speech detection is used to shift existing subtitles for a perfect match in audio and text! 5 | 6 | ## Features 7 | - [x] Machine learning model for voice activity detection (*not recognition*) 8 | - [x] Shift subtitle as a whole for best match 9 | - [x] Sync every sentence in the subtitle individually 10 | - [ ] Sync using existing matched subtitle in a different laguage 11 | 12 | ## Dependencies 13 | * ffmpeg (https://www.ffmpeg.org/download.html) 14 | 15 | ## Installation 16 | ```bash 17 | pip install subsync 18 | ``` 19 | 20 | ## Help 21 | ``` 22 | usage: subsync [-h] [--version] [--graph] [-d SECONDS] [-m SECONDS] [-s] 23 | [--logfile PATH] 24 | MEDIA [MEDIA ...] 25 | 26 | positional arguments: 27 | MEDIA media for which to synchronize subtitles 28 | 29 | optional arguments: 30 | -h, --help show this help message and exit 31 | --version show program's version number and exit 32 | --graph show graph for subtitle synchronization (default: 33 | False) 34 | -d SECONDS, --duration SECONDS 35 | duration (in seconds) of the sample audio length 36 | increases precision but reduces speed (default: 900) 37 | -m SECONDS, --margin SECONDS 38 | the margin in which to search for a subtitle match 39 | (default: 12) 40 | -s, --start sample audio from the start of the media instad of the 41 | middle (default: False) 42 | -r, --recursive recurviely sync every sentence in the subtitle 43 | (default: False) 44 | --logfile PATH path to location of log file for logging application 45 | specific information (default: None) 46 | ``` 47 | 48 | ## Special thanks 49 | [[1] Automatic Subtitle Synchronization through Machine Learning](https://machinelearnings.co/automatic-subtitle-synchronization-e188a9275617) 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.1.12 2 | astor==0.6.2 3 | audioread==2.1.5 4 | bleach==1.5.0 5 | cffi==1.11.5 6 | chardet==3.0.4 7 | cycler==0.10.0 8 | decorator==4.2.1 9 | gast==0.2.0 10 | graphviz==0.8.2 11 | grpcio==1.10.0 12 | h5py==2.8.0rc1 13 | html5lib==0.9999999 14 | joblib==0.11 15 | Keras==2.1.5 16 | kiwisolver==1.0.1 17 | librosa==0.6.0 18 | llvmlite==0.22.0 19 | Markdown==2.6.11 20 | matplotlib==2.2.2 21 | numba==0.37.0 22 | numpy==1.14.2 23 | protobuf==3.5.2.post1 24 | pycparser==2.18 25 | pydot==1.2.4 26 | pyparsing==2.2.0 27 | pysrt==1.1.1 28 | python-dateutil==2.7.0 29 | pytz==2018.3 30 | PyYAML==3.12 31 | resampy==0.2.0 32 | scikit-learn==0.19.1 33 | scipy==1.0.0 34 | six==1.11.0 35 | sklearn==0.0 36 | tensorboard==1.6.0 37 | tensorflow==1.5.0 38 | tensorflow-tensorboard==1.5.1 39 | termcolor==1.1.0 40 | Werkzeug==0.14.1 41 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | exec(open('subsync/version.py').read()) 4 | 5 | setup(name='subsync', 6 | version=__version__, 7 | description='Synchronize your subtitles with machine learning', 8 | classifiers=[ 9 | 'License :: OSI Approved :: MIT License', 10 | 'Programming Language :: Python :: 3.6', 11 | 'Topic :: Multimedia :: Sound/Audio :: Analysis', 12 | 'Topic :: Multimedia :: Sound/Audio :: Speech', 13 | ], 14 | keywords='subtitle synchronize machine learning', 15 | platforms=["Independent"], 16 | scripts=['subsync/bin/subsync'], 17 | include_package_data=True, 18 | url='https://github.com/tympanix/subsync', 19 | author='tympanix', 20 | author_email='tympanix@gmail.com', 21 | license='MIT', 22 | packages=['subsync'], 23 | install_requires=[ 24 | 'tensorflow>=1.0.0', 25 | 'numpy', 26 | 'matplotlib', 27 | 'librosa', 28 | 'h5py>=2.9.0', 29 | 'pysrt', 30 | ], 31 | zip_safe=False) 32 | -------------------------------------------------------------------------------- /subsync/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .version import __version__ 3 | from .main import run 4 | -------------------------------------------------------------------------------- /subsync/__main__.py: -------------------------------------------------------------------------------- 1 | from .main import run 2 | 3 | if __name__ == '__main__': 4 | run() 5 | -------------------------------------------------------------------------------- /subsync/bin/subsync: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import subsync 4 | subsync.run() 5 | -------------------------------------------------------------------------------- /subsync/ffmpeg.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | import subprocess 4 | import os 5 | import tempfile 6 | import re 7 | import sys 8 | from datetime import timedelta 9 | from subprocess import DEVNULL, STDOUT, PIPE 10 | 11 | 12 | class Transcode: 13 | """ 14 | Transcode is a wrapper around the ffmpeg binary used to transcode 15 | audio from media files. 16 | """ 17 | 18 | def __init__(self, input, binary='ffmpeg', seek=False, start=0, duration=0, channels=2, samplerate=16000, bitrate='160k'): 19 | if seek and start: 20 | raise ValueError("Can't both supply seek and start argument in transcode") 21 | self.input = input 22 | self.bitrate = bitrate 23 | self.channels = channels 24 | self.samplerate = samplerate 25 | self.binary = binary 26 | self.start = start if type(start) is timedelta else timedelta(seconds=start) 27 | self.duration = duration if type(duration) is timedelta else timedelta(seconds=duration) 28 | self.length = self.__length() 29 | if seek: 30 | self.start = max(timedelta(), self.length/2-self.duration/2) 31 | 32 | self.output = os.path.join(tempfile.gettempdir(), 'subsync_' + randomString() + '.wav') 33 | 34 | 35 | def command(self): 36 | cmd = [self.binary, '-y'] 37 | cmd.extend(('-i', shellquote(self.input))) 38 | 39 | if self.start > timedelta(): 40 | cmd.extend(('-ss', duration_str(self.start))) 41 | 42 | if self.duration > timedelta(): 43 | cmd.extend(('-t', self.duration.seconds)) 44 | 45 | cmd.extend(('-ab', self.bitrate)) 46 | cmd.extend(('-ac', self.channels)) 47 | cmd.extend(('-ar', self.samplerate)) 48 | cmd.append('-vn') # no video 49 | cmd.append(self.output) 50 | 51 | return [str(s) for s in cmd] 52 | 53 | 54 | def __length(self): 55 | cmd = subprocess.Popen(['ffprobe', self.input], stdout=PIPE, stderr=STDOUT) 56 | duration = [x.decode("utf-8") for x in cmd.stdout.readlines() if b"Duration" in x] 57 | match = re.search(r'(\d\d):(\d\d):(\d\d)\.(\d\d)', duration[0]) 58 | code = cmd.wait() 59 | if not match or code != 0: 60 | raise RuntimeError('Could not call ffprobe:', self.input) 61 | return timedelta( 62 | hours=int(match.group(1)), 63 | minutes=int(match.group(2)), 64 | seconds=int(match.group(3)), 65 | milliseconds=int(match.group(4))*100 66 | ) 67 | 68 | 69 | def run(self): 70 | code = subprocess.call(' '.join(self.command()), stderr=DEVNULL, shell=True) 71 | if code != 0: 72 | raise RuntimeError('Could not transcode audio:', self.input) 73 | 74 | 75 | def randomString(len=12): 76 | allchar = string.ascii_letters + string.digits 77 | return "".join(random.choice(allchar) for x in range(len)) 78 | 79 | 80 | def duration_str(d): 81 | hours, remainder = divmod(d.seconds, 3600) 82 | minutes, seconds = divmod(remainder, 60) 83 | return '{:02d}:{:02d}:{:02d}.{:06d}'.format(hours, minutes, seconds, d.microseconds) 84 | 85 | 86 | def shellquote(s): 87 | if sys.platform == 'win32': 88 | return "\"" + s.replace("\"", "\\\"") + "\"" 89 | else: 90 | return "'" + s.replace("'", "'\\''") + "'" 91 | -------------------------------------------------------------------------------- /subsync/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | def init_logger(filepath): 4 | # create file handler which logs even debug messages 5 | fh = logging.FileHandler(filepath) 6 | fh.setLevel(logging.DEBUG) 7 | fh.setFormatter(formatter) 8 | logger.addHandler(fh) 9 | 10 | 11 | logger = logging.getLogger('subsync_logger') 12 | logger.setLevel(logging.DEBUG) 13 | # create console handler with a higher log level 14 | ch = logging.StreamHandler() 15 | ch.setLevel(logging.ERROR) 16 | # create formatter and add it to the handlers 17 | formatter = logging.Formatter('%(asctime)s - %(message)s') 18 | ch.setFormatter(formatter) 19 | # add the handlers to the logger 20 | logger.addHandler(ch) 21 | -------------------------------------------------------------------------------- /subsync/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from .log import logger, init_logger 4 | from .version import __version__ 5 | 6 | def run(): 7 | parser = argparse.ArgumentParser( 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 9 | ) 10 | parser.add_argument('media', metavar='MEDIA', type=str, nargs='+', 11 | help='media for which to synchronize subtitles') 12 | parser.add_argument('--version', action='version', version='%(prog)s {}'.format(__version__)) 13 | parser.add_argument('--graph', dest="graph", action='store_true', 14 | help='show graph for subtitle synchronization') 15 | parser.add_argument('-d', '--duration', dest='duration', type=int, metavar='SECONDS', default=60*15, 16 | help='duration (in seconds) of the sample audio length increases precision but reduces speed') 17 | parser.add_argument('-m', '--margin', dest='margin', type=int, metavar='SECONDS', default=12, 18 | help='the margin in which to search for a subtitle match') 19 | parser.add_argument('-s', '--start', dest='start', action='store_true', 20 | help='sample audio from the start of the media instad of the middle') 21 | parser.add_argument('-r', '--recursive', dest='recursive', action='store_true', 22 | help='recurviely sync every sentence in the subtitle') 23 | parser.add_argument('--logfile', dest='logfile', type=str, metavar='PATH', 24 | help='path to location of log file for logging application specific information') 25 | 26 | args = parser.parse_args() 27 | 28 | if args.logfile: 29 | init_logger(args.logfile) 30 | 31 | 32 | from .media import Media 33 | media = [Media(m) for m in args.media if m] 34 | 35 | from .net import NeuralNet 36 | model = NeuralNet() 37 | 38 | for m in media: 39 | if args.recursive: 40 | m.mfcc(duration=0, seek=False) 41 | else: 42 | m.mfcc(duration=args.duration, seek=not args.start) 43 | for s in m.subtitles(): 44 | if args.recursive: 45 | s.sync_all(model, plot=args.graph, margin=args.margin) 46 | else: 47 | s.sync(model, plot=args.graph, margin=args.margin) 48 | -------------------------------------------------------------------------------- /subsync/media.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import subprocess 4 | import tempfile 5 | import io 6 | import pysrt 7 | from pysrt import SubRipTime 8 | import string 9 | import random 10 | import chardet 11 | import re 12 | from datetime import timedelta 13 | 14 | import numpy as np 15 | import sklearn 16 | 17 | from .ffmpeg import Transcode 18 | from .log import logger 19 | 20 | 21 | class Media: 22 | """ 23 | Media class represents a media file on disk for which the content can be 24 | analyzed and retrieved. 25 | """ 26 | 27 | # List of supported media formats 28 | FORMATS = ['.mkv', '.mp4', '.wmv', '.avi', '.flv'] 29 | 30 | # The frequency of the generated audio 31 | FREQ = 16000 32 | 33 | # The number of coefficients to extract from the mfcc 34 | N_MFCC = 13 35 | 36 | # The number of samples in each mfcc coefficient 37 | HOP_LEN = 512.0 38 | 39 | # The length (seconds) of each item in the mfcc analysis 40 | LEN_MFCC = HOP_LEN/FREQ 41 | 42 | 43 | def __init__(self, filepath, subtitles=None): 44 | prefix, ext = os.path.splitext(filepath) 45 | if ext == '.srt': 46 | return self.from_srt(filepath) 47 | if not ext: 48 | raise ValueError('unknown file: "{}"'.format(filepath)) 49 | if ext not in Media.FORMATS: 50 | raise ValueError('filetype {} not supported: "{}"'.format(ext, filepath)) 51 | self.__subtitles = subtitles 52 | self.filepath = os.path.abspath(filepath) 53 | self.filename = os.path.basename(prefix) 54 | self.extension = ext 55 | self.offset = timedelta() 56 | 57 | 58 | def from_srt(self, filepath): 59 | prefix, ext = os.path.splitext(filepath) 60 | if ext != '.srt': 61 | raise ValueError('filetype must be .srt format') 62 | prefix = os.path.basename(re.sub(r'\.\w\w$', '', prefix)) 63 | dir = os.path.dirname(filepath) 64 | for f in os.listdir(dir): 65 | _, ext = os.path.splitext(f) 66 | if f.startswith(prefix) and ext in Media.FORMATS: 67 | return self.__init__(os.path.join(dir, f), subtitles=[filepath]) 68 | raise ValueError('no media for subtitle: "{}"'.format(filepath)) 69 | 70 | 71 | def subtitles(self): 72 | if self.__subtitles is not None: 73 | for s in self.__subtitles: 74 | yield Subtitle(self, s) 75 | else: 76 | dir = os.path.dirname(self.filepath) 77 | for f in os.listdir(dir): 78 | if f.endswith('.srt') and f.startswith(self.filename): 79 | yield Subtitle(self, os.path.join(dir, f)) 80 | 81 | 82 | def mfcc(self, duration=60*15, seek=True): 83 | transcode = Transcode(self.filepath, duration=duration, seek=seek) 84 | self.offset = transcode.start 85 | print("Transcoding...") 86 | transcode.run() 87 | y, sr = librosa.load(transcode.output, sr=Media.FREQ) 88 | print("Analysing...") 89 | self.mfcc = librosa.feature.mfcc(y=y, sr=sr, 90 | hop_length=int(Media.HOP_LEN), 91 | n_mfcc=int(Media.N_MFCC) 92 | ) 93 | os.remove(transcode.output) 94 | return self.mfcc 95 | 96 | 97 | 98 | class Subtitle: 99 | """ 100 | Subtitle class represnets an .srt file on disk and provides 101 | functionality to inspect and manipulate the subtitle content 102 | """ 103 | 104 | def __init__(self, media, path): 105 | self.media = media 106 | self.path = path 107 | self.subs = pysrt.open(self.path, encoding=self._find_encoding()) 108 | 109 | def labels(self, subs=None): 110 | if self.media.mfcc is None: 111 | raise RuntimeError("Must analyse mfcc before generating labels") 112 | samples = len(self.media.mfcc[0]) 113 | labels = np.zeros(samples) 114 | for sub in self.subs if subs is None else subs: 115 | start = timeToPos(sub.start - self.offset()) 116 | end = timeToPos(sub.end - self.offset())+1 117 | for i in range(start, end): 118 | if i >= 0 and i < len(labels): 119 | labels[i] = 1 120 | 121 | return labels 122 | 123 | def _find_encoding(self): 124 | data = None 125 | with open(self.path, "rb") as f: 126 | data = f.read() 127 | det = chardet.detect(data) 128 | return det.get("encoding") 129 | 130 | 131 | def offset(self): 132 | d = self.media.offset 133 | hours, remainder = divmod(d.seconds, 3600) 134 | minutes, seconds = divmod(remainder, 60) 135 | return SubRipTime( 136 | hours=hours, minutes=minutes, seconds=seconds, 137 | milliseconds=d.microseconds/1000 138 | ) 139 | 140 | 141 | def logloss(self, pred, actual, margin=12): 142 | blocks = secondsToBlocks(margin) 143 | logloss = np.ones(blocks*2) 144 | indices = np.ones(blocks*2) 145 | nonzero = np.nonzero(actual)[0] 146 | begin = max(nonzero[0]-blocks, 0) 147 | end = min(nonzero[-1]+blocks, len(actual)-1) 148 | pred = pred[begin:end] 149 | actual = actual[begin:end] 150 | for i, offset in enumerate(range(-blocks, blocks)): 151 | snippet = np.roll(actual, offset) 152 | try: 153 | logloss[i] = sklearn.metrics.log_loss(snippet[blocks:-blocks], pred[blocks:-blocks]) 154 | except (ValueError, RuntimeWarning): 155 | pass 156 | indices[i] = offset 157 | 158 | return indices, logloss 159 | 160 | 161 | def sync(self, net, safe=True, margin=12, plot=True): 162 | secs = 0.0 163 | labels = self.labels() 164 | mfcc = self.media.mfcc.T 165 | mfcc = mfcc[..., np.newaxis] 166 | pred = net.predict(mfcc) 167 | x, y = self.logloss(pred, labels, margin=margin) 168 | accept = True 169 | if safe: 170 | mean = np.mean(y) 171 | sd = np.std(y) 172 | accept = np.min(y) < mean - sd 173 | if accept: 174 | secs = blocksToSeconds(x[np.argmin(y)]) 175 | print("Shift {} seconds:".format(secs)) 176 | self.subs.shift(seconds=secs) 177 | self.subs.save(self.path, encoding='utf-8') 178 | if secs != 0.0: 179 | logger.info('{}: {}s'.format(self.path, secs)) 180 | if plot: 181 | self.plot_logloss(x, y) 182 | return secs 183 | 184 | 185 | def sync_all(self, net, margin=16, plot=True): 186 | secs = 0.0 187 | mfcc = self.media.mfcc.T 188 | mfcc = mfcc[..., np.newaxis] 189 | pred = net.predict(mfcc) 190 | print("Fitting...") 191 | self.__sync_all_rec(self.subs, pred) 192 | self.clean() 193 | self.subs.save(self.path, encoding='utf-8') 194 | 195 | 196 | def __sync_all_rec(self, subs, pred, margin=16): 197 | if len(subs) < 3: 198 | return 199 | labels = self.labels(subs=subs) 200 | if np.unique(labels).size <= 1: 201 | return 202 | x, y = self.logloss(pred, labels, margin=max(margin, 0.25)) 203 | #self.plot_logloss(x,y) 204 | #self.plot_labels(labels, pred) 205 | secs = blocksToSeconds(x[np.argmin(y)]) 206 | subs.shift(seconds=secs) 207 | # call recursively 208 | middle = subs[len(subs)//2] 209 | left = subs.slice(ends_before=middle.start) 210 | right = subs.slice(starts_after=middle.start) 211 | self.__sync_all_rec(left, pred, margin=margin/2) 212 | self.__sync_all_rec(right, pred, margin=margin/2) 213 | 214 | 215 | def clean(self): 216 | for i, s in enumerate(self.subs): 217 | if i >= len(self.subs)-1: 218 | return 219 | next = self.subs[i+1] 220 | if s.end > next.start: 221 | s.end = next.start 222 | 223 | 224 | 225 | def plot_logloss(self, x, y): 226 | import matplotlib.pyplot as plt 227 | plt.figure() 228 | plt.plot(x, y) 229 | plt.title('logloss over shifts') 230 | plt.ylabel('logloss') 231 | plt.xlabel('shifts') 232 | plt.legend(['logloss'], loc='upper left') 233 | plt.show() 234 | 235 | def plot_labels(self, labels, pred): 236 | import matplotlib.pyplot as plt 237 | plt.figure() 238 | plt.plot([i for i in range(0,len(labels))], labels, label='labels') 239 | plt.title('labels vs predictions') 240 | plt.ylabel('value') 241 | plt.xlabel('time') 242 | plt.legend(['labels'], loc='upper left') 243 | 244 | plt.figure() 245 | plt.plot([i for i in range(0,len(pred))], pred, label='pred') 246 | plt.title('labels vs predictions') 247 | plt.ylabel('value') 248 | plt.xlabel('time') 249 | plt.legend(['pred'], loc='upper left') 250 | plt.show() 251 | 252 | 253 | 254 | # Convert timestamp to seconds 255 | def timeToSec(t): 256 | total_sec = float(t.milliseconds)/1000 257 | total_sec += t.seconds 258 | total_sec += t.minutes*60 259 | total_sec += t.hours*60*60 260 | return total_sec 261 | 262 | 263 | # Return timestamp from cell position 264 | def timeToPos(t, freq=Media.FREQ, hop_len=Media.HOP_LEN): 265 | return round(timeToSec(t)/(hop_len/freq)) 266 | 267 | 268 | def secondsToBlocks(s, hop_len=Media.HOP_LEN, freq=Media.FREQ): 269 | return int(float(s)/(hop_len/freq)) 270 | 271 | 272 | def blocksToSeconds(h, freq=Media.FREQ, hop_len=Media.HOP_LEN): 273 | return float(h)*(hop_len/freq) 274 | -------------------------------------------------------------------------------- /subsync/model/convert.py: -------------------------------------------------------------------------------- 1 | # https://github.com/bitbionic/keras-to-tensorflow 2 | 3 | import os 4 | import os.path as osp 5 | import argparse 6 | 7 | import tensorflow as tf 8 | 9 | from keras.models import load_model 10 | from keras import backend as K 11 | 12 | def convertGraph(modelPath, output, outPath): 13 | ''' 14 | Converts an HD5F file to a .pb file for use with Tensorflow. 15 | Args: 16 | modelPath (str): path to the .h5 file 17 | output (str): name of the referenced output 18 | outPath (str): path to the output .pb file 19 | Returns: 20 | None 21 | ''' 22 | 23 | dir = os.path.dirname(os.path.realpath(__file__)) 24 | outdir = os.path.join(dir, os.path.dirname(outPath)) 25 | name = os.path.basename(outPath) 26 | basename, ext = os.path.splitext(name) 27 | 28 | #NOTE: If using Python > 3.2, this could be replaced with os.makedirs( name, exist_ok=True ) 29 | if not os.path.isdir(outdir): 30 | os.mkdir(outdir) 31 | 32 | K.set_learning_phase(0) 33 | 34 | net_model = load_model(modelPath) 35 | 36 | # Alias the outputs in the model - this sometimes makes them easier to access in TF 37 | tf.identity(net_model.output, name=output) 38 | 39 | sess = K.get_session() 40 | 41 | net_model.summary() 42 | 43 | # Write the graph in human readable 44 | f = '{}.reference.pb.ascii'.format(basename) 45 | tf.train.write_graph(sess.graph.as_graph_def(), outdir, f, as_text=True) 46 | print('Saved the graph definition in ascii format at: ', osp.join(outdir, f)) 47 | 48 | # Write the graph in binary .pb file 49 | from tensorflow.python.framework import graph_util 50 | from tensorflow.python.framework import graph_io 51 | constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [output]) 52 | graph_io.write_graph(constant_graph, outdir, name, as_text=False) 53 | print('Saved the constant graph (ready for inference) at: ', outPath) 54 | 55 | 56 | if __name__ == '__main__': 57 | convertGraph('out/ann.hdf5', 'speech_out', 'out/subsync.pb') 58 | -------------------------------------------------------------------------------- /subsync/model/eval_ann.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pickle 3 | import sys 4 | import os 5 | 6 | DIRNAME = os.path.dirname(os.path.realpath(__file__)) 7 | OUT_DIR = os.path.join(DIRNAME, 'out') 8 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5') 9 | HIST = os.path.join(OUT_DIR, 'ann.hist') 10 | 11 | if not os.path.exists(MODEL): 12 | print("missing model:", MODEL) 13 | sys.exit(1) 14 | 15 | if not os.path.exists(HIST): 16 | print("missing history:", HIST) 17 | sys.exit(1) 18 | 19 | 20 | def plot(history): 21 | # Summarize history for accuracy 22 | plt.figure() 23 | plt.plot(history['acc']) 24 | plt.plot(history['val_acc']) 25 | plt.title('model accuracy') 26 | plt.ylabel('accuracy') 27 | plt.xlabel('epoch') 28 | plt.legend(['train', 'test'], loc='upper left') 29 | 30 | # Summarize history for loss 31 | plt.figure() 32 | plt.plot(history['loss']) 33 | plt.plot(history['val_loss']) 34 | plt.title('model loss') 35 | plt.ylabel('loss') 36 | plt.xlabel('epoch') 37 | plt.legend(['train', 'test'], loc='upper left') 38 | 39 | plt.show() 40 | 41 | 42 | if __name__ == '__main__': 43 | history = pickle.load(open(HIST, "rb")) 44 | 45 | print('val_loss:', min(history['val_loss'])) 46 | print('val_acc:', max(history['val_acc'])) 47 | 48 | try: 49 | plot(history) 50 | except KeyboardInterrupt as e: 51 | sys.exit(0) 52 | -------------------------------------------------------------------------------- /subsync/model/eval_logloss.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | import numpy as np 3 | from train_data import * 4 | from train_ann import * 5 | 6 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5') 7 | 8 | if not os.path.exists(MODEL): 9 | print("missing model:", MODEL) 10 | sys.exit(1) 11 | 12 | 13 | def logloss(pred, actual): 14 | begin = np.argmax(actual) * (-1) 15 | end = np.argmax(actual[::-1]) + 1 16 | print("Calculating {} logloss values".format(end-begin)) 17 | logloss = np.zeros(end-begin) 18 | indices = np.zeros(end-begin) 19 | for i, offset in enumerate(range(begin, end)): 20 | logloss[i] = sklearn.metrics.log_loss(np.roll(actual, offset), pred) 21 | indices[i] = offset 22 | 23 | return indices, logloss 24 | 25 | 26 | def plot_logloss(x, y): 27 | plt.figure() 28 | plt.plot(x, y) 29 | plt.title('logloss over shifts') 30 | plt.ylabel('logloss') 31 | plt.xlabel('shifts') 32 | plt.legend(['logloss'], loc='upper left') 33 | 34 | 35 | def load_model(input_shape): 36 | model = ann_model(input_shape) 37 | model.load_weights(MODEL) 38 | return model 39 | 40 | if __name__ == '__main__': 41 | files = transcode_audio() 42 | mfcc, labels = extract_features(files=files) 43 | 44 | for X, Y in zip(mfcc, labels): 45 | shape = (len(X), 1) 46 | X, Y = prepare_data(X, Y, balance=False) 47 | model = load_model(shape) 48 | print("Predicting...") 49 | pred = model.predict(X, batch_size=32) 50 | print("Done...") 51 | x, y = logloss(pred, Y) 52 | plot_logloss(x, y) 53 | plt.show() 54 | -------------------------------------------------------------------------------- /subsync/model/eval_train.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from train_ann import * 3 | from train_data import * 4 | import os 5 | 6 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5') 7 | 8 | 9 | if not os.path.exists(MODEL): 10 | print("missing model:", MODEL) 11 | sys.exit(1) 12 | 13 | 14 | def load_model(input_shape): 15 | model = ann_model(input_shape) 16 | model.load_weights(MODEL) 17 | return model 18 | 19 | 20 | def plot_pred(pred, actual): 21 | plt.figure() 22 | plt.plot(pred) 23 | plt.plot(actual) 24 | plt.title('prediction evaluation') 25 | plt.ylabel('label') 26 | plt.xlabel('time') 27 | plt.legend(['pred', 'actual'], loc='upper left') 28 | 29 | 30 | if __name__ == '__main__': 31 | files = transcode_audio() 32 | wav, srt = extract_features(files=files) 33 | 34 | for X, Y in zip(wav, srt): 35 | shape = (len(X), 1) 36 | X, Y = prepare_data(X, Y, balance=False) 37 | model = load_model(shape) 38 | pred = model.predict(X, batch_size=32) 39 | plot_pred(pred, Y) 40 | 41 | plt.show() 42 | -------------------------------------------------------------------------------- /subsync/model/test.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import librosa 3 | from train_data import * 4 | import os 5 | 6 | TEST_DIR = os.path.join(DIRNAME, 'test') 7 | 8 | def spectral_centroid(file): 9 | y, sr = librosa.load(file) 10 | cent = librosa.feature.spectral_centroid(y=y, sr=sr) 11 | 12 | plt.figure() 13 | plt.semilogy(cent.T, label='Spectral centroid') 14 | plt.ylabel('Hz') 15 | plt.xticks([]) 16 | plt.xlim([0, cent.shape[-1]]) 17 | plt.legend() 18 | plt.title('log Power spectrogram') 19 | plt.tight_layout() 20 | 21 | 22 | def plot_pred(pred, actual): 23 | plt.figure() 24 | plt.plot(pred) 25 | plt.plot(actual) 26 | plt.title('prediction evaluation') 27 | plt.ylabel('label') 28 | plt.xlabel('time') 29 | plt.legend(['pred', 'actual'], loc='upper left') 30 | 31 | 32 | if __name__ == '__main__': 33 | filename = 'test_440hz_880hz' 34 | audio = os.path.join(TEST_DIR, filename + '.wav') 35 | sub = os.path.join(TEST_DIR, filename + '.srt') 36 | 37 | files = [(audio, sub)] 38 | mfcc, srt = extract_features(files=files) 39 | 40 | 41 | for X, Y in zip(mfcc, srt): 42 | shape = (len(X), 1) 43 | print("Len", len(X[0])) 44 | plot_pred(Y, np.array([])) 45 | spectral_centroid(audio) 46 | 47 | plt.show() -------------------------------------------------------------------------------- /subsync/model/train_ann.py: -------------------------------------------------------------------------------- 1 | from train_data import * 2 | import numpy as np 3 | import pickle 4 | import os 5 | 6 | # Keras imports 7 | from keras.layers import Dense, Input, LSTM, Conv1D, Conv2D, Dropout, Flatten, Activation, MaxPooling2D 8 | from keras.models import Model 9 | from keras.layers.normalization import BatchNormalization 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint 11 | from keras.optimizers import Adam, RMSprop 12 | 13 | # Matplotlib imports 14 | import matplotlib.pylab as plt 15 | 16 | # Sklearn imports 17 | from sklearn.utils import class_weight 18 | 19 | DIRNAME = os.path.dirname(os.path.realpath(__file__)) 20 | OUT_DIR = os.path.join(DIRNAME, "out") 21 | 22 | if not os.path.exists(OUT_DIR): 23 | os.makedirs(OUT_DIR) 24 | 25 | """ 26 | Returns a neural network model which can be used for training. The model first 27 | needs to be compiled 28 | """ 29 | def ann_model(input_shape): 30 | 31 | inp = Input(shape=input_shape, name='mfcc_in') 32 | model = inp 33 | 34 | model = Conv1D(filters=12, kernel_size=(3), activation='relu')(model) 35 | model = Conv1D(filters=12, kernel_size=(3), activation='relu')(model) 36 | model = Flatten()(model) 37 | 38 | model = Dense(56)(model) 39 | model = Activation('relu')(model) 40 | model = BatchNormalization()(model) 41 | model = Dropout(0.2)(model) 42 | model = Dense(28)(model) 43 | model = Activation('relu')(model) 44 | model = BatchNormalization()(model) 45 | 46 | model = Dense(1)(model) 47 | model = Activation('sigmoid')(model) 48 | 49 | model = Model(inp, model) 50 | return model 51 | 52 | 53 | """ 54 | Trains the neural network using generated test data. Saved the model and the 55 | training history in the ./out folder. 56 | """ 57 | def train_ann(): 58 | X, Y = extract_features() 59 | 60 | # Only consider first media file for now 61 | X, Y = X[0], Y[0] 62 | 63 | shape = (len(X), 1) 64 | model = ann_model(shape) 65 | 66 | filename = "out/ann.hdf5" 67 | 68 | checkpoint = ModelCheckpoint(filepath=filename, monitor='val_loss', verbose=0, save_best_only=True) 69 | cutoff = EarlyStopping(monitor='val_loss', min_delta=1E-3, mode='min', patience=5) 70 | 71 | model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001), metrics=['accuracy']) 72 | 73 | X, Y = prepare_data(X, Y, balance=True) 74 | 75 | print("Label 1:", len(Y[Y==1])) 76 | print("Label 0:", len(Y[Y==0])) 77 | 78 | # Permutate training data in random order 79 | rand = np.random.permutation(np.arange(len(Y))) 80 | X = X[rand] 81 | Y = Y[rand] 82 | 83 | options = { 84 | 'epochs': 200, 85 | 'batch_size': 32, 86 | 'shuffle': True, 87 | 'validation_split': 0.3, 88 | 'verbose': 2, 89 | 'callbacks': [checkpoint, cutoff] 90 | } 91 | 92 | print("Training neural network:", filename) 93 | hist = model.fit(X, Y, **options) 94 | 95 | print('val_loss:', min(hist.history['val_loss'])) 96 | print('val_acc:', max(hist.history['val_acc'])) 97 | 98 | with open('out/ann.hist', 'wb') as hist_file: 99 | pickle.dump(hist.history, hist_file) 100 | 101 | 102 | if __name__ == '__main__': 103 | train_ann() 104 | print("module used to train the artifical neural network") 105 | -------------------------------------------------------------------------------- /subsync/model/train_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import matplotlib.pyplot as plt 3 | import librosa.display 4 | import numpy as np 5 | import subprocess 6 | import librosa 7 | import pysrt 8 | import sys 9 | import os 10 | import re 11 | 12 | DIRNAME = os.path.dirname(os.path.realpath(__file__)) 13 | TRAIN_DIR = os.path.join(DIRNAME, 'training') 14 | 15 | FREQ = 16000 # Audio frequency 16 | N_MFCC = 13 17 | HOP_LEN = 512.0 # Num of items per sample 18 | # 1 item = 1/16000 seg = 32 ms 19 | ITEM_TIME = HOP_LEN/FREQ 20 | 21 | 22 | if not os.path.exists(TRAIN_DIR): 23 | print("missing training data in directory:", TRAIN_DIR) 24 | sys.exit(1) 25 | 26 | # Convert timestamp to seconds 27 | def timeToSec(t): 28 | total_sec = float(t.milliseconds)/1000 29 | total_sec += t.seconds 30 | total_sec += t.minutes*60 31 | total_sec += t.hours*60*60 32 | return total_sec 33 | 34 | # Return timestamp from cell position 35 | def timeToPos(t, freq=FREQ, hop_len=HOP_LEN): 36 | return round(timeToSec(t)/(hop_len/freq)) 37 | 38 | 39 | """ 40 | Uses ffmpeg to transcode and extract audio from movie files in the training 41 | directory. Function returns a list of tuples; the .wav files and corresponding 42 | .srt files to processing 43 | """ 44 | def transcode_audio(dir=TRAIN_DIR): 45 | files = os.listdir(dir) 46 | p = re.compile('.*\.[mkv|avi|mp4]') 47 | files = [ f for f in files if p.match(f) ] 48 | 49 | training = [] 50 | 51 | for f in files: 52 | name, extension = os.path.splitext(f) 53 | input = os.path.join(dir, f) 54 | output = os.path.join(dir, name + '.wav') 55 | srt = os.path.join(dir, name + '.srt') 56 | 57 | if not os.path.exists(srt): 58 | print("missing subtitle for training:", srt) 59 | sys.exit(1) 60 | 61 | training.append((output, srt)) 62 | 63 | if os.path.exists(output): 64 | continue 65 | 66 | print("Transcoding:", input) 67 | command = "ffmpeg -y -i {0} -ab 160k -ac 2 -ar {2} -vn {1}".format(input, output, FREQ) 68 | code = subprocess.call(command, stderr=subprocess.DEVNULL, shell=True) 69 | if code != 0: 70 | raise Exception("ffmpeg returned: {}".format(code)) 71 | 72 | return training 73 | 74 | 75 | """ 76 | Extracts the features and labels from the .wav and .srt file. The audio is 77 | processed using MFCC. Returns a tuple where the first element is the MFCC data 78 | and the second argument is the labels for the data. 79 | """ 80 | def extract_features(files=None): 81 | if files is None: 82 | files = transcode_audio() 83 | 84 | audio = [] 85 | labels = [] 86 | 87 | for (wav, srt) in files: 88 | print("Processing audio:", wav) 89 | y, sr = librosa.load(wav, sr=FREQ) 90 | mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=int(HOP_LEN), n_mfcc=int(N_MFCC)) 91 | label = extract_labels(srt, len(mfcc[0])) 92 | audio.append(mfcc) 93 | labels.append(label) 94 | 95 | return audio, labels 96 | 97 | 98 | """ 99 | Processes a .srt file and returns a numpy array of labels for each sample. If 100 | there is a subtitle at the i'th sample, there is a 1 at position i, else 0. 101 | """ 102 | def extract_labels(srt, samples): 103 | subs = pysrt.open(srt) 104 | labels = np.zeros(samples) 105 | for sub in subs: 106 | start = timeToPos(sub.start) 107 | end = timeToPos(sub.end)+1 108 | for i in range(start, end): 109 | if i < len(labels): 110 | labels[i] = 1 111 | 112 | return labels 113 | 114 | 115 | """ 116 | Returns a mask of indexes in Y (a selection) for which the selection has an 117 | equal/balanced choice for every unique value in Y. That is exactly n items are 118 | chosen for each class. 119 | """ 120 | def balance_classes(Y): 121 | uniq = np.unique(Y) 122 | C = [np.squeeze(np.argwhere(Y==c)) for c in uniq] 123 | minority = min([len(c) for c in C]) 124 | M = [np.random.choice(c, size=minority, replace=False) for c in C] 125 | return np.append(*M) 126 | 127 | 128 | """ 129 | Prepares the data for processing in a neural network. First the data is 130 | converted to the proper dimensions, and afterwards the data is balanced 131 | for class imbalance issues. 132 | """ 133 | def prepare_data(X, Y, balance=True): 134 | X = X.T 135 | X = X[..., np.newaxis] 136 | 137 | if balance: 138 | # Balance classes such that there are n of each class 139 | balance = balance_classes(Y) 140 | X = X[balance] 141 | Y = Y[balance] 142 | 143 | return X, Y 144 | 145 | 146 | """ 147 | Used to plot the MFCC spectrograms for inspecting 148 | """ 149 | def plot_mfcc(mfcc): 150 | plt.figure(figsize=(10, 4)) 151 | librosa.display.specshow(mfcc, x_axis='time') 152 | plt.colorbar() 153 | plt.title('MFCC') 154 | plt.tight_layout() 155 | plt.show() 156 | 157 | 158 | if __name__ == '__main__': 159 | mfccs, labels = extract_features() 160 | 161 | for mfcc in mfccs: 162 | plot_mfcc(mfcc) 163 | -------------------------------------------------------------------------------- /subsync/net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | class NeuralNet: 6 | """ 7 | NeuralNet provides a prediction model for predicting speech using 8 | Mel-frequency cepstral coefficients (MFCC) data 9 | """ 10 | 11 | DIR = os.path.dirname(os.path.realpath(__file__)) 12 | 13 | def __init__(self): 14 | model = os.path.join(NeuralNet.DIR, 'subsync.pb') 15 | self.graph = self.load_graph(model) 16 | self.input = self.graph.get_tensor_by_name('subsync/mfcc_in:0') 17 | self.output = self.graph.get_tensor_by_name('subsync/speech_out:0') 18 | 19 | 20 | def summary(self): 21 | for op in self.graph.get_operations(): 22 | print(op.name) 23 | 24 | 25 | def load_graph(self, frozen_graph_filename): 26 | with tf.gfile.GFile(frozen_graph_filename, "rb") as f: 27 | graph_def = tf.GraphDef() 28 | graph_def.ParseFromString(f.read()) 29 | 30 | with tf.Graph().as_default() as graph: 31 | tf.import_graph_def( 32 | graph_def, 33 | input_map=None, 34 | return_elements=None, 35 | name="subsync", 36 | producer_op_list=None 37 | ) 38 | return graph 39 | 40 | 41 | def predict(self, mfcc): 42 | print("Predicting values...") 43 | with tf.Session(graph=self.graph) as sess: 44 | return sess.run(self.output, feed_dict={self.input: mfcc}) 45 | -------------------------------------------------------------------------------- /subsync/subsync.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tympanix/subsync/e5ce40c877c4fb0d9c358e1b0a1e36b2a8b4c0a7/subsync/subsync.pb -------------------------------------------------------------------------------- /subsync/test/test_440hz_880hz.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:01,000 --> 00:00:02,000 3 | High. 4 | 5 | 2 6 | 00:00:03,000 --> 00:00:04,000 7 | High. 8 | -------------------------------------------------------------------------------- /subsync/test/test_440hz_880hz.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tympanix/subsync/e5ce40c877c4fb0d9c358e1b0a1e36b2a8b4c0a7/subsync/test/test_440hz_880hz.wav -------------------------------------------------------------------------------- /subsync/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.6' 2 | --------------------------------------------------------------------------------