├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── requirements.txt
├── setup.cfg
├── setup.py
└── subsync
    ├── __init__.py
    ├── __main__.py
    ├── bin
        └── subsync
    ├── ffmpeg.py
    ├── log.py
    ├── main.py
    ├── media.py
    ├── model
        ├── convert.py
        ├── eval_ann.py
        ├── eval_logloss.py
        ├── eval_train.py
        ├── test.py
        ├── train_ann.py
        └── train_data.py
    ├── net.py
    ├── subsync.pb
    ├── test
        ├── test_440hz_880hz.srt
        └── test_440hz_880hz.wav
    └── version.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | training
  2 | out
  3 | 
  4 | # Created by https://www.gitignore.io/api/macos,python,windows
  5 | 
  6 | ### macOS ###
  7 | *.DS_Store
  8 | .AppleDouble
  9 | .LSOverride
 10 | 
 11 | # Icon must end with two \r
 12 | Icon
 13 | 
 14 | # Thumbnails
 15 | ._*
 16 | 
 17 | # Files that might appear in the root of a volume
 18 | .DocumentRevisions-V100
 19 | .fseventsd
 20 | .Spotlight-V100
 21 | .TemporaryItems
 22 | .Trashes
 23 | .VolumeIcon.icns
 24 | .com.apple.timemachine.donotpresent
 25 | 
 26 | # Directories potentially created on remote AFP share
 27 | .AppleDB
 28 | .AppleDesktop
 29 | Network Trash Folder
 30 | Temporary Items
 31 | .apdisk
 32 | 
 33 | ### Python ###
 34 | # Byte-compiled / optimized / DLL files
 35 | __pycache__/
 36 | *.py[cod]
 37 | *.class
 38 | 
 39 | # C extensions
 40 | *.so
 41 | 
 42 | # Distribution / packaging
 43 | .Python
 44 | build/
 45 | develop-eggs/
 46 | dist/
 47 | downloads/
 48 | eggs/
 49 | .eggs/
 50 | lib/
 51 | lib64/
 52 | parts/
 53 | sdist/
 54 | var/
 55 | wheels/
 56 | *.egg-info/
 57 | .installed.cfg
 58 | *.egg
 59 | 
 60 | # PyInstaller
 61 | #  Usually these files are written by a python script from a template
 62 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 63 | *.manifest
 64 | *.spec
 65 | 
 66 | # Installer logs
 67 | pip-log.txt
 68 | pip-delete-this-directory.txt
 69 | 
 70 | # Unit test / coverage reports
 71 | htmlcov/
 72 | .tox/
 73 | .coverage
 74 | .coverage.*
 75 | .cache
 76 | .pytest_cache/
 77 | nosetests.xml
 78 | coverage.xml
 79 | *.cover
 80 | .hypothesis/
 81 | 
 82 | # Translations
 83 | *.mo
 84 | *.pot
 85 | 
 86 | # Flask stuff:
 87 | instance/
 88 | .webassets-cache
 89 | 
 90 | # Scrapy stuff:
 91 | .scrapy
 92 | 
 93 | # Sphinx documentation
 94 | docs/_build/
 95 | 
 96 | # PyBuilder
 97 | target/
 98 | 
 99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 | 
102 | # pyenv
103 | .python-version
104 | 
105 | # celery beat schedule file
106 | celerybeat-schedule.*
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | 
133 | ### Windows ###
134 | # Windows thumbnail cache files
135 | Thumbs.db
136 | ehthumbs.db
137 | ehthumbs_vista.db
138 | 
139 | # Folder config file
140 | Desktop.ini
141 | 
142 | # Recycle Bin used on file shares
143 | .BIN/
144 | 
145 | # Windows Installer files
146 | *.cab
147 | *.msi
148 | *.msm
149 | *.msp
150 | 
151 | # Windows shortcuts
152 | *.lnk
153 | 
154 | # Visual studio code
155 | .vscode
156 | 
157 | 
158 | # End of https://www.gitignore.io/api/macos,python,windows
159 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include subsync/*.pb
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env make
 2 | 
 3 | init:
 4 | 	pip install -r requirements.txt
 5 | 
 6 | setup:
 7 | 	python subsync/model/train_data.py
 8 | 
 9 | train:
10 | 	python subsync/model/train_ann.py
11 | 
12 | eval:
13 | 	python subsync/model/eval_ann.py
14 | 
15 | logloss:
16 | 	python subsync/model/eval_logloss.py
17 | 
18 | convert:
19 | 	python subsync/model/convert.py
20 | 
21 | test:
22 | 	python subsync/model/test.py
23 | .PHONY: test
24 | 
25 | freeze:
26 | 	pip freeze > requirements.txt
27 | 
28 | dist:
29 | 	python setup.py sdist
30 | 
31 | publish:
32 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Subsync
 2 | **Synchronize your subtitles using machine learning**
 3 | 
 4 | Subsync analyses and processes the sound from your media files and uses machine learning to detect speech. Speech detection is used to shift existing subtitles for a perfect match in audio and text!
 5 | 
 6 | ## Features
 7 |  - [x] Machine learning model for voice activity detection (*not recognition*)
 8 |  - [x] Shift subtitle as a whole for best match
 9 |  - [x] Sync every sentence in the subtitle individually
10 |  - [ ] Sync using existing matched subtitle in a different laguage
11 | 
12 | ## Dependencies
13 | * ffmpeg (https://www.ffmpeg.org/download.html)
14 | 
15 | ## Installation
16 | ```bash
17 | pip install subsync
18 | ```
19 | 
20 | ## Help
21 | ```
22 | usage: subsync [-h] [--version] [--graph] [-d SECONDS] [-m SECONDS] [-s]
23 |                    [--logfile PATH]
24 |                    MEDIA [MEDIA ...]
25 | 
26 | positional arguments:
27 |   MEDIA                 media for which to synchronize subtitles
28 | 
29 | optional arguments:
30 |   -h, --help            show this help message and exit
31 |   --version             show program's version number and exit
32 |   --graph               show graph for subtitle synchronization (default:
33 |                         False)
34 |   -d SECONDS, --duration SECONDS
35 |                         duration (in seconds) of the sample audio length
36 |                         increases precision but reduces speed (default: 900)
37 |   -m SECONDS, --margin SECONDS
38 |                         the margin in which to search for a subtitle match
39 |                         (default: 12)
40 |   -s, --start           sample audio from the start of the media instad of the
41 |                         middle (default: False)
42 |   -r, --recursive       recurviely sync every sentence in the subtitle
43 |                         (default: False)
44 |   --logfile PATH        path to location of log file for logging application
45 |                         specific information (default: None)
46 | ```
47 | 
48 | ## Special thanks
49 | [[1] Automatic Subtitle Synchronization through Machine Learning](https://machinelearnings.co/automatic-subtitle-synchronization-e188a9275617) 
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.1.12
 2 | astor==0.6.2
 3 | audioread==2.1.5
 4 | bleach==1.5.0
 5 | cffi==1.11.5
 6 | chardet==3.0.4
 7 | cycler==0.10.0
 8 | decorator==4.2.1
 9 | gast==0.2.0
10 | graphviz==0.8.2
11 | grpcio==1.10.0
12 | h5py==2.8.0rc1
13 | html5lib==0.9999999
14 | joblib==0.11
15 | Keras==2.1.5
16 | kiwisolver==1.0.1
17 | librosa==0.6.0
18 | llvmlite==0.22.0
19 | Markdown==2.6.11
20 | matplotlib==2.2.2
21 | numba==0.37.0
22 | numpy==1.14.2
23 | protobuf==3.5.2.post1
24 | pycparser==2.18
25 | pydot==1.2.4
26 | pyparsing==2.2.0
27 | pysrt==1.1.1
28 | python-dateutil==2.7.0
29 | pytz==2018.3
30 | PyYAML==3.12
31 | resampy==0.2.0
32 | scikit-learn==0.19.1
33 | scipy==1.0.0
34 | six==1.11.0
35 | sklearn==0.0
36 | tensorboard==1.6.0
37 | tensorflow==1.5.0
38 | tensorflow-tensorboard==1.5.1
39 | termcolor==1.1.0
40 | Werkzeug==0.14.1
41 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | exec(open('subsync/version.py').read())
 4 | 
 5 | setup(name='subsync',
 6 |       version=__version__,
 7 |       description='Synchronize your subtitles with machine learning',
 8 |       classifiers=[
 9 |         'License :: OSI Approved :: MIT License',
10 |         'Programming Language :: Python :: 3.6',
11 |         'Topic :: Multimedia :: Sound/Audio :: Analysis',
12 |         'Topic :: Multimedia :: Sound/Audio :: Speech',
13 |       ],
14 |       keywords='subtitle synchronize machine learning',
15 |       platforms=["Independent"],
16 |       scripts=['subsync/bin/subsync'],
17 |       include_package_data=True,
18 |       url='https://github.com/tympanix/subsync',
19 |       author='tympanix',
20 |       author_email='tympanix@gmail.com',
21 |       license='MIT',
22 |       packages=['subsync'],
23 |       install_requires=[
24 |           'tensorflow>=1.0.0',
25 |           'numpy',
26 |           'matplotlib',
27 |           'librosa',
28 |           'h5py>=2.9.0',
29 |           'pysrt',
30 |       ],
31 |       zip_safe=False)
32 | 


--------------------------------------------------------------------------------
/subsync/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .version import __version__
3 | from .main import run
4 | 


--------------------------------------------------------------------------------
/subsync/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import run
2 | 
3 | if __name__ == '__main__':
4 |     run()
5 | 


--------------------------------------------------------------------------------
/subsync/bin/subsync:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import subsync
4 | subsync.run()
5 | 


--------------------------------------------------------------------------------
/subsync/ffmpeg.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | import subprocess
 4 | import os
 5 | import tempfile
 6 | import re
 7 | import sys
 8 | from datetime import timedelta
 9 | from subprocess import DEVNULL, STDOUT, PIPE
10 | 
11 | 
12 | class Transcode:
13 |     """
14 |     Transcode is a wrapper around the ffmpeg binary used to transcode
15 |     audio from media files.
16 |     """
17 | 
18 |     def __init__(self, input, binary='ffmpeg', seek=False, start=0, duration=0, channels=2, samplerate=16000, bitrate='160k'):
19 |         if seek and start:
20 |             raise ValueError("Can't both supply seek and start argument in transcode")
21 |         self.input = input
22 |         self.bitrate = bitrate
23 |         self.channels = channels
24 |         self.samplerate = samplerate
25 |         self.binary =  binary
26 |         self.start = start if type(start) is timedelta else timedelta(seconds=start)
27 |         self.duration = duration if type(duration) is timedelta else timedelta(seconds=duration)
28 |         self.length = self.__length()
29 |         if seek:
30 |             self.start = max(timedelta(), self.length/2-self.duration/2)
31 | 
32 |         self.output = os.path.join(tempfile.gettempdir(), 'subsync_' + randomString() + '.wav')
33 | 
34 | 
35 |     def command(self):
36 |         cmd = [self.binary, '-y']
37 |         cmd.extend(('-i', shellquote(self.input)))
38 | 
39 |         if self.start > timedelta():
40 |             cmd.extend(('-ss', duration_str(self.start)))
41 | 
42 |         if self.duration > timedelta():
43 |             cmd.extend(('-t', self.duration.seconds))
44 | 
45 |         cmd.extend(('-ab', self.bitrate))
46 |         cmd.extend(('-ac', self.channels))
47 |         cmd.extend(('-ar', self.samplerate))
48 |         cmd.append('-vn') # no video
49 |         cmd.append(self.output)
50 | 
51 |         return [str(s) for s in cmd]
52 | 
53 | 
54 |     def __length(self):
55 |         cmd = subprocess.Popen(['ffprobe', self.input], stdout=PIPE, stderr=STDOUT)
56 |         duration = [x.decode("utf-8") for x in cmd.stdout.readlines() if b"Duration" in x]
57 |         match = re.search(r'(\d\d):(\d\d):(\d\d)\.(\d\d)', duration[0])
58 |         code = cmd.wait()
59 |         if not match or code != 0:
60 |             raise RuntimeError('Could not call ffprobe:', self.input)
61 |         return timedelta(
62 |             hours=int(match.group(1)),
63 |             minutes=int(match.group(2)),
64 |             seconds=int(match.group(3)),
65 |             milliseconds=int(match.group(4))*100
66 |         )
67 | 
68 | 
69 |     def run(self):
70 |         code = subprocess.call(' '.join(self.command()), stderr=DEVNULL, shell=True)
71 |         if code != 0:
72 |             raise RuntimeError('Could not transcode audio:', self.input)
73 | 
74 | 
75 | def randomString(len=12):
76 |     allchar = string.ascii_letters + string.digits
77 |     return "".join(random.choice(allchar) for x in range(len))
78 | 
79 | 
80 | def duration_str(d):
81 |     hours, remainder = divmod(d.seconds, 3600)
82 |     minutes, seconds = divmod(remainder, 60)
83 |     return '{:02d}:{:02d}:{:02d}.{:06d}'.format(hours, minutes, seconds, d.microseconds)
84 | 
85 | 
86 | def shellquote(s):
87 |     if sys.platform == 'win32':
88 |         return "\"" + s.replace("\"", "\\\"") + "\""
89 |     else:
90 |         return "'" + s.replace("'", "'\\''") + "'"
91 | 


--------------------------------------------------------------------------------
/subsync/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | def init_logger(filepath):
 4 |     # create file handler which logs even debug messages
 5 |     fh = logging.FileHandler(filepath)
 6 |     fh.setLevel(logging.DEBUG)
 7 |     fh.setFormatter(formatter)
 8 |     logger.addHandler(fh)
 9 | 
10 | 
11 | logger = logging.getLogger('subsync_logger')
12 | logger.setLevel(logging.DEBUG)
13 | # create console handler with a higher log level
14 | ch = logging.StreamHandler()
15 | ch.setLevel(logging.ERROR)
16 | # create formatter and add it to the handlers
17 | formatter = logging.Formatter('%(asctime)s - %(message)s')
18 | ch.setFormatter(formatter)
19 | # add the handlers to the logger
20 | logger.addHandler(ch)
21 | 


--------------------------------------------------------------------------------
/subsync/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from .log import logger, init_logger
 4 | from .version import __version__
 5 | 
 6 | def run():
 7 |     parser = argparse.ArgumentParser(
 8 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 9 |     )
10 |     parser.add_argument('media', metavar='MEDIA', type=str, nargs='+',
11 |         help='media for which to synchronize subtitles')
12 |     parser.add_argument('--version', action='version', version='%(prog)s {}'.format(__version__))
13 |     parser.add_argument('--graph', dest="graph", action='store_true',
14 |         help='show graph for subtitle synchronization')
15 |     parser.add_argument('-d', '--duration', dest='duration', type=int, metavar='SECONDS', default=60*15,
16 |         help='duration (in seconds) of the sample audio length increases precision but reduces speed')
17 |     parser.add_argument('-m', '--margin', dest='margin', type=int, metavar='SECONDS', default=12,
18 |         help='the margin in which to search for a subtitle match')
19 |     parser.add_argument('-s', '--start', dest='start', action='store_true',
20 |         help='sample audio from the start of the media instad of the middle')
21 |     parser.add_argument('-r', '--recursive', dest='recursive', action='store_true',
22 |         help='recurviely sync every sentence in the subtitle')
23 |     parser.add_argument('--logfile', dest='logfile', type=str, metavar='PATH',
24 |         help='path to location of log file for logging application specific information')
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     if args.logfile:
29 |         init_logger(args.logfile)
30 | 
31 | 
32 |     from .media import Media
33 |     media = [Media(m) for m in args.media if m]
34 | 
35 |     from .net import NeuralNet
36 |     model = NeuralNet()
37 | 
38 |     for m in media:
39 |         if args.recursive:
40 |             m.mfcc(duration=0, seek=False)
41 |         else:
42 |             m.mfcc(duration=args.duration, seek=not args.start)
43 |         for s in m.subtitles():
44 |             if args.recursive:
45 |                 s.sync_all(model, plot=args.graph, margin=args.margin)
46 |             else:
47 |                 s.sync(model, plot=args.graph, margin=args.margin)
48 | 


--------------------------------------------------------------------------------
/subsync/media.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import librosa
  3 | import subprocess
  4 | import tempfile
  5 | import io
  6 | import pysrt
  7 | from pysrt import SubRipTime
  8 | import string
  9 | import random
 10 | import chardet
 11 | import re
 12 | from datetime import timedelta
 13 | 
 14 | import numpy as np
 15 | import sklearn
 16 | 
 17 | from .ffmpeg import Transcode
 18 | from .log import logger
 19 | 
 20 | 
 21 | class Media:
 22 |     """
 23 |     Media class represents a media file on disk for which the content can be
 24 |     analyzed and retrieved.
 25 |     """
 26 | 
 27 |     # List of supported media formats
 28 |     FORMATS = ['.mkv', '.mp4', '.wmv', '.avi', '.flv']
 29 | 
 30 |     # The frequency of the generated audio
 31 |     FREQ = 16000
 32 | 
 33 |     # The number of coefficients to extract from the mfcc
 34 |     N_MFCC = 13
 35 | 
 36 |     # The number of samples in each mfcc coefficient
 37 |     HOP_LEN = 512.0
 38 | 
 39 |     # The length (seconds) of each item in the mfcc analysis
 40 |     LEN_MFCC = HOP_LEN/FREQ
 41 | 
 42 | 
 43 |     def __init__(self, filepath, subtitles=None):
 44 |         prefix, ext = os.path.splitext(filepath)
 45 |         if ext == '.srt':
 46 |             return self.from_srt(filepath)
 47 |         if not ext:
 48 |             raise ValueError('unknown file: "{}"'.format(filepath))
 49 |         if ext not in Media.FORMATS:
 50 |             raise ValueError('filetype {} not supported: "{}"'.format(ext, filepath))
 51 |         self.__subtitles = subtitles
 52 |         self.filepath = os.path.abspath(filepath)
 53 |         self.filename = os.path.basename(prefix)
 54 |         self.extension = ext
 55 |         self.offset = timedelta()
 56 | 
 57 | 
 58 |     def from_srt(self, filepath):
 59 |         prefix, ext = os.path.splitext(filepath)
 60 |         if ext != '.srt':
 61 |             raise ValueError('filetype must be .srt format')
 62 |         prefix = os.path.basename(re.sub(r'\.\w\w$', '', prefix))
 63 |         dir = os.path.dirname(filepath)
 64 |         for f in os.listdir(dir):
 65 |             _, ext = os.path.splitext(f)
 66 |             if f.startswith(prefix) and ext in Media.FORMATS:
 67 |                 return self.__init__(os.path.join(dir, f), subtitles=[filepath])
 68 |         raise ValueError('no media for subtitle: "{}"'.format(filepath))
 69 | 
 70 | 
 71 |     def subtitles(self):
 72 |         if self.__subtitles is not None:
 73 |             for s in self.__subtitles:
 74 |                 yield Subtitle(self, s)
 75 |         else:
 76 |             dir = os.path.dirname(self.filepath)
 77 |             for f in os.listdir(dir):
 78 |                 if f.endswith('.srt') and f.startswith(self.filename):
 79 |                     yield Subtitle(self, os.path.join(dir, f))
 80 | 
 81 | 
 82 |     def mfcc(self, duration=60*15, seek=True):
 83 |         transcode = Transcode(self.filepath, duration=duration, seek=seek)
 84 |         self.offset = transcode.start
 85 |         print("Transcoding...")
 86 |         transcode.run()
 87 |         y, sr = librosa.load(transcode.output, sr=Media.FREQ)
 88 |         print("Analysing...")
 89 |         self.mfcc = librosa.feature.mfcc(y=y, sr=sr,
 90 |             hop_length=int(Media.HOP_LEN),
 91 |             n_mfcc=int(Media.N_MFCC)
 92 |         )
 93 |         os.remove(transcode.output)
 94 |         return self.mfcc
 95 | 
 96 | 
 97 | 
 98 | class Subtitle:
 99 |     """
100 |     Subtitle class represnets an .srt file on disk and provides
101 |     functionality to inspect and manipulate the subtitle content
102 |     """
103 | 
104 |     def __init__(self, media, path):
105 |         self.media = media
106 |         self.path = path
107 |         self.subs = pysrt.open(self.path, encoding=self._find_encoding())
108 | 
109 |     def labels(self, subs=None):
110 |         if self.media.mfcc is None:
111 |             raise RuntimeError("Must analyse mfcc before generating labels")
112 |         samples = len(self.media.mfcc[0])
113 |         labels = np.zeros(samples)
114 |         for sub in self.subs if subs is None else subs:
115 |             start = timeToPos(sub.start - self.offset())
116 |             end = timeToPos(sub.end - self.offset())+1
117 |             for i in range(start, end):
118 |                 if i >= 0 and i < len(labels):
119 |                     labels[i] = 1
120 | 
121 |         return labels
122 | 
123 |     def _find_encoding(self):
124 |         data = None
125 |         with open(self.path, "rb") as f:
126 |             data = f.read()
127 |         det = chardet.detect(data)
128 |         return det.get("encoding")
129 | 
130 | 
131 |     def offset(self):
132 |         d = self.media.offset
133 |         hours, remainder = divmod(d.seconds, 3600)
134 |         minutes, seconds = divmod(remainder, 60)
135 |         return SubRipTime(
136 |             hours=hours, minutes=minutes, seconds=seconds,
137 |             milliseconds=d.microseconds/1000
138 |         )
139 | 
140 | 
141 |     def logloss(self, pred, actual, margin=12):
142 |         blocks = secondsToBlocks(margin)
143 |         logloss = np.ones(blocks*2)
144 |         indices = np.ones(blocks*2)
145 |         nonzero = np.nonzero(actual)[0]
146 |         begin = max(nonzero[0]-blocks, 0)
147 |         end = min(nonzero[-1]+blocks, len(actual)-1)
148 |         pred = pred[begin:end]
149 |         actual = actual[begin:end]
150 |         for i, offset in enumerate(range(-blocks, blocks)):
151 |             snippet = np.roll(actual, offset)
152 |             try:
153 |                 logloss[i] = sklearn.metrics.log_loss(snippet[blocks:-blocks], pred[blocks:-blocks])
154 |             except (ValueError, RuntimeWarning):
155 |                 pass
156 |             indices[i] = offset
157 | 
158 |         return indices, logloss
159 | 
160 | 
161 |     def sync(self, net, safe=True, margin=12, plot=True):
162 |         secs = 0.0
163 |         labels = self.labels()
164 |         mfcc = self.media.mfcc.T
165 |         mfcc = mfcc[..., np.newaxis]
166 |         pred = net.predict(mfcc)
167 |         x, y = self.logloss(pred, labels, margin=margin)
168 |         accept = True
169 |         if safe:
170 |             mean = np.mean(y)
171 |             sd = np.std(y)
172 |             accept = np.min(y) < mean - sd
173 |         if accept:
174 |             secs = blocksToSeconds(x[np.argmin(y)])
175 |             print("Shift {} seconds:".format(secs))
176 |             self.subs.shift(seconds=secs)
177 |             self.subs.save(self.path, encoding='utf-8')
178 |             if secs != 0.0:
179 |                 logger.info('{}: {}s'.format(self.path, secs))
180 |         if plot:
181 |             self.plot_logloss(x, y)
182 |         return secs
183 | 
184 | 
185 |     def sync_all(self, net, margin=16, plot=True):
186 |         secs = 0.0
187 |         mfcc = self.media.mfcc.T
188 |         mfcc = mfcc[..., np.newaxis]
189 |         pred = net.predict(mfcc)
190 |         print("Fitting...")
191 |         self.__sync_all_rec(self.subs, pred)
192 |         self.clean()
193 |         self.subs.save(self.path, encoding='utf-8')
194 | 
195 | 
196 |     def __sync_all_rec(self, subs, pred, margin=16):
197 |         if len(subs) < 3:
198 |             return
199 |         labels = self.labels(subs=subs)
200 |         if np.unique(labels).size <= 1:
201 |             return
202 |         x, y = self.logloss(pred, labels, margin=max(margin, 0.25))
203 |         #self.plot_logloss(x,y)
204 |         #self.plot_labels(labels, pred)
205 |         secs = blocksToSeconds(x[np.argmin(y)])
206 |         subs.shift(seconds=secs)
207 |         # call recursively
208 |         middle = subs[len(subs)//2]
209 |         left = subs.slice(ends_before=middle.start)
210 |         right = subs.slice(starts_after=middle.start)
211 |         self.__sync_all_rec(left, pred, margin=margin/2)
212 |         self.__sync_all_rec(right, pred, margin=margin/2)
213 | 
214 | 
215 |     def clean(self):
216 |         for i, s in enumerate(self.subs):
217 |             if i >= len(self.subs)-1:
218 |                 return
219 |             next = self.subs[i+1]
220 |             if s.end > next.start:
221 |                 s.end = next.start
222 | 
223 | 
224 | 
225 |     def plot_logloss(self, x, y):
226 |         import matplotlib.pyplot as plt
227 |         plt.figure()
228 |         plt.plot(x, y)
229 |         plt.title('logloss over shifts')
230 |         plt.ylabel('logloss')
231 |         plt.xlabel('shifts')
232 |         plt.legend(['logloss'], loc='upper left')
233 |         plt.show()
234 | 
235 |     def plot_labels(self, labels, pred):
236 |         import matplotlib.pyplot as plt
237 |         plt.figure()
238 |         plt.plot([i for i in range(0,len(labels))], labels, label='labels')
239 |         plt.title('labels vs predictions')
240 |         plt.ylabel('value')
241 |         plt.xlabel('time')
242 |         plt.legend(['labels'], loc='upper left')
243 | 
244 |         plt.figure()
245 |         plt.plot([i for i in range(0,len(pred))], pred, label='pred')
246 |         plt.title('labels vs predictions')
247 |         plt.ylabel('value')
248 |         plt.xlabel('time')
249 |         plt.legend(['pred'], loc='upper left')
250 |         plt.show()
251 | 
252 | 
253 | 
254 | # Convert timestamp to seconds
255 | def timeToSec(t):
256 |     total_sec = float(t.milliseconds)/1000
257 |     total_sec += t.seconds
258 |     total_sec += t.minutes*60
259 |     total_sec += t.hours*60*60
260 |     return total_sec
261 | 
262 | 
263 | # Return timestamp from cell position
264 | def timeToPos(t, freq=Media.FREQ, hop_len=Media.HOP_LEN):
265 |     return round(timeToSec(t)/(hop_len/freq))
266 | 
267 | 
268 | def secondsToBlocks(s, hop_len=Media.HOP_LEN, freq=Media.FREQ):
269 |     return int(float(s)/(hop_len/freq))
270 | 
271 | 
272 | def blocksToSeconds(h, freq=Media.FREQ, hop_len=Media.HOP_LEN):
273 |     return float(h)*(hop_len/freq)
274 | 


--------------------------------------------------------------------------------
/subsync/model/convert.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/bitbionic/keras-to-tensorflow
 2 | 
 3 | import os
 4 | import os.path as osp
 5 | import argparse
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from keras.models import load_model
10 | from keras import backend as K
11 | 
12 | def convertGraph(modelPath, output, outPath):
13 |     '''
14 |     Converts an HD5F file to a .pb file for use with Tensorflow.
15 |     Args:
16 |         modelPath (str): path to the .h5 file
17 |            output (str): name of the referenced output
18 |           outPath (str): path to the output .pb file
19 |     Returns:
20 |         None
21 |     '''
22 | 
23 |     dir = os.path.dirname(os.path.realpath(__file__))
24 |     outdir = os.path.join(dir, os.path.dirname(outPath))
25 |     name = os.path.basename(outPath)
26 |     basename, ext = os.path.splitext(name)
27 | 
28 |     #NOTE: If using Python > 3.2, this could be replaced with os.makedirs( name, exist_ok=True )
29 |     if not os.path.isdir(outdir):
30 |         os.mkdir(outdir)
31 | 
32 |     K.set_learning_phase(0)
33 | 
34 |     net_model = load_model(modelPath)
35 | 
36 |     # Alias the outputs in the model - this sometimes makes them easier to access in TF
37 |     tf.identity(net_model.output, name=output)
38 | 
39 |     sess = K.get_session()
40 | 
41 |     net_model.summary()
42 | 
43 |     # Write the graph in human readable
44 |     f = '{}.reference.pb.ascii'.format(basename)
45 |     tf.train.write_graph(sess.graph.as_graph_def(), outdir, f, as_text=True)
46 |     print('Saved the graph definition in ascii format at: ', osp.join(outdir, f))
47 | 
48 |     # Write the graph in binary .pb file
49 |     from tensorflow.python.framework import graph_util
50 |     from tensorflow.python.framework import graph_io
51 |     constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [output])
52 |     graph_io.write_graph(constant_graph, outdir, name, as_text=False)
53 |     print('Saved the constant graph (ready for inference) at: ', outPath)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     convertGraph('out/ann.hdf5', 'speech_out', 'out/subsync.pb')
58 | 


--------------------------------------------------------------------------------
/subsync/model/eval_ann.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pickle
 3 | import sys
 4 | import os
 5 | 
 6 | DIRNAME = os.path.dirname(os.path.realpath(__file__))
 7 | OUT_DIR = os.path.join(DIRNAME, 'out')
 8 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5')
 9 | HIST = os.path.join(OUT_DIR, 'ann.hist')
10 | 
11 | if not os.path.exists(MODEL):
12 |     print("missing model:", MODEL)
13 |     sys.exit(1)
14 | 
15 | if not os.path.exists(HIST):
16 |     print("missing history:", HIST)
17 |     sys.exit(1)
18 | 
19 | 
20 | def plot(history):
21 |     # Summarize history for accuracy
22 |     plt.figure()
23 |     plt.plot(history['acc'])
24 |     plt.plot(history['val_acc'])
25 |     plt.title('model accuracy')
26 |     plt.ylabel('accuracy')
27 |     plt.xlabel('epoch')
28 |     plt.legend(['train', 'test'], loc='upper left')
29 | 
30 |     # Summarize history for loss
31 |     plt.figure()
32 |     plt.plot(history['loss'])
33 |     plt.plot(history['val_loss'])
34 |     plt.title('model loss')
35 |     plt.ylabel('loss')
36 |     plt.xlabel('epoch')
37 |     plt.legend(['train', 'test'], loc='upper left')
38 | 
39 |     plt.show()
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     history = pickle.load(open(HIST, "rb"))
44 | 
45 |     print('val_loss:', min(history['val_loss']))
46 |     print('val_acc:', max(history['val_acc']))
47 | 
48 |     try:
49 |         plot(history)
50 |     except KeyboardInterrupt as e:
51 |         sys.exit(0)
52 | 


--------------------------------------------------------------------------------
/subsync/model/eval_logloss.py:
--------------------------------------------------------------------------------
 1 | import sklearn
 2 | import numpy as np
 3 | from train_data import *
 4 | from train_ann import *
 5 | 
 6 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5')
 7 | 
 8 | if not os.path.exists(MODEL):
 9 |     print("missing model:", MODEL)
10 |     sys.exit(1)
11 | 
12 | 
13 | def logloss(pred, actual):
14 |     begin = np.argmax(actual) * (-1)
15 |     end = np.argmax(actual[::-1]) + 1
16 |     print("Calculating {} logloss values".format(end-begin))
17 |     logloss = np.zeros(end-begin)
18 |     indices = np.zeros(end-begin)
19 |     for i, offset in enumerate(range(begin, end)):
20 |         logloss[i] = sklearn.metrics.log_loss(np.roll(actual, offset), pred)
21 |         indices[i] = offset
22 | 
23 |     return indices, logloss
24 | 
25 | 
26 | def plot_logloss(x, y):
27 |     plt.figure()
28 |     plt.plot(x, y)
29 |     plt.title('logloss over shifts')
30 |     plt.ylabel('logloss')
31 |     plt.xlabel('shifts')
32 |     plt.legend(['logloss'], loc='upper left')
33 | 
34 | 
35 | def load_model(input_shape):
36 |     model = ann_model(input_shape)
37 |     model.load_weights(MODEL)
38 |     return model
39 | 
40 | if __name__ == '__main__':
41 |     files = transcode_audio()
42 |     mfcc, labels = extract_features(files=files)
43 | 
44 |     for X, Y in zip(mfcc, labels):
45 |         shape = (len(X), 1)
46 |         X, Y = prepare_data(X, Y, balance=False)
47 |         model = load_model(shape)
48 |         print("Predicting...")
49 |         pred = model.predict(X, batch_size=32)
50 |         print("Done...")
51 |         x, y = logloss(pred, Y)
52 |         plot_logloss(x, y)
53 |         plt.show()
54 | 


--------------------------------------------------------------------------------
/subsync/model/eval_train.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from train_ann import *
 3 | from train_data import *
 4 | import os
 5 | 
 6 | MODEL = os.path.join(OUT_DIR, 'ann.hdf5')
 7 | 
 8 | 
 9 | if not os.path.exists(MODEL):
10 |     print("missing model:", MODEL)
11 |     sys.exit(1)
12 | 
13 | 
14 | def load_model(input_shape):
15 |     model = ann_model(input_shape)
16 |     model.load_weights(MODEL)
17 |     return model
18 | 
19 | 
20 | def plot_pred(pred, actual):
21 |     plt.figure()
22 |     plt.plot(pred)
23 |     plt.plot(actual)
24 |     plt.title('prediction evaluation')
25 |     plt.ylabel('label')
26 |     plt.xlabel('time')
27 |     plt.legend(['pred', 'actual'], loc='upper left')
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     files = transcode_audio()
32 |     wav, srt = extract_features(files=files)
33 | 
34 |     for X, Y in zip(wav, srt):
35 |         shape = (len(X), 1)
36 |         X, Y = prepare_data(X, Y, balance=False)
37 |         model = load_model(shape)
38 |         pred = model.predict(X, batch_size=32)
39 |         plot_pred(pred, Y)
40 | 
41 |     plt.show()
42 | 


--------------------------------------------------------------------------------
/subsync/model/test.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import librosa
 3 | from train_data import *
 4 | import os
 5 | 
 6 | TEST_DIR = os.path.join(DIRNAME, 'test')
 7 | 
 8 | def spectral_centroid(file):
 9 |     y, sr = librosa.load(file)
10 |     cent = librosa.feature.spectral_centroid(y=y, sr=sr)
11 | 
12 |     plt.figure()
13 |     plt.semilogy(cent.T, label='Spectral centroid')
14 |     plt.ylabel('Hz')
15 |     plt.xticks([])
16 |     plt.xlim([0, cent.shape[-1]])
17 |     plt.legend()
18 |     plt.title('log Power spectrogram')
19 |     plt.tight_layout()
20 | 
21 | 
22 | def plot_pred(pred, actual):
23 |     plt.figure()
24 |     plt.plot(pred)
25 |     plt.plot(actual)
26 |     plt.title('prediction evaluation')
27 |     plt.ylabel('label')
28 |     plt.xlabel('time')
29 |     plt.legend(['pred', 'actual'], loc='upper left')
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     filename = 'test_440hz_880hz'
34 |     audio = os.path.join(TEST_DIR, filename + '.wav')
35 |     sub = os.path.join(TEST_DIR, filename + '.srt')
36 | 
37 |     files = [(audio, sub)]
38 |     mfcc, srt = extract_features(files=files)
39 | 
40 | 
41 |     for X, Y in zip(mfcc, srt):
42 |         shape = (len(X), 1)
43 |         print("Len", len(X[0]))
44 |         plot_pred(Y, np.array([]))
45 |         spectral_centroid(audio)
46 | 
47 |         plt.show()


--------------------------------------------------------------------------------
/subsync/model/train_ann.py:
--------------------------------------------------------------------------------
  1 | from train_data import *
  2 | import numpy as np
  3 | import pickle
  4 | import os
  5 | 
  6 | # Keras imports
  7 | from keras.layers import Dense, Input, LSTM, Conv1D, Conv2D, Dropout, Flatten, Activation, MaxPooling2D
  8 | from keras.models import Model
  9 | from keras.layers.normalization import BatchNormalization
 10 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 11 | from keras.optimizers import Adam, RMSprop
 12 | 
 13 | # Matplotlib imports
 14 | import matplotlib.pylab as plt
 15 | 
 16 | # Sklearn imports
 17 | from sklearn.utils import class_weight
 18 | 
 19 | DIRNAME = os.path.dirname(os.path.realpath(__file__))
 20 | OUT_DIR = os.path.join(DIRNAME, "out")
 21 | 
 22 | if not os.path.exists(OUT_DIR):
 23 |     os.makedirs(OUT_DIR)
 24 | 
 25 | """
 26 | Returns a neural network model which can be used for training. The model first
 27 | needs to be compiled
 28 | """
 29 | def ann_model(input_shape):
 30 | 
 31 |     inp = Input(shape=input_shape, name='mfcc_in')
 32 |     model = inp
 33 | 
 34 |     model = Conv1D(filters=12, kernel_size=(3), activation='relu')(model)
 35 |     model = Conv1D(filters=12, kernel_size=(3), activation='relu')(model)
 36 |     model = Flatten()(model)
 37 | 
 38 |     model = Dense(56)(model)
 39 |     model = Activation('relu')(model)
 40 |     model = BatchNormalization()(model)
 41 |     model = Dropout(0.2)(model)
 42 |     model = Dense(28)(model)
 43 |     model = Activation('relu')(model)
 44 |     model = BatchNormalization()(model)
 45 | 
 46 |     model = Dense(1)(model)
 47 |     model = Activation('sigmoid')(model)
 48 | 
 49 |     model = Model(inp, model)
 50 |     return model
 51 | 
 52 | 
 53 | """
 54 | Trains the neural network using generated test data. Saved the model and the
 55 | training history in the ./out folder.
 56 | """
 57 | def train_ann():
 58 |     X, Y = extract_features()
 59 | 
 60 |     # Only consider first media file for now
 61 |     X, Y = X[0], Y[0]
 62 | 
 63 |     shape = (len(X), 1)
 64 |     model = ann_model(shape)
 65 | 
 66 |     filename = "out/ann.hdf5"
 67 | 
 68 |     checkpoint = ModelCheckpoint(filepath=filename, monitor='val_loss', verbose=0, save_best_only=True)
 69 |     cutoff = EarlyStopping(monitor='val_loss', min_delta=1E-3, mode='min', patience=5)
 70 | 
 71 |     model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001), metrics=['accuracy'])
 72 | 
 73 |     X, Y = prepare_data(X, Y, balance=True)
 74 | 
 75 |     print("Label 1:", len(Y[Y==1]))
 76 |     print("Label 0:", len(Y[Y==0]))
 77 | 
 78 |     # Permutate training data in random order
 79 |     rand = np.random.permutation(np.arange(len(Y)))
 80 |     X = X[rand]
 81 |     Y = Y[rand]
 82 | 
 83 |     options = {
 84 |         'epochs': 200,
 85 |         'batch_size': 32,
 86 |         'shuffle': True,
 87 |         'validation_split': 0.3,
 88 |         'verbose': 2,
 89 |         'callbacks': [checkpoint, cutoff]
 90 |     }
 91 | 
 92 |     print("Training neural network:", filename)
 93 |     hist = model.fit(X, Y, **options)
 94 | 
 95 |     print('val_loss:', min(hist.history['val_loss']))
 96 |     print('val_acc:', max(hist.history['val_acc']))
 97 | 
 98 |     with open('out/ann.hist', 'wb') as hist_file:
 99 |         pickle.dump(hist.history, hist_file)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     train_ann()
104 |     print("module used to train the artifical neural network")
105 | 


--------------------------------------------------------------------------------
/subsync/model/train_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import matplotlib.pyplot as plt
  3 | import librosa.display
  4 | import numpy as np
  5 | import subprocess
  6 | import librosa
  7 | import pysrt
  8 | import sys
  9 | import os
 10 | import re
 11 | 
 12 | DIRNAME = os.path.dirname(os.path.realpath(__file__))
 13 | TRAIN_DIR = os.path.join(DIRNAME, 'training')
 14 | 
 15 | FREQ = 16000        # Audio frequency
 16 | N_MFCC = 13
 17 | HOP_LEN = 512.0    # Num of items per sample
 18 |                     # 1 item = 1/16000 seg = 32 ms
 19 | ITEM_TIME = HOP_LEN/FREQ
 20 | 
 21 | 
 22 | if not os.path.exists(TRAIN_DIR):
 23 |     print("missing training data in directory:", TRAIN_DIR)
 24 |     sys.exit(1)
 25 | 
 26 | # Convert timestamp to seconds
 27 | def timeToSec(t):
 28 |     total_sec = float(t.milliseconds)/1000
 29 |     total_sec += t.seconds
 30 |     total_sec += t.minutes*60
 31 |     total_sec += t.hours*60*60
 32 |     return total_sec
 33 | 
 34 | # Return timestamp from cell position
 35 | def timeToPos(t, freq=FREQ, hop_len=HOP_LEN):
 36 |     return round(timeToSec(t)/(hop_len/freq))
 37 | 
 38 | 
 39 | """
 40 | Uses ffmpeg to transcode and extract audio from movie files in the training
 41 | directory. Function returns a list of tuples; the .wav files and corresponding
 42 | .srt files to processing
 43 | """
 44 | def transcode_audio(dir=TRAIN_DIR):
 45 |     files = os.listdir(dir)
 46 |     p = re.compile('.*\.[mkv|avi|mp4]')
 47 |     files = [ f for f in files if p.match(f) ]
 48 | 
 49 |     training = []
 50 | 
 51 |     for f in files:
 52 |         name, extension = os.path.splitext(f)
 53 |         input = os.path.join(dir, f)
 54 |         output = os.path.join(dir, name + '.wav')
 55 |         srt = os.path.join(dir, name + '.srt')
 56 | 
 57 |         if not os.path.exists(srt):
 58 |             print("missing subtitle for training:", srt)
 59 |             sys.exit(1)
 60 | 
 61 |         training.append((output, srt))
 62 | 
 63 |         if os.path.exists(output):
 64 |             continue
 65 | 
 66 |         print("Transcoding:", input)
 67 |         command = "ffmpeg -y -i {0} -ab 160k -ac 2 -ar {2} -vn {1}".format(input, output, FREQ)
 68 |         code = subprocess.call(command, stderr=subprocess.DEVNULL, shell=True)
 69 |         if code != 0:
 70 |             raise Exception("ffmpeg returned: {}".format(code))
 71 | 
 72 |     return training
 73 | 
 74 | 
 75 | """
 76 | Extracts the features and labels from the .wav and .srt file. The audio is
 77 | processed using MFCC. Returns a tuple where the first element is the MFCC data
 78 | and the second argument is the labels for the data.
 79 | """
 80 | def extract_features(files=None):
 81 |     if files is None:
 82 |         files = transcode_audio()
 83 | 
 84 |     audio = []
 85 |     labels = []
 86 | 
 87 |     for (wav, srt) in files:
 88 |         print("Processing audio:", wav)
 89 |         y, sr = librosa.load(wav, sr=FREQ)
 90 |         mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=int(HOP_LEN), n_mfcc=int(N_MFCC))
 91 |         label = extract_labels(srt, len(mfcc[0]))
 92 |         audio.append(mfcc)
 93 |         labels.append(label)
 94 | 
 95 |     return audio, labels
 96 | 
 97 | 
 98 | """
 99 | Processes a .srt file and returns a numpy array of labels for each sample. If
100 | there is a subtitle at the i'th sample, there is a 1 at position i, else 0.
101 | """
102 | def extract_labels(srt, samples):
103 |     subs = pysrt.open(srt)
104 |     labels = np.zeros(samples)
105 |     for sub in subs:
106 |         start = timeToPos(sub.start)
107 |         end = timeToPos(sub.end)+1
108 |         for i in range(start, end):
109 |             if i < len(labels):
110 |                 labels[i] = 1
111 | 
112 |     return labels
113 | 
114 | 
115 | """
116 | Returns a mask of indexes in Y (a selection) for which the selection has an
117 | equal/balanced choice for every unique value in Y. That is exactly n items are
118 | chosen for each class.
119 | """
120 | def balance_classes(Y):
121 |     uniq = np.unique(Y)
122 |     C = [np.squeeze(np.argwhere(Y==c)) for c in uniq]
123 |     minority = min([len(c) for c in C])
124 |     M = [np.random.choice(c, size=minority, replace=False) for c in C]
125 |     return np.append(*M)
126 | 
127 | 
128 | """
129 | Prepares the data for processing in a neural network. First the data is
130 | converted to the proper dimensions, and afterwards the data is balanced
131 | for class imbalance issues.
132 | """
133 | def prepare_data(X, Y, balance=True):
134 |     X = X.T
135 |     X = X[..., np.newaxis]
136 | 
137 |     if balance:
138 |         # Balance classes such that there are n of each class
139 |         balance = balance_classes(Y)
140 |         X = X[balance]
141 |         Y = Y[balance]
142 | 
143 |     return X, Y
144 | 
145 | 
146 | """
147 | Used to plot the MFCC spectrograms for inspecting
148 | """
149 | def plot_mfcc(mfcc):
150 |     plt.figure(figsize=(10, 4))
151 |     librosa.display.specshow(mfcc, x_axis='time')
152 |     plt.colorbar()
153 |     plt.title('MFCC')
154 |     plt.tight_layout()
155 |     plt.show()
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     mfccs, labels = extract_features()
160 | 
161 |     for mfcc in mfccs:
162 |         plot_mfcc(mfcc)
163 | 


--------------------------------------------------------------------------------
/subsync/net.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | class NeuralNet:
 6 |     """
 7 |     NeuralNet provides a prediction model for predicting speech using
 8 |     Mel-frequency cepstral coefficients (MFCC) data
 9 |     """
10 | 
11 |     DIR = os.path.dirname(os.path.realpath(__file__))
12 | 
13 |     def __init__(self):
14 |         model = os.path.join(NeuralNet.DIR, 'subsync.pb')
15 |         self.graph = self.load_graph(model)
16 |         self.input = self.graph.get_tensor_by_name('subsync/mfcc_in:0')
17 |         self.output = self.graph.get_tensor_by_name('subsync/speech_out:0')
18 | 
19 | 
20 |     def summary(self):
21 |         for op in self.graph.get_operations():
22 |             print(op.name)
23 | 
24 | 
25 |     def load_graph(self, frozen_graph_filename):
26 |         with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
27 |             graph_def = tf.GraphDef()
28 |             graph_def.ParseFromString(f.read())
29 | 
30 |         with tf.Graph().as_default() as graph:
31 |             tf.import_graph_def(
32 |                 graph_def,
33 |                 input_map=None,
34 |                 return_elements=None,
35 |                 name="subsync",
36 |                 producer_op_list=None
37 |             )
38 |         return graph
39 | 
40 | 
41 |     def predict(self, mfcc):
42 |         print("Predicting values...")
43 |         with tf.Session(graph=self.graph) as sess:
44 |             return sess.run(self.output, feed_dict={self.input: mfcc})
45 | 


--------------------------------------------------------------------------------
/subsync/subsync.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tympanix/subsync/e5ce40c877c4fb0d9c358e1b0a1e36b2a8b4c0a7/subsync/subsync.pb


--------------------------------------------------------------------------------
/subsync/test/test_440hz_880hz.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:01,000 --> 00:00:02,000
3 | High.
4 | 
5 | 2
6 | 00:00:03,000 --> 00:00:04,000
7 | High.
8 | 


--------------------------------------------------------------------------------
/subsync/test/test_440hz_880hz.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tympanix/subsync/e5ce40c877c4fb0d9c358e1b0a1e36b2a8b4c0a7/subsync/test/test_440hz_880hz.wav


--------------------------------------------------------------------------------
/subsync/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.6'
2 | 


--------------------------------------------------------------------------------