├── gtts ├── tests │ ├── __init__.py │ ├── input_files │ │ ├── test_cli_test_utf8.txt │ │ └── test_cli_test_ascii.txt │ ├── test_lang.py │ ├── test_utils.py │ ├── test_tts.py │ └── test_cli.py ├── version.py ├── tokenizer │ ├── __init__.py │ ├── symbols.py │ ├── tests │ │ ├── test_pre_processors.py │ │ ├── test_tokenizer_cases.py │ │ └── test_core.py │ ├── pre_processors.py │ ├── tokenizer_cases.py │ └── core.py ├── __init__.py ├── langs.py ├── lang.py ├── utils.py ├── cli.py └── tts.py ├── news └── .gitignore ├── docs ├── changelog.rst ├── contributing.rst ├── license.rst ├── Makefile ├── index.rst ├── cli.rst ├── conf.py ├── module.rst └── tokenizer.rst ├── .mypy.ini ├── MANIFEST.in ├── pytest.ini ├── .readthedocs.yml ├── setup.py ├── pyproject.toml ├── .github ├── workflows │ ├── autolock.yml │ ├── test.yml │ └── publish.yml └── ISSUE_TEMPLATE │ ├── config.yml │ ├── feature.md │ └── bug.md ├── .gitignore ├── LICENSE ├── CONTRIBUTING.rst ├── setup.cfg ├── README.md ├── scripts └── gen_langs.py └── CHANGELOG.rst /gtts/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /news/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore 2 | -------------------------------------------------------------------------------- /gtts/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.2.3' 2 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGELOG.rst 2 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | .. include:: ../LICENSE 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include CHANGELOG.rst 3 | include CONTRIBUTING.rst 4 | include LICENSE 5 | include pytest.ini 6 | -------------------------------------------------------------------------------- /gtts/tests/input_files/test_cli_test_utf8.txt: -------------------------------------------------------------------------------- 1 | 这是一个三岁的小孩 2 | 在讲述她从一系列照片里看到的东西。 3 | 对这个世界, 她也许还有很多要学的东西, 4 | 但在一个重要的任务上, 她已经是专家了: 5 | 去理解她所看到的东西。 6 | -------------------------------------------------------------------------------- /gtts/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from .core import RegexBuilder, PreProcessorRegex, PreProcessorSub, Tokenizer # noqa: F401 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | maxversion = 4.6.11 3 | 4 | markers = 5 | net: marks tests that call use the net (using the URL endpoint, deselect with '-m "not net"') 6 | -------------------------------------------------------------------------------- /gtts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .version import __version__ # noqa: F401 3 | from .tts import gTTS, gTTSError 4 | 5 | __all__ = ['gTTS', 'gTTSError'] 6 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Build PDF as extra 2 | formats: 3 | - pdf 4 | 5 | python: 6 | version: 3.7 7 | pip_install: true 8 | extra_requirements: 9 | - docs 10 | -------------------------------------------------------------------------------- /gtts/tests/input_files/test_cli_test_ascii.txt: -------------------------------------------------------------------------------- 1 | Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger? 2 | How much will it cost the website doesn't have the theme i was going for. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from codecs import open 3 | import site 4 | 5 | # PEP517 6 | site.ENABLE_USER_SITE = True 7 | 8 | exec(open('gtts/version.py').read()) 9 | 10 | setup( 11 | version=__version__, # type: ignore # noqa: F821 12 | test_suite='gtts.tests', 13 | ) 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | 4 | [tool.towncrier] 5 | package = "gtts" 6 | filename = "CHANGELOG.rst" 7 | directory = "news/" 8 | underlines = ["-", "~", "_"] 9 | title_format = "{version} ({project_date})" 10 | issue_format = "`#{issue} `_" 11 | -------------------------------------------------------------------------------- /gtts/tokenizer/symbols.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ABBREVIATIONS = [ 4 | 'dr', 'jr', 'mr', 5 | 'mrs', 'ms', 'msgr', 6 | 'prof', 'sr', 'st'] 7 | 8 | SUB_PAIRS = [ 9 | ('Esq.', 'Esquire') 10 | ] 11 | 12 | ALL_PUNC = u"?!?!.,¡()[]¿…‥،;:—。,、:\n" 13 | 14 | TONE_MARKS = u"?!?!" 15 | 16 | PERIOD_COMMA = u".," 17 | 18 | COLON = u":" 19 | -------------------------------------------------------------------------------- /.github/workflows/autolock.yml: -------------------------------------------------------------------------------- 1 | name: 'Lock Inactive Issues' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 * * * *' 6 | 7 | jobs: 8 | lock: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: dessant/lock-threads@v2 12 | with: 13 | github-token: ${{ github.token }} 14 | issue-lock-inactive-days: '15' 15 | process-only: 'issues' -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Bug Report 4 | url: https://github.com/pndurette/gTTS/issues/new?template=bug.md 5 | about: Create a report to help us improve gTTS! 6 | - name: Feature Request 7 | url: https://github.com/pndurette/gTTS/issues/new?template=feature.md 8 | about: Suggest a new feature for gTTS! 9 | - name: Questions and discussions! 10 | url: https://github.com/pndurette/gTTS/discussions 11 | about: "Place to ask questions and get help about gTTS!!" 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites 2 | * [ ] Did you make sure a similar [issue](../) didn't exist? 3 | * [ ] Did you update gTTS to the latest? (`pip install --upgrade gTTS`) 4 | 5 | ## Proposed Behaviour 6 | 7 | 8 | ``` 9 | code 10 | ``` 11 | 12 | ## Context 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = gTTS 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /gtts/tests/test_lang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | from gtts.lang import tts_langs, _extra_langs, _fallback_deprecated_lang 4 | from gtts.langs import _main_langs 5 | 6 | """Test language list""" 7 | 8 | 9 | def test_main_langs(): 10 | """Fetch languages successfully""" 11 | # Safe to assume 'en' (English) will always be there 12 | scraped_langs = _main_langs() 13 | assert 'en' in scraped_langs 14 | 15 | 16 | def test_deprecated_lang(): 17 | """Test language deprecation fallback""" 18 | with pytest.deprecated_call(): 19 | assert _fallback_deprecated_lang('en-gb') == 'en' 20 | 21 | 22 | if __name__ == '__main__': 23 | pytest.main(['-x', __file__]) 24 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | gTTS 2 | ===== 3 | 4 | :class:`gTTS` (*Google Text-to-Speech*), a Python library and CLI tool to interface with Google Translate's text-to-speech API. Writes spoken ``mp3`` data to a file, a file-like object (bytestring) for further audio manipulation, or ``stdout``. It features flexible pre-processing and tokenizing. 5 | 6 | Installation 7 | ------------ 8 | 9 | .. code-block:: bash 10 | 11 | pip install gTTS 12 | 13 | .. toctree:: 14 | :maxdepth: 3 15 | :caption: Documentation 16 | 17 | cli 18 | module 19 | tokenizer 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | :caption: Project 24 | 25 | license 26 | contributing 27 | changelog 28 | 29 | Misc 30 | ---- 31 | 32 | * :ref:`genindex` 33 | * :ref:`modindex` 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .eggs/ 2 | .mypy_cache/ 3 | pip-wheel-metadata/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env*/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # Installer logs 29 | pip-log.txt 30 | pip-delete-this-directory.txt 31 | 32 | # Unit test / coverage reports 33 | .pytest_cache/ 34 | htmlcov/ 35 | .tox/ 36 | .coverage 37 | .cache 38 | nosetests.xml 39 | coverage.xml 40 | 41 | # Translations 42 | *.mo 43 | 44 | # Mr Developer 45 | .mr.developer.cfg 46 | .project 47 | .pydevproject 48 | 49 | # Rope 50 | .ropeproject 51 | 52 | # Django stuff: 53 | *.log 54 | *.pot 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # Other 60 | .vscode/ 61 | .DS_Store 62 | -------------------------------------------------------------------------------- /gtts/tokenizer/tests/test_pre_processors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | from gtts.tokenizer.pre_processors import tone_marks, end_of_line, abbreviations, word_sub 4 | 5 | 6 | class TestPreProcessors(unittest.TestCase): 7 | def test_tone_marks(self): 8 | _in = "lorem!ipsum?" 9 | _out = "lorem! ipsum? " 10 | self.assertEqual(tone_marks(_in), _out) 11 | 12 | def test_end_of_line(self): 13 | _in = """test- 14 | ing""" 15 | _out = "testing" 16 | self.assertEqual(end_of_line(_in), _out) 17 | 18 | def test_abbreviations(self): 19 | _in = "jr. sr. dr." 20 | _out = "jr sr dr" 21 | self.assertEqual(abbreviations(_in), _out) 22 | 23 | def test_word_sub(self): 24 | _in = "Esq. Bacon" 25 | _out = "Esquire Bacon" 26 | self.assertEqual(word_sub(_in), _out) 27 | 28 | 29 | if __name__ == '__main__': 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites 2 | * [ ] Did you make sure a similar [issue](../) didn't exist? 3 | * [ ] Did you update gTTS to the latest? (`pip install --upgrade gTTS`) 4 | 5 | ## Current Behaviour (steps to reproduce) 6 | 7 | 8 | ``` 9 | code 10 | ``` 11 | 12 | ## Expected Behaviour 13 | 14 | 15 | ``` 16 | code 17 | ``` 18 | 19 | ## Context 20 | 21 | 22 | 23 | 24 | ## Environment 25 | 26 | * gTTS version: 27 | * Operating System version: -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: 3 | push: 4 | tags-ignore: 5 | - 'v*' 6 | pull_request: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | test: 12 | name: Unit 13 | 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ['2.x', 3.6, 3.7, 3.8, 3.9] 19 | os: [ubuntu-latest, macOS-latest, windows-latest] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Install 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install -e .[tests] 32 | 33 | - name: Unit Tests 34 | run: pytest -v -s gtts/ --cov=gtts --cov-config=setup.cfg --cov-report=xml 35 | env: 36 | TEST_LANGS: all 37 | 38 | - name: Upload Coverage Report 39 | uses: codecov/codecov-action@v1.0.14 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2014-2021 Pierre Nicolas Durette 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - 'v*' 6 | 7 | jobs: 8 | package: 9 | name: Package 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.x'] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -e .[tests] 27 | 28 | - name: Unit Tests 29 | run: pytest -v -s gtts/ 30 | env: 31 | TEST_LANGS: all 32 | 33 | - name: Install Tools 34 | run: pip install --upgrade setuptools wheel twine 35 | 36 | - name: Package (wheel) 37 | run: python setup.py bdist_wheel 38 | 39 | - name: Package (sdist) 40 | if: matrix.python-version == '3.x' 41 | run: python setup.py sdist 42 | 43 | - name: Publish 44 | env: 45 | TWINE_USERNAME: __token__ 46 | TWINE_PASSWORD: ${{ secrets.pypi_token }} 47 | run: twine upload dist/* 48 | -------------------------------------------------------------------------------- /gtts/langs.py: -------------------------------------------------------------------------------- 1 | # Note: this file is generated 2 | _langs = { 3 | "af": "Afrikaans", 4 | "ar": "Arabic", 5 | "bg": "Bulgarian", 6 | "bn": "Bengali", 7 | "bs": "Bosnian", 8 | "ca": "Catalan", 9 | "cs": "Czech", 10 | "cy": "Welsh", 11 | "da": "Danish", 12 | "de": "German", 13 | "el": "Greek", 14 | "en": "English", 15 | "eo": "Esperanto", 16 | "es": "Spanish", 17 | "et": "Estonian", 18 | "fi": "Finnish", 19 | "fr": "French", 20 | "gu": "Gujarati", 21 | "hi": "Hindi", 22 | "hr": "Croatian", 23 | "hu": "Hungarian", 24 | "hy": "Armenian", 25 | "id": "Indonesian", 26 | "is": "Icelandic", 27 | "it": "Italian", 28 | "ja": "Japanese", 29 | "jw": "Javanese", 30 | "km": "Khmer", 31 | "kn": "Kannada", 32 | "ko": "Korean", 33 | "la": "Latin", 34 | "lv": "Latvian", 35 | "mk": "Macedonian", 36 | "ml": "Malayalam", 37 | "mr": "Marathi", 38 | "my": "Myanmar (Burmese)", 39 | "ne": "Nepali", 40 | "nl": "Dutch", 41 | "no": "Norwegian", 42 | "pl": "Polish", 43 | "pt": "Portuguese", 44 | "ro": "Romanian", 45 | "ru": "Russian", 46 | "si": "Sinhala", 47 | "sk": "Slovak", 48 | "sq": "Albanian", 49 | "sr": "Serbian", 50 | "su": "Sundanese", 51 | "sv": "Swedish", 52 | "sw": "Swahili", 53 | "ta": "Tamil", 54 | "te": "Telugu", 55 | "th": "Thai", 56 | "tl": "Filipino", 57 | "tr": "Turkish", 58 | "uk": "Ukrainian", 59 | "ur": "Urdu", 60 | "vi": "Vietnamese", 61 | "zh-CN": "Chinese" 62 | } 63 | 64 | def _main_langs(): 65 | return _langs 66 | -------------------------------------------------------------------------------- /gtts/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | from gtts.utils import _minimize, _len, _clean_tokens, _translate_url 4 | 5 | delim = ' ' 6 | Lmax = 10 7 | 8 | 9 | def test_ascii(): 10 | _in = "Bacon ipsum dolor sit amet" 11 | _out = ["Bacon", "ipsum", "dolor sit", "amet"] 12 | assert _minimize(_in, delim, Lmax) == _out 13 | 14 | 15 | def test_ascii_no_delim(): 16 | _in = "Baconipsumdolorsitametflankcornedbee" 17 | _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"] 18 | assert _minimize(_in, delim, Lmax) == _out 19 | 20 | 21 | def test_unicode(): 22 | _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。" 23 | _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"] 24 | assert _minimize(_in, delim, Lmax) == _out 25 | 26 | 27 | def test_startwith_delim(): 28 | _in = delim + "test" 29 | _out = ["test"] 30 | assert _minimize(_in, delim, Lmax) == _out 31 | 32 | 33 | def test_len_ascii(): 34 | text = "Bacon ipsum dolor sit amet flank corned beef." 35 | assert _len(text) == 45 36 | 37 | 38 | def test_len_unicode(): 39 | text = u"但在一个重要的任务上" 40 | assert _len(text) == 10 41 | 42 | 43 | def test_only_space_and_punc(): 44 | _in = [",(:)?", "\t ", "\n"] 45 | _out = [] 46 | assert _clean_tokens(_in) == _out 47 | 48 | 49 | def test_strip(): 50 | _in = [" Bacon ", "& ", "ipsum\r", "."] 51 | _out = ["Bacon", "&", "ipsum"] 52 | assert _clean_tokens(_in) == _out 53 | 54 | 55 | def test_translate_url(): 56 | _in = {"tld": "qwerty", "path": "asdf"} 57 | _out = "https://translate.google.qwerty/asdf" 58 | assert _translate_url(**_in) == _out 59 | 60 | 61 | if __name__ == '__main__': 62 | pytest.main(['-x', __file__]) 63 | -------------------------------------------------------------------------------- /gtts/tokenizer/tests/test_tokenizer_cases.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | from gtts.tokenizer.tokenizer_cases import tone_marks, period_comma, colon, other_punctuation, legacy_all_punctuation 4 | from gtts.tokenizer import Tokenizer, symbols 5 | 6 | 7 | class TestPreTokenizerCases(unittest.TestCase): 8 | def test_tone_marks(self): 9 | t = Tokenizer([tone_marks]) 10 | _in = "Lorem? Ipsum!" 11 | _out = ['Lorem?', 'Ipsum!'] 12 | self.assertEqual(t.run(_in), _out) 13 | 14 | def test_period_comma(self): 15 | t = Tokenizer([period_comma]) 16 | _in = "Hello, it's 24.5 degrees in the U.K. today. $20,000,000." 17 | _out = ['Hello', "it's 24.5 degrees in the U.K. today", '$20,000,000.'] 18 | self.assertEqual(t.run(_in), _out) 19 | 20 | def test_colon(self): 21 | t = Tokenizer([colon]) 22 | _in = "It's now 6:30 which means: morning missing:space" 23 | _out = ["It's now 6:30 which means", ' morning missing', 'space'] 24 | self.assertEqual(t.run(_in), _out) 25 | 26 | def test_other_punctuation(self): 27 | # String of the unique 'other punctuations' 28 | other_punc_str = ''.join( 29 | set(symbols.ALL_PUNC) - 30 | set(symbols.TONE_MARKS) - 31 | set(symbols.PERIOD_COMMA) - 32 | set(symbols.COLON)) 33 | 34 | t = Tokenizer([other_punctuation]) 35 | self.assertEqual(len(t.run(other_punc_str)) - 1, len(other_punc_str)) 36 | 37 | def test_legacy_all_punctuation(self): 38 | t = Tokenizer([legacy_all_punctuation]) 39 | self.assertEqual(len(t.run(symbols.ALL_PUNC)) - 40 | 1, len(symbols.ALL_PUNC)) 41 | 42 | 43 | if __name__ == '__main__': 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Command-line (:mod:`gtts-cli`) 2 | ============================== 3 | 4 | After installing the package, the ``gtts-cli`` tool becomes available:: 5 | 6 | $ gtts-cli 7 | 8 | .. click:: gtts.cli:tts_cli 9 | :prog: gtts-cli 10 | :show-nested: 11 | 12 | Examples 13 | -------- 14 | 15 | List available languages:: 16 | 17 | $ gtts-cli --all 18 | 19 | Read 'hello' to ``hello.mp3``:: 20 | 21 | $ gtts-cli 'hello' --output hello.mp3 22 | 23 | Read "c'est la vie" in French to ``cestlavie.mp3``:: 24 | 25 | $ gtts-cli "c'est la vie" --lang fr --output cestlavie.mp3 26 | 27 | Read '你好' to ``你好.mp3`` (in Mandarin, using google.cn):: 28 | 29 | $ gtts-cli '你好' --tld cn --lang zh-cn --output 你好.mp3 30 | 31 | Read 'slow' slowly to ``slow.mp3``:: 32 | 33 | $ gtts-cli 'slow' --slow --output slow.mp3 34 | 35 | Read 'hello' to ``stdout``:: 36 | 37 | $ gtts-cli 'hello' 38 | 39 | Read ``stdin`` to ``hello.mp3`` via ```` or ````:: 40 | 41 | $ echo -n 'hello' | gtts-cli - --output hello.mp3 42 | $ echo -n 'hello' | gtts-cli --file - --output hello.mp3 43 | 44 | Read 'no check' to ``nocheck.mp3`` without language checking:: 45 | 46 | $ gtts-cli 'no check' --lang zh --nocheck --ouput nocheck.mp3 47 | 48 | .. note:: Using ``--nocheck`` can speed up execution. It exists mostly however to force a ```` language tag that might not be documented but would work with the API, such as for specific regional sub-tags of documented tags (examples for 'en': 'en-gb', 'en-au', etc.). 49 | 50 | Playing sound directly 51 | ---------------------- 52 | 53 | You can pipe the output of ``gtts-cli`` into any media player that supports ``stdin``. For example, using the ``play`` command from `SoX `_:: 54 | 55 | $ gtts-cli 'hello' | play -t mp3 - 56 | 57 | -------------------------------------------------------------------------------- /gtts/tokenizer/pre_processors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.tokenizer import PreProcessorRegex, PreProcessorSub, symbols 3 | import re 4 | 5 | 6 | def tone_marks(text): 7 | """Add a space after tone-modifying punctuation. 8 | 9 | Because the `tone_marks` tokenizer case will split after a tone-modidfying 10 | punctuation mark, make sure there's whitespace after. 11 | 12 | """ 13 | return PreProcessorRegex( 14 | search_args=symbols.TONE_MARKS, 15 | search_func=lambda x: u"(?<={})".format(x), 16 | repl=' ').run(text) 17 | 18 | 19 | def end_of_line(text): 20 | """Re-form words cut by end-of-line hyphens. 21 | 22 | Remove "". 23 | 24 | """ 25 | return PreProcessorRegex( 26 | search_args=u'-', 27 | search_func=lambda x: u"{}\n".format(x), 28 | repl='').run(text) 29 | 30 | 31 | def abbreviations(text): 32 | """Remove periods after an abbreviation from a list of known 33 | abbrevations that can be spoken the same without that period. This 34 | prevents having to handle tokenization of that period. 35 | 36 | Note: 37 | Could potentially remove the ending period of a sentence. 38 | 39 | Note: 40 | Abbreviations that Google Translate can't pronounce without 41 | (or even with) a period should be added as a word substitution with a 42 | :class:`PreProcessorSub` pre-processor. Ex.: 'Esq.', 'Esquire'. 43 | 44 | """ 45 | return PreProcessorRegex( 46 | search_args=symbols.ABBREVIATIONS, 47 | search_func=lambda x: r"(?<={})(?=\.).".format(x), 48 | repl='', flags=re.IGNORECASE).run(text) 49 | 50 | 51 | def word_sub(text): 52 | """Word-for-word substitutions.""" 53 | return PreProcessorSub( 54 | sub_pairs=symbols.SUB_PAIRS).run(text) 55 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Reporting Issues 5 | ---------------- 6 | 7 | On the Github issues_ page. Thanks! 8 | 9 | Submitting Patches 10 | ------------------ 11 | 12 | 1. **Fork**. Follow `PEP 8 `_! 13 | 2. **Write/Update tests** (see below). 14 | 3. **Document**. Docstrings follow the `Google Python Style Guide`_ (docs by Sphinx_). 15 | You can 'test' documentation:: 16 | 17 | $ pip install .[docs] 18 | $ cd docs && make html # generated in docs/_build/html/ 19 | 20 | 4. **Open Pull Request**. To the ``master`` branch. 21 | 5. **Changelog**. This project uses towncrier_ for managing the changelog. Please consider 22 | creating one or more 'news fragment' in the ``/news/`` directory and adding them to 23 | your PR, in the style of ``.`` where 'type' is one of: 24 | 'feature', 'bugfix', 'doc', 'removal' or 'misc'. 25 | 26 | See towncrier_ (New Fragments) for more details. Example:: 27 | 28 | $ echo 'Fixed a thing!' > gtts/news/1234.bugfix 29 | 30 | .. note:: | Please don't hesitate to contribute! While good tests, docs and structure are 31 | | encouraged, I do welcome great ideas over absolute comformity to the above! 32 | | Thanks! ❤️ 33 | 34 | Testing 35 | ------- 36 | 37 | | Testing is done with the ``unittest`` framework. 38 | | As a rule, the file ``./tests/test_.py`` file tests the ```` module. 39 | 40 | To run all tests (testing only language 'en' and generating an html coverage 41 | report in ``gtts/htmlcov/``):: 42 | 43 | $ pip install .[tests] 44 | $ TEST_LANGS=en pytest -v -s gtts/ --cov=gtts --cov-report=html 45 | 46 | .. _repo: https://github.com/pndurette/gTTS/ 47 | .. _issues: https://github.com/pndurette/gTTS/issues 48 | 49 | .. _Google Python Style Guide: http://google.github.io/styleguide/pyguide.html#Comments 50 | .. _Sphinx: http://www.sphinx-doc.org/ 51 | .. _towncrier: https://github.com/hawkowl/towncrier 52 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = gTTS 3 | description = gTTS (Google Text-to-Speech), a Python library and CLI tool to interface with Google Translate text-to-speech API 4 | author = Pierre Nicolas Durette 5 | author_email = pndurette@gmail.com 6 | url = https://github.com/pndurette/gTTS 7 | license = MIT 8 | keywords = 9 | gtts 10 | text to speech 11 | Google Translate 12 | TTS 13 | classifiers = 14 | Environment :: Console 15 | Intended Audience :: Developers 16 | License :: OSI Approved :: MIT License 17 | Operating System :: MacOS 18 | Operating System :: Unix 19 | Operating System :: POSIX 20 | Operating System :: POSIX :: Linux 21 | Operating System :: Microsoft :: Windows 22 | Programming Language :: Python :: 2.7 23 | Programming Language :: Python :: 3.6 24 | Programming Language :: Python :: 3.7 25 | Programming Language :: Python :: 3.8 26 | Programming Language :: Python :: 3.9 27 | Topic :: Software Development :: Libraries 28 | Topic :: Multimedia :: Sound/Audio :: Speech 29 | license_file = LICENSE 30 | long_description = file: README.md 31 | long_description_content_type = text/markdown 32 | 33 | [options] 34 | python_requires = >= 2.7 35 | include_package_data = True 36 | packages = find: 37 | install_requires = 38 | six 39 | click 40 | requests 41 | 42 | [options.extras_require] 43 | tests = 44 | pytest == 4.6.11 45 | pytest-cov 46 | flake8 47 | testfixtures 48 | mock 49 | six 50 | docs = 51 | sphinx 52 | sphinx-autobuild 53 | sphinx_rtd_theme 54 | sphinx-click 55 | towncrier 56 | 57 | [options.entry_points] 58 | console_scripts = 59 | gtts-cli = gtts.cli:tts_cli 60 | 61 | [flake8] 62 | max-line-length = 132 63 | exclude = .git,__pycache__,.eggs/,doc/,docs/,build/,dist/,archive/ 64 | ignore = W605, W503, W504 65 | 66 | [coverage:run] 67 | cover_pylib = false 68 | omit = 69 | */site-packages/* 70 | gtts/tests/* 71 | gtts/tokenizer/tests/* 72 | 73 | [coverage:report] 74 | exclude_lines = 75 | pragma: no cover 76 | def __repr__ 77 | log.debug 78 | log.warning 79 | -------------------------------------------------------------------------------- /gtts/tokenizer/tokenizer_cases.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.tokenizer import RegexBuilder, symbols 3 | 4 | 5 | def tone_marks(): 6 | """Keep tone-modifying punctuation by matching following character. 7 | 8 | Assumes the `tone_marks` pre-processor was run for cases where there might 9 | not be any space after a tone-modifying punctuation mark. 10 | """ 11 | return RegexBuilder( 12 | pattern_args=symbols.TONE_MARKS, 13 | pattern_func=lambda x: u"(?<={}).".format(x)).regex 14 | 15 | 16 | def period_comma(): 17 | """Period and comma case. 18 | 19 | Match if not preceded by "." and only if followed by space. 20 | Won't cut in the middle/after dotted abbreviations; won't cut numbers. 21 | 22 | Note: 23 | Won't match if a dotted abbreviation ends a sentence. 24 | 25 | Note: 26 | Won't match the end of a sentence if not followed by a space. 27 | 28 | """ 29 | return RegexBuilder( 30 | pattern_args=symbols.PERIOD_COMMA, 31 | pattern_func=lambda x: r"(? 6 | 7 | [![PyPI version](https://img.shields.io/pypi/v/gTTS.svg)](https://pypi.org/project/gTTS/) 8 | [![Python versions](https://img.shields.io/pypi/pyversions/gTTS.svg)](https://pypi.org/project/gTTS/) 9 | [![Tests workflow](https://github.com/pndurette/gTTS/workflows/Tests/badge.svg)](https://github.com/pndurette/gTTS/actions) 10 | [![codecov](https://codecov.io/gh/pndurette/gTTS/branch/master/graph/badge.svg)](https://codecov.io/gh/pndurette/gTTS) 11 | [![Commits Since](https://img.shields.io/github/commits-since/pndurette/gTTS/latest.svg)](https://github.com/pndurette/gTTS/commits/) 12 | [![PyPi Downloads](http://pepy.tech/badge/gtts)](http://pepy.tech/project/gtts) 13 | [![Buy me a Coffee](https://img.shields.io/badge/buy%20me%20a-coffee-orange)](https://www.buymeacoffee.com/pndurette) 14 | 15 | ## Features 16 | 17 | - Customizable speech-specific sentence tokenizer that allows for unlimited lengths of text to be read, all while keeping proper intonation, abbreviations, decimals and more; 18 | - Customizable text pre-processors which can, for example, provide pronunciation corrections; 19 | 20 | ### Installation 21 | 22 | $ pip install gTTS 23 | 24 | ### Quickstart 25 | 26 | Command Line: 27 | 28 | $ gtts-cli 'hello' --output hello.mp3 29 | 30 | Module: 31 | 32 | >>> from gtts import gTTS 33 | >>> tts = gTTS('hello') 34 | >>> tts.save('hello.mp3') 35 | 36 | See for documentation and examples. 37 | 38 | ### Disclaimer 39 | 40 | This project is *not* affiliated with Google or Google Cloud. Breaking upstream changes *can* occur without notice. This project is leveraging the undocumented [Google Translate](https://translate.google.com) speech functionality and is *different* from [Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech/). 41 | 42 | ### Project 43 | 44 | - [Questions & community](https://github.com/pndurette/gTTS/discussions) 45 | - [Changelog](CHANGELOG.rst) 46 | - [Contributing](CONTRIBUTING.rst) 47 | 48 | ### Licence 49 | 50 | [The MIT License (MIT)](LICENSE) Copyright © 2014-2021 Pierre Nicolas Durette & [Contributors](https://github.com/pndurette/gTTS/graphs/contributors) 51 | -------------------------------------------------------------------------------- /gtts/lang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.langs import _main_langs 3 | from warnings import warn 4 | import logging 5 | 6 | __all__ = ['tts_langs'] 7 | 8 | # Logger 9 | log = logging.getLogger(__name__) 10 | log.addHandler(logging.NullHandler()) 11 | 12 | 13 | def tts_langs(): 14 | """Languages Google Text-to-Speech supports. 15 | 16 | Returns: 17 | dict: A dictionary of the type `{ '': ''}` 18 | 19 | Where `` is an IETF language tag such as `en` or `zh-TW`, 20 | and `` is the full English name of the language, such as 21 | `English` or `Chinese (Mandarin/Taiwan)`. 22 | 23 | The dictionary returned combines languages from two origins: 24 | 25 | - Languages fetched from Google Translate (pre-generated in :mod:`gtts.langs`) 26 | - Languages that are undocumented variations that were observed to work and 27 | present different dialects or accents. 28 | 29 | """ 30 | langs = dict() 31 | langs.update(_main_langs()) 32 | langs.update(_extra_langs()) 33 | log.debug("langs: {}".format(langs)) 34 | return langs 35 | 36 | 37 | def _extra_langs(): 38 | """Define extra languages. 39 | 40 | Returns: 41 | dict: A dictionnary of extra languages manually defined. 42 | 43 | Variations of the ones generated in `_main_langs`, 44 | observed to provide different dialects or accents or 45 | just simply accepted by the Google Translate Text-to-Speech API. 46 | 47 | """ 48 | return { 49 | # Chinese 50 | 'zh-TW': 'Chinese (Mandarin/Taiwan)', 51 | 'zh': 'Chinese (Mandarin)' 52 | } 53 | 54 | 55 | def _fallback_deprecated_lang(lang): 56 | """Languages Google Text-to-Speech used to support. 57 | 58 | Language tags that don't work anymore, but that can 59 | fallback to a more general language code to maintain 60 | compatibility. 61 | 62 | Args: 63 | lang (string): The language tag. 64 | 65 | Returns: 66 | string: The language tag, as-is if not deprecated, 67 | or a fallack if it exits. 68 | 69 | Example: 70 | ``en-GB`` returns ``en``. 71 | ``en-gb`` returns ``en``. 72 | 73 | """ 74 | 75 | deprecated = { 76 | # '': [] 77 | 'en': ['en-us', 'en-ca', 'en-uk', 'en-gb', 'en-au', 'en-gh', 'en-in', 78 | 'en-ie', 'en-nz', 'en-ng', 'en-ph', 'en-za', 'en-tz'], 79 | 'fr': ['fr-ca', 'fr-fr'], 80 | 'pt': ['pt-br', 'pt-pt'], 81 | 'es': ['es-es', 'es-us'], 82 | 'zh-CN': ['zh-cn'], 83 | 'zh-TW': ['zh-tw'], 84 | } 85 | 86 | for fallback_lang, deprecated_langs in deprecated.items(): 87 | if lang.lower() in deprecated_langs: 88 | msg = ( 89 | "'{}' has been deprecated, falling back to '{}'. " 90 | "This fallback will be removed in a future version." 91 | ).format(lang, fallback_lang) 92 | 93 | warn(msg, DeprecationWarning) 94 | log.warning(msg) 95 | 96 | return fallback_lang 97 | 98 | return lang -------------------------------------------------------------------------------- /gtts/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.tokenizer.symbols import ALL_PUNC as punc 3 | from string import whitespace as ws 4 | import re 5 | 6 | _ALL_PUNC_OR_SPACE = re.compile(u"^[{}]*$".format(re.escape(punc + ws))) 7 | """Regex that matches if an entire line is only comprised 8 | of whitespace and punctuation 9 | 10 | """ 11 | 12 | 13 | def _minimize(the_string, delim, max_size): 14 | """Recursively split a string in the largest chunks 15 | possible from the highest position of a delimiter all the way 16 | to a maximum size 17 | 18 | Args: 19 | the_string (string): The string to split. 20 | delim (string): The delimiter to split on. 21 | max_size (int): The maximum size of a chunk. 22 | 23 | Returns: 24 | list: the minimized string in tokens 25 | 26 | Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx`` 27 | is the highest index of ``delim`` found in ``the_string``; and at maximum 28 | ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``. 29 | In the latter case, the split will occur at ``the_string[max_size]`` 30 | which can be any character. The function runs itself again on the rest of 31 | ``the_string`` (``the_string[idx:]``) until no chunk is larger than 32 | ``max_size``. 33 | 34 | """ 35 | # Remove `delim` from start of `the_string` 36 | # i.e. prevent a recursive infinite loop on `the_string[0:0]` 37 | # if `the_string` starts with `delim` and is larger than `max_size` 38 | if the_string.startswith(delim): 39 | the_string = the_string[_len(delim):] 40 | 41 | if _len(the_string) > max_size: 42 | try: 43 | # Find the highest index of `delim` in `the_string[0:max_size]` 44 | # i.e. `the_string` will be cut in half on `delim` index 45 | idx = the_string.rindex(delim, 0, max_size) 46 | except ValueError: 47 | # `delim` not found in `the_string`, index becomes `max_size` 48 | # i.e. `the_string` will be cut in half arbitrarily on `max_size` 49 | idx = max_size 50 | # Call itself again for `the_string[idx:]` 51 | return [the_string[:idx]] + \ 52 | _minimize(the_string[idx:], delim, max_size) 53 | else: 54 | return [the_string] 55 | 56 | 57 | def _len(text): 58 | """Same as ``len(text)`` for a string but that decodes 59 | ``text`` first in Python 2.x 60 | 61 | Args: 62 | text (string): String to get the size of. 63 | 64 | Returns: 65 | int: The size of the string. 66 | """ 67 | try: 68 | # Python 2 69 | return len(unicode(text)) 70 | except NameError: # pragma: no cover 71 | # Python 3 72 | return len(text) 73 | 74 | 75 | def _clean_tokens(tokens): 76 | """Clean a list of strings 77 | 78 | Args: 79 | tokens (list): A list of strings (tokens) to clean. 80 | 81 | Returns: 82 | list: Stripped strings ``tokens`` without the original elements 83 | that only consisted of whitespace and/or punctuation characters. 84 | 85 | """ 86 | return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)] 87 | 88 | 89 | def _translate_url(tld="com", path=""): 90 | """Generates a Google Translate URL 91 | 92 | Args: 93 | tld (string): Top-level domain for the Google Translate host, 94 | i.e ``https://translate.google.``. Default is ``com``. 95 | path: (string): A path to append to the Google Translate host, 96 | i.e ``https://translate.google.com/``. Default is ``""``. 97 | 98 | Returns: 99 | string: A Google Translate URL `https://translate.google./path` 100 | """ 101 | _GOOGLE_TTS_URL = "https://translate.google.{}/{}" 102 | return _GOOGLE_TTS_URL.format(tld, path) -------------------------------------------------------------------------------- /scripts/gen_langs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.utils import _translate_url 3 | from bs4 import BeautifulSoup 4 | import requests 5 | import logging 6 | import js2py 7 | import json 8 | import sys 9 | import re 10 | 11 | # Logger 12 | log = logging.getLogger(__name__) 13 | log.addHandler(logging.NullHandler()) 14 | 15 | # This file is used to generate the language dict (as a module) 16 | # Needs cleaning up, very much WIP 17 | # Usage: 18 | # * Install gTTS 19 | # * $ python gen_langs.py /langs.py 20 | 21 | 22 | def _get_data_by_key(js_list): 23 | """JavaScript function to generate the languages. 24 | 25 | A payload with the languages is passed to a JavaScript function. 26 | Instead of parsing that payload (combersome), we 'overload' that 27 | function to return what we want. 28 | 29 | """ 30 | 31 | js_function = r""" 32 | function AF_initDataCallback(args) { 33 | return { key: args['key'], data: args['data'] }; 34 | }; 35 | """ 36 | 37 | data_by_key = {} 38 | for js in js_list: 39 | js_code = js_function + js 40 | py_eval = js2py.eval_js(js_code) 41 | data_by_key[py_eval['key']] = py_eval['data'] 42 | 43 | return data_by_key 44 | 45 | 46 | def _fetch_langs(tld="com"): 47 | """Fetch (scrape) languages from Google Translate. 48 | 49 | Google Translate loads a JavaScript Array of 'languages codes' that can 50 | be spoken. We intersect this list with all the languages Google Translate 51 | provides to get the ones that support text-to-speech. 52 | 53 | Args: 54 | tld (string): Top-level domain for the Google Translate host 55 | to fetch languages from. i.e `https://translate.google.`. 56 | The language names obtained will be in a language locale of the TLD 57 | (e.g. ``tld=fr`` will retrieve the French names of the languages). 58 | Default is ``com``. 59 | 60 | Returns: 61 | dict: A dictionnary of languages from Google Translate 62 | 63 | """ 64 | 65 | URL_BASE = _translate_url(tld) 66 | 67 | headers = { 68 | 'User-Agent': 69 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " 70 | "AppleWebKit/605.1.15 (KHTML, like Gecko) " 71 | "Version/14.0 Safari/605.1.15" 72 | } 73 | 74 | page = requests.get(URL_BASE, headers=headers) 75 | soup = BeautifulSoup(page.content, 'html.parser') 76 | 77 | scripts = soup.find_all(name='script', string=re.compile(r"^AF_initDataCallback")) 78 | scripts = [s.text for s in scripts] 79 | 80 | data_by_key = _get_data_by_key(scripts) 81 | 82 | # Get all languages (ds:3) 83 | # data for 'ds:3' is 84 | # [ 85 | # [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]], 86 | # [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]] 87 | # ] 88 | # (Note: list[0] and list[1] are identical) 89 | all_langs_raw = data_by_key["ds:3"] 90 | 91 | # Get languages codes that have TTS (ds:6) 92 | # data for 'ds:6' is 93 | # [ 94 | # [['af', 200], ['ar', 200], ...] 95 | # ] 96 | tts_langs_raw = data_by_key["ds:6"] 97 | tts_langs = [lang[0] for lang in tts_langs_raw[0]] 98 | 99 | # Create language dict (and filter only TTS-enabled langs) 100 | # langs = { lang[0], lang[1] for lang in all_langs_raw[0] } 101 | 102 | langs = {k: v for k, v in all_langs_raw[0] if k in tts_langs} 103 | return langs 104 | 105 | 106 | if __name__ == "__main__": 107 | """Language list generation 'main' 108 | 109 | CLI to generate the language list as a dict in 110 | an importable python file/module 111 | 112 | Usage: 113 | python ./scripts/gen_langs.py ./gTTS/gtts/langs.py 114 | 115 | """ 116 | 117 | lang_file_path = sys.argv[1] 118 | with open(lang_file_path, 'w') as f: 119 | langs = _fetch_langs() 120 | 121 | py_content = f"""# Note: this file is generated 122 | _langs = {json.dumps(langs, indent=4, sort_keys=True)} 123 | 124 | def _main_langs(): 125 | return _langs 126 | """ 127 | 128 | f.write(py_content) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/stable/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'gTTS' 23 | copyright = '2014-2021 Pierre Nicolas Durette' 24 | author = 'Pierre Nicolas Durette' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx_click.ext', 45 | 'sphinx.ext.napoleon', 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ['_templates'] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = '.rst' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = None 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path . 70 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = 'sphinx' 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = 'sphinx_rtd_theme' 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | # html_theme_options = {} 88 | 89 | # Add any paths that contain custom static files (such as style sheets) here, 90 | # relative to this directory. They are copied after the builtin static files, 91 | # so a file named "default.css" will overwrite the builtin "default.css". 92 | html_static_path = ['_static'] 93 | 94 | # Custom sidebar templates, must be a dictionary that maps document names 95 | # to template names. 96 | # 97 | # The default sidebars (for documents that don't match any pattern) are 98 | # defined by theme itself. Builtin themes are using these templates by 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 100 | # 'searchbox.html']``. 101 | # 102 | # html_sidebars = {} 103 | 104 | 105 | # -- Options for HTMLHelp output --------------------------------------------- 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = 'gTTSdoc' 109 | 110 | 111 | # -- Options for LaTeX output ------------------------------------------------ 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | 118 | # The font size ('10pt', '11pt' or '12pt'). 119 | # 120 | # 'pointsize': '10pt', 121 | 122 | # Additional stuff for the LaTeX preamble. 123 | # 124 | # 'preamble': '', 125 | 126 | # Latex figure (float) alignment 127 | # 128 | # 'figure_align': 'htbp', 129 | } 130 | 131 | # Grouping the document tree into LaTeX files. List of tuples 132 | # (source start file, target name, title, 133 | # author, documentclass [howto, manual, or own class]). 134 | latex_documents = [ 135 | (master_doc, 'gTTS.tex', 'gTTS Documentation', 136 | 'Pierre-Nick Durette', 'manual'), 137 | ] 138 | 139 | 140 | # -- Options for manual page output ------------------------------------------ 141 | 142 | # One entry per manual page. List of tuples 143 | # (source start file, name, description, authors, manual section). 144 | man_pages = [ 145 | (master_doc, 'gtts', 'gTTS Documentation', 146 | [author], 1) 147 | ] 148 | 149 | 150 | # -- Options for Texinfo output ---------------------------------------------- 151 | 152 | # Grouping the document tree into Texinfo files. List of tuples 153 | # (source start file, target name, title, author, 154 | # dir menu entry, description, category) 155 | texinfo_documents = [ 156 | (master_doc, 'gTTS', 'gTTS Documentation', 157 | author, 'gTTS', 'One line description of project.', 158 | 'Miscellaneous'), 159 | ] 160 | 161 | 162 | # -- Extension configuration ------------------------------------------------- 163 | -------------------------------------------------------------------------------- /docs/module.rst: -------------------------------------------------------------------------------- 1 | Module (:mod:`gtts`) 2 | ==================== 3 | 4 | .. contents:: :local: 5 | :depth: 2 6 | 7 | gTTS (:class:`gtts.gTTS`) 8 | ------------------------- 9 | 10 | .. automodule:: gtts.tts 11 | :members: 12 | 13 | Languages (:mod:`gtts.lang`) 14 | ---------------------------- 15 | 16 | .. note:: The easiest way to get a list of available languages is to print them 17 | with ``gtts-cli --all`` 18 | 19 | .. automodule:: gtts.lang 20 | :members: 21 | 22 | Localized 'accents' 23 | ------------------- 24 | 25 | For a given language, Google Translate text-to-speech can speak in different 26 | local 'accents' depending on the Google domain (``google.``) of the request, 27 | with some examples shown in the table below. 28 | 29 | .. note:: This is an **incomplete** list. Try different combinaisons of language codes and 30 | `known localized Google domains `_. Feel 31 | free to add new combinaisons to this list via a Pull Request! 32 | 33 | +---------------------------+--------------------------+----------------------------+ 34 | | Local accent | Language code (``lang``) | Top-level domain (``tld``) | 35 | +===========================+==========================+============================+ 36 | | English (Australia) | ``en`` | ``com.au`` | 37 | +---------------------------+--------------------------+----------------------------+ 38 | | English (United Kingdom) | ``en`` | ``co.uk`` | 39 | +---------------------------+--------------------------+----------------------------+ 40 | | English (United States) | ``en`` | ``com`` (default) | 41 | +---------------------------+--------------------------+----------------------------+ 42 | | English (Canada) | ``en`` | ``ca`` | 43 | +---------------------------+--------------------------+----------------------------+ 44 | | English (India) | ``en`` | ``co.in`` | 45 | +---------------------------+--------------------------+----------------------------+ 46 | | English (Ireland) | ``en`` | ``ie`` | 47 | +---------------------------+--------------------------+----------------------------+ 48 | | English (South Africa) | ``en`` | ``co.za`` | 49 | +---------------------------+--------------------------+----------------------------+ 50 | | French (Canada) | ``fr`` | ``ca`` | 51 | +---------------------------+--------------------------+----------------------------+ 52 | | French (France) | ``fr`` | ``fr`` | 53 | +---------------------------+--------------------------+----------------------------+ 54 | | Mandarin (China Mainland) | ``zh-CN`` | any | 55 | +---------------------------+--------------------------+----------------------------+ 56 | | Mandarin (Taiwan) | ``zh-TW`` | any | 57 | +---------------------------+--------------------------+----------------------------+ 58 | | Portuguese (Brazil) | ``pt`` | ``com.br`` | 59 | +---------------------------+--------------------------+----------------------------+ 60 | | Portuguese (Portugal) | ``pt`` | ``pt`` | 61 | +---------------------------+--------------------------+----------------------------+ 62 | | Spanish (Mexico) | ``es`` | ``com.mx`` | 63 | +---------------------------+--------------------------+----------------------------+ 64 | | Spanish (Spain) | ``es`` | ``es`` | 65 | +---------------------------+--------------------------+----------------------------+ 66 | | Spanish (United States) | ``es`` | ``com`` (default) | 67 | +---------------------------+--------------------------+----------------------------+ 68 | 69 | 70 | Examples 71 | -------- 72 | 73 | Write 'hello' in English to ``hello.mp3``:: 74 | 75 | >>> from gtts import gTTS 76 | >>> tts = gTTS('hello', lang='en') 77 | >>> tts.save('hello.mp3') 78 | 79 | Write 'hello' in Australian English to ``hello.mp3``:: 80 | 81 | >>> from gtts import gTTS 82 | >>> tts = gTTS('hello', lang='en', tld='com.au') 83 | >>> tts.save('hello.mp3') 84 | 85 | Write 'hello bonjour' in English then French to ``hello_bonjour.mp3``:: 86 | 87 | >>> from gtts import gTTS 88 | >>> tts_en = gTTS('hello', lang='en') 89 | >>> tts_fr = gTTS('bonjour', lang='fr') 90 | >>> 91 | >>> with open('hello_bonjour.mp3', 'wb') as f: 92 | ... tts_en.write_to_fp(f) 93 | ... tts_fr.write_to_fp(f) 94 | 95 | Playing sound directly 96 | ---------------------- 97 | 98 | There's quite a few libraries that do this. Write 'hello' to a file-like object 99 | to do further manipulation::: 100 | 101 | >>> from gtts import gTTS 102 | >>> from io import BytesIO 103 | >>> 104 | >>> mp3_fp = BytesIO() 105 | >>> tts = gTTS('hello', lang='en') 106 | >>> tts.write_to_fp(mp3_fp) 107 | >>> 108 | >>> # Load `mp3_fp` as an mp3 file in 109 | >>> # the audio library of your choice 110 | 111 | .. note:: See `Issue #26 `_ for 112 | a discussion and examples of direct playback using various methods. 113 | 114 | 115 | Logging 116 | ------- 117 | 118 | :mod:`gtts` does logging using the standard Python logging module. The following loggers are available: 119 | 120 | ``gtts.tts`` 121 | Logger used for the :class:`gTTS` class 122 | 123 | ``gtts.lang`` 124 | Logger used for the :mod:`lang` module (language fetching) 125 | 126 | ``gtts`` 127 | Upstream logger for all of the above 128 | 129 | -------------------------------------------------------------------------------- /gtts/tests/test_tts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import pytest 4 | from mock import Mock 5 | from six.moves import urllib 6 | 7 | from gtts.tts import gTTS, gTTSError 8 | from gtts.langs import _main_langs 9 | from gtts.lang import _extra_langs 10 | 11 | # Testing all languages takes some time. 12 | # Set TEST_LANGS envvar to choose languages to test. 13 | # * 'main': Languages extracted from the Web 14 | # * 'extra': Languagee set in Languages.EXTRA_LANGS 15 | # * 'all': All of the above 16 | # * : Languages tags list to test 17 | # Unset TEST_LANGS to test everything ('all') 18 | # See: langs_dict() 19 | 20 | 21 | """Construct a dict of suites of languages to test. 22 | { '' : } 23 | 24 | ex.: { 'fetch' : {'en': 'English', 'fr': 'French'}, 25 | 'extra' : {'en': 'English', 'fr': 'French'} } 26 | ex.: { 'environ' : ['en', 'fr'] } 27 | """ 28 | env = os.environ.get('TEST_LANGS') 29 | if not env or env == 'all': 30 | langs = _main_langs() 31 | langs.update(_extra_langs()) 32 | elif env == 'main': 33 | langs = _main_langs() 34 | elif env == 'extra': 35 | langs = _extra_langs() 36 | else: 37 | env_langs = {l: l for l in env.split(',') if l} 38 | langs = env_langs 39 | 40 | 41 | @pytest.mark.net 42 | @pytest.mark.parametrize('lang', langs.keys(), ids=list(langs.values())) 43 | def test_TTS(tmp_path, lang): 44 | """Test all supported languages and file save""" 45 | 46 | text = "This is a test" 47 | """Create output .mp3 file successfully""" 48 | for slow in (False, True): 49 | filename = tmp_path / 'test_{}_.mp3'.format(lang) 50 | # Create gTTS and save 51 | tts = gTTS(text=text, lang=lang, slow=slow, lang_check=False) 52 | tts.save(filename) 53 | 54 | # Check if files created is > 1.5 55 | assert filename.stat().st_size > 1500 56 | 57 | 58 | @pytest.mark.net 59 | def test_unsupported_language_check(): 60 | """Raise ValueError on unsupported language (with language check)""" 61 | lang = 'xx' 62 | text = "Lorem ipsum" 63 | check = True 64 | with pytest.raises(ValueError): 65 | gTTS(text=text, lang=lang, lang_check=check) 66 | 67 | 68 | def test_empty_string(): 69 | """Raise AssertionError on empty string""" 70 | text = "" 71 | with pytest.raises(AssertionError): 72 | gTTS(text=text) 73 | 74 | 75 | def test_no_text_parts(tmp_path): 76 | """Raises AssertionError on no content to send to API (no text_parts)""" 77 | text = " ..,\n" 78 | with pytest.raises(AssertionError): 79 | filename = tmp_path / 'no_content.txt' 80 | tts = gTTS(text=text) 81 | tts.save(filename) 82 | 83 | 84 | # Test write_to_fp()/save() cases not covered elsewhere in this file 85 | 86 | def test_bad_fp_type(): 87 | """Raise TypeError if fp is not a file-like object (no .write())""" 88 | # Create gTTS and save 89 | tts = gTTS(text='test') 90 | with pytest.raises(TypeError): 91 | tts.write_to_fp(5) 92 | 93 | 94 | @pytest.mark.net 95 | def test_save(tmp_path): 96 | """Save .mp3 file successfully""" 97 | filename = tmp_path / 'save.mp3' 98 | # Create gTTS and save 99 | tts = gTTS(text='test') 100 | tts.save(filename) 101 | 102 | # Check if file created is > 2k 103 | assert filename.stat().st_size > 2000 104 | 105 | 106 | @pytest.mark.net 107 | def test_get_bodies(): 108 | """get request bodies list""" 109 | tts = gTTS(text='test', tld='com', lang='en') 110 | body = tts.get_bodies()[0] 111 | assert 'test' in body 112 | # \"en\" url-encoded 113 | assert '%5C%22en%5C%22' in body 114 | 115 | 116 | def test_msg(): 117 | """Test gTTsError internal exception handling 118 | Set exception message successfully""" 119 | error1 = gTTSError('test') 120 | assert 'test' == error1.msg 121 | 122 | error2 = gTTSError() 123 | assert error2.msg is None 124 | 125 | 126 | def test_infer_msg(): 127 | """Infer message sucessfully based on context""" 128 | 129 | # Without response: 130 | 131 | # Bad TLD 132 | ttsTLD = Mock(tld='invalid') 133 | errorTLD = gTTSError(tts=ttsTLD) 134 | assert errorTLD.msg == "Failed to connect. Probable cause: Host 'https://translate.google.invalid/' is not reachable" 135 | 136 | # With response: 137 | 138 | # 403 139 | tts403 = Mock() 140 | response403 = Mock(status_code=403, reason='aaa') 141 | error403 = gTTSError(tts=tts403, response=response403) 142 | assert error403.msg == "403 (aaa) from TTS API. Probable cause: Bad token or upstream API changes" 143 | 144 | # 200 (and not lang_check) 145 | tts200 = Mock(lang='xx', lang_check=False) 146 | response404 = Mock(status_code=200, reason='bbb') 147 | error200 = gTTSError(tts=tts200, response=response404) 148 | assert error200.msg == "200 (bbb) from TTS API. Probable cause: No audio stream in response. Unsupported language 'xx'" 149 | 150 | # >= 500 151 | tts500 = Mock() 152 | response500 = Mock(status_code=500, reason='ccc') 153 | error500 = gTTSError(tts=tts500, response=response500) 154 | assert error500.msg == "500 (ccc) from TTS API. Probable cause: Uptream API error. Try again later." 155 | 156 | # Unknown (ex. 100) 157 | tts100 = Mock() 158 | response100 = Mock(status_code=100, reason='ddd') 159 | error100 = gTTSError(tts=tts100, response=response100) 160 | assert error100.msg == "100 (ddd) from TTS API. Probable cause: Unknown" 161 | 162 | 163 | @pytest.mark.net 164 | def test_WebRequest(tmp_path): 165 | """Test Web Requests""" 166 | 167 | text = "Lorem ipsum" 168 | 169 | """Raise gTTSError on unsupported language (without language check)""" 170 | lang = 'xx' 171 | check = False 172 | 173 | with pytest.raises(gTTSError): 174 | filename = tmp_path / 'xx.txt' 175 | # Create gTTS 176 | tts = gTTS(text=text, lang=lang, lang_check=check) 177 | tts.save(filename) 178 | 179 | 180 | if __name__ == '__main__': 181 | pytest.main(['-x', __file__]) 182 | -------------------------------------------------------------------------------- /docs/tokenizer.rst: -------------------------------------------------------------------------------- 1 | .. module:: gtts.tokenizer 2 | 3 | Pre-processing and tokenizing 4 | ============================= 5 | 6 | The :mod:`gtts.tokenizer` module powers the default pre-processing and tokenizing features of ``gTTS`` and provides tools to easily expand them. :class:`gtts.tts.gTTS` takes two arguments ``pre_processor_funcs`` (list of functions) and ``tokenizer_func`` (function). See: `Pre-processing`_, `Tokenizing`_. 7 | 8 | .. contents:: :local: 9 | :depth: 2 10 | 11 | Definitions 12 | ----------- 13 | 14 | Pre-processor: 15 | Function that takes text and returns text. Its goal is to modify text (for example correcting pronounciation), and/or to prepare text for proper tokenization (for example enuring spacing after certain characters). 16 | 17 | Tokenizer: 18 | Function that takes text and returns it split into a list of `tokens` (strings). 19 | In the ``gTTS`` context, its goal is to cut the text into smaller segments that do not exceed the maximum character size allowed for each TTS API request, while making the speech sound natural and continuous. 20 | It does so by splitting text where speech would naturaly pause (for example on ".") while handling where it should not (for example on "10.5" or "U.S.A."). Such rules are called `tokenizer cases`, which it takes a list of. 21 | 22 | Tokenizer case: 23 | Function that defines one of the specific cases used by :class:`gtts.tokenizer.core.Tokenizer`. More specefically, it returns a ``regex`` object that describes what to look for for a particular case. :class:`gtts.tokenizer.core.Tokenizer` then creates its main `regex` pattern by joining all `tokenizer cases` with "|". 24 | 25 | 26 | Pre-processing 27 | -------------- 28 | 29 | You can pass a list of any function to :class:`gtts.tts.gTTS`'s ``pre_processor_funcs`` attribute to act as pre-processor (as long as it takes a string and returns a string). 30 | 31 | By default, :class:`gtts.tts.gTTS` takes a list of the following pre-processors, applied in order:: 32 | 33 | [ 34 | pre_processors.tone_marks, 35 | pre_processors.end_of_line, 36 | pre_processors.abbreviations, 37 | pre_processors.word_sub 38 | ] 39 | 40 | .. automodule:: gtts.tokenizer.pre_processors 41 | :members: 42 | 43 | Customizing & Examples 44 | ~~~~~~~~~~~~~~~~~~~~~~ 45 | 46 | This module provides two classes to help build pre-processors: 47 | 48 | * :class:`gtts.tokenizer.core.PreProcessorRegex` (for `regex`-based replacing, as would ``re.sub`` use) 49 | * :class:`gtts.tokenizer.core.PreProcessorSub` (for word-for-word replacements). 50 | 51 | The ``run(text)`` method of those objects returns the processed text. 52 | 53 | Speech corrections (word substitution) 54 | ______________________________________ 55 | 56 | The default substitutions are defined by the :attr:`gtts.tokenizer.symbols.SUB_PAIRS` list. Add a custom one by appending to it: 57 | 58 | :: 59 | 60 | >>> from gtts.tokenizer import pre_processors 61 | >>> import gtts.tokenizer.symbols 62 | >>> 63 | >>> gtts.tokenizer.symbols.SUB_PAIRS.append( 64 | ... ('sub.', 'submarine') 65 | ... ) 66 | >>> test_text = "Have you seen the Queen's new sub.?" 67 | >>> pre_processors.word_sub(test_text) 68 | "Have you seen the Queen's new submarine?" 69 | 70 | Abbreviations 71 | _____________ 72 | 73 | The default abbreviations are defined by the :attr:`gtts.tokenizer.symbols.ABBREVIATIONS` list. Add a custom one to it to add a new abbreviation to remove the period from. *Note: the default list already includes an extensive list of English abbreviations that Google Translate will read even without the period.* 74 | 75 | See :mod:`gtts.tokenizer.pre_processors` for more examples. 76 | 77 | Tokenizing 78 | ---------- 79 | 80 | You can pass any function to :class:`gtts.tts.gTTS`'s ``tokenizer_func`` attribute to act as tokenizer (as long as it takes a string and returns a list of strings). 81 | 82 | By default, :class:`gTTS` takes the :class:`gtts.tokenizer.core.Tokenizer`'s :func:`gtts.tokenizer.core.Tokenizer.run()`, initialized with default `tokenizer cases`:: 83 | 84 | Tokenizer([ 85 | tokenizer_cases.tone_marks, 86 | tokenizer_cases.period_comma, 87 | tokenizer_cases.other_punctuation 88 | ]).run 89 | 90 | The available `tokenizer cases` are as follows: 91 | 92 | .. automodule:: gtts.tokenizer.tokenizer_cases 93 | :members: 94 | 95 | Customizing & Examples 96 | ~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | A `tokenizer case` is a function that returns a compiled `regex` object to be used in a ``re.split()`` context. 99 | 100 | :class:`gtts.tokenizer.core.Tokenizer` takes a list of `tokenizer cases` and joins their pattern with "|" in one single pattern. 101 | 102 | This module provides a class to help build tokenizer cases: :class:`gtts.tokenizer.core.RegexBuilder`. See :class:`gtts.tokenizer.core.RegexBuilder` and :mod:`gtts.tokenizer.tokenizer_cases` for examples. 103 | 104 | Using a 3rd-party tokenizer 105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 106 | 107 | Even though :class:`gtts.tokenizer.core.Tokenizer` works well in this context, there are way more advanced tokenizers and tokenzing techniques. As long as you can restrict the lenght of output tokens, you can use any tokenizer you'd like, such as the ones in `NLTK `_. 108 | 109 | Minimizing 110 | ---------- 111 | 112 | The Google Translate text-to-speech API accepts a maximum of **100 characters**. 113 | 114 | If after tokenization any of the tokens is larger than 100 characters, it will be split in two: 115 | 116 | * On the last space character that is closest to, but before the 100th character; 117 | * Between the 100th and 101st characters if there's no space. 118 | 119 | gtts.tokenizer module reference (:mod:`gtts.tokenizer`) 120 | ------------------------------------------------------- 121 | 122 | .. autoclass:: gtts.tokenizer.core.RegexBuilder 123 | :members: 124 | :undoc-members: 125 | 126 | .. autoclass:: gtts.tokenizer.core.PreProcessorRegex 127 | :members: 128 | :undoc-members: 129 | 130 | .. autoclass:: gtts.tokenizer.core.PreProcessorSub 131 | :members: 132 | :undoc-members: 133 | 134 | .. autoclass:: gtts.tokenizer.core.Tokenizer 135 | :members: 136 | :undoc-members: 137 | 138 | .. autoattribute:: gtts.tokenizer.symbols.ABBREVIATIONS 139 | .. autoattribute:: gtts.tokenizer.symbols.SUB_PAIRS 140 | .. autoattribute:: gtts.tokenizer.symbols.ALL_PUNC 141 | .. autoattribute:: gtts.tokenizer.symbols.TONE_MARKS 142 | -------------------------------------------------------------------------------- /gtts/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts import gTTS, gTTSError, __version__ 3 | from gtts.lang import tts_langs 4 | import click 5 | import logging 6 | import logging.config 7 | 8 | # Click settings 9 | CONTEXT_SETTINGS = { 10 | 'help_option_names': ['-h', '--help'] 11 | } 12 | 13 | # Logger settings 14 | LOGGER_SETTINGS = { 15 | 'version': 1, 16 | 'formatters': { 17 | 'default': { 18 | 'format': '%(name)s - %(levelname)s - %(message)s' 19 | } 20 | }, 21 | 'handlers': { 22 | 'console': { 23 | 'class': 'logging.StreamHandler', 24 | 'formatter': 'default' 25 | } 26 | }, 27 | 'loggers': { 28 | 'gtts': { 29 | 'handlers': ['console'], 30 | 'level': 'WARNING' 31 | } 32 | } 33 | } 34 | 35 | # Logger 36 | logging.config.dictConfig(LOGGER_SETTINGS) 37 | log = logging.getLogger('gtts') 38 | 39 | 40 | def sys_encoding(): 41 | """Charset to use for --file |- (stdin)""" 42 | return 'utf8' 43 | 44 | 45 | def validate_text(ctx, param, text): 46 | """Validation callback for the argument. 47 | Ensures (arg) and (opt) are mutually exclusive 48 | """ 49 | if not text and 'file' not in ctx.params: 50 | # No and no 51 | raise click.BadParameter( 52 | " or -f/--file required") 53 | if text and 'file' in ctx.params: 54 | # Both and 55 | raise click.BadParameter( 56 | " and -f/--file can't be used together") 57 | return text 58 | 59 | 60 | def validate_lang(ctx, param, lang): 61 | """Validation callback for the option. 62 | Ensures is a supported language unless the flag is set 63 | """ 64 | if ctx.params['nocheck']: 65 | return lang 66 | 67 | try: 68 | if lang not in tts_langs(): 69 | raise click.UsageError( 70 | "'%s' not in list of supported languages.\n" 71 | "Use --all to list languages or " 72 | "add --nocheck to disable language check." % lang) 73 | else: 74 | # The language is valid. 75 | # No need to let gTTS re-validate. 76 | ctx.params['nocheck'] = True 77 | except RuntimeError as e: 78 | # Only case where the flag can be False 79 | # Non-fatal. gTTS will try to re-validate. 80 | log.debug(str(e), exc_info=True) 81 | 82 | return lang 83 | 84 | 85 | def print_languages(ctx, param, value): 86 | """Callback for flag. 87 | Prints formatted sorted list of supported languages and exits 88 | """ 89 | if not value or ctx.resilient_parsing: 90 | return 91 | 92 | try: 93 | langs = tts_langs() 94 | langs_str_list = sorted("{}: {}".format(k, langs[k]) for k in langs) 95 | click.echo(' ' + '\n '.join(langs_str_list)) 96 | except RuntimeError as e: # pragma: no cover 97 | log.debug(str(e), exc_info=True) 98 | raise click.ClickException("Couldn't fetch language list.") 99 | ctx.exit() 100 | 101 | 102 | def set_debug(ctx, param, debug): 103 | """Callback for flag. 104 | Sets logger level to DEBUG 105 | """ 106 | if debug: 107 | log.setLevel(logging.DEBUG) 108 | return 109 | 110 | 111 | @click.command(context_settings=CONTEXT_SETTINGS) 112 | @click.argument('text', metavar='', required=False, callback=validate_text) 113 | @click.option( 114 | '-f', 115 | '--file', 116 | metavar='', 117 | # For py2.7/unicode. If encoding not None Click uses io.open 118 | type=click.File(encoding=sys_encoding()), 119 | help="Read from instead of .") 120 | @click.option( 121 | '-o', 122 | '--output', 123 | metavar='', 124 | type=click.File(mode='wb'), 125 | help="Write to instead of stdout.") 126 | @click.option( 127 | '-s', 128 | '--slow', 129 | default=False, 130 | is_flag=True, 131 | help="Read more slowly.") 132 | @click.option( 133 | '-l', 134 | '--lang', 135 | metavar='', 136 | default='en', 137 | show_default=True, 138 | callback=validate_lang, 139 | help="IETF language tag. Language to speak in. List documented tags with --all.") 140 | @click.option( 141 | '-t', 142 | '--tld', 143 | metavar='', 144 | default='com', 145 | show_default=True, 146 | is_eager=True, # Prioritize to ensure it gets set before 147 | help="Top-level domain for the Google host, i.e https://translate.google.") 148 | @click.option( 149 | '--nocheck', 150 | default=False, 151 | is_flag=True, 152 | is_eager=True, # Prioritize to ensure it gets set before 153 | help="Disable strict IETF language tag checking. Allow undocumented tags.") 154 | @click.option( 155 | '--all', 156 | default=False, 157 | is_flag=True, 158 | is_eager=True, 159 | expose_value=False, 160 | callback=print_languages, 161 | help="Print all documented available IETF language tags and exit.") 162 | @click.option( 163 | '--debug', 164 | default=False, 165 | is_flag=True, 166 | is_eager=True, # Prioritize to see debug logs of callbacks 167 | expose_value=False, 168 | callback=set_debug, 169 | help="Show debug information.") 170 | @click.version_option(version=__version__) 171 | def tts_cli(text, file, output, slow, tld, lang, nocheck): 172 | """ Read to mp3 format using Google Translate's Text-to-Speech API 173 | (set or --file to - for standard input) 174 | """ 175 | 176 | # stdin for 177 | if text == '-': 178 | text = click.get_text_stream('stdin').read() 179 | 180 | # stdout (when no ) 181 | if not output: 182 | output = click.get_binary_stream('stdout') 183 | 184 | # input (stdin on '-' is handled by click.File) 185 | if file: 186 | try: 187 | text = file.read() 188 | except UnicodeDecodeError as e: # pragma: no cover 189 | log.debug(str(e), exc_info=True) 190 | raise click.FileError( 191 | file.name, 192 | " must be encoded using '%s'." % 193 | sys_encoding()) 194 | 195 | # TTS 196 | try: 197 | tts = gTTS( 198 | text=text, 199 | lang=lang, 200 | slow=slow, 201 | tld=tld, 202 | lang_check=not nocheck) 203 | tts.write_to_fp(output) 204 | except (ValueError, AssertionError) as e: 205 | raise click.UsageError(str(e)) 206 | except gTTSError as e: 207 | raise click.ClickException(str(e)) 208 | -------------------------------------------------------------------------------- /gtts/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | import re 4 | import os 5 | from click.testing import CliRunner 6 | from gtts.cli import tts_cli 7 | 8 | # Need to look into gTTS' log output to test proper instantiation 9 | # - Use testfixtures.LogCapture() b/c TestCase.assertLogs() needs py3.4+ 10 | # - Clear 'gtts' logger handlers (set in gtts.cli) to reduce test noise 11 | import logging 12 | from testfixtures import LogCapture 13 | logger = logging.getLogger('gtts') 14 | logger.handlers = [] 15 | 16 | 17 | """Test options and arguments""" 18 | 19 | 20 | def runner(args, input=None): 21 | return CliRunner().invoke(tts_cli, args, input) 22 | 23 | 24 | def runner_debug(args, input=None): 25 | return CliRunner().invoke(tts_cli, args + ['--debug'], input) 26 | 27 | 28 | # tests 29 | def test_text_no_text_or_file(): 30 | """One of (arg) and should be set""" 31 | result = runner_debug([]) 32 | 33 | assert " required" in result.output 34 | assert result.exit_code != 0 35 | 36 | 37 | def test_text_text_and_file(tmp_path): 38 | """ (arg) and should not be set together""" 39 | filename = tmp_path / 'test_and_file.txt' 40 | filename.touch() 41 | 42 | result = runner_debug(['--file', str(filename), 'test']) 43 | 44 | assert " can't be used together" in result.output 45 | assert result.exit_code != 0 46 | 47 | 48 | def test_text_empty(tmp_path): 49 | """Exit on no text to speak (via )""" 50 | filename = tmp_path / 'text_empty.txt' 51 | filename.touch() 52 | 53 | result = runner_debug(['--file', str(filename)]) 54 | 55 | assert "No text to speak" in result.output 56 | assert result.exit_code != 0 57 | 58 | 59 | # tests 60 | def test_file_not_exists(): 61 | """ should exist""" 62 | result = runner_debug(['--file', 'notexist.txt', 'test']) 63 | 64 | assert "No such file or directory" in result.output 65 | assert result.exit_code != 0 66 | 67 | 68 | # tests 69 | @pytest.mark.net 70 | def test_all(): 71 | """Option should return a list of languages""" 72 | result = runner(['--all']) 73 | 74 | # One or more of " xy: name" (\n optional to match the last) 75 | # Ex. " xx: xxxxx\n xx-yy: xxxxx\n xx: xxxxx" 76 | 77 | assert re.match(r"^(?:\s{2}(\w{2}|\w{2}-\w{2}): .+\n?)+$", result.output) 78 | assert result.exit_code == 0 79 | 80 | 81 | # tests 82 | @pytest.mark.net 83 | def test_lang_not_valid(): 84 | """Invalid should display an error""" 85 | result = runner(['--lang', 'xx', 'test']) 86 | 87 | assert "xx' not in list of supported languages" in result.output 88 | assert result.exit_code != 0 89 | 90 | 91 | @pytest.mark.net 92 | def test_lang_nocheck(): 93 | """Invalid (with ) should display an error message from gtts""" 94 | with LogCapture() as lc: 95 | result = runner_debug(['--lang', 'xx', '--nocheck', 'test']) 96 | 97 | log = str(lc) 98 | 99 | assert 'lang: xx' in log 100 | assert 'lang_check: False' in log 101 | assert "Unsupported language 'xx'" in result.output 102 | assert result.exit_code != 0 103 | 104 | # Param set tests 105 | @pytest.mark.net 106 | def test_params_set(): 107 | """Options should set gTTS instance arguments (read from debug log)""" 108 | with LogCapture() as lc: 109 | result = runner_debug(['--lang', 'fr', '--tld', 'es', '--slow', '--nocheck', 'test']) 110 | 111 | log = str(lc) 112 | 113 | assert 'lang: fr' in log 114 | assert 'tld: es' in log 115 | assert 'lang_check: False' in log 116 | assert 'slow: True' in log 117 | assert 'text: test' in log 118 | assert result.exit_code == 0 119 | 120 | 121 | # Test all input methods 122 | pwd = os.path.dirname(__file__) 123 | 124 | # Text for stdin ('-' for or ) 125 | textstdin = """stdin 126 | test 127 | 123""" 128 | 129 | # Text for stdin ('-' for or ) (Unicode) 130 | textstdin_unicode = u"""你吃饭了吗? 131 | 你最喜欢哪部电影? 132 | 我饿了,我要去做饭了。""" 133 | 134 | # Text for and 135 | text = """Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger? 136 | How much will it cost the website doesn't have the theme i was going for.""" 137 | 138 | textfile_ascii = os.path.join(pwd, 'input_files', 'test_cli_test_ascii.txt') 139 | 140 | # Text for and (Unicode) 141 | text_unicode = u"""这是一个三岁的小孩 142 | 在讲述她从一系列照片里看到的东西。 143 | 对这个世界, 她也许还有很多要学的东西, 144 | 但在一个重要的任务上, 她已经是专家了: 145 | 去理解她所看到的东西。""" 146 | 147 | textfile_utf8 = os.path.join(pwd, 'input_files', 'test_cli_test_utf8.txt') 148 | 149 | """ 150 | Method that mimics's LogCapture's __str__ method to make 151 | the string in the comprehension a unicode literal for P2.7 152 | https://github.com/Simplistix/testfixtures/blob/32c87902cb111b7ede5a6abca9b597db551c88ef/testfixtures/logcapture.py#L149 153 | """ 154 | 155 | 156 | def logcapture_str(lc): 157 | if not lc.records: 158 | return 'No logging captured' 159 | 160 | return '\n'.join([u"%s %s\n %s" % r for r in lc.actual()]) 161 | 162 | 163 | @pytest.mark.net 164 | def test_stdin_text(): 165 | with LogCapture() as lc: 166 | result = runner_debug(['-'], textstdin) 167 | log = logcapture_str(lc) 168 | 169 | assert 'text: %s' % textstdin in log 170 | assert result.exit_code == 0 171 | 172 | 173 | @pytest.mark.net 174 | def test_stdin_text_unicode(): 175 | with LogCapture() as lc: 176 | result = runner_debug(['-'], textstdin_unicode) 177 | log = logcapture_str(lc) 178 | 179 | assert u'text: %s' % textstdin_unicode in log 180 | assert result.exit_code == 0 181 | 182 | 183 | @pytest.mark.net 184 | def test_stdin_file(): 185 | with LogCapture() as lc: 186 | result = runner_debug(['--file', '-'], textstdin) 187 | log = logcapture_str(lc) 188 | 189 | assert 'text: %s' % textstdin in log 190 | assert result.exit_code == 0 191 | 192 | 193 | @pytest.mark.net 194 | def test_stdin_file_unicode(): 195 | with LogCapture() as lc: 196 | result = runner_debug(['--file', '-'], textstdin_unicode) 197 | log = logcapture_str(lc) 198 | 199 | assert 'text: %s' % textstdin_unicode in log 200 | assert result.exit_code == 0 201 | 202 | 203 | @pytest.mark.net 204 | def test_text(): 205 | with LogCapture() as lc: 206 | result = runner_debug([text]) 207 | log = logcapture_str(lc) 208 | 209 | assert "text: %s" % text in log 210 | assert result.exit_code == 0 211 | 212 | 213 | @pytest.mark.net 214 | def test_text_unicode(): 215 | with LogCapture() as lc: 216 | result = runner_debug([text_unicode]) 217 | log = logcapture_str(lc) 218 | 219 | assert "text: %s" % text_unicode in log 220 | assert result.exit_code == 0 221 | 222 | 223 | @pytest.mark.net 224 | def test_file_ascii(): 225 | with LogCapture() as lc: 226 | result = runner_debug(['--file', textfile_ascii]) 227 | log = logcapture_str(lc) 228 | 229 | assert "text: %s" % text in log 230 | assert result.exit_code == 0 231 | 232 | 233 | @pytest.mark.net 234 | def test_file_utf8(): 235 | with LogCapture() as lc: 236 | result = runner_debug(['--file', textfile_utf8]) 237 | log = logcapture_str(lc) 238 | 239 | assert "text: %s" % text_unicode in log 240 | assert result.exit_code == 0 241 | 242 | 243 | @pytest.mark.net 244 | def test_stdout(): 245 | result = runner(['test']) 246 | 247 | # The MP3 encoding (LAME 3.99.5) used to leave a signature in the raw output 248 | # This no longer appears to be the case 249 | assert result.exit_code == 0 250 | 251 | 252 | @pytest.mark.net 253 | def test_file(tmp_path): 254 | filename = tmp_path / 'out.mp3' 255 | 256 | result = runner(['test', '--output', str(filename)]) 257 | 258 | # Check if files created is > 2k 259 | assert filename.stat().st_size > 2000 260 | assert result.exit_code == 0 261 | 262 | 263 | if __name__ == '__main__': 264 | pytest.main(['-x', __file__]) 265 | -------------------------------------------------------------------------------- /gtts/tokenizer/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | 5 | class RegexBuilder(): 6 | r"""Builds regex using arguments passed into a pattern template. 7 | 8 | Builds a regex object for which the pattern is made from an argument 9 | passed into a template. If more than one argument is passed (iterable), 10 | each pattern is joined by "|" (regex alternation 'or') to create a 11 | single pattern. 12 | 13 | Args: 14 | pattern_args (iteratable): String element(s) to be each passed to 15 | ``pattern_func`` to create a regex pattern. Each element is 16 | ``re.escape``'d before being passed. 17 | pattern_func (callable): A 'template' function that should take a 18 | string and return a string. It should take an element of 19 | ``pattern_args`` and return a valid regex pattern group string. 20 | flags: ``re`` flag(s) to compile with the regex. 21 | 22 | Example: 23 | To create a simple regex that matches on the characters "a", "b", 24 | or "c", followed by a period:: 25 | 26 | >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x)) 27 | 28 | Looking at ``rb.regex`` we get the following compiled regex:: 29 | 30 | >>> print(rb.regex) 31 | 'a\.|b\.|c\.' 32 | 33 | The above is fairly simple, but this class can help in writing more 34 | complex repetitive regex, making them more readable and easier to 35 | create by using existing data structures. 36 | 37 | Example: 38 | To match the character following the words "lorem", "ipsum", "meili" 39 | or "koda":: 40 | 41 | >>> words = ['lorem', 'ipsum', 'meili', 'koda'] 42 | >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x)) 43 | 44 | Looking at ``rb.regex`` we get the following compiled regex:: 45 | 46 | >>> print(rb.regex) 47 | '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).' 48 | 49 | """ 50 | 51 | def __init__(self, pattern_args, pattern_func, flags=0): 52 | self.pattern_args = pattern_args 53 | self.pattern_func = pattern_func 54 | self.flags = flags 55 | 56 | # Compile 57 | self.regex = self._compile() 58 | 59 | def _compile(self): 60 | alts = [] 61 | for arg in self.pattern_args: 62 | arg = re.escape(arg) 63 | alt = self.pattern_func(arg) 64 | alts.append(alt) 65 | 66 | pattern = '|'.join(alts) 67 | return re.compile(pattern, self.flags) 68 | 69 | def __repr__(self): # pragma: no cover 70 | return str(self.regex) 71 | 72 | 73 | class PreProcessorRegex(): 74 | r"""Regex-based substitution text pre-processor. 75 | 76 | Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a 77 | :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl`` 78 | replacement parameter. 79 | 80 | Args: 81 | search_args (iteratable): String element(s) to be each passed to 82 | ``search_func`` to create a regex pattern. Each element is 83 | ``re.escape``'d before being passed. 84 | search_func (callable): A 'template' function that should take a 85 | string and return a string. It should take an element of 86 | ``search_args`` and return a valid regex search pattern string. 87 | repl (string): The common replacement passed to the ``sub`` method for 88 | each ``regex``. Can be a raw string (the case of a regex 89 | backreference, for example) 90 | flags: ``re`` flag(s) to compile with each `regex`. 91 | 92 | Example: 93 | Add "!" after the words "lorem" or "ipsum", while ignoring case:: 94 | 95 | >>> import re 96 | >>> words = ['lorem', 'ipsum'] 97 | >>> pp = PreProcessorRegex(words, 98 | ... lambda x: "({})".format(x), r'\\1!', 99 | ... re.IGNORECASE) 100 | 101 | In this case, the regex is a group and the replacement uses its 102 | backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the 103 | following list of search/replacement pairs:: 104 | 105 | >>> print(pp) 106 | (re.compile('(lorem)', re.IGNORECASE), repl='\1!'), 107 | (re.compile('(ipsum)', re.IGNORECASE), repl='\1!') 108 | 109 | It can then be run on any string of text:: 110 | 111 | >>> pp.run("LOREM ipSuM") 112 | "LOREM! ipSuM!" 113 | 114 | See :mod:`gtts.tokenizer.pre_processors` for more examples. 115 | 116 | """ 117 | 118 | def __init__(self, search_args, search_func, repl, flags=0): 119 | self.repl = repl 120 | 121 | # Create regex list 122 | self.regexes = [] 123 | for arg in search_args: 124 | rb = RegexBuilder([arg], search_func, flags) 125 | self.regexes.append(rb.regex) 126 | 127 | def run(self, text): 128 | """Run each regex substitution on ``text``. 129 | 130 | Args: 131 | text (string): the input text. 132 | 133 | Returns: 134 | string: text after all substitutions have been sequentially 135 | applied. 136 | 137 | """ 138 | for regex in self.regexes: 139 | text = regex.sub(self.repl, text) 140 | return text 141 | 142 | def __repr__(self): # pragma: no cover 143 | subs_strs = [] 144 | for r in self.regexes: 145 | subs_strs.append("({}, repl='{}')".format(r, self.repl)) 146 | return ", ".join(subs_strs) 147 | 148 | 149 | class PreProcessorSub(): 150 | r"""Simple substitution text preprocessor. 151 | 152 | Performs string-for-string substitution from list a find/replace pairs. 153 | It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default 154 | simple substitution regex. 155 | 156 | Args: 157 | sub_pairs (list): A list of tuples of the style 158 | ``(, )`` 159 | ignore_case (bool): Ignore case during search. Defaults to ``True``. 160 | 161 | Example: 162 | Replace all occurences of "Mac" to "PC" and "Firefox" to "Chrome":: 163 | 164 | >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')] 165 | >>> pp = PreProcessorSub(sub_pairs) 166 | 167 | Looking at the ``pp``, we get the following list of 168 | search (regex)/replacement pairs:: 169 | 170 | >>> print(pp) 171 | (re.compile('Mac', re.IGNORECASE), repl='PC'), 172 | (re.compile('Firefox', re.IGNORECASE), repl='Chrome') 173 | 174 | It can then be run on any string of text:: 175 | 176 | >>> pp.run("I use firefox on my mac") 177 | "I use Chrome on my PC" 178 | 179 | See :mod:`gtts.tokenizer.pre_processors` for more examples. 180 | 181 | """ 182 | 183 | def __init__(self, sub_pairs, ignore_case=True): 184 | def search_func(x): 185 | return u"{}".format(x) 186 | 187 | flags = re.I if ignore_case else 0 188 | 189 | # Create pre-processor list 190 | self.pre_processors = [] 191 | for sub_pair in sub_pairs: 192 | pattern, repl = sub_pair 193 | pp = PreProcessorRegex([pattern], search_func, repl, flags) 194 | self.pre_processors.append(pp) 195 | 196 | def run(self, text): 197 | """Run each substitution on ``text``. 198 | 199 | Args: 200 | text (string): the input text. 201 | 202 | Returns: 203 | string: text after all substitutions have been sequentially 204 | applied. 205 | 206 | """ 207 | for pp in self.pre_processors: 208 | text = pp.run(text) 209 | return text 210 | 211 | def __repr__(self): # pragma: no cover 212 | return ", ".join([str(pp) for pp in self.pre_processors]) 213 | 214 | 215 | class Tokenizer(): 216 | r"""An extensible but simple generic rule-based tokenizer. 217 | 218 | A generic and simple string tokenizer that takes a list of functions 219 | (called `tokenizer cases`) returning ``regex`` objects and joins them by 220 | "|" (regex alternation 'or') to create a single regex to use with the 221 | standard ``regex.split()`` function. 222 | 223 | ``regex_funcs`` is a list of any function that can return a ``regex`` 224 | (from ``re.compile()``) object, such as a 225 | :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex`` 226 | attribute). 227 | 228 | See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples. 229 | 230 | Args: 231 | regex_funcs (list): List of compiled ``regex`` objects. Each 232 | functions's pattern will be joined into a single pattern and 233 | compiled. 234 | flags: ``re`` flag(s) to compile with the final regex. Defaults to 235 | ``re.IGNORECASE`` 236 | 237 | Note: 238 | When the ``regex`` objects obtained from ``regex_funcs`` are joined, 239 | their individual ``re`` flags are ignored in favour of ``flags``. 240 | 241 | Raises: 242 | TypeError: When an element of ``regex_funcs`` is not a function, or 243 | a function that does not return a compiled ``regex`` object. 244 | 245 | Warning: 246 | Joined ``regex`` patterns can easily interfere with one another in 247 | unexpected ways. It is recommanded that each tokenizer case operate 248 | on distinct or non-overlapping chracters/sets of characters 249 | (For example, a tokenizer case for the period (".") should also 250 | handle not matching/cutting on decimals, instead of making that 251 | a seperate tokenizer case). 252 | 253 | Example: 254 | A tokenizer with a two simple case (*Note: these are bad cases to 255 | tokenize on, this is simply a usage example*):: 256 | 257 | >>> import re, RegexBuilder 258 | >>> 259 | >>> def case1(): 260 | ... return re.compile("\,") 261 | >>> 262 | >>> def case2(): 263 | ... return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex 264 | >>> 265 | >>> t = Tokenizer([case1, case2]) 266 | 267 | Looking at ``case1().pattern``, we get:: 268 | 269 | >>> print(case1().pattern) 270 | '\\,' 271 | 272 | Looking at ``case2().pattern``, we get:: 273 | 274 | >>> print(case2().pattern) 275 | 'a\\.|b\\.|c\\.' 276 | 277 | Finally, looking at ``t``, we get them combined:: 278 | 279 | >>> print(t) 280 | 're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE) 281 | from: [, ]' 282 | 283 | It can then be run on any string of text:: 284 | 285 | >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend") 286 | ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"] 287 | 288 | """ 289 | 290 | def __init__(self, regex_funcs, flags=re.IGNORECASE): 291 | self.regex_funcs = regex_funcs 292 | self.flags = flags 293 | 294 | try: 295 | # Combine 296 | self.total_regex = self._combine_regex() 297 | except (TypeError, AttributeError) as e: # pragma: no cover 298 | raise TypeError( 299 | "Tokenizer() expects a list of functions returning " 300 | "regular expression objects (i.e. re.compile). " + str(e)) 301 | 302 | def _combine_regex(self): 303 | alts = [] 304 | for func in self.regex_funcs: 305 | alts.append(func()) 306 | 307 | pattern = '|'.join(alt.pattern for alt in alts) 308 | return re.compile(pattern, self.flags) 309 | 310 | def run(self, text): 311 | """Tokenize `text`. 312 | 313 | Args: 314 | text (string): the input text to tokenize. 315 | 316 | Returns: 317 | list: A list of strings (token) split according to the tokenizer cases. 318 | 319 | """ 320 | return self.total_regex.split(text) 321 | 322 | def __repr__(self): # pragma: no cover 323 | return str(self.total_regex) + " from: " + str(self.regex_funcs) 324 | -------------------------------------------------------------------------------- /gtts/tts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from gtts.tokenizer import pre_processors, Tokenizer, tokenizer_cases 3 | from gtts.utils import _minimize, _len, _clean_tokens, _translate_url 4 | from gtts.lang import tts_langs, _fallback_deprecated_lang 5 | 6 | from six.moves import urllib 7 | try: 8 | from urllib.parse import quote 9 | import urllib3 10 | except ImportError: 11 | from urllib import quote 12 | import urllib2 13 | import requests 14 | import logging 15 | import json 16 | import re 17 | import base64 18 | 19 | __all__ = ['gTTS', 'gTTSError'] 20 | 21 | # Logger 22 | log = logging.getLogger(__name__) 23 | log.addHandler(logging.NullHandler()) 24 | 25 | 26 | class Speed: 27 | """Read Speed 28 | 29 | The Google TTS Translate API supports two speeds: 30 | Slow: True 31 | Normal: None 32 | """ 33 | SLOW = True 34 | NORMAL = None 35 | 36 | 37 | class gTTS: 38 | """gTTS -- Google Text-to-Speech. 39 | 40 | An interface to Google Translate's Text-to-Speech API. 41 | 42 | Args: 43 | text (string): The text to be read. 44 | tld (string): Top-level domain for the Google Translate host, 45 | i.e `https://translate.google.`. Different Google domains 46 | can produce different localized 'accents' for a given 47 | language. This is also useful when ``google.com`` might be blocked 48 | within a network but a local or different Google host 49 | (e.g. ``google.cn``) is not. Default is ``com``. 50 | lang (string, optional): The language (IETF language tag) to 51 | read the text in. Default is ``en``. 52 | slow (bool, optional): Reads text more slowly. Defaults to ``False``. 53 | lang_check (bool, optional): Strictly enforce an existing ``lang``, 54 | to catch a language error early. If set to ``True``, 55 | a ``ValueError`` is raised if ``lang`` doesn't exist. 56 | Setting ``lang_check`` to ``False`` skips Web requests 57 | (to validate language) and therefore speeds up instanciation. 58 | Default is ``True``. 59 | pre_processor_funcs (list): A list of zero or more functions that are 60 | called to transform (pre-process) text before tokenizing. Those 61 | functions must take a string and return a string. Defaults to:: 62 | 63 | [ 64 | pre_processors.tone_marks, 65 | pre_processors.end_of_line, 66 | pre_processors.abbreviations, 67 | pre_processors.word_sub 68 | ] 69 | 70 | tokenizer_func (callable): A function that takes in a string and 71 | returns a list of string (tokens). Defaults to:: 72 | 73 | Tokenizer([ 74 | tokenizer_cases.tone_marks, 75 | tokenizer_cases.period_comma, 76 | tokenizer_cases.colon, 77 | tokenizer_cases.other_punctuation 78 | ]).run 79 | 80 | See Also: 81 | :doc:`Pre-processing and tokenizing ` 82 | 83 | Raises: 84 | AssertionError: When ``text`` is ``None`` or empty; when there's nothing 85 | left to speak after pre-precessing, tokenizing and cleaning. 86 | ValueError: When ``lang_check`` is ``True`` and ``lang`` is not supported. 87 | RuntimeError: When ``lang_check`` is ``True`` but there's an error loading 88 | the languages dictionary. 89 | 90 | """ 91 | 92 | GOOGLE_TTS_MAX_CHARS = 100 # Max characters the Google TTS API takes at a time 93 | GOOGLE_TTS_HEADERS = { 94 | "Referer": "http://translate.google.com/", 95 | "User-Agent": 96 | "Mozilla/5.0 (Windows NT 10.0; WOW64) " 97 | "AppleWebKit/537.36 (KHTML, like Gecko) " 98 | "Chrome/47.0.2526.106 Safari/537.36", 99 | "Content-Type": "application/x-www-form-urlencoded;charset=utf-8" 100 | } 101 | GOOGLE_TTS_RPC = "jQ1olc" 102 | 103 | def __init__( 104 | self, 105 | text, 106 | tld='com', 107 | lang='en', 108 | slow=False, 109 | lang_check=True, 110 | pre_processor_funcs=[ 111 | pre_processors.tone_marks, 112 | pre_processors.end_of_line, 113 | pre_processors.abbreviations, 114 | pre_processors.word_sub 115 | ], 116 | tokenizer_func=Tokenizer([ 117 | tokenizer_cases.tone_marks, 118 | tokenizer_cases.period_comma, 119 | tokenizer_cases.colon, 120 | tokenizer_cases.other_punctuation 121 | ]).run 122 | ): 123 | 124 | # Debug 125 | for k, v in dict(locals()).items(): 126 | if k == 'self': 127 | continue 128 | log.debug("%s: %s", k, v) 129 | 130 | # Text 131 | assert text, 'No text to speak' 132 | self.text = text 133 | 134 | # Translate URL top-level domain 135 | self.tld = tld 136 | 137 | # Language 138 | self.lang_check = lang_check 139 | self.lang = lang 140 | 141 | if self.lang_check: 142 | # Fallback lang in case it is deprecated 143 | self.lang = _fallback_deprecated_lang(lang) 144 | 145 | try: 146 | langs = tts_langs() 147 | if self.lang not in langs: 148 | raise ValueError("Language not supported: %s" % lang) 149 | except RuntimeError as e: 150 | log.debug(str(e), exc_info=True) 151 | log.warning(str(e)) 152 | 153 | # Read speed 154 | if slow: 155 | self.speed = Speed.SLOW 156 | else: 157 | self.speed = Speed.NORMAL 158 | 159 | # Pre-processors and tokenizer 160 | self.pre_processor_funcs = pre_processor_funcs 161 | self.tokenizer_func = tokenizer_func 162 | 163 | def _tokenize(self, text): 164 | # Pre-clean 165 | text = text.strip() 166 | 167 | # Apply pre-processors 168 | for pp in self.pre_processor_funcs: 169 | log.debug("pre-processing: %s", pp) 170 | text = pp(text) 171 | 172 | if _len(text) <= self.GOOGLE_TTS_MAX_CHARS: 173 | return _clean_tokens([text]) 174 | 175 | # Tokenize 176 | log.debug("tokenizing: %s", self.tokenizer_func) 177 | tokens = self.tokenizer_func(text) 178 | 179 | # Clean 180 | tokens = _clean_tokens(tokens) 181 | 182 | # Minimize 183 | min_tokens = [] 184 | for t in tokens: 185 | min_tokens += _minimize(t, ' ', self.GOOGLE_TTS_MAX_CHARS) 186 | 187 | # Filter empty tokens, post-minimize 188 | tokens = [t for t in min_tokens if t] 189 | 190 | return min_tokens 191 | 192 | def _prepare_requests(self): 193 | """Created the TTS API the request(s) without sending them. 194 | 195 | Returns: 196 | list: ``requests.PreparedRequests_``. `_``. 197 | """ 198 | # TTS API URL 199 | translate_url = _translate_url(tld=self.tld, path="_/TranslateWebserverUi/data/batchexecute") 200 | 201 | text_parts = self._tokenize(self.text) 202 | log.debug("text_parts: %s", str(text_parts)) 203 | log.debug("text_parts: %i", len(text_parts)) 204 | assert text_parts, 'No text to send to TTS API' 205 | 206 | prepared_requests = [] 207 | for idx, part in enumerate(text_parts): 208 | data = self._package_rpc(part) 209 | 210 | log.debug("data-%i: %s", idx, data) 211 | 212 | # Request 213 | r = requests.Request(method='POST', 214 | url=translate_url, 215 | data=data, 216 | headers=self.GOOGLE_TTS_HEADERS) 217 | 218 | # Prepare request 219 | prepared_requests.append(r.prepare()) 220 | 221 | return prepared_requests 222 | 223 | def _package_rpc(self, text): 224 | parameter = [text, self.lang, self.speed, "null"] 225 | escaped_parameter = json.dumps(parameter, separators=(',', ':')) 226 | 227 | rpc = [[[self.GOOGLE_TTS_RPC, escaped_parameter, None, "generic"]]] 228 | espaced_rpc = json.dumps(rpc, separators=(',', ':')) 229 | return "f.req={}&".format(quote(espaced_rpc)) 230 | 231 | def get_bodies(self): 232 | """Get TTS API request bodies(s) that would be sent to the TTS API. 233 | 234 | Returns: 235 | list: A list of TTS API request bodiess to make. 236 | """ 237 | return [pr.body for pr in self._prepare_requests()] 238 | 239 | def stream(self): 240 | """Do the TTS API request(s) and stream bytes 241 | 242 | Raises: 243 | :class:`gTTSError`: When there's an error with the API request. 244 | TypeError: When ``fp`` is not a file-like object that takes bytes. 245 | 246 | """ 247 | # When disabling ssl verify in requests (for proxies and firewalls), 248 | # urllib3 prints an insecure warning on stdout. We disable that. 249 | try: 250 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 251 | except: 252 | pass 253 | 254 | 255 | 256 | prepared_requests = self._prepare_requests() 257 | for idx, pr in enumerate(prepared_requests): 258 | try: 259 | with requests.Session() as s: 260 | # Send request 261 | r = s.send(request=pr, 262 | proxies=urllib.request.getproxies(), 263 | verify=False) 264 | 265 | log.debug("headers-%i: %s", idx, r.request.headers) 266 | log.debug("url-%i: %s", idx, r.request.url) 267 | log.debug("status-%i: %s", idx, r.status_code) 268 | 269 | r.raise_for_status() 270 | except requests.exceptions.HTTPError as e: # pragma: no cover 271 | # Request successful, bad response 272 | log.debug(str(e)) 273 | raise gTTSError(tts=self, response=r) 274 | except requests.exceptions.RequestException as e: # pragma: no cover 275 | # Request failed 276 | log.debug(str(e)) 277 | raise gTTSError(tts=self) 278 | 279 | # Write 280 | for line in r.iter_lines(chunk_size=1024): 281 | decoded_line = line.decode('utf-8') 282 | if 'jQ1olc' in decoded_line: 283 | audio_search = re.search(r'jQ1olc","\[\\"(.*)\\"]', decoded_line) 284 | if audio_search: 285 | as_bytes = audio_search.group(1).encode('ascii') 286 | yield base64.b64decode(as_bytes) 287 | else: 288 | # Request successful, good response, 289 | # no audio stream in response 290 | raise gTTSError(tts=self, response=r) 291 | log.debug("part-%i created", idx) 292 | 293 | def write_to_fp(self, fp): 294 | """Do the TTS API request(s) and write bytes to a file-like object. 295 | 296 | Args: 297 | fp (file object): Any file-like object to write the ``mp3`` to. 298 | 299 | Raises: 300 | :class:`gTTSError`: When there's an error with the API request. 301 | TypeError: When ``fp`` is not a file-like object that takes bytes. 302 | 303 | """ 304 | 305 | try: 306 | for idx, decoded in enumerate(self.stream()): 307 | fp.write(decoded) 308 | log.debug("part-%i written to %s", idx, fp) 309 | except (AttributeError, TypeError) as e: 310 | raise TypeError( 311 | "'fp' is not a file-like object or it does not take bytes: %s" % 312 | str(e)) 313 | 314 | def save(self, savefile): 315 | """Do the TTS API request and write result to file. 316 | 317 | Args: 318 | savefile (string): The path and file name to save the ``mp3`` to. 319 | 320 | Raises: 321 | :class:`gTTSError`: When there's an error with the API request. 322 | 323 | """ 324 | with open(str(savefile), 'wb') as f: 325 | self.write_to_fp(f) 326 | log.debug("Saved to %s", savefile) 327 | 328 | 329 | class gTTSError(Exception): 330 | """Exception that uses context to present a meaningful error message""" 331 | 332 | def __init__(self, msg=None, **kwargs): 333 | self.tts = kwargs.pop('tts', None) 334 | self.rsp = kwargs.pop('response', None) 335 | if msg: 336 | self.msg = msg 337 | elif self.tts is not None: 338 | self.msg = self.infer_msg(self.tts, self.rsp) 339 | else: 340 | self.msg = None 341 | super(gTTSError, self).__init__(self.msg) 342 | 343 | def infer_msg(self, tts, rsp=None): 344 | """Attempt to guess what went wrong by using known 345 | information (e.g. http response) and observed behaviour 346 | 347 | """ 348 | cause = "Unknown" 349 | 350 | if rsp is None: 351 | premise = "Failed to connect" 352 | 353 | if tts.tld != 'com': 354 | host = _translate_url(tld=tts.tld) 355 | cause = "Host '{}' is not reachable".format(host) 356 | 357 | else: 358 | # rsp should be 359 | # http://docs.python-requests.org/en/master/api/ 360 | status = rsp.status_code 361 | reason = rsp.reason 362 | 363 | premise = "{:d} ({}) from TTS API".format(status, reason) 364 | 365 | if status == 403: 366 | cause = "Bad token or upstream API changes" 367 | elif status == 200 and not tts.lang_check: 368 | cause = "No audio stream in response. Unsupported language '%s'" % self.tts.lang 369 | elif status >= 500: 370 | cause = "Uptream API error. Try again later." 371 | 372 | return "{}. Probable cause: {}".format(premise, cause) 373 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | .. NOTE: You should *NOT* be adding new change log entries to this file, this 2 | file is managed by towncrier. You *may* edit previous change logs to 3 | fix problems like typo corrections or such. 4 | 5 | To add a new change log entry, please see CONTRIBUTING.rst 6 | 7 | Changelog 8 | ========= 9 | 10 | .. towncrier release notes start 11 | 12 | 2.2.3 (2021-06-17) 13 | ------------------ 14 | 15 | Features 16 | ~~~~~~~~ 17 | 18 | - Added Bulgarian language support (`#302 `_) 19 | 20 | 21 | 2.2.2 (2021-02-03) 22 | ------------------ 23 | 24 | Features 25 | ~~~~~~~~ 26 | 27 | - Adds a language fallback feature for deprecated languages to maintain compatiblity (e.g. ``en-us`` becomes ``en``). Fallback can be disabled with ``lang_check=False`` or ``--nocheck`` for the cli (`#267 `_) 28 | 29 | 30 | Bugfixes 31 | ~~~~~~~~ 32 | 33 | - Fix Python 2.7 compatiblity (!). Python 2 is long gone, but the cut wasn't clearly communicated for gTTS, so it was restored. Python 2 support will be completely removed in the next major release. (`#255 `_) 34 | - Language code case sensitivity is maintained throughout (`#267 `_) 35 | 36 | 37 | Deprecations and Removals 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | - The following list of 'hyphenated' language codes no longer work and have been removed: ``en-us``, ``en-ca``, ``en-uk``, ``en-gb``, ``en-au``, ``en-gh``, ``en-in``, ``en-ie``, ``en-nz``, ``en-ng``, ``en-ph``, ``en-za``, ``en-tz``, ``fr-ca``, ``fr-fr``, ``pt-br``, ``pt-pt``, ``es-es``, ``es-us``, ``zh-cn``, ``zh-tw`` (`#267 `_) 41 | - Removed the ``gtts.get_url()`` method (outdated since ``2.1.0``) (`#270 `_) 42 | 43 | 44 | 2.2.1 (2020-11-15) 45 | ------------------ 46 | 47 | Bugfixes 48 | ~~~~~~~~ 49 | 50 | - ``_package_rpc()`` was erroneously packaging the entire text instead of tokenized part (`#252 `_) 51 | 52 | 53 | Improved Documentation 54 | ~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | - Removes reference to automatic retrieval of languages (`#250 `_) 57 | 58 | 59 | Misc 60 | ~~~~ 61 | 62 | - `#251 `_ 63 | 64 | 65 | 2.2.0 (2020-11-14) 66 | ------------------ 67 | 68 | Features 69 | ~~~~~~~~ 70 | 71 | - Switch to the newer Google TTS API (thanks to `@Boudewijn26! `_). See `his great writeup `_ for more on the methodology and why this was necessary. (`#226 `_, `#232 `_, `#236 `_, `#241 `_) 72 | 73 | 74 | Deprecations and Removals 75 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 76 | 77 | - Removed automatic language download from the main code, which has become too unreliable & slow. 78 | Languages will still be fetched but a pre-generated list will be shipped with ``gTTS``. (`#233 `_, `#241 `_, `#242 `_, `#243 `_) 79 | - Because languages are now pre-generated, removed custom TLD support for language URL (which allowed to get language **names** in other than English) (`#245 `_) 80 | 81 | 82 | Misc 83 | ~~~~ 84 | 85 | - `#245 `_ 86 | 87 | 88 | 2.1.2 (2020-11-10) 89 | ------------------ 90 | 91 | Features 92 | ~~~~~~~~ 93 | 94 | - Update `gTTS-token` to `1.1.4` (`#238 `_) 95 | 96 | 97 | Bugfixes 98 | ~~~~~~~~ 99 | 100 | - Fixed an issue where some tokens could be empty after minimization (`#229 `_, `#239 `_) 101 | 102 | 103 | Improved Documentation 104 | ~~~~~~~~~~~~~~~~~~~~~~ 105 | 106 | - Grammar, spelling and example fixes (`#227 `_) 107 | 108 | 109 | Misc 110 | ~~~~ 111 | 112 | - `#218 `_, `#230 `_, `#231 `_, `#239 `_ 113 | 114 | 115 | 2.1.1 (2020-01-25) 116 | ------------------ 117 | 118 | Bugfixes 119 | ~~~~~~~~ 120 | 121 | - Debug mode now uses a copy of locals() to prevent RuntimeError (`#213 `_) 122 | 123 | 124 | 2.1.0 (2020-01-01) 125 | ------------------ 126 | 127 | Features 128 | ~~~~~~~~ 129 | 130 | - The ``gtts`` module 131 | 132 | - Added the ability to customize the Google Translate URL hostname. 133 | This is useful when ``google.com`` might be blocked within a network but 134 | a local or different Google host (e.g. ``google.cn``) is not 135 | (`#143 `_, `#203 `_): 136 | 137 | - New ``gTTS()`` parameter ``tld`` to specify the top-level 138 | domain to use for the Google hostname, i.e ``https://translate.google.`` 139 | (default: ``com``). 140 | - Languages are also now fetched using the same customized hostname. 141 | 142 | - Pre-generated TTS API request URLs can now be obtained instead of 143 | writing an ``mp3`` file to disk (for example to be used in an 144 | external program): 145 | 146 | - New ``get_urls()`` method returns the list of URLs generated by ``gTTS``, 147 | which can be used in lieu of ``write_to_fp()`` or ``save()``. 148 | 149 | - The ``gtts-cli`` command-line tool 150 | 151 | - New ``--tld`` option to match the new ``gtts`` customizable hostname (`#200 `_, `#207 `_) 152 | 153 | - Other 154 | 155 | - Added Python 3.8 support (`#204 `_) 156 | 157 | 158 | Bugfixes 159 | ~~~~~~~~ 160 | 161 | - Changed default word-for-word pre-processor (``('M.', 'Monsieur')``) which would substitute any 'm.' for 'monsieur' (e.g. 'them.' became 'themonsieur') (`#197 `_) 162 | 163 | 164 | Improved Documentation 165 | ~~~~~~~~~~~~~~~~~~~~~~ 166 | 167 | - Added examples for newer features (`#205 `_, `#207 `_) 168 | 169 | 170 | Misc 171 | ~~~~ 172 | 173 | - `#204 `_, `#205 `_, `#207 `_ 174 | 175 | 176 | 2.0.4 (2019-08-29) 177 | ------------------ 178 | 179 | Features 180 | ~~~~~~~~ 181 | 182 | - gTTS is now built as a wheel package (Python 2 & 3) (`#181 `_) 183 | 184 | 185 | Improved Documentation 186 | ~~~~~~~~~~~~~~~~~~~~~~ 187 | 188 | - Fixed bad example in docs (`#163 `_, `#166 `_) 189 | 190 | 191 | Misc 192 | ~~~~ 193 | 194 | - `#164 `_, `#171 `_, `#173 `_, `#185 `_ 195 | 196 | 197 | 2.0.3 (2018-12-15) 198 | ------------------ 199 | 200 | Features 201 | ~~~~~~~~ 202 | 203 | - Added new tokenizer case for ':' preventing cut in the middle of a time notation (`#135 `_) 204 | 205 | 206 | Misc 207 | ~~~~ 208 | 209 | - `#159 `_ 210 | 211 | 212 | 2.0.2 (2018-12-09) 213 | ------------------ 214 | 215 | Features 216 | ~~~~~~~~ 217 | 218 | - Added Python 3.7 support, modernization of packaging, testing and CI (`#126 `_) 219 | 220 | 221 | Bugfixes 222 | ~~~~~~~~ 223 | 224 | - Fixed language retrieval/validation broken from new Google Translate page (`#156 `_) 225 | 226 | 227 | 2.0.1 (2018-06-20) 228 | ------------------ 229 | 230 | Bugfixes 231 | ~~~~~~~~ 232 | 233 | - Fixed an UnicodeDecodeError when installing gTTS if system locale was not 234 | utf-8 (`#120 `_) 235 | 236 | 237 | Improved Documentation 238 | ~~~~~~~~~~~~~~~~~~~~~~ 239 | 240 | - Added *Pre-processing and tokenizing > Minimizing* section about the API's 241 | 100 characters limit and how larger tokens are handled (`#121 242 | `_) 243 | 244 | 245 | Misc 246 | ~~~~ 247 | 248 | - `#122 `_ 249 | 250 | 251 | 2.0.0 (2018-04-30) 252 | ------------------ 253 | (`#108 `_) 254 | 255 | Features 256 | ~~~~~~~~ 257 | 258 | - The ``gtts`` module 259 | 260 | - New logger ("gtts") replaces all occurrences of ``print()`` 261 | - Languages list is now obtained automatically (``gtts.lang``) 262 | (`#91 `_, 263 | `#94 `_, 264 | `#106 `_) 265 | - Added a curated list of language sub-tags that 266 | have been observed to provide different dialects or accents 267 | (e.g. "en-gb", "fr-ca") 268 | - New ``gTTS()`` parameter ``lang_check`` to disable language 269 | checking. 270 | - ``gTTS()`` now delegates the ``text`` tokenizing to the 271 | API request methods (i.e. ``write_to_fp()``, ``save()``), 272 | allowing ``gTTS`` instances to be modified/reused 273 | - Rewrote tokenizing and added pre-processing (see below) 274 | - New ``gTTS()`` parameters ``pre_processor_funcs`` and 275 | ``tokenizer_func`` to configure pre-processing and tokenizing 276 | (or use a 3rd party tokenizer) 277 | - Error handling: 278 | 279 | - Added new exception ``gTTSError`` raised on API request errors. 280 | It attempts to guess what went wrong based on known information 281 | and observed behaviour 282 | (`#60 `_, 283 | `#106 `_) 284 | - ``gTTS.write_to_fp()`` and ``gTTS.save()`` also raise ``gTTSError`` 285 | on `gtts_token` error 286 | - ``gTTS.write_to_fp()`` raises ``TypeError`` when ``fp`` is not a 287 | file-like object or one that doesn't take bytes 288 | - ``gTTS()`` raises ``ValueError`` on unsupported languages 289 | (and ``lang_check`` is ``True``) 290 | - More fine-grained error handling throughout (e.g. 291 | `request failed` vs. `request successful with a bad response`) 292 | 293 | - Tokenizer (and new pre-processors): 294 | 295 | - Rewrote and greatly expanded tokenizer (``gtts.tokenizer``) 296 | - Smarter token 'cleaning' that will remove tokens that only contain 297 | characters that can't be spoken (i.e. punctuation and whitespace) 298 | - Decoupled token minimizing from tokenizing, making the latter usable 299 | in other contexts 300 | - New flexible speech-centric text pre-processing 301 | - New flexible full-featured regex-based tokenizer 302 | (``gtts.tokenizer.core.Tokenizer``) 303 | - New ``RegexBuilder``, ``PreProcessorRegex`` and ``PreProcessorSub`` classes 304 | to make writing regex-powered text `pre-processors` and `tokenizer cases` 305 | easier 306 | - Pre-processors: 307 | 308 | - Re-form words cut by end-of-line hyphens 309 | - Remove periods after a (customizable) list of known abbreviations 310 | (e.g. "jr", "sr", "dr") that can be spoken the same without a period 311 | - Perform speech corrections by doing word-for-word replacements 312 | from a (customizable) list of tuples 313 | 314 | - Tokenizing: 315 | 316 | - Keep punctuation that modify the inflection of speech (e.g. "?", "!") 317 | - Don't split in the middle of numbers (e.g. "10.5", "20,000,000") 318 | (`#101 `_) 319 | - Don't split on "dotted" abbreviations and accronyms (e.g. "U.S.A") 320 | - Added Chinese comma (","), ellipsis ("…") to punctuation list 321 | to tokenize on (`#86 `_) 322 | 323 | - The ``gtts-cli`` command-line tool 324 | 325 | - Rewrote cli as first-class citizen module (``gtts.cli``), 326 | powered by `Click `_ 327 | - Windows support using `setuptool`'s `entry_points` 328 | - Better support for Unicode I/O in Python 2 329 | - All arguments are now pre-validated 330 | - New ``--nocheck`` flag to skip language pre-checking 331 | - New ``--all`` flag to list all available languages 332 | - Either the ``--file`` option or the ```` argument can be set to 333 | "-" to read from ``stdin`` 334 | - The ``--debug`` flag uses logging and doesn't pollute ``stdout`` 335 | anymore 336 | 337 | 338 | Bugfixes 339 | ~~~~~~~~ 340 | 341 | - ``_minimize()``: Fixed an infinite recursion loop that would occur 342 | when a token started with the miminizing delimiter (i.e. a space) 343 | (`#86 `_) 344 | - ``_minimize()``: Handle the case where a token of more than 100 345 | characters did not contain a space (e.g. in Chinese). 346 | - Fixed an issue that fused multiline text together if the total number of 347 | characters was less than 100 348 | - Fixed ``gtts-cli`` Unicode errors in Python 2.7 (famous last words) 349 | (`#78 `_, 350 | `#93 `_, 351 | `#96 `_) 352 | 353 | 354 | Deprecations and Removals 355 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 356 | 357 | - Dropped Python 3.3 support 358 | - Removed ``debug`` parameter of ``gTTS`` (in favour of logger) 359 | - ``gtts-cli``: Changed long option name of ``-o`` to ``--output`` 360 | instead of ``--destination`` 361 | - ``gTTS()`` will raise a ``ValueError`` rather than an ``AssertionError`` 362 | on unsupported language 363 | 364 | 365 | Improved Documentation 366 | ~~~~~~~~~~~~~~~~~~~~~~ 367 | 368 | - Rewrote all documentation files as reStructuredText 369 | - Comprehensive documentation writen for `Sphinx `_, published to http://gtts.readthedocs.io 370 | - Changelog built with `towncrier `_ 371 | 372 | Misc 373 | ~~~~ 374 | 375 | - Major test re-work 376 | - Language tests can read a ``TEST_LANGS`` enviromment variable so 377 | not all language tests are run every time. 378 | - Added `AppVeyor `_ CI for Windows 379 | - `PEP 8 `_ compliance 380 | 381 | 382 | 1.2.2 (2017-08-15) 383 | ------------------ 384 | 385 | Misc 386 | ~~~~ 387 | 388 | - Update LICENCE, add to manifest (`#77 `_) 389 | 390 | 391 | 1.2.1 (2017-08-02) 392 | ------------------ 393 | 394 | Features 395 | ~~~~~~~~ 396 | 397 | - Add Unicode punctuation to the tokenizer (such as for Chinese and Japanese) 398 | (`#75 `_) 399 | 400 | 401 | Bugfixes 402 | ~~~~~~~~ 403 | 404 | - Fix > 100 characters non-ASCII split, ``unicode()`` for Python 2 (`#71 405 | `_, `#73 406 | `_, `#75 407 | `_) 408 | 409 | 410 | 1.2.0 (2017-04-15) 411 | ------------------ 412 | 413 | Features 414 | ~~~~~~~~ 415 | 416 | - Option for slower read speed (``slow=True`` for ``gTTS()``, ``--slow`` for 417 | ``gtts-cli``) (`#40 `_, `#41 418 | `_, `#64 419 | `_, `#67 420 | `_) 421 | - System proxy settings are passed transparently to all http requests (`#45 422 | `_, `#68 423 | `_) 424 | - Silence SSL warnings from urllib3 (`#69 425 | `_) 426 | 427 | 428 | Bugfixes 429 | ~~~~~~~~ 430 | 431 | - The text to read is now cut in proper chunks in Python 2 unicode. This 432 | broke reading for many languages such as Russian. 433 | - Disabled SSL verify on http requests to accommodate certain firewalls 434 | and proxies. 435 | - Better Python 2/3 support in general (`#9 `_, 436 | `#48 `_, `#68 437 | `_) 438 | 439 | 440 | Deprecations and Removals 441 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 442 | 443 | - 'pt-br' : 'Portuguese (Brazil)' (it was the same as 'pt' and not Brazilian) 444 | (`#69 `_) 445 | 446 | 447 | 1.1.8 (2017-01-15) 448 | ------------------ 449 | 450 | Features 451 | ~~~~~~~~ 452 | 453 | - Added ``stdin`` support via the '-' ``text`` argument to ``gtts-cli`` (`#56 454 | `_) 455 | 456 | 457 | 1.1.7 (2016-12-14) 458 | ------------------ 459 | 460 | Features 461 | ~~~~~~~~ 462 | 463 | - Added utf-8 support to ``gtts-cli`` (`#52 464 | `_) 465 | 466 | 467 | 1.1.6 (2016-07-20) 468 | ------------------ 469 | 470 | Features 471 | ~~~~~~~~ 472 | 473 | - Added 'bn' : 'Bengali' (`#39 `_, 474 | `#44 `_) 475 | 476 | 477 | Deprecations and Removals 478 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 479 | 480 | - 'ht' : 'Haitian Creole' (removed by Google) (`#43 481 | `_) 482 | 483 | 484 | 1.1.5 (2016-05-13) 485 | ------------------ 486 | 487 | Bugfixes 488 | ~~~~~~~~ 489 | 490 | - Fixed HTTP 403s by updating the client argument to reflect new API usage 491 | (`#32 `_, `#33 492 | `_) 493 | 494 | 495 | 1.1.4 (2016-02-22) 496 | ------------------ 497 | 498 | Features 499 | ~~~~~~~~ 500 | 501 | - Spun-off token calculation to `gTTS-Token 502 | `_ (`#23 503 | `_, `#29 504 | `_) 505 | 506 | 507 | 1.1.3 (2016-01-24) 508 | ------------------ 509 | 510 | Bugfixes 511 | ~~~~~~~~ 512 | 513 | - ``gtts-cli`` works with Python 3 (`#20 514 | `_) 515 | - Better support for non-ASCII characters (`#21 516 | `_, `#22 517 | `_) 518 | 519 | 520 | Misc 521 | ~~~~ 522 | 523 | - Moved out gTTS token to its own module (`#19 `_) 524 | 525 | 526 | 1.1.2 (2016-01-13) 527 | ------------------ 528 | 529 | Features 530 | ~~~~~~~~ 531 | 532 | - Added gTTS token (tk url parameter) calculation (`#14 533 | `_, `#15 534 | `_, `#17 535 | `_) 536 | 537 | 538 | 1.0.7 (2015-10-07) 539 | ------------------ 540 | 541 | Features 542 | ~~~~~~~~ 543 | 544 | - Added ``stdout`` support to ``gtts-cli``, text now an argument rather than an 545 | option (`#10 `_) 546 | 547 | 548 | 1.0.6 (2015-07-30) 549 | ------------------ 550 | 551 | Features 552 | ~~~~~~~~ 553 | 554 | - Raise an exception on bad HTTP response (4xx or 5xx) (`#8 555 | `_) 556 | 557 | 558 | Bugfixes 559 | ~~~~~~~~ 560 | 561 | - Added ``client=t`` parameter for the api HTTP request (`#8 562 | `_) 563 | 564 | 565 | 1.0.5 (2015-07-15) 566 | ------------------ 567 | 568 | Features 569 | ~~~~~~~~ 570 | 571 | - ``write_to_fp()`` to write to a file-like object (`#6 572 | `_) 573 | 574 | 575 | 1.0.4 (2015-05-11) 576 | ------------------ 577 | 578 | Features 579 | ~~~~~~~~ 580 | 581 | - Added Languages: `zh-yue` : 'Chinese (Cantonese)', `en-uk` : 'English (United 582 | Kingdom)', `pt-br` : 'Portuguese (Brazil)', `es-es` : 'Spanish (Spain)', 583 | `es-us` : 'Spanish (United StateS)', `zh-cn` : 'Chinese (Mandarin/China)', 584 | `zh-tw` : 'Chinese (Mandarin/Taiwan)' (`#4 585 | `_) 586 | 587 | 588 | Bugfixes 589 | ~~~~~~~~ 590 | 591 | - ``gtts-cli`` print version and pretty printed available languages, language 592 | codes are now case insensitive (`#4 `_) 593 | 594 | 595 | 1.0.3 (2014-11-21) 596 | ------------------ 597 | 598 | Features 599 | ~~~~~~~~ 600 | 601 | - Added Languages: 'en-us' : 'English (United States)', 'en-au' : 'English 602 | (Australia)' (`#3 `_) 603 | 604 | 605 | 1.0.2 (2014-05-15) 606 | ------------------ 607 | 608 | Features 609 | ~~~~~~~~ 610 | 611 | - Python 3 support 612 | 613 | 614 | 1.0.1 (2014-05-15) 615 | ------------------ 616 | 617 | Misc 618 | ~~~~ 619 | 620 | - SemVer versioning, CI changes 621 | 622 | 623 | 1.0 (2014-05-08) 624 | ---------------- 625 | 626 | Features 627 | ~~~~~~~~ 628 | 629 | - Initial release 630 | 631 | 632 | --------------------------------------------------------------------------------