├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── linux.yaml
    │   ├── macos.yaml
    │   ├── publish_pypi.yaml
    │   └── windows.yaml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE.txt
├── README.md
├── phonemizer
    ├── __init__.py
    ├── backend
    │   ├── __init__.py
    │   ├── base.py
    │   ├── espeak
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── base.py
    │   │   ├── espeak.py
    │   │   ├── language_switch.py
    │   │   ├── mbrola.py
    │   │   ├── voice.py
    │   │   ├── words_mismatch.py
    │   │   └── wrapper.py
    │   ├── festival
    │   │   ├── __init__.py
    │   │   ├── festival.py
    │   │   └── lispy.py
    │   └── segments.py
    ├── logger.py
    ├── main.py
    ├── phonemize.py
    ├── punctuation.py
    ├── separator.py
    ├── share
    │   ├── festival
    │   │   └── phonemize.scm
    │   └── segments
    │   │   ├── chintang.g2p
    │   │   ├── cree.g2p
    │   │   ├── inuktitut.g2p
    │   │   ├── japanese.g2p
    │   │   ├── sesotho.g2p
    │   │   └── yucatec.g2p
    ├── utils.py
    └── version.py
├── setup.cfg
├── setup.py
└── test
    ├── __init__.py
    ├── test_espeak.py
    ├── test_espeak_lang_switch.py
    ├── test_espeak_wrapper.py
    ├── test_festival.py
    ├── test_import.py
    ├── test_main.py
    ├── test_mbrola.py
    ├── test_phonemize.py
    ├── test_punctuation.py
    ├── test_segments.py
    ├── test_separator.py
    └── test_utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Phonemizer version**
14 | The output of `phonemize --version` from command line, very helpfull!
15 | 
16 | **System**
17 | Your OS (Linux distribution, Windows, ...), eventually Python version.
18 | 
19 | **To reproduce**
20 | A short example (Python script or command) reproducing the bug.
21 | 
22 | **Expected behavior**
23 | A clear and concise description of what you expected to happen.
24 | 
25 | **Additional context**
26 | Add any other context about the problem here.
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/linux.yaml:
--------------------------------------------------------------------------------
 1 | # Test on Linux Ubuntu with festival-2.5 with various Python and espeak versions
 2 | 
 3 | name: Linux
 4 | 
 5 | on: [push, pull_request]
 6 | 
 7 | jobs:
 8 |   python-version:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       matrix:
13 |         python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
14 | 
15 |     steps:
16 |       - name: Checkout phonemizer
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Setup python
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Install system dependencies
25 |         run: |
26 |           sudo apt-get update
27 |           sudo apt-get install espeak-ng festival mbrola mbrola-fr1
28 | 
29 |       - name: Install phonemizer
30 |         run: |
31 |           pip install --upgrade pip pytest pytest-cov
32 |           python setup.py install
33 | 
34 |       - name: Version phonemizer
35 |         run: phonemize --version
36 | 
37 |       - name: Test phonemizer
38 |         run: pytest -v --cov=phonemizer --cov-report=xml test/
39 | 
40 |       - name: Upload coverage to Codecov
41 |         if: ${{ matrix.python-version == '3.9' }}
42 |         uses: codecov/codecov-action@v2
43 |         with:
44 |           files: coverage.xml
45 |           verbose: true
46 | 
47 |   espeak-version:
48 |     runs-on: ubuntu-latest
49 | 
50 |     strategy:
51 |       matrix:
52 |         espeak-version: ['1.48.03', '1.49.2', '1.50']
53 | 
54 |     steps:
55 |       - name: Checkout phonemizer
56 |         uses: actions/checkout@v2
57 | 
58 |       - name: Setup python
59 |         uses: actions/setup-python@v2
60 | 
61 |       - name: Install system dependencies
62 |         run: |
63 |           sudo apt-get update
64 |           sudo apt-get install festival mbrola mbrola-fr1
65 | 
66 |       - name: Install espeak-1.48
67 |         if: ${{ matrix.espeak-version == '1.48.03' }}
68 |         run: sudo apt install espeak
69 | 
70 |       - name: Install espeak>=1.49
71 |         if: ${{ matrix.espeak-version != '1.48.03' }}
72 |         env:
73 |           ESPEAK_VERSION: ${{ matrix.espeak-version }}
74 |         run: |
75 |           sudo apt-get install make autoconf automake libtool pkg-config gcc libsonic-dev libpcaudio-dev git
76 |           git clone --depth 1 --branch $ESPEAK_VERSION https://github.com/espeak-ng/espeak-ng.git
77 |           cd espeak-ng
78 |           ./autogen.sh
79 |           ./configure
80 |           make
81 |           sudo make install
82 |           sudo ldconfig
83 |           espeak --version
84 | 
85 |       - name: Install phonemizer
86 |         run: |
87 |           pip install --upgrade pip pytest
88 |           python setup.py install
89 | 
90 |       - name: Version phonemizer
91 |         run: phonemize --version
92 | 
93 |       - name: Test phonemizer
94 |         run: pytest -v
95 | 


--------------------------------------------------------------------------------
/.github/workflows/macos.yaml:
--------------------------------------------------------------------------------
 1 | # Test on macos with festival-2.4 compiled from source and espeak-1.48 from
 2 | # homebrew. To save time and ressources, festival is cached across runs.
 3 | 
 4 | name: MacOS
 5 | 
 6 | on: [push, pull_request]
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: macos-latest
11 | 
12 |     env:
13 |       PHONEMIZER_FESTIVAL_EXECUTABLE: ${{ github.workspace }}/festival/build_festival/festival/bin/festival
14 | 
15 |     steps:
16 |       - name: Checkout phonemizer
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Setup python
20 |         uses: actions/setup-python@v2
21 | 
22 |       - name: Install espeak-1.48
23 |         run: |
24 |           brew update
25 |           brew install espeak
26 | 
27 |       - name: Cache festival
28 |         uses: actions/cache@v2
29 |         id: cache-festival
30 |         with:
31 |           path: ${{ github.workspace }}/festival
32 |           key: ${{ runner.os }}-festival
33 | 
34 |       - name: Checkout festival
35 |         if: steps.cache-festival.outputs.cache-hit != 'true'
36 |         uses: actions/checkout@v2
37 |         with:
38 |           repository: pettarin/setup-festival-mbrola
39 |           path: festival
40 | 
41 |       - name: Install festival
42 |         if: steps.cache-festival.outputs.cache-hit != 'true'
43 |         run: |
44 |           cd festival
45 |           bash setup_festival_mbrola.sh . festival
46 | 
47 |       - name: Install phonemizer
48 |         run: |
49 |           pip install --upgrade pip
50 |           python setup.py install
51 |           pip install --upgrade pytest
52 | 
53 |       - name: Version phonemizer
54 |         run: phonemize --version
55 | 
56 |       - name: Test phonemizer
57 |         run: pytest -v
58 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_pypi.yaml:
--------------------------------------------------------------------------------
 1 | # Uppload to pypi on new tags
 2 | 
 3 | name: Publish to Pypi
 4 | 
 5 | on:
 6 |   push:
 7 |     tags: v*
 8 | 
 9 | jobs:
10 |   publish:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - name: Checkout phonemizer
15 |         uses: actions/checkout@v2
16 | 
17 |       - name: Setup python
18 |         uses: actions/setup-python@v2
19 | 
20 |       - name: Install system dependencies
21 |         run: |
22 |           sudo apt-get update
23 |           sudo apt-get install espeak-ng festival mbrola mbrola-fr1
24 | 
25 |       - name: Build phonemizer
26 |         run: |
27 |           pip install --upgrade pip pytest wheel
28 |           python setup.py install
29 |           pytest
30 |           python setup.py sdist bdist_wheel
31 | 
32 |       - name: Publish to Pypi
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 |         with:
35 |           user: __token__
36 |           password: ${{ secrets.PYPI_API_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yaml:
--------------------------------------------------------------------------------
 1 | # Test on windows with espeak-1.50 and festival-2.5
 2 | 
 3 | name: Windows
 4 | 
 5 | on: [push, pull_request]
 6 | 
 7 | jobs:
 8 |   test:
 9 |     runs-on: windows-latest
10 | 
11 |     env:
12 |       PHONEMIZER_ESPEAK_LIBRARY: "C:\\Program Files\\eSpeak NG\\libespeak-ng.dll"
13 |       PHONEMIZER_FESTIVAL_EXECUTABLE: "C:\\festival\\src\\main\\festival.exe"
14 | 
15 |     steps:
16 |       - name: Checkout phonemizer
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Setup python
20 |         uses: actions/setup-python@v2
21 | 
22 |       - name: Cache festival
23 |         uses: actions/cache@v2
24 |         id: cache-festival
25 |         with:
26 |           path: |
27 |             C:\festival
28 |             C:\speech_tools
29 |           key: ${{ runner.os }}-festival
30 | 
31 |       - name: Install espeak
32 |         if: steps.cache-espeak.outputs.cache-hit != 'true'
33 |         run: |
34 |           $source = 'https://github.com/espeak-ng/espeak-ng/releases/download/1.50/espeak-ng-20191129-b702b03-x64.msi'
35 |           Invoke-WebRequest -Uri $source -OutFile espeak.msi
36 |           Start-Process msiexec.exe -Wait -ArgumentList '/I espeak.msi /qn'
37 | 
38 |       - name: Install festival
39 |         if: steps.cache-festival.outputs.cache-hit != 'true'
40 |         run: |
41 |           $uri = "https://sourceforge.net/projects/e-guidedog/files/related-third-party-software/0.3"
42 | 
43 |           $webclient = New-Object System.Net.WebClient
44 |           $webclient.DownloadFile("$uri" + "/festival-2.5-win.7z", "festival-2.5.7z")
45 |           $webclient.DownloadFile("$uri" + "/speech_tools-2.5-win.7z", "speech_tools-2.5.7z")
46 | 
47 |           set-alias sz "$env:ProgramFiles\7-Zip\7z.exe"
48 |           sz x -oC:\ festival-2.5.7z
49 |           sz x -oC:\ speech_tools-2.5.7z
50 | 
51 |       - name: Install phonemizer
52 |         run: |
53 |           pip install pytest
54 |           python setup.py install
55 | 
56 |       - name: Version phonemizer
57 |         run: |
58 |           phonemize --version
59 | 
60 |       - name: Test phonemizer
61 |         run: pytest -v
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info/
 2 | *.pyc
 3 | /.cache/*
 4 | /.pytest_cache/
 5 | /build/*
 6 | /dist/*
 7 | .coverage*
 8 | coverage.xml
 9 | htmlcov/*
10 | .eggs/*
11 | test/htmlcov


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # ChangeLog
  2 | 
  3 | Version numbers follow [semantic versioning](https://semver.org)
  4 | 
  5 | ## phonemizer-3.0
  6 | 
  7 | * **breaking change**
  8 | 
  9 |   * Do not remove empty lines from output. For example:
 10 | 
 11 |   ```python
 12 |   # this is now
 13 |   phonemize(["hello", "!??"]) == ['həloʊ ', '']
 14 |   # this was
 15 |   phonemize(["hello", "!??"]) == ['həloʊ ']
 16 |   ```
 17 | 
 18 |   * Default backend in the `phonemize` function is now `espeak` (was
 19 |     `festival`).
 20 | 
 21 |   * `espeak-mbrola` backend now requires `espeak>=1.49`.
 22 | 
 23 |   * `--espeak-path` option renamed as `--espeak-library`and
 24 |     `PHONEMIZER_ESPEAK_PATH` environment variable renamed as
 25 |     `PHONEMIZER_ESPEAK_LIBRARY`.
 26 | 
 27 |   * `--festival-path` option renamed as `--festival-executable` and
 28 |     `PHONEMIZER_FESTIVAL_PATH` environment variable renamed as
 29 |     `PHONEMIZER_FESTIVAL_EXECUTABLE`.
 30 | 
 31 |   * The methods `backend.phonemize()` from the backend classes take only a list
 32 |     of str a input text (was either a str or a list of str).
 33 | 
 34 |   * The methods `backend.version()` from the backend classes returns a tuple of
 35 |     int instead of a str.
 36 | 
 37 | * **improvements**
 38 | 
 39 |   * `espeak` and `mbrola` backends now rely on the `espeak` shared library using
 40 |     the `ctypes` Python module, instead of reliying on the `espeak` executable
 41 |     through subprocesses. This implies drastic speed improvments, up to 40 times
 42 |     faster.
 43 | 
 44 | * **new features**
 45 | 
 46 |   * New option `--prepend-text` to prepend the input text to phonemized
 47 |     utterances, so as to have both orthographic and phonemized available at
 48 |     output.
 49 | 
 50 |   * New option `--tie` for the `espeak` backend to display a tie character
 51 |     within multi-letter phonemes. (see issue
 52 |     [#74](https://github.com/bootphon/phonemizer/issues/74)).
 53 | 
 54 |   * New option `--words-mismatch` for the `espeak` backend. This allows to
 55 |     detect when espeak merge consecutive words or drop a word from the
 56 |     orthographic text. Possible actions are to ignore those misatches, to issue
 57 |     a warning for each line where a mismatch is detectd, or to remove those
 58 |     lines from the output.
 59 | 
 60 | * **bugfixes**
 61 | 
 62 |   * phonemizer's logger no more conflicts with other loggers when imported from
 63 |     Python (see PR [#61](https://github.com/bootphon/phonemizer/pull/61)).
 64 | 
 65 | 
 66 | ## phonemizer-2.2.2
 67 | 
 68 | * **bugfixes**
 69 | 
 70 |   * fixed installation from source (bug introduced in 2.2.1, see
 71 |     issue [#52](https://github.com/bootphon/phonemizer/issues/52)).
 72 | 
 73 |   * Fixed a bug when trying to restore punctuation on an empty text (see issue
 74 |     [#54](https://github.com/bootphon/phonemizer/issues/54)).
 75 | 
 76 |   * Fixed an edge case bug when using custom punctuation marks (see issue
 77 |     [#55](https://github.com/bootphon/phonemizer/issues/55)).
 78 | 
 79 |   * Fixed regex issue that causes digits to be considered punctuation (see
 80 |     issue [#60](https://github.com/bootphon/phonemizer/pull/60)).
 81 | 
 82 | 
 83 | ## phonemizer-2.2.1
 84 | 
 85 | * **improvements**
 86 | 
 87 |   From Python import the phonemize function using `from phonemizer import
 88 |   phonemize` instead of `from phonemizer.phonemize import phonemize`. The
 89 |   second import is still available for compatibility.
 90 | 
 91 | * **bugfixes**
 92 | 
 93 |   * Fixed a minor bug in `utils.chunks`.
 94 | 
 95 |   * Fixed warnings on language switching for espeak backend when using parallel
 96 |     jobs (see issue [#50](https://github.com/bootphon/phonemizer/issues/50)).
 97 | 
 98 |   * Save file in utf-8 explicitly for Windows compat (see issue
 99 |     [#43](https://github.com/bootphon/phonemizer/issues/43)).
100 | 
101 |   * Fixed build and tests in Dockerfile (see issue
102 |     [#45](https://github.com/bootphon/phonemizer/issues/45)).
103 | 
104 | 
105 | ## phonemizer-2.2
106 | 
107 | * **new features**
108 | 
109 |   * New option ``--list-languages`` to list the available languages for a given
110 |     backend from the command line.
111 | 
112 |   * The ``--sampa`` option of the ``espeak`` backend has been replaced by a new
113 |     backend ``espeak-mbrola``.
114 | 
115 |     * The former ``--sampa`` option (introduced in phonemizer-2.0) outputs
116 |       phones that are not standard SAMPA but are adapted to the espeak TTS
117 |       front-end.
118 | 
119 |     * On the other hand the ``espeak-mbrola`` backend allows espeak to output
120 |       phones in standard SAMPA (adapted to the mbrola TTS front-end). This
121 |       backend requires mbrola to be installed, as well as additional mbrola
122 |       voices to support needed languages. **This backend does not support word
123 |       separation nor punctuation preservation**.
124 | 
125 | * **bugfixes**
126 | 
127 |   * Fixed issues with punctuation processing on some corner cases, see issues
128 |     [#39](https://github.com/bootphon/phonemizer/issues/39) and
129 |     [#40](https://github.com/bootphon/phonemizer/issues/40).
130 | 
131 |   * Improvments and updates in the documentation (Readme, ``phonemize --help``
132 |     and Python code).
133 | 
134 |   * Fixed a test when using ``espeak>=1.50``.
135 | 
136 |   * Empty lines are correctly ignored when reading text from a file.
137 | 
138 | 
139 | ## phonemizer-2.1
140 | 
141 | * **new features**
142 | 
143 |   * Possibility to preserve the punctuation (ignored and silently removed by
144 |     default) in the phonemized output with the new option
145 |     ``--preserve-punctuation`` from command line (or the equivalent
146 |     ``preserve-punctuation`` from Python API). With the ``punctuation-marks``
147 |     option, one can overload the default marls considered as punctuation.
148 | 
149 |   * It is now possible to specify the path to a custom ``espeak`` or
150 |     ``festival`` executable (for instance to use a local installation or to test
151 |     different versions). Either specify the ``PHONEMIZER_ESPEAK_PATH``
152 |     environment variable, the ``--espeak-path`` option from command line or use
153 |     the ``EspeakBackend.set_espeak_path`` method from the Python API. Similarly
154 |     for festival use ``PHONEMIZER_FESTIVAL_PATH``, ``--festival-path`` or
155 |     ``FestivalBackend.set_festival_path``.
156 | 
157 |   * The ``--sampa`` option is now available for espeak (was available only for
158 |     espeak-ng).
159 | 
160 |   * When using ``espeak`` with SAMPA output, some SAMPA phones are corrected to
161 |     correspond to the normalized SAMPA alphabet (espeak seems not to respect
162 |     it). The corrections are language specific. A correction file must be placed
163 |     in ``phonemizer/share/espeak``. This have been implemented only for French
164 |     by now.
165 | 
166 | * **bugfixes**
167 | 
168 |   * parses correctly the version of ``espeak-ng`` even for dev versions (e.g.
169 |     ``1.51-dev``).
170 | 
171 |   * fixed an issue with ``espeak`` backend, where multiple phone separators can be
172 |     present at the end of a word, see
173 |     [#31](https://github.com/bootphon/phonemizer/issues/31).
174 | 
175 |   * added an additional stress symbol ``-`` for ``espeak``.
176 | 
177 | 
178 | ## phonemizer-2.0.1
179 | 
180 | * **bugfixes**
181 | 
182 |   * ``keep-flags`` was not the default argument for ``language_switch`` in the
183 |     class ``EspeakBackend``.
184 | 
185 |   * fixed an issue with punctuation processing in the espeak backend, see
186 |     [#26](https://github.com/bootphon/phonemizer/issues/26)
187 | 
188 | * **improvements**
189 | 
190 |   * log a warning if using ``python2``.
191 | 
192 | 
193 | ## phonemizer-2.0
194 | 
195 | * **incompatible change**
196 | 
197 |   Starting with ``phonemizer-2.0`` only python3 is supported. **Compatibility
198 |   with python2 is no more ensured nor tested.** https://pythonclock.org.
199 | 
200 | * **bugfixes**
201 | 
202 |   * new ``--language-switch`` option to use with ``espeak`` backend to deals
203 |     with language switching on phonemized output. In previous version there was
204 |     a bug in detection of the language switching flags (sometimes removed,
205 |     sometimes not). Now you can choose to keep the flags, to remove them, or to
206 |     delete the whole utterance.
207 | 
208 |   * bugfix in a test with `espeak>=1.49.3`.
209 | 
210 |   * bugfix using `NamedTemporaryFile` on windows, see
211 |     [#21](https://github.com/bootphon/phonemizer/issues/21).
212 | 
213 |   * bugfix when calling *festival* or *espeak* subprocesses on Windows, see
214 |     [#17](https://github.com/bootphon/phonemizer/issues/17).
215 | 
216 |   * bugfix in detecting recent versions of *espeak-ng*, see
217 |     [#18](https://github.com/bootphon/phonemizer/issues/18).
218 | 
219 |   * bugfix when using utf8 input on *espeak* backend (python2), see
220 |     [#19](https://github.com/bootphon/phonemizer/issues/19).
221 | 
222 | 
223 | * **new features and improvements**
224 | 
225 |   * new `--sampa` option to output phonemes in SAMPA alphabet instead of IPA,
226 |     available for espeak-ng only.
227 | 
228 |   * new ``--with-stress`` option to use with ``espeak`` backend to not remove the
229 |     stresses on phonemized output. For instance:
230 | 
231 |         $ echo "hello world" | phonemize
232 |         həloʊ wɜːld
233 |         $ echo "hello world" | phonemize --with-stress
234 |         həlˈoʊ wˈɜːld
235 | 
236 |   * improved logging: by default only warnings are displayed, use the new
237 |     ``--quiet`` option to inhibate all log messages or ``--verbose`` to see all of
238 |     them. Log messages now display level name (debug/info/warning).
239 | 
240 |   * improved code organization:
241 | 
242 |     * backends are now implemented in the ``backend`` submodule
243 |       as separated source files.
244 | 
245 |     * improved version string (displays uninstalled backends, moved outside of
246 |       main for use from Python).
247 | 
248 |     * improved logger implemented in its own module so as a call to phonemizer
249 |       from CLI or API yields the same log messages.
250 | 
251 | 
252 | ## phonemizer-1.0
253 | 
254 | * **incompabile changes**
255 | 
256 |   The following changes break the compatibility with previous versions
257 |   of phonemizer (0.X.Y):
258 | 
259 |   * command-line `phonemize` program: new `--backend
260 |     <espeak|festival|segments>` option, default language is now
261 |     *espeak en-us* (was *festival en-us*),
262 | 
263 |   * it is now illegal to have the same separator at different levels
264 |     (for instance a space for both word and phone),
265 | 
266 |   * from Python, must import the phonemize function as `from
267 |     phonemizer.phonemize import phonemize`, was `from phonemizer
268 |     import phonemize`.
269 | 
270 | * New backend [segments](https://github.com/cldf/segments) for
271 |   phonemization based on grapheme-to-phoneme mappings.
272 | 
273 | * Major refactoring of the backends implementation and separators (as
274 |   Python classes).
275 | 
276 | * Input to phonemizer now supports utf8.
277 | 
278 | * Better handling of errors (display of a meaningful message).
279 | 
280 | * Fixed a bug in fetching espeak version on macos, see
281 |   [#14](https://github.com/bootphon/phonemizer/issues/14).
282 | 
283 | ## phonemizer-0.3.3
284 | 
285 | * Fix a bug introduced in phonemizer-0.3.2 (apostrophes in festival
286 |   backend). See [#12](https://github.com/bootphon/phonemizer/issues/12).
287 | 
288 | 
289 | ## phonemizer-0.3.2
290 | 
291 | * Continuous integration with tracis-ci.
292 | 
293 | * Support for docker.
294 | 
295 | * Better support for different versions of espeak/festival.
296 | 
297 | * Minor bugfixes and improved tests.
298 | 
299 | 
300 | ## phonemizer-0.3.1
301 | 
302 | * New espeak or espeak-ng backend with more than 100 languages.
303 | 
304 | * Support for Python 2.7 and 3.5.
305 | 
306 | * Integration with zenodo for citation.
307 | 
308 | * Various bugfixes and minor improvments.
309 | 
310 | 
311 | ## phonemizer-0.2
312 | 
313 | * First public release.
314 | 
315 | * Support for festival backend, American English only.
316 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute?
 2 | 
 3 | We welcome and encourage every bug report, feature and pull request.
 4 | 
 5 | 
 6 | ## You have an issue with `phonemizer` or a feature request
 7 | 
 8 | Please open an [issue on github](https://github.com/bootphon/phonemizer/issues)
 9 | and follow the template from there.
10 | 
11 | 
12 | ## You want to contribute code
13 | 
14 | If you're willing to take it upon yourself to improve `phonemizer`, via
15 | bugfixes, improvements and new features, please follow these steps:
16 | 
17 | - Submit an issue explaining what you're willing to fix or add to this package.
18 |   We can discuss with you on the the best way to do it, considering the current
19 |   state of things.
20 | 
21 | - Fork the `phonemizer` repo, code away and open a pull-request. If you add some
22 |   code or change significantly a function, please test it by adding more unit
23 |   tests.
24 | 
25 | - Please confirm to the following conventions:
26 | 
27 |     - Python code follows [PEP 8 style](https://pep8.org).
28 |     - Docstrings follow [Google
29 |       style](https://google.github.io/styleguide/pyguide.html#s3.8-comments-and-docstrings).
30 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use this file to build a docker image of phonemizer (using
 2 | # festival-2.5.0 and espeak-ng-1.50 from ubuntu repo):
 3 | #
 4 | #    sudo docker build -t phonemizer .
 5 | #
 6 | # Then open a bash session in docker with:
 7 | #
 8 | #    sudo docker run -it phonemizer /bin/bash
 9 | #
10 | # You can then use phonemizer within docker. See the docker doc for
11 | # advanced usage.
12 | 
13 | 
14 | # Use an official Ubuntu as a parent image
15 | FROM ubuntu:20.04
16 | 
17 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
18 | 
19 | # set the working directory to /phonemizer
20 | WORKDIR /phonemizer
21 | 
22 | # install dependencies
23 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
24 |         festival \
25 |         festvox-us1 \
26 |         festlex-cmu \
27 |         festlex-poslex \
28 |         espeak-ng \
29 |         git \
30 |         mbrola \
31 |         mbrola-fr1 \
32 |         python3 \
33 |         python3-pip && \
34 |     apt-get clean
35 | 
36 | # pytest needs to be installed through pip to make sure we have a recent version
37 | RUN pip3 install pytest
38 | 
39 | # tests expect python to be available as executable 'python' not 'python3'
40 | RUN ln -s /usr/bin/python3 /usr/bin/python
41 | 
42 | # copy the phonemizer code within the docker image
43 | COPY . /phonemizer
44 | 
45 | # install phonemizer and run the tests
46 | RUN cd /phonemizer && \
47 |     python3 setup.py install && \
48 |     phonemize --version && \
49 |     python3 -m pytest -v test
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Linux](https://github.com/bootphon/phonemizer/actions/workflows/linux.yaml/badge.svg?branch=master)](
  2 | https://github.com/bootphon/phonemizer/actions/workflows/linux.yaml)
  3 | [![MacOS](https://github.com/bootphon/phonemizer/actions/workflows/macos.yaml/badge.svg?branch=master)](
  4 | https://github.com/bootphon/phonemizer/actions/workflows/macos.yaml)
  5 | [![Windows](https://github.com/bootphon/phonemizer/actions/workflows/windows.yaml/badge.svg?branch=master)](
  6 | https://github.com/bootphon/phonemizer/actions/workflows/windows.yaml)
  7 | [![Codecov](https://img.shields.io/codecov/c/github/bootphon/phonemizer)](
  8 | https://codecov.io/gh/bootphon/phonemizer) [![GitHub release (latest
  9 | SemVer)](https://img.shields.io/github/v/release/bootphon/phonemizer)](
 10 | https://github.com/bootphon/phonemizer/releases/latest)
 11 | [![DOI](https://zenodo.org/badge/56728069.svg)](
 12 | https://doi.org/10.5281/zenodo.1045825)
 13 | 
 14 | # Phonemizer -- *foʊnmaɪzɚ*
 15 | 
 16 | * The phonemizer allows simple phonemization of words and texts in many languages.
 17 | 
 18 | * Provides both the `phonemize` command-line tool and the Python function
 19 |   `phonemizer.phonemize`.
 20 | 
 21 | * It is using four backends: espeak, espeak-mbrola, festival and segments.
 22 | 
 23 |   * [espeak-ng](https://github.com/espeak-ng/espeak-ng) supports a lot of
 24 |     languages and IPA (International Phonetic Alphabet) output.
 25 | 
 26 |   * [espeak-ng-mbrola](https://github.com/espeak-ng/espeak-ng/blob/master/docs/mbrola.md)
 27 |     uses the SAMPA phonetic alphabet instead of IPA but does not preserve word
 28 |     boundaries.
 29 | 
 30 |   * [festival](http://www.cstr.ed.ac.uk/projects/festival) currently supports
 31 |     only American English. It uses a [custom
 32 |     phoneset](http://www.festvox.org/bsv/c4711.html), but it allows tokenization
 33 |     at the syllable level.
 34 | 
 35 |   * [segments](https://github.com/cldf/segments) is a Unicode tokenizer that
 36 |     build a phonemization from a grapheme to phoneme mapping provided as a file
 37 |     by the user.
 38 | 
 39 | 
 40 | ## Installation
 41 | 
 42 | **You need python>=3.6.** If you really need to use python2, use an [older
 43 | version](https://github.com/bootphon/phonemizer/releases/tag/v1.0) of
 44 | the phonemizer.
 45 | 
 46 | 
 47 | ### Dependencies
 48 | 
 49 | * You need to install
 50 |   [festival](http://www.festvox.org/docs/manual-2.4.0/festival_6.html#Installation),
 51 |   [espeak-ng](https://github.com/espeak-ng/espeak-ng#espeak-ng-text-to-speech)
 52 |   and [mbrola](https://github.com/numediart/MBROLA) on your system. On
 53 |   Debian/Ubuntu simply run:
 54 | 
 55 |         $ sudo apt-get install festival espeak-ng mbrola
 56 | 
 57 | * When using the **espeak-mbrola** backend, additional mbrola voices must be
 58 |   installed (see
 59 |   [here](https://github.com/espeak-ng/espeak-ng/blob/master/docs/mbrola.md)). On
 60 |   Debian/Ubuntu, list the possible voices with `apt search mbrola`.
 61 | 
 62 | 
 63 | ### Phonemizer
 64 | 
 65 | * The simplest way is using pip:
 66 | 
 67 |         $ pip install phonemizer
 68 | 
 69 | * **OR** install it from sources with:
 70 | 
 71 |         $ git clone https://github.com/bootphon/phonemizer
 72 |         $ cd phonemizer
 73 |         $ [sudo] python setup.py install
 74 | 
 75 |   If you experiment an error such as `ImportError: No module named
 76 |   setuptools` during installation, refeer to [issue
 77 |   11](https://github.com/bootphon/phonemizer/issues/11).
 78 | 
 79 | 
 80 | ### Docker image
 81 | 
 82 | Alternatively you can run the phonemizer within docker, using the
 83 | provided `Dockerfile`. To build the docker image, have a:
 84 | 
 85 |     $ git clone https://github.com/bootphon/phonemizer
 86 |     $ cd phonemizer
 87 |     $ sudo docker build -t phonemizer .
 88 | 
 89 | Then run an interactive session with:
 90 | 
 91 |     $ sudo docker run -it phonemizer /bin/bash
 92 | 
 93 | 
 94 | ### Testing
 95 | 
 96 | When installed from sources or whithin a Docker image, you can run the tests
 97 | suite from the root `phonemizer` folder (once you installed `pytest`):
 98 | 
 99 |     $ pip install pytest
100 |     $ pytest
101 | 
102 | 
103 | ## Python usage
104 | 
105 | In Python import the `phonemize` function with `from phonemizer import
106 | phonemize`. See
107 | [here](https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32)
108 | for function documentation.
109 | 
110 | 
111 | ## Command-line examples
112 | 
113 | **The above examples can be run from Python using the `phonemize` function**
114 | 
115 | 
116 | For a complete list of available options, have a:
117 | 
118 |     $ phonemize --help
119 | 
120 | See the installed backends with the `--version` option:
121 | 
122 |     $ phonemize --version
123 |     phonemizer-3.0
124 |     available backends: espeak-ng-1.50, espeak-mbrola, festival-2.5.0, segments-2.1.3
125 | 
126 | 
127 | ### Input/output exemples
128 | 
129 | * from stdin to stdout:
130 | 
131 |         $ echo "hello world" | phonemize
132 |         həloʊ wɜːld
133 | 
134 | * Prepend the input text to output:
135 | 
136 |         $ echo "hello world" | phonemize --prepend-text
137 |         hello world | həloʊ wɜːld
138 |         $ echo "hello world" | phonemize --prepend-text=';'
139 |         hello world ; həloʊ wɜːld
140 | 
141 | * from file to stdout
142 | 
143 |         $ echo "hello world" > hello.txt
144 |         $ phonemize hello.txt
145 |         həloʊ wɜːld
146 | 
147 | * from file to file
148 | 
149 |         $ phonemize hello.txt -o hello.phon --strip
150 |         $ cat hello.phon
151 |         həloʊ wɜːld
152 | 
153 | 
154 | ### Backends
155 | 
156 | * The default is to use **espeak** us-english:
157 | 
158 |         $ echo "hello world" | phonemize
159 |         həloʊ wɜːld
160 |         $ echo "hello world" | phonemize -l en-us -b espeak
161 |         həloʊ wɜːld
162 |         $ echo 'hello world' | phonemize -l en-us -b espeak --tie
163 |         həlo͡ʊ wɜːld
164 | 
165 | * Use **festival** US English instead
166 | 
167 |         $ echo "hello world" | phonemize -l en-us -b festival
168 |         hhaxlow werld
169 | 
170 | * In French, using **espeak** and **espeak-mbrola**, with custom token
171 |   separators (see below). **espeak-mbrola** does not support words separation.
172 | 
173 |         $ echo "bonjour le monde" | phonemize -b espeak -l fr-fr -p ' ' -w '/w '
174 |         b ɔ̃ ʒ u ʁ /w l ə /w m ɔ̃ d /w
175 |         $ echo "bonjour le monde" | phonemize -b espeak-mbrola -l mb-fr1 -p ' ' -w '/w '
176 |         b o~ Z u R l @ m o~ d
177 | 
178 | * In Japanese, using **segments**
179 | 
180 |         $ echo 'konnichiwa' | phonemize -b segments -l japanese
181 |         konnitʃiwa
182 |         $ echo 'konnichiwa' | phonemize -b segments -l ./phonemizer/share/japanese.g2p
183 |         konnitʃiwa
184 | 
185 | 
186 | ### Supported languages
187 | 
188 | The exhaustive list of supported languages is available with the command
189 | `phonemize --list-languages [--backend <backend>]`.
190 | 
191 | * Languages supported by **espeak** are available
192 |   [here](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md).
193 | 
194 | * Languages supported by **espeak-mbrola** are available
195 |   [here](https://github.com/numediart/MBROLA-voices). Please note that the
196 |   mbrola voices are not bundled with the phonemizer and must be installed
197 |   separately.
198 | 
199 | * Languages supported by **festival** are:
200 | 
201 |         en-us -> english-us
202 | 
203 | * Languages supported by the **segments** backend are:
204 | 
205 |         chintang  -> ./phonemizer/share/segments/chintang.g2p
206 | 	    cree      -> ./phonemizer/share/segments/cree.g2p
207 | 	    inuktitut -> ./phonemizer/share/segments/inuktitut.g2p
208 | 	    japanese  -> ./phonemizer/share/segments/japanese.g2p
209 | 	    sesotho   -> ./phonemizer/share/segments/sesotho.g2p
210 | 	    yucatec   -> ./phonemizer/share/segments/yucatec.g2p
211 | 
212 |   Instead of a language you can also provide a file specifying a
213 |   grapheme to phone mapping (see the files above for examples).
214 | 
215 | 
216 | ### Token separators
217 | 
218 | You can specify separators for phones, syllables (**festival** only) and
219 | words (excepted **espeak-mbrola**).
220 | 
221 |     $ echo "hello world" | phonemize -b festival -w ' ' -p ''
222 |     hhaxlow werld
223 | 
224 |     $ echo "hello world" | phonemize -b festival -p ' ' -w ''
225 |     hh ax l ow w er l d
226 | 
227 |     $ echo "hello world" | phonemize -b festival -p '-' -s '|'
228 |     hh-ax-l-|ow-| w-er-l-d-|
229 | 
230 |     $ echo "hello world" | phonemize -b festival -p '-' -s '|' --strip
231 |     hh-ax-l|ow w-er-l-d
232 | 
233 |     $ echo "hello world" | phonemize -b festival -p ' ' -s ';esyll ' -w ';eword '
234 |     hh ax l ;esyll ow ;esyll ;eword w er l d ;esyll ;eword
235 | 
236 | You cannot specify the same separator for several tokens (for instance
237 | a space for both phones and words):
238 | 
239 |     $ echo "hello world" | phonemize -b festival -p ' ' -w ' '
240 |     fatal error: illegal separator with word=" ", syllable="" and phone=" ",
241 |     must be all differents if not empty
242 | 
243 | 
244 | ### Punctuation
245 | 
246 | By default the punctuation is removed in the phonemized output. You can preserve
247 | it using the ``--preserve-punctuation`` option (not supported by the
248 | **espeak-mbrola** backend):
249 | 
250 |     $ echo "hello, world!" | phonemize --strip
251 |     həloʊ wɜːld
252 | 
253 |     $ echo "hello, world!" | phonemize --preserve-punctuation --strip
254 |     həloʊ, wɜːld!
255 | 
256 | 
257 | ### Espeak specific options
258 | 
259 | * The **espeak** backend can output the stresses on phones:
260 | 
261 |         $ echo "hello world" | phonemize -l en-us -b espeak --with-stress
262 |         həlˈoʊ wˈɜːld
263 | 
264 | * The **espeak** backend can add tie on multi-characters phonemes:
265 | 
266 |         $ echo "hello world" | phonemize -l en-us -b espeak --tie
267 |         həlo͡ʊ wɜːld
268 | 
269 | * The **espeak** backend can switch languages during phonemization (below from
270 |   French to English), use the ``--language-switch`` option to deal with it:
271 | 
272 |         $ echo "j'aime le football" | phonemize -l fr-fr -b espeak --language-switch keep-flags
273 |         [WARNING] fount 1 utterances containing language switches on lines 1
274 |         [WARNING] extra phones may appear in the "fr-fr" phoneset
275 |         [WARNING] language switch flags have been kept (applying "keep-flags" policy)
276 |         ʒɛm lə- (en)fʊtbɔːl(fr)
277 | 
278 |         $ echo "j'aime le football" | phonemize -l fr-fr -b espeak --language-switch remove-flags
279 |         [WARNING] fount 1 utterances containing language switches on lines 1
280 |         [WARNING] extra phones may appear in the "fr-fr" phoneset
281 |         [WARNING] language switch flags have been removed (applying "remove-flags" policy)
282 |         ʒɛm lə- fʊtbɔːl
283 | 
284 |         $ echo "j'aime le football" | phonemize -l fr-fr -b espeak --language-switch remove-utterance
285 |         [WARNING] removed 1 utterances containing language switches (applying "remove-utterance" policy)
286 | 
287 | 
288 | * The **espeak** backend sometimes merge words together in the output, use the
289 |   `--words-mismatch` option to deal with it:
290 | 
291 |         $ echo "that's it, words are merged" | phonemize -l en-us -b espeak
292 |         [WARNING] words count mismatch on 100.0% of the lines (1/1)
293 |         ðætsɪt wɜːdz ɑːɹ mɜːdʒd
294 | 
295 | 
296 | ## Licence
297 | 
298 | **Copyright 2015-2021 Mathieu Bernard**
299 | 
300 | This program is free software: you can redistribute it and/or modify
301 | it under the terms of the GNU General Public License as published by
302 | the Free Software Foundation, either version 3 of the License, or
303 | (at your option) any later version.
304 | 
305 | This program is distributed in the hope that it will be useful,
306 | but WITHOUT ANY WARRANTY; without even the implied warranty of
307 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
308 | GNU General Public License for more details.
309 | 
310 | You should have received a copy of the GNU General Public License
311 | along with this program. If not, see <http://www.gnu.org/licenses/>.
312 | 


--------------------------------------------------------------------------------
/phonemizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonologizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonologizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Multilingual text to phones converter"""
16 | 
17 | __version__ = "3.0"
18 | __RESEMBLE__ = True
19 | """Phonemizer version"""
20 | 
21 | 
22 | try:  # pragma: nocover
23 |     # This variable is injected in the __builtins__ by the build process. In
24 |     # that case we don't want to import phonemize as there are missing
25 |     # dependencies.
26 |     __PHONEMIZER_SETUP__
27 | except NameError:
28 |     __PHONEMIZER_SETUP__ = False
29 | 
30 | 
31 | if __PHONEMIZER_SETUP__:  # pragma: nocover
32 |     import sys
33 |     sys.stderr.write(
34 |         'Partial import of phonemizer during the build process.\n')
35 | else:
36 |     # pylint: disable=unused-import
37 |     from .phonemize import phonemize
38 | 


--------------------------------------------------------------------------------
/phonemizer/backend/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonologizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonologizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Multilingual text to phonemes converter"""
16 | 
17 | # pylint: disable=unused-import
18 | 
19 | from .espeak.espeak import EspeakBackend
20 | from .espeak.mbrola import EspeakMbrolaBackend
21 | from .festival.festival import FestivalBackend
22 | from .segments import SegmentsBackend
23 | 
24 | 
25 | BACKENDS = {b.name(): b for b in (
26 |     EspeakBackend, FestivalBackend, SegmentsBackend, EspeakMbrolaBackend)}
27 | """The different phonemization backends as a mapping (name, class)"""
28 | 


--------------------------------------------------------------------------------
/phonemizer/backend/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Abstract base class for phonemization backends"""
 16 | 
 17 | import abc
 18 | import itertools
 19 | import joblib
 20 | 
 21 | from phonemizer.separator import default_separator
 22 | from phonemizer.logger import get_logger
 23 | from phonemizer.punctuation import Punctuation
 24 | from phonemizer.utils import chunks
 25 | 
 26 | 
 27 | class BaseBackend(abc.ABC):
 28 |     """Abstract base class of all the phonemization backends
 29 | 
 30 |     Provides a common interface to all backends. The central method is
 31 |     `phonemize()`
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     language (str): The language code of the input text, must be supported by
 36 |       the backend. If `backend` is 'segments', the language can be a file with
 37 |       a grapheme to phoneme mapping.
 38 | 
 39 |     preserve_punctuation (bool): When True, will keep the punctuation in the
 40 |       phonemized output. Not supported by the 'espeak-mbrola' backend. Default
 41 |       to False and remove all the punctuation.
 42 | 
 43 |     punctuation_marks (str): The punctuation marks to consider when dealing
 44 |       with punctuation, either for removal or preservation. Default to
 45 |       Punctuation.default_marks().
 46 | 
 47 |     logger (logging.Logger): the logging instance where to send
 48 |       messages. If not specified, use the default system logger.
 49 | 
 50 |     Raises
 51 |     ------
 52 |     RuntimeError if the backend is not available of if the `language` cannot be
 53 |     initialized.
 54 | 
 55 |     """
 56 |     def __init__(self, language,
 57 |                  punctuation_marks=Punctuation.default_marks(),
 58 |                  preserve_punctuation=False,
 59 |                  logger=get_logger()):
 60 |         # ensure the backend is installed on the system
 61 |         if not self.is_available():
 62 |             raise RuntimeError(  # pragma: nocover
 63 |                 '{} not installed on your system'.format(self.name()))
 64 | 
 65 |         self._logger = logger
 66 |         self._logger.info(
 67 |             'initializing backend %s-%s',
 68 |             self.name(), '.'.join(str(v) for v in self.version()))
 69 | 
 70 |         # ensure the backend support the requested language
 71 |         self._language = self._init_language(language)
 72 | 
 73 |         # setup punctuation processing
 74 |         self._preserve_punctuation = preserve_punctuation
 75 |         self._punctuator = Punctuation(punctuation_marks)
 76 | 
 77 |     @classmethod
 78 |     def _init_language(cls, language):
 79 |         """Language initialization
 80 | 
 81 |         This method may be overloaded in child classes (see Segments backend)
 82 | 
 83 |         """
 84 |         if not cls.is_supported_language(language):
 85 |             raise RuntimeError(
 86 |                 f'language "{language}" is not supported by the '
 87 |                 f'{cls.name()} backend')
 88 |         return language
 89 | 
 90 |     @property
 91 |     def logger(self):
 92 |         """A logging.Logger instance where to send messages"""
 93 |         return self._logger
 94 | 
 95 |     @property
 96 |     def language(self):
 97 |         """The language code configured to be used for phonemization"""
 98 |         return self._language
 99 | 
100 |     @staticmethod
101 |     @abc.abstractmethod
102 |     def name():
103 |         """The name of the backend"""
104 | 
105 |     @classmethod
106 |     @abc.abstractmethod
107 |     def is_available(cls):
108 |         """Returns True if the backend is installed, False otherwise"""
109 | 
110 |     @classmethod
111 |     @abc.abstractmethod
112 |     def version(cls):
113 |         """Return the backend version as a tuple (major, minor, patch)"""
114 | 
115 |     @staticmethod
116 |     @abc.abstractmethod
117 |     def supported_languages():
118 |         """Return a dict of language codes -> name supported by the backend"""
119 | 
120 |     @classmethod
121 |     def is_supported_language(cls, language):
122 |         """Returns True if `language` is supported by the backend"""
123 |         return language in cls.supported_languages()
124 | 
125 |     def phonemize(self, text, separator=default_separator,
126 |                   strip=False, njobs=1):
127 |         """Returns the `text` phonemized for the given language
128 | 
129 |         Parameters
130 |         ----------
131 |         text (list of str): The text to be phonemized. Each string in the list
132 |           is considered as a separated line. Each line is considered as a text
133 |           utterance. Any empty utterance will be ignored.
134 | 
135 |         separator (Separator): string separators between phonemes, syllables
136 |           and words, default to separator.default_separator. Syllable separator
137 |           is considered only for the festival backend. Word separator is
138 |           ignored by the 'espeak-mbrola' backend.
139 | 
140 |         strip (bool): If True, don't output the last word and phone separators
141 |           of a token, default to False.
142 | 
143 |         njobs (int): The number of parallel jobs to launch. The input text is
144 |           split in `njobs` parts, phonemized on parallel instances of the
145 |           backend and the outputs are finally collapsed.
146 | 
147 |         Returns
148 |         -------
149 |         phonemized text (list of str) : The input `text` phonemized for the
150 |           given `language` and `backend`.
151 | 
152 |         Raises
153 |         ------
154 |         RuntimeError if something went wrong during the phonemization
155 | 
156 |         """
157 |         if isinstance(text, str):
158 |             # changed in phonemizer-3.0, warn the user
159 |             self.logger.error(
160 |                 'input text to phonemize() is str but it must be list')
161 | 
162 |         text, punctuation_marks = self._phonemize_preprocess(text)
163 | 
164 |         if njobs == 1:
165 |             # phonemize the text forced as a string
166 |             phonemized = self._phonemize_aux(text, 0, separator, strip)
167 |         else:
168 |             # If using parallel jobs, disable the log as stderr is not
169 |             # picklable.
170 |             self.logger.info('running %s on %s jobs', self.name(), njobs)
171 | 
172 |             # we have here a list of phonemized chunks
173 |             phonemized = joblib.Parallel(n_jobs=njobs)(
174 |                 joblib.delayed(self._phonemize_aux)(
175 |                     # chunk[0] is the text, chunk[1] is the offset
176 |                     chunk[0], chunk[1], separator, strip)
177 |                 for chunk in zip(*chunks(text, njobs)))
178 | 
179 |             # flatten them in a single list
180 |             phonemized = self._flatten(phonemized)
181 | 
182 |         return self._phonemize_postprocess(phonemized, punctuation_marks)
183 | 
184 |     @staticmethod
185 |     def _flatten(phonemized):
186 |         """Flatten a list of lists into a single one
187 | 
188 |         From [[1, 2], [3], [4]] returns [1, 2, 3, 4]. This method is used to
189 |         format the output as obtained using multiple jobs.
190 | 
191 |         """
192 |         return list(itertools.chain(*phonemized))
193 | 
194 |     @abc.abstractmethod
195 |     def _phonemize_aux(self, text, offset, separator, strip):
196 |         """The "concrete" phonemization method
197 | 
198 |         Must be implemented in child classes. `separator` and `strip`
199 |         parameters are as given to the phonemize() method. `text` is as
200 |         returned by _phonemize_preprocess(). `offset` is line number of the
201 |         first line in `text` with respect to the original text (this is only
202 |         usefull with running on chunks in multiple jobs. When using a single
203 |         jobs the offset is 0).
204 | 
205 |         """
206 | 
207 |     def _phonemize_preprocess(self, text):
208 |         """Preprocess the text before phonemization
209 | 
210 |         Removes the punctuation (keep trace of punctuation marks for further
211 |         restoration if required by the `preserve_punctuation` option).
212 | 
213 |         """
214 |         if self._preserve_punctuation:
215 |             # a tuple (text, punctuation marks)
216 |             return self._punctuator.preserve(text)
217 |         return self._punctuator.remove(text), []
218 | 
219 |     def _phonemize_postprocess(self, phonemized, punctuation_marks):
220 |         """Postprocess the raw phonemized output
221 | 
222 |         Restores the punctuation as needed.
223 | 
224 |         """
225 |         if self._preserve_punctuation:
226 |             return self._punctuator.restore(phonemized, punctuation_marks)
227 |         return phonemized
228 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonologizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonologizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Phonemizer module for espeak backend implementation"""
16 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/api.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Low-level bindings to the espeak API"""
 16 | 
 17 | import atexit
 18 | import ctypes
 19 | import pathlib
 20 | import shutil
 21 | import sys
 22 | import tempfile
 23 | import weakref
 24 | 
 25 | from phonemizer.backend.espeak.voice import EspeakVoice
 26 | 
 27 | if sys.platform != 'win32':
 28 |     # cause a crash on Windows
 29 |     import dlinfo
 30 | 
 31 | 
 32 | class EspeakAPI:
 33 |     """Exposes the espeak API to the EspeakWrapper
 34 | 
 35 |     This class exposes only low-level bindings to the API and should not be
 36 |     used directly.
 37 | 
 38 |     """
 39 |     def __init__(self, library):
 40 |         # set to None to avoid an AttributeError in _delete if the __init__
 41 |         # method raises, will be properly initialized below
 42 |         self._library = None
 43 | 
 44 |         # Because the library is not designed to be wrapped nor to be used in
 45 |         # multithreaded/multiprocess contexts (massive use of global variables)
 46 |         # we need a copy of the original library for each instance of the
 47 |         # wrapper... (see "man dlopen" on Linux/MacOS: we cannot load two times
 48 |         # the same library because a reference is then returned by dlopen). The
 49 |         # tweak is therefore to make a copy of the original library in a
 50 |         # different (temporary) directory.
 51 |         try:
 52 |             # load the original library in order to retrieve its full path?
 53 |             # Forced as str as it is required on Windows.
 54 |             espeak = ctypes.cdll.LoadLibrary(str(library))
 55 |             library_path = self._shared_library_path(espeak)
 56 |             del espeak
 57 |         except OSError as error:
 58 |             raise RuntimeError(
 59 |                 f'failed to load espeak library: {str(error)}') from None
 60 | 
 61 |         # will be automatically destroyed after use
 62 |         self._tempdir = tempfile.mkdtemp()
 63 | 
 64 |         # properly exit when the wrapper object is destroyed (see
 65 |         # https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods).
 66 |         # But... weakref implementation does not work on windows so we register
 67 |         # the cleanup with atexit. This means that, on Windows, all the
 68 |         # temporary directories created by EspeakAPI instances will remain on
 69 |         # disk until the Python process exit.
 70 |         if sys.platform == 'win32':  # pragma: nocover
 71 |             atexit.register(self._delete_win32)
 72 |         else:
 73 |             weakref.finalize(self, self._delete, self._library, self._tempdir)
 74 | 
 75 |         espeak_copy = pathlib.Path(self._tempdir) / library_path.name
 76 |         shutil.copy(library_path, espeak_copy, follow_symlinks=False)
 77 | 
 78 |         # finally load the library copy and initialize it. 0x02 is
 79 |         # AUDIO_OUTPUT_SYNCHRONOUS in the espeak API
 80 |         self._library = ctypes.cdll.LoadLibrary(str(espeak_copy))
 81 |         try:
 82 |             if self._library.espeak_Initialize(0x02, 0, None, 0) <= 0:
 83 |                 raise RuntimeError(  # pragma: nocover
 84 |                     'failed to initialize espeak shared library')
 85 |         except AttributeError:  # pragma: nocover
 86 |             raise RuntimeError(
 87 |                 'failed to load espeak library') from None
 88 | 
 89 |         # the path to the original one (the copy is considered an
 90 |         # implementation detail and is not exposed)
 91 |         self._library_path = library_path
 92 | 
 93 |     def _delete_win32(self):  # pragma: nocover
 94 |         # Windows does not support static methods with ctypes libraries
 95 |         # (library == None) so we use a proxy method...
 96 |         self._delete(self._library, self._tempdir)
 97 | 
 98 |     @staticmethod
 99 |     def _delete(library, tempdir):
100 |         try:
101 |             # clean up the espeak library allocated memory
102 |             library.espeak_Terminate()
103 |         except AttributeError:  # library not loaded
104 |             pass
105 | 
106 |         # on Windows it is required to unload the library or the .dll file
107 |         # cannot be erased from the temporary directory
108 |         if sys.platform == 'win32':  # pragma: nocover
109 |             # pylint: disable=import-outside-toplevel
110 |             # pylint: disable=protected-access
111 |             # pylint: disable=no-member
112 |             import _ctypes
113 |             _ctypes.FreeLibrary(library._handle)
114 | 
115 |         # clean up the tempdir containing the copy of the library
116 |         shutil.rmtree(tempdir)
117 | 
118 |     @property
119 |     def library_path(self):
120 |         """Absolute path to the espeak library being in use"""
121 |         return self._library_path
122 | 
123 |     @staticmethod
124 |     def _shared_library_path(library):
125 |         """Returns the absolute path to `library`
126 | 
127 |         This function is cross-platform and works for Linux, MacOS and Windows.
128 |         Raises a RuntimeError if the library path cannot be retrieved
129 | 
130 |         """
131 |         # pylint: disable=protected-access
132 |         path = pathlib.Path(library._name).resolve()
133 |         if path.is_file():
134 |             return path
135 | 
136 |         try:
137 |             # Linux or MacOS only, ImportError on Windows
138 |             return pathlib.Path(dlinfo.DLInfo(library).path).resolve()
139 |         except (Exception, ImportError):  # pragma: nocover
140 |             raise RuntimeError(
141 |                 f'failed to retrieve the path to {library} library') from None
142 | 
143 |     def info(self):
144 |         """Bindings to espeak_Info
145 | 
146 |         Returns
147 |         -------
148 |         version, data_path: encoded strings containing the espeak version
149 |             number and data path respectively
150 | 
151 |         """
152 |         f_info = self._library.espeak_Info
153 |         f_info.restype = ctypes.c_char_p
154 |         data_path = ctypes.c_char_p()
155 |         version = f_info(ctypes.byref(data_path))
156 |         return version, data_path.value
157 | 
158 |     def list_voices(self, name):
159 |         """Bindings to espeak_ListVoices
160 | 
161 |         Parameters
162 |         ----------
163 |         name (str or None): if specified, a filter on voices to be listed
164 | 
165 |         Returns
166 |         -------
167 |         voices: a pointer to EspeakVoice.Struct instances
168 | 
169 |         """
170 |         f_list_voices = self._library.espeak_ListVoices
171 |         f_list_voices.argtypes = [ctypes.POINTER(EspeakVoice.Struct)]
172 |         f_list_voices.restype = ctypes.POINTER(
173 |             ctypes.POINTER(EspeakVoice.Struct))
174 |         return f_list_voices(name)
175 | 
176 |     def set_voice_by_name(self, name):
177 |         """Bindings to espeak_SetVoiceByName
178 | 
179 |         Parameters
180 |         ----------
181 |         name (str) : the voice name to setup
182 | 
183 |         Returns
184 |         -------
185 |         0 on success, non-zero integer on failure
186 | 
187 |         """
188 |         f_set_voice_by_name = self._library.espeak_SetVoiceByName
189 |         f_set_voice_by_name.argtypes = [ctypes.c_char_p]
190 |         return f_set_voice_by_name(name)
191 | 
192 |     def get_current_voice(self):
193 |         """Bindings to espeak_GetCurrentVoice
194 | 
195 |         Returns
196 |         -------
197 |         a EspeakVoice.Struct instance or None if no voice has been setup
198 | 
199 |         """
200 |         f_get_current_voice = self._library.espeak_GetCurrentVoice
201 |         f_get_current_voice.restype = ctypes.POINTER(EspeakVoice.Struct)
202 |         return f_get_current_voice().contents
203 | 
204 |     def text_to_phonemes(self, text_ptr, text_mode, phonemes_mode):
205 |         """Bindings to espeak_TextToPhonemes
206 | 
207 |         Parameters
208 |         ----------
209 |         text_ptr (pointer): the text to be phonemized, as a pointer to a
210 |             pointer of chars
211 |         text_mode (bits field): see espeak sources for details
212 |         phonemes_mode (bits field): see espeak sources for details
213 | 
214 |         Returns
215 |         -------
216 |         an encoded string containing the computed phonemes
217 | 
218 |         """
219 |         f_text_to_phonemes = self._library.espeak_TextToPhonemes
220 |         f_text_to_phonemes.restype = ctypes.c_char_p
221 |         f_text_to_phonemes.argtypes = [
222 |             ctypes.POINTER(ctypes.c_char_p),
223 |             ctypes.c_int,
224 |             ctypes.c_int]
225 |         return f_text_to_phonemes(text_ptr, text_mode, phonemes_mode)
226 | 
227 |     def set_phoneme_trace(self, mode, file_pointer):
228 |         """"Bindings on espeak_SetPhonemeTrace
229 | 
230 |         This method must be called before any call to synthetize()
231 | 
232 |         Parameters
233 |         ----------
234 |         mode (bits field): see espeak sources for details
235 |         file_pointer (FILE*): a pointer to an opened file in which to output
236 |             the phoneme trace
237 | 
238 |         """
239 |         f_set_phoneme_trace = self._library.espeak_SetPhonemeTrace
240 |         f_set_phoneme_trace.argtypes = [
241 |             ctypes.c_int,
242 |             ctypes.c_void_p]
243 |         f_set_phoneme_trace(mode, file_pointer)
244 | 
245 |     def synthetize(self, text_ptr, size, mode):
246 |         """Bindings on espeak_Synth
247 | 
248 |         The output phonemes are sent to the file specified by a call to
249 |         set_phoneme_trace().
250 | 
251 |         Parameters
252 |         ----------
253 |         text (pointer) : a pointer to chars
254 |         size (int) : number of chars in `text`
255 |         mode (bits field) : see espeak sources for details
256 | 
257 |         Returns
258 |         -------
259 |         0 on success, non-zero integer on failure
260 | 
261 |         """
262 |         f_synthetize = self._library.espeak_Synth
263 |         f_synthetize.argtypes = [
264 |             ctypes.c_void_p,
265 |             ctypes.c_size_t,
266 |             ctypes.c_uint,
267 |             ctypes.c_int,  # position_type
268 |             ctypes.c_uint,
269 |             ctypes.POINTER(ctypes.c_uint),
270 |             ctypes.c_void_p]
271 |         return f_synthetize(text_ptr, size, 0, 1, 0, mode, None, None)
272 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Base class of espeak backends for the phonemizer"""
 16 | 
 17 | import abc
 18 | 
 19 | from phonemizer.backend.base import BaseBackend
 20 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 21 | from phonemizer.logger import get_logger
 22 | from phonemizer.punctuation import Punctuation
 23 | 
 24 | 
 25 | class BaseEspeakBackend(BaseBackend):
 26 |     """Abstract espeak backend for the phonemizer
 27 | 
 28 |     Base class of the concrete backends Espeak and EspeakMbrola. It provides
 29 |     facilities to find espeak library and read espeak version.
 30 | 
 31 |     """
 32 |     def __init__(self, language,
 33 |                  punctuation_marks=Punctuation.default_marks(),
 34 |                  preserve_punctuation=False,
 35 |                  logger=get_logger()):
 36 |         self._espeak = EspeakWrapper()
 37 |         logger.debug('loaded %s', self._espeak.library_path)
 38 | 
 39 |         super().__init__(
 40 |             language,
 41 |             punctuation_marks=punctuation_marks,
 42 |             preserve_punctuation=preserve_punctuation,
 43 |             logger=logger)
 44 | 
 45 |     @classmethod
 46 |     def set_library(cls, library):
 47 |         """Sets the espeak backend to use `library`
 48 | 
 49 |         If this is not set, the backend uses the default espeak shared library
 50 |         from the system installation.
 51 | 
 52 |         Parameters
 53 |         ----------
 54 |         library (str or None) : the path to the espeak shared library to use as
 55 |             backend. Set `library` to None to restore the default.
 56 | 
 57 |         """
 58 |         EspeakWrapper.set_library(library)
 59 | 
 60 |     @classmethod
 61 |     def library(cls):
 62 |         """Returns the espeak library used as backend
 63 | 
 64 |         The following precedence rule applies for library lookup:
 65 | 
 66 |         1. As specified by BaseEspeakBackend.set_library()
 67 |         2. Or as specified by the environment variable
 68 |            PHONEMIZER_ESPEAK_LIBRARY
 69 |         3. Or the default espeak library found on the system
 70 | 
 71 |         Raises
 72 |         ------
 73 |         RuntimeError if the espeak library cannot be found or if the
 74 |             environment variable PHONEMIZER_ESPEAK_LIBRARY is set to a
 75 |             non-readable file
 76 | 
 77 |         """
 78 |         return EspeakWrapper.library()
 79 | 
 80 |     @classmethod
 81 |     def is_available(cls):
 82 |         try:
 83 |             EspeakWrapper()
 84 |         except RuntimeError:  # pragma: nocover
 85 |             return False
 86 |         return True
 87 | 
 88 |     @classmethod
 89 |     def is_espeak_ng(cls):
 90 |         """Returns True if using espeak-ng, False otherwise"""
 91 |         # espeak-ng starts with version 1.49
 92 |         return cls.version() >= (1, 49)
 93 | 
 94 |     @classmethod
 95 |     def version(cls):
 96 |         """Espeak version as a tuple (major, minor, patch)
 97 | 
 98 |         Raises
 99 |         ------
100 |         RuntimeError if BaseEspeakBackend.is_available() is False or if the
101 |             version cannot be extracted for some reason.
102 | 
103 |         """
104 |         return EspeakWrapper().version
105 | 
106 |     @abc.abstractmethod
107 |     def _postprocess_line(self, line, num, separator, strip):
108 |         pass
109 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/espeak.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Espeak backend for the phonemizer"""
 16 | 
 17 | import itertools
 18 | import re
 19 | 
 20 | from phonemizer.backend.espeak.base import BaseEspeakBackend
 21 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 22 | from phonemizer.backend.espeak.language_switch import (
 23 |     get_language_switch_processor)
 24 | from phonemizer.backend.espeak.words_mismatch import (
 25 |     get_words_mismatch_processor)
 26 | from phonemizer.logger import get_logger
 27 | from phonemizer.punctuation import Punctuation
 28 | 
 29 | 
 30 | class EspeakBackend(BaseEspeakBackend):
 31 |     """Espeak backend for the phonemizer"""
 32 |     # a regular expression to find phonemes stresses in espeak output
 33 |     _ESPEAK_STRESS_RE = re.compile(r"[ˈˌ'-]+")
 34 | 
 35 |     # pylint: disable=too-many-arguments
 36 |     def __init__(self, language,
 37 |                  punctuation_marks=Punctuation.default_marks(),
 38 |                  preserve_punctuation=False,
 39 |                  with_stress=False,
 40 |                  tie=False,
 41 |                  language_switch='keep-flags',
 42 |                  words_mismatch='ignore',
 43 |                  logger=get_logger()):
 44 |         super().__init__(
 45 |             language, punctuation_marks=punctuation_marks,
 46 |             preserve_punctuation=preserve_punctuation, logger=logger)
 47 | 
 48 |         self._espeak.set_voice(language)
 49 |         self._with_stress = with_stress
 50 |         self._tie = self._init_tie(tie)
 51 |         self._lang_switch = get_language_switch_processor(
 52 |             language_switch, self.logger, self.language)
 53 |         # self._words_mismatch = get_words_mismatch_processor(
 54 |         #     words_mismatch, self.logger)
 55 | 
 56 |     @staticmethod
 57 |     def _init_tie(tie):
 58 |         if not tie:
 59 |             return False
 60 | 
 61 |         if tie is True:  # default U+361 tie character
 62 |             return '͡'
 63 | 
 64 |         # non default tie charcacter
 65 |         tie = str(tie)
 66 |         if len(tie) != 1:
 67 |             raise RuntimeError(
 68 |                 f'explicit tie must be a single charcacter but is {tie}')
 69 |         return tie
 70 | 
 71 |     @staticmethod
 72 |     def name():
 73 |         return 'espeak'
 74 | 
 75 |     @classmethod
 76 |     def supported_languages(cls):
 77 |         return {
 78 |             voice.language: voice.name
 79 |             for voice in EspeakWrapper().available_voices()}
 80 | 
 81 |     def _phonemize_aux(self, text, offset, separator, strip):
 82 |         if self._tie and separator.phone:
 83 |             self.logger.warning(
 84 |                 'cannot use ties AND phone separation, '
 85 |                 'ignoring phone separator')
 86 | 
 87 |         output = []
 88 |         lang_switches = []
 89 |         for num, line in enumerate(text, start=1):
 90 |             line = self._espeak.text_to_phonemes(line, self._tie)
 91 |             line, has_switch = self._postprocess_line(
 92 |                 line, num, separator, strip)
 93 |             output.append(line)
 94 |             if has_switch:
 95 |                 lang_switches.append(num + offset)
 96 | 
 97 |         return output, lang_switches
 98 | 
 99 |     def _process_stress(self, word):
100 |         if self._with_stress:
101 |             return word
102 |         # remove the stresses on phonemes
103 |         return re.sub(self._ESPEAK_STRESS_RE, '', word)
104 | 
105 |     def _process_tie(self, word, separator):
106 |         # NOTE a bug in espeak append ties to (en) flags so as (͡e͡n).
107 |         # We do not correct it here.
108 |         if self._tie and self._tie != '͡':
109 |             # replace default '͡' by the requested one
110 |             return word.replace('͡', self._tie)
111 |         return word.replace('_', separator.phone)
112 | 
113 |     def _postprocess_line(self, line, num, separator, strip):
114 |         # espeak can split an utterance into several lines because
115 |         # of punctuation, here we merge the lines into a single one
116 |         line = line.strip().replace('\n', ' ').replace('  ', ' ')
117 | 
118 |         # due to a bug in espeak-ng, some additional separators can be
119 |         # added at the end of a word. Here a quick fix to solve that
120 |         # issue. See https://github.com/espeak-ng/espeak-ng/issues/694
121 |         line = re.sub(r'_+', '_', line)
122 |         line = re.sub(r'_ ', ' ', line)
123 | 
124 |         line, has_switch = self._lang_switch.process(line)
125 |         if not line:
126 |             return '', has_switch
127 | 
128 |         out_line = ''
129 |         for word in line.split(' '):
130 |             word = self._process_stress(word.strip())
131 |             if not strip and not self._tie:
132 |                 word += '_'
133 |             word = self._process_tie(word, separator)
134 |             out_line += word + separator.word
135 | 
136 |         if strip and separator.word:
137 |             # erase the last word separator from the line
138 |             out_line = out_line[:-len(separator.word)]
139 | 
140 |         return out_line, has_switch
141 | 
142 |     def _phonemize_preprocess(self, text):
143 |         text, punctuation_marks = super()._phonemize_preprocess(text)
144 |         # self._words_mismatch.count_text(text)
145 |         return text, punctuation_marks
146 | 
147 |     def _phonemize_postprocess(self, phonemized, punctuation_marks):
148 |         text = phonemized[0]
149 |         switches = phonemized[1]
150 | 
151 |         # self._words_mismatch.count_phonemized(text)
152 |         self._lang_switch.warning(switches)
153 | 
154 |         phonemized = super()._phonemize_postprocess(text, punctuation_marks)
155 |         # return self._words_mismatch.process(phonemized)
156 |         return phonemized
157 | 
158 |     @staticmethod
159 |     def _flatten(phonemized):
160 |         """Specialization of BaseBackend._flatten for the espeak backend
161 | 
162 |         From [([1, 2], ['a', 'b']), ([3],), ([4], ['c'])] to [[1, 2, 3, 4],
163 |         ['a', 'b', 'c']].
164 | 
165 |         """
166 |         flattened = []
167 |         for i in range(len(phonemized[0])):
168 |             flattened.append(
169 |                 list(itertools.chain(
170 |                     c for chunk in phonemized for c in chunk[i])))
171 |         return flattened
172 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/language_switch.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Manages language switches for the espeak backend
 16 | 
 17 | This module is used in phonemizer.backend.EspeakBackend and should be
 18 | considered private.
 19 | 
 20 | It manages languages switches that occur during phonemization, where a part of
 21 | a text is phonemized in a language different from the target language. For
 22 | instance the sentence "j'aime le football" in French will be phonemized by
 23 | espeak as "ʒɛm lə (en)fʊtbɔːl(fr)", "football" be pronounced as an English
 24 | word. This may cause two issues to end users. First it introduces undesirable
 25 | (.) language switch flags. It may introduce extra phones that are not present
 26 | in the target language phoneset.
 27 | 
 28 | This module implements 3 alternative solutions the user can choose when
 29 | initializing the espeak backend:
 30 | - 'keep-flags' preserves the language switch flags,
 31 | - 'remove-flags' removes the flags (.) but preserves the words with alternative
 32 |   phoneset,
 33 | - 'remove-utterance' removes the utterances where flags are detected.
 34 | 
 35 | """
 36 | 
 37 | import abc
 38 | import re
 39 | 
 40 | 
 41 | def get_language_switch_processor(mode, logger, language):
 42 |     """Returns a language switch processor initialized from `mode`
 43 | 
 44 |     The `mode` can be one of the following:
 45 |     - 'keep-flags' to preserve the switch flags
 46 |     - 'remove-flags' to suppress the switch flags
 47 |     - 'remove-utterance' to suppress the entire utterance
 48 | 
 49 |     Raises a RuntimeError if the `mode` is unknown.
 50 | 
 51 |     """
 52 |     processors = {
 53 |         'keep-flags': KeepFlags,
 54 |         'remove-flags': RemoveFlags,
 55 |         'remove-utterance': RemoveUtterances}
 56 | 
 57 |     try:
 58 |         return processors[mode](logger, language)
 59 |     except KeyError:
 60 |         raise RuntimeError(
 61 |             f'mode "{mode}" invalid, must be in {", ".join(processors.keys())}'
 62 |         ) from None
 63 | 
 64 | 
 65 | class BaseLanguageSwitch(abc.ABC):
 66 |     """The base class for language switch processors
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     logger (logging.Logger) : a logger instance to send warnings when language
 71 |         switches are detected.
 72 |     language (str) : the language code currently in use by the phonemizer, to
 73 |         customize warning content
 74 | 
 75 |     """
 76 |     # a regular expression to find language switch flags in espeak output,
 77 |     # Switches have the following form (here a switch from English to French):
 78 |     # "something (fr)quelque chose(en) another thing".
 79 |     _ESPEAK_FLAGS_RE = re.compile(r'\(.+?\)')
 80 | 
 81 |     def __init__(self, logger, language):
 82 |         self._logger = logger
 83 |         self._language = language
 84 | 
 85 |     @classmethod
 86 |     def is_language_switch(cls, utterance):
 87 |         """Returns True is a language switch is present in the `utterance`"""
 88 |         return bool(cls._ESPEAK_FLAGS_RE.search(utterance))
 89 | 
 90 |     @classmethod
 91 |     @abc.abstractmethod
 92 |     def process(cls, utterance):
 93 |         """Detects and process language switches according to the mode
 94 | 
 95 |         This method is called on each utterance as a phonemization
 96 |         post-processing step.
 97 | 
 98 |         Returns
 99 |         -------
100 |         processed_utterance (str) : the utterance either preserved, deleted (as
101 |             '') or with the switch removed
102 |         has_switch (bool): True if a language switch flag is found in the
103 |             `utterance` and False otherwise
104 | 
105 |         """
106 | 
107 |     @abc.abstractmethod
108 |     def warning(self, switches):
109 |         """Sends warnings to the logger with recorded language switches
110 | 
111 |         This method is called a single time at the very end of the
112 |         phonemization process.
113 | 
114 |         Parameters
115 |         ----------
116 |         switches (list of int) : the line numbers where language switches has
117 |             been detected during phonemization
118 | 
119 |         """
120 | 
121 | 
122 | class KeepFlags(BaseLanguageSwitch):
123 |     """Preserves utterances even if language switch flags are present"""
124 |     @classmethod
125 |     def process(cls, utterance):
126 |         return utterance, cls.is_language_switch(utterance)
127 | 
128 |     def warning(self, switches):
129 |         if not switches:
130 |             return
131 | 
132 |         nswitches = len(switches)
133 |         self._logger.warning(
134 |             '%s utterances containing language switches '
135 |             'on lines %s', nswitches,
136 |             ', '.join(str(switch) for switch in sorted(switches)))
137 |         self._logger.warning(
138 |             'extra phones may appear in the "%s" phoneset', self._language)
139 |         self._logger.warning(
140 |             'language switch flags have been kept '
141 |             '(applying "keep-flags" policy)')
142 | 
143 | 
144 | class RemoveFlags(BaseLanguageSwitch):
145 |     """Removes the language switch flags when detected"""
146 |     @classmethod
147 |     def process(cls, utterance):
148 |         if cls.is_language_switch(utterance):
149 |             # remove all the (lang) flags in the current utterance
150 |             return re.sub(cls._ESPEAK_FLAGS_RE, '', utterance), True
151 |         return utterance, False
152 | 
153 |     def warning(self, switches):
154 |         if not switches:
155 |             return
156 | 
157 |         nswitches = len(switches)
158 |         self._logger.warning(
159 |             '%s utterances containing language switches '
160 |             'on lines %s', nswitches,
161 |             ', '.join(str(switch) for switch in sorted(switches)))
162 |         self._logger.warning(
163 |             'extra phones may appear in the "%s" phoneset', self._language)
164 |         self._logger.warning(
165 |             'language switch flags have been removed '
166 |             '(applying "remove-flags" policy)')
167 | 
168 | 
169 | class RemoveUtterances(BaseLanguageSwitch):
170 |     """Remove the entire utterance when a language switch flag is detected"""
171 |     @classmethod
172 |     def process(cls, utterance):
173 |         if cls.is_language_switch(utterance):
174 |             # drop the entire utterance
175 |             return '', True
176 |         return utterance, False
177 | 
178 |     def warning(self, switches):
179 |         if not switches:
180 |             return
181 | 
182 |         nswitches = len(switches)
183 |         self._logger.warning(
184 |             'removed %s utterances containing language switches '
185 |             '(applying "remove-utterance" policy)', nswitches)
186 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/mbrola.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Mbrola backend for the phonemizer"""
 16 | 
 17 | import pathlib
 18 | import shutil
 19 | import sys
 20 | 
 21 | from phonemizer.backend.espeak.base import BaseEspeakBackend
 22 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 23 | from phonemizer.logger import get_logger
 24 | 
 25 | 
 26 | class EspeakMbrolaBackend(BaseEspeakBackend):
 27 |     """Espeak-mbrola backend for the phonemizer"""
 28 |     # this will be initialized once, at the first call to supported_languages()
 29 |     _supported_languages = None
 30 | 
 31 |     def __init__(self, language, logger=get_logger()):
 32 |         super().__init__(language, logger=logger)
 33 |         self._espeak.set_voice(language)
 34 | 
 35 |     @staticmethod
 36 |     def name():
 37 |         return 'espeak-mbrola'
 38 | 
 39 |     @staticmethod
 40 |     def is_available():
 41 |         """Mbrola backend is available for espeak>=1.49"""
 42 |         return (
 43 |             BaseEspeakBackend.is_available() and
 44 |             shutil.which('mbrola') and
 45 |             BaseEspeakBackend.is_espeak_ng())
 46 | 
 47 |     @classmethod
 48 |     def _all_supported_languages(cls):
 49 |         # retrieve the mbrola voices. This voices must be installed separately.
 50 |         voices = EspeakWrapper().available_voices('mbrola')
 51 |         return {voice.identifier[3:]: voice.name for voice in voices}
 52 | 
 53 |     @classmethod
 54 |     def _is_language_installed(cls, language, data_path):
 55 |         """Returns True if the required mbrola voice is installed"""
 56 |         # this is a reimplementation of LoadMbrolaTable from espeak
 57 |         # synth_mbrola.h sources
 58 |         voice = language[3:]  # remove mb- prefix
 59 | 
 60 |         if pathlib.Path(data_path / 'mbrola' / voice).is_file():
 61 |             return True  # pragma: nocover
 62 | 
 63 |         if sys.platform != 'win32':
 64 |             candidates = [
 65 |                 f'/usr/share/mbrola/{voice}',
 66 |                 f'/usr/share/mbrola/{voice}/{voice}',
 67 |                 f'/usr/share/mbrola/voices/{voice}']
 68 |             for candidate in candidates:
 69 |                 if pathlib.Path(candidate).is_file():
 70 |                     return True
 71 | 
 72 |         return False
 73 | 
 74 |     @classmethod
 75 |     def supported_languages(cls):  # pragma: nocover
 76 |         """Returns the list of installed mbrola voices"""
 77 |         if cls._supported_languages is None:
 78 |             data_path = EspeakWrapper().data_path
 79 |             cls._supported_languages = {
 80 |                 k: v for k, v in cls._all_supported_languages().items()
 81 |                 if cls._is_language_installed(k, data_path)}
 82 |         return cls._supported_languages
 83 | 
 84 |     def _phonemize_aux(self, text, offset, separator, strip):
 85 |         output = []
 86 |         for num, line in enumerate(text, start=1):
 87 |             line = self._espeak.synthetize(line)
 88 |             line = self._postprocess_line(line, offset + num, separator, strip)
 89 |             output.append(line)
 90 |         return output
 91 | 
 92 |     def _postprocess_line(self, line, num, separator, strip):
 93 |         # retrieve the phonemes with the correct SAMPA alphabet (but
 94 |         # without word separation)
 95 |         phonemes = (
 96 |             phn.split('\t')[0] for phn in line.split('\n') if phn.strip())
 97 |         phonemes = separator.phone.join(pho for pho in phonemes if pho != '_')
 98 | 
 99 |         if not strip:
100 |             phonemes += separator.phone
101 | 
102 |         return phonemes
103 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/voice.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonemizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonemizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Voice struct from Espeak API exposed to Python"""
16 | 
17 | import ctypes
18 | 
19 | 
20 | # This class can be a dataclass for compatibility with python-3.6 we don't use
21 | # the dataclasses module.
22 | class EspeakVoice:
23 |     """A helper class to expose voice structures within C and Python"""
24 |     def __init__(self, name='', language='', identifier=''):
25 |         self._name = name
26 |         self._language = language
27 |         self._identifier = identifier
28 | 
29 |     @property
30 |     def name(self):
31 |         "Voice name"
32 |         return self._name
33 | 
34 |     @property
35 |     def language(self):
36 |         """Language code"""
37 |         return self._language
38 | 
39 |     @property
40 |     def identifier(self):
41 |         """Path to the voice file wrt espeak data path"""
42 |         return self._identifier
43 | 
44 |     def __eq__(self, other):
45 |         return (
46 |             self.name == other.name and
47 |             self.language == other.language and
48 |             self.identifier == other.identifier)
49 | 
50 |     def __hash__(self):
51 |         return hash((self.name, self.language, self.identifier))
52 | 
53 |     class Struct(ctypes.Structure):  # pylint: disable=too-few-public-methods
54 |         """A helper class to fetch voices information from the espeak library.
55 | 
56 |         The espeak_VOICE struct is defined in speak_lib.h from the espeak code.
57 |         Here we use only name (voice name), languages (language code) and
58 |         identifier (voice file) information.
59 | 
60 |         """
61 |         _fields_ = [
62 |             ('name', ctypes.c_char_p),
63 |             ('languages', ctypes.c_char_p),
64 |             ('identifier', ctypes.c_char_p)]
65 | 
66 |     def to_ctypes(self):
67 |         """Converts the Voice instance to  an espeak ctypes structure"""
68 |         return self.Struct(
69 |             self.name.encode('utf8') if self.name else None,
70 |             self.language.encode('utf8') if self.language else None,
71 |             self.identifier.encode('utf8') if self.identifier else None)
72 | 
73 |     @classmethod
74 |     def from_ctypes(cls, struct):
75 |         """Returns a Voice instance built from an espeak ctypes structure"""
76 |         return cls(
77 |             name=(struct.name or b'').decode(),
78 |             # discard a useless char prepended by espeak
79 |             language=(struct.languages or b'0').decode()[1:],
80 |             identifier=(struct.identifier or b'').decode())
81 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/words_mismatch.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Manages words count mismatches for the espeak backend"""
 16 | 
 17 | 
 18 | import abc
 19 | import re
 20 | 
 21 | 
 22 | def get_words_mismatch_processor(mode, logger):
 23 |     """Returns a word count mismatch processor according to `mode`
 24 | 
 25 |     The `mode` can be one of the following:
 26 |     - `ignore` to ignore words mismatches
 27 |     - `warn` to display a warning on each mismatched utterance
 28 |     - `remove` to remove any utterance containing a words mismatch
 29 | 
 30 |     Raises a RuntimeError if the `mode` is unknown.
 31 | 
 32 |     """
 33 |     processors = {
 34 |         'ignore': Ignore,
 35 |         'warn': Warn,
 36 |         'remove': Remove}
 37 | 
 38 |     try:
 39 |         return processors[mode](logger)
 40 |     except KeyError:
 41 |         raise RuntimeError(
 42 |             f'mode {mode} invalid, must be in {", ".join(processors.keys())}'
 43 |         ) from None
 44 | 
 45 | 
 46 | class BaseWordsMismatch(abc.ABC):
 47 |     """The base class of all word count mismatch processors"""
 48 |     _RE_SPACES = re.compile(r'\s+')
 49 | 
 50 |     def __init__(self, logger):
 51 |         self._logger = logger
 52 |         self._count_txt = []
 53 |         self._count_phn = []
 54 | 
 55 |     @classmethod
 56 |     def _count_words(cls, text):
 57 |         """Return the number of words contained in each line of `text`"""
 58 |         return [
 59 |             len([w for w in cls._RE_SPACES.split(line.strip()) if w])
 60 |             for line in text]
 61 | 
 62 |     def _mismatched_lines(self):
 63 |         """Returns a list of (num_line, nwords_input, nwords_output)
 64 | 
 65 |         Consider only the lines where nwords_input != nwords_output. Raises a
 66 |         RuntimeError if input and output do not have the same number of lines.
 67 | 
 68 |         """
 69 |         if len(self._count_txt) != len(self._count_phn):
 70 |             raise RuntimeError(  # pragma: nocover
 71 |                 f'number of lines in input and output must be equal, '
 72 |                 f'we have: input={len(self._count_txt)}, '
 73 |                 f'output={len(self._count_phn)}')
 74 | 
 75 |         return [
 76 |             (n, t, p) for n, (t, p) in
 77 |             enumerate(zip(self._count_txt, self._count_phn))
 78 |             if t != p]
 79 | 
 80 |     def _resume(self, nmismatch, nlines):
 81 |         """Logs a high level undetailed warning"""
 82 |         if nmismatch:
 83 |             self._logger.warning(
 84 |                 'words count mismatch on %s%% of the lines (%s/%s)',
 85 |                 round(nmismatch / nlines, 2) * 100, nmismatch, nlines)
 86 | 
 87 |     def count_text(self, text):
 88 |         """Stores the number of words in each input line"""
 89 |         self._count_txt = self._count_words(text)
 90 | 
 91 |     def count_phonemized(self, text):
 92 |         """Stores the number of words in each output line"""
 93 |         self._count_phn = self._count_words(text)
 94 | 
 95 |     @abc.abstractmethod
 96 |     def process(self, text):
 97 |         """Detects and process word count misatches according to the mode
 98 | 
 99 |         This method is called at the very end of phonemization, during
100 |         post-processing.
101 | 
102 |         """
103 | 
104 | 
105 | class Ignore(BaseWordsMismatch):
106 |     """Ignores word count mismatches"""
107 |     def process(self, text):
108 |         self._resume(len(self._mismatched_lines()), len(text))
109 |         return text
110 | 
111 | 
112 | class Warn(BaseWordsMismatch):
113 |     """Warns on every mismatch detected"""
114 |     def process(self, text):
115 |         mismatch = self._mismatched_lines()
116 |         for num, ntxt, nphn in mismatch:
117 |             self._logger.warning(
118 |                 'words count mismatch on line %s '
119 |                 '(expected %s words but get %s)',
120 |                 num+1, ntxt, nphn)
121 | 
122 |         self._resume(len(mismatch), len(text))
123 |         return text
124 | 
125 | 
126 | class Remove(BaseWordsMismatch):
127 |     """Removes any utterance containing a word count mismatch"""
128 |     def process(self, text):
129 |         mismatch = [line[0] for line in self._mismatched_lines()]
130 |         self._resume(len(mismatch), len(text))
131 |         self._logger.warning('removing the mismatched lines')
132 | 
133 |         for index in mismatch:
134 |             text[index] = ''
135 |         return text
136 | 


--------------------------------------------------------------------------------
/phonemizer/backend/espeak/wrapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Wrapper on espeak-ng library"""
 16 | 
 17 | import ctypes
 18 | import ctypes.util
 19 | import functools
 20 | import os
 21 | import pathlib
 22 | import sys
 23 | import tempfile
 24 | import weakref
 25 | 
 26 | from phonemizer.backend.espeak.api import EspeakAPI
 27 | from phonemizer.backend.espeak.voice import EspeakVoice
 28 | 
 29 | 
 30 | class EspeakWrapper:
 31 |     """Wrapper on espeak shared library
 32 | 
 33 |     The aim of this wrapper is not to be exhaustive but to encapsulate the
 34 |     espeak functions required for phonemization. It relies on a espeak shared
 35 |     library (*.so on Linux, *.dylib on Mac and *.dll on Windows) that must be
 36 |     installed on the system.
 37 | 
 38 |     Use the function `EspeakWrapper.set_library()` before instanciation to
 39 |     customize the library to use.
 40 | 
 41 |     Raises
 42 |     ------
 43 |     RuntimeError if the espeak shared library cannot be loaded
 44 | 
 45 |     """
 46 |     # a static variable used to overload the default espeak library installed
 47 |     # on the system. The user can choose an alternative espeak library with
 48 |     # the method EspeakWrapper.set_library().
 49 |     _ESPEAK_LIBRARY = None
 50 | 
 51 |     def __init__(self):
 52 |         # the following attributes are accessed through properties and are
 53 |         # lazily initialized
 54 |         self._version = None
 55 |         self._data_path = None
 56 |         self._voice = None
 57 | 
 58 |         # load the espeak API
 59 |         self._espeak = EspeakAPI(self.library())
 60 | 
 61 |         # lazy loading of attributes only required for the synthetize method
 62 |         self._libc_ = None
 63 |         self._tempfile_ = None
 64 | 
 65 |     @property
 66 |     def _libc(self):
 67 |         if self._libc_ is None:
 68 |             self._libc_ = (
 69 |                 ctypes.windll.msvcrt if sys.platform == 'win32' else
 70 |                 ctypes.cdll.LoadLibrary(ctypes.util.find_library('c')))
 71 |         return self._libc_
 72 | 
 73 |     @property
 74 |     def _tempfile(self):
 75 |         if self._tempfile_ is None:
 76 |             # this will automatically removed at exit
 77 |             # pylint: disable=consider-using-with
 78 |             self._tempfile_ = tempfile.NamedTemporaryFile()
 79 |             weakref.finalize(self._tempfile_, self._tempfile_.close)
 80 |         return self._tempfile_
 81 | 
 82 |     def __getstate__(self):
 83 |         """For pickling, when phonemizing on multiple jobs"""
 84 |         return {
 85 |             'version': self._version,
 86 |             'data_path': self._data_path,
 87 |             'voice': self._voice}
 88 | 
 89 |     def __setstate__(self, state):
 90 |         """For unpickling, when phonemizing on multiple jobs"""
 91 |         self.__init__()
 92 |         self._version = state['version']
 93 |         self._data_path = state['data_path']
 94 |         self._voice = state['voice']
 95 |         if self._voice:
 96 |             if 'mb' in self._voice.identifier:  # mbrola voice
 97 |                 self.set_voice(self._voice.identifier[3:])
 98 |             else:
 99 |                 self.set_voice(self._voice.language)
100 | 
101 |     @classmethod
102 |     def set_library(cls, library):
103 |         """Sets the espeak backend to use `library`
104 | 
105 |         If this is not set, the backend uses the default espeak shared library
106 |         from the system installation.
107 | 
108 |         Parameters
109 |         ----------
110 |         library (str or None) : the path to the espeak shared library to use as
111 |           backend. Set `library` to None to restore the default.
112 | 
113 |         """
114 |         cls._ESPEAK_LIBRARY = library
115 | 
116 |     @classmethod
117 |     def library(cls):
118 |         """Returns the espeak library used as backend
119 | 
120 |         The following precedence rule applies for library lookup:
121 | 
122 |         1. As specified by BaseEspeakBackend.set_library()
123 |         2. Or as specified by the environment variable
124 |            PHONEMIZER_ESPEAK_LIBRARY
125 |         3. Or the default espeak library found on the system
126 | 
127 |         Raises
128 |         ------
129 |         RuntimeError if the espeak library cannot be found or if the
130 |           environment variable PHONEMIZER_ESPEAK_LIBRARY is set to a
131 |           non-readable file
132 | 
133 |         """
134 |         if cls._ESPEAK_LIBRARY:
135 |             return cls._ESPEAK_LIBRARY
136 | 
137 |         if 'PHONEMIZER_ESPEAK_LIBRARY' in os.environ:
138 |             library = pathlib.Path(os.environ['PHONEMIZER_ESPEAK_LIBRARY'])
139 |             if not (library.is_file() and os.access(library, os.R_OK)):
140 |                 raise RuntimeError(  # pragma: nocover
141 |                     f'PHONEMIZER_ESPEAK_LIBRARY={library} '
142 |                     f'is not a readable file')
143 |             return library.resolve()
144 | 
145 |         library = (
146 |             ctypes.util.find_library('espeak-ng') or
147 |             ctypes.util.find_library('espeak'))
148 |         if not library:  # pragma: nocover
149 |             raise RuntimeError(
150 |                 'failed to find espeak library')
151 |         return library
152 | 
153 |     def _fetch_version_and_path(self):
154 |         """Initializes version and dapa path from the espeak library"""
155 |         version, data_path = self._espeak.info()
156 | 
157 |         # pylint: disable=no-member
158 |         self._data_path = pathlib.Path(data_path.decode())
159 |         if not self._data_path.is_dir():  # pragma: nocover
160 |             raise RuntimeError('failed to retrieve espeak data directory')
161 | 
162 |         # espeak-1.48 appends the release date to version number, here we
163 |         # simply ignore it
164 |         version = version.decode().strip().split(' ')[0].replace('-dev', '')
165 |         self._version = tuple(int(v) for v in version.split('.'))
166 | 
167 |     @property
168 |     def version(self):
169 |         """The espeak version as a tuple of integers (major, minor, patch)"""
170 |         if self._version is None:
171 |             self._fetch_version_and_path()
172 |         return self._version
173 | 
174 |     @property
175 |     def library_path(self):
176 |         """The espeak library as a pathlib.Path instance"""
177 |         return self._espeak.library_path
178 | 
179 |     @property
180 |     def data_path(self):
181 |         """The espeak data directory as a pathlib.Path instance"""
182 |         if self._data_path is None:
183 |             self._fetch_version_and_path()
184 |         return self._data_path
185 | 
186 |     @property
187 |     def voice(self):
188 |         """The configured voice as an EspeakVoice instance
189 | 
190 |         If `set_voice` has not been called, returns None
191 | 
192 |         """
193 |         return self._voice
194 | 
195 |     @functools.lru_cache(maxsize=None)
196 |     def available_voices(self, name=None):
197 |         """Voices available for phonemization, as a list of `EspeakVoice`"""
198 |         if name:
199 |             name = EspeakVoice(language=name).to_ctypes()
200 |         voices = self._espeak.list_voices(name or None)
201 | 
202 |         index = 0
203 |         available_voices = []
204 |         # voices is an array to pointers, terminated by None
205 |         while voices[index]:
206 |             voice = voices[index].contents
207 |             available_voices.append(EspeakVoice(
208 |                 name=os.fsdecode(voice.name).replace('_', ' '),
209 |                 language=os.fsdecode(voice.languages)[1:],
210 |                 identifier=os.fsdecode(voice.identifier)))
211 |             index += 1
212 |         return available_voices
213 | 
214 |     def set_voice(self, voice_code):
215 |         """Setup the voice to use for phonemization
216 | 
217 |         Parameters
218 |         ----------
219 |         voice_code (str) : Must be a valid language code that is actually
220 |           supported by espeak
221 | 
222 |         Raises
223 |         ------
224 |         RuntimeError if the required voice cannot be initialized
225 | 
226 |         """
227 |         if 'mb' in voice_code:
228 |             # this is an mbrola voice code. Select the voice by using
229 |             # identifier in the format 'mb/{voice_code}'
230 |             available = {
231 |                 voice.identifier[3:]: voice.identifier
232 |                 for voice in self.available_voices('mbrola')}
233 |         else:
234 |             # this are espeak voices. Select the voice using it's attached
235 |             # language code. Consider only the first voice of a given code as
236 |             # they are sorted by relevancy
237 |             available = {}
238 |             for voice in self.available_voices():
239 |                 if voice.language not in available:
240 |                     available[voice.language] = voice.identifier
241 | 
242 |         try:
243 |             voice_name = available[voice_code]
244 |         except KeyError:
245 |             raise RuntimeError(f'invalid voice code "{voice_code}"') from None
246 | 
247 |         if self._espeak.set_voice_by_name(voice_name.encode('utf8')) != 0:
248 |             raise RuntimeError(  # pragma: nocover
249 |                 f'failed to load voice "{voice_code}"')
250 | 
251 |         voice = self._get_voice()
252 |         if not voice:  # pragma: nocover
253 |             raise RuntimeError(f'failed to load voice "{voice_code}"')
254 |         self._voice = voice
255 | 
256 |     def _get_voice(self):
257 |         """Returns the current voice used for phonemization
258 | 
259 |         If no voice has been set up, returns None.
260 | 
261 |         """
262 |         voice = self._espeak.get_current_voice()
263 |         if voice.name:
264 |             return EspeakVoice.from_ctypes(voice)
265 |         return None  # pragma: nocover
266 | 
267 |     def text_to_phonemes(self, text, tie=False):
268 |         """Translates a text into phonemes, must call set_voice() first.
269 | 
270 |         This method is used by the Espeak backend. Wrapper on the
271 |         espeak_TextToPhonemes function.
272 | 
273 |         Parameters
274 |         ----------
275 |         text (str) : the text to phonemize
276 | 
277 |         tie (bool, optional) : When True use a '͡' character between
278 |           consecutive characters of a single phoneme. Else separate phoneme
279 |           with '_'. This option requires espeak>=1.49. Default to False.
280 | 
281 |         Returns
282 |         -------
283 |         phonemes (str) : the phonemes for the text encoded in IPA, with '_' as
284 |           phonemes separator (excepted if ``tie`` is True) and ' ' as word
285 |           separator.
286 | 
287 |         """
288 |         if self.voice is None:  # pragma: nocover
289 |             raise RuntimeError('no voice specified')
290 | 
291 |         if tie and self.version <= (1, 48, 3):
292 |             raise RuntimeError(  # pragma: nocover
293 |                 'tie option only compatible with espeak>=1.49')
294 | 
295 |         # from Python string to C void** (a pointer to a pointer to chars)
296 |         text_ptr = ctypes.pointer(ctypes.c_char_p(text.encode('utf8')))
297 | 
298 |         # input text is encoded as UTF8
299 |         text_mode = 1
300 | 
301 |         # output phonemes in IPA and separated by _, or with a tie character if
302 |         # required. See comments for the function espeak_TextToPhonemes in
303 |         # speak_lib.h of the espeak sources for details.
304 |         if self.version <= (1, 48, 3):  # pragma: nocover
305 |             phonemes_mode = 0x03 | 0x01 << 4
306 |         elif tie:
307 |             phonemes_mode = 0x02 | 0x01 << 7 | ord('͡') << 8
308 |         else:
309 |             phonemes_mode = ord('_') << 8 | 0x02
310 | 
311 |         result = []
312 |         while text_ptr.contents.value is not None:
313 |             phonemes = self._espeak.text_to_phonemes(
314 |                 text_ptr, text_mode, phonemes_mode)
315 |             if phonemes:
316 |                 result.append(phonemes.decode())
317 |         return ' '.join(result)
318 | 
319 |     def synthetize(self, text):
320 |         """Translates a text into phonemes, must call set_voice() first.
321 | 
322 |         Only compatible with espeak>=1.49. This method is used by the
323 |         EspeakMbrola backend. Wrapper on the espeak_Synthesize function.
324 | 
325 |         Parameters
326 |         ----------
327 |         text (str) : the text to phonemize
328 | 
329 |         Returns
330 |         -------
331 |         phonemes (str) : the phonemes for the text encoded in SAMPA, with '_'
332 |           as phonemes separator and no word separation.
333 | 
334 |         """
335 | 
336 |         if self.version < (1, 49):  # pragma: nocover
337 |             raise RuntimeError('not compatible with espeak<=1.48')
338 |         if self.voice is None:  # pragma: nocover
339 |             raise RuntimeError('no voice specified')
340 | 
341 |         # init libc fopen and fclose functions
342 |         self._libc.fopen.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
343 |         self._libc.fopen.restype = ctypes.c_void_p
344 |         self._libc.fclose.argtypes = [ctypes.c_void_p]
345 |         self._libc.fclose.restype = ctypes.c_int
346 | 
347 |         # output phonemes in SAMPA and separated by _. Write the result to a
348 |         # tempfile which is read back after phonemization (seems not possible
349 |         # to redirect to stdout). See comments for the function
350 |         # espeak_SetPhonemeTrace in speak_lib.h of the espeak sources for
351 |         # details.
352 |         self._tempfile.truncate(0)
353 |         file_p = self._libc.fopen(
354 |             self._tempfile.name.encode(),
355 |             self._tempfile.mode.encode())
356 | 
357 |         self._espeak.set_phoneme_trace(0x01 << 4 | ord('_') << 8, file_p)
358 |         status = self._espeak.synthetize(
359 |             ctypes.c_char_p(text.encode('utf8')),
360 |             ctypes.c_size_t(len(text) + 1),
361 |             ctypes.c_uint(0x01))
362 |         self._libc.fclose(file_p)  # because flush does not work...
363 | 
364 |         if status != 0:  # pragma: nocover
365 |             raise RuntimeError('failed to synthetize')
366 | 
367 |         self._tempfile.seek(0)
368 |         phonemized = self._tempfile.read().decode().strip()
369 |         return phonemized
370 | 


--------------------------------------------------------------------------------
/phonemizer/backend/festival/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonologizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonologizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Phonemizer module for festival backend implementation"""
16 | 


--------------------------------------------------------------------------------
/phonemizer/backend/festival/festival.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Festival backend for the phonemizer"""
 16 | 
 17 | 
 18 | import os
 19 | import pathlib
 20 | import re
 21 | import shlex
 22 | import shutil
 23 | import subprocess
 24 | import sys
 25 | import tempfile
 26 | 
 27 | from phonemizer.backend.festival import lispy
 28 | from phonemizer.backend.base import BaseBackend
 29 | from phonemizer.logger import get_logger
 30 | from phonemizer.punctuation import Punctuation
 31 | from phonemizer.utils import get_package_resource, version_as_tuple
 32 | 
 33 | 
 34 | class FestivalBackend(BaseBackend):
 35 |     """Festival backend for the phonemizer"""
 36 |     # a static variable used to overload the default festival binary installed
 37 |     # on the system. The user can choose an alternative festival binary with
 38 |     # the method FestivalBackend.set_executable().
 39 |     _FESTIVAL_EXECUTABLE = None
 40 | 
 41 |     def __init__(self, language,
 42 |                  punctuation_marks=Punctuation.default_marks(),
 43 |                  preserve_punctuation=False,
 44 |                  logger=get_logger()):
 45 |         super().__init__(
 46 |             language,
 47 |             punctuation_marks=punctuation_marks,
 48 |             preserve_punctuation=preserve_punctuation,
 49 |             logger=logger)
 50 | 
 51 |         self.logger.debug('festival executable is %s', self.executable())
 52 | 
 53 |         # the Scheme script to be send to festival
 54 |         script_file = get_package_resource('festival/phonemize.scm')
 55 |         with open(script_file, 'r') as fscript:
 56 |             self._script = fscript.read()
 57 |         self.logger.debug('loaded %s', script_file)
 58 | 
 59 |     @staticmethod
 60 |     def name():
 61 |         return 'festival'
 62 | 
 63 |     @classmethod
 64 |     def set_executable(cls, executable):
 65 |         """Sets the festival backend to use `executable`
 66 | 
 67 |         If this is not set, the backend uses the default festival executable
 68 |         from the system installation.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         executable (str) : the path to the festival executable to use as
 73 |             backend. Set `executable` to None to restore the default.
 74 | 
 75 |         Raises
 76 |         ------
 77 |         RuntimeError if `executable` is not an executable file.
 78 | 
 79 |         """
 80 |         if executable is None:
 81 |             cls._FESTIVAL_EXECUTABLE = None
 82 |             return
 83 | 
 84 |         executable = pathlib.Path(executable)
 85 |         if not (executable.is_file() and os.access(executable, os.X_OK)):
 86 |             raise RuntimeError(
 87 |                 f'{executable} is not an executable file')
 88 | 
 89 |         cls._FESTIVAL_EXECUTABLE = executable.resolve()
 90 | 
 91 |     @classmethod
 92 |     def executable(cls):
 93 |         """Returns the absolute path to the festival executable used as backend
 94 | 
 95 |         The following precedence rule applies for executable lookup:
 96 | 
 97 |         1. As specified by FestivalBackend.set_executable()
 98 |         2. Or as specified by the environment variable
 99 |            PHONEMIZER_FESTIVAL_EXECUTABLE
100 |         3. Or the default 'festival' binary found on the system with
101 |           `shutil.which('festival')`
102 | 
103 |         Raises
104 |         ------
105 |         RuntimeError if the festival executable cannot be found or if the
106 |             environment variable PHONEMIZER_FESTIVAL_EXECUTABLE is set to a
107 |             non-executable file
108 | 
109 |         """
110 |         if cls._FESTIVAL_EXECUTABLE:
111 |             return cls._FESTIVAL_EXECUTABLE
112 | 
113 |         if 'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ:
114 |             executable = pathlib.Path(os.environ[
115 |                 'PHONEMIZER_FESTIVAL_EXECUTABLE'])
116 |             if not (
117 |                     executable.is_file()
118 |                     and os.access(executable, mode=os.X_OK)
119 |             ):
120 |                 raise RuntimeError(
121 |                     f'PHONEMIZER_FESTIVAL_EXECUTABLE={executable} '
122 |                     f'is not an executable file')
123 |             return executable.resolve()
124 | 
125 |         executable = shutil.which('festival')
126 |         if not executable:  # pragma: nocover
127 |             raise RuntimeError(
128 |                 'failed to find festival executable')
129 |         return pathlib.Path(executable).resolve()
130 | 
131 |     @classmethod
132 |     def is_available(cls):
133 |         """True if the festival executable is available, False otherwise"""
134 |         try:
135 |             cls.executable()
136 |         except RuntimeError:  # pragma: nocover
137 |             return False
138 |         return True
139 | 
140 |     @classmethod
141 |     def version(cls):
142 |         """Festival version as a tupe of integers (major, minor, patch)
143 | 
144 |         Raises
145 |         ------
146 |         RuntimeError if FestivalBackend.is_available() is False or if the
147 |             version cannot be extracted for some reason.
148 | 
149 |         """
150 | 
151 |         festival = cls.executable()
152 | 
153 |         # the full version version string includes extra information
154 |         # we don't need
155 |         long_version = subprocess.check_output(
156 |             [festival, '--version']).decode('latin1').strip()
157 | 
158 |         # extract the version number with a regular expression
159 |         festival_version_re = r'.* ([0-9\.]+[0-9]):'
160 |         try:
161 |             version = re.match(festival_version_re, long_version).group(1)
162 |         except AttributeError:
163 |             raise RuntimeError(
164 |                 f'cannot extract festival version from {festival}') from None
165 | 
166 |         return version_as_tuple(version)
167 | 
168 |     @staticmethod
169 |     def supported_languages():
170 |         """A dictionnary of language codes -> name supported by festival
171 | 
172 |         Actually only en-us (American English) is supported.
173 | 
174 |         """
175 |         return {'en-us': 'english-us'}
176 | 
177 |     # pylint: disable=unused-argument
178 |     def _phonemize_aux(self, text, offset, separator, strip):
179 |         """Return a phonemized version of `text` with festival
180 | 
181 |         This function is a wrapper on festival, a text to speech
182 |         program, allowing simple phonemization of some English
183 |         text. The US phoneset we use is the default one in festival,
184 |         as described at http://www.festvox.org/bsv/c4711.html
185 | 
186 |         Any opening and closing parenthesis in `text` are removed, as
187 |         they interfer with the Scheme expression syntax. Moreover
188 |         double quotes are replaced by simple quotes because double
189 |         quotes denotes utterances boundaries in festival.
190 | 
191 |         Parsing a ill-formed Scheme expression during post-processing
192 |         (typically with unbalanced parenthesis) raises an IndexError.
193 | 
194 |         """
195 |         text = self._preprocess(text)
196 |         if len(text) == 0:
197 |             return []
198 |         text = self._process(text)
199 |         text = self._postprocess(text, separator, strip)
200 |         return text
201 | 
202 |     @staticmethod
203 |     def _double_quoted(line):
204 |         """Return the string `line` surrounded by double quotes"""
205 |         return '"' + line + '"'
206 | 
207 |     @staticmethod
208 |     def _cleaned(line):
209 |         """Remove 'forbidden' characters from the line"""
210 |         # special case (very unlikely but causes a crash in festival)
211 |         # where a line is only made of '
212 |         if set(line) == set("'"):
213 |             line = ''
214 | 
215 |         # remove forbidden characters (reserved for scheme, ie festival
216 |         # scripting language)
217 |         return line.replace('"', '').replace('(', '').replace(')', '').strip()
218 | 
219 |     @classmethod
220 |     def _preprocess(cls, text):
221 |         """Returns the contents of `text` formatted for festival input
222 | 
223 |         This function adds double quotes to begining and end of each
224 |         line in text, if not already presents. The returned result is
225 |         a multiline string. Empty lines in inputs are ignored.
226 | 
227 |         """
228 |         cleaned_text = (
229 |             cls._cleaned(line) for line in text if line != '')
230 | 
231 |         return '\n'.join(
232 |             cls._double_quoted(line) for line in cleaned_text if line != '')
233 | 
234 |     def _process(self, text):
235 |         """Return the raw phonemization of `text`
236 | 
237 |         This function delegates to festival the text analysis and
238 |         syllabic structure extraction.
239 | 
240 |         Return a string containing the "SylStructure" relation tree of
241 |         the text, as a scheme expression.
242 | 
243 |         """
244 |         with tempfile.NamedTemporaryFile('w+', delete=False) as data:
245 |             try:
246 |                 # save the text as a tempfile
247 |                 data.write(text)
248 |                 data.close()
249 | 
250 |                 # fix the path name for windows
251 |                 name = data.name
252 |                 if sys.platform == 'win32':  # pragma: nocover
253 |                     name = name.replace('\\', '\\\\')
254 | 
255 |                 with tempfile.NamedTemporaryFile('w+', delete=False) as scm:
256 |                     try:
257 |                         scm.write(self._script.format(name))
258 |                         scm.close()
259 | 
260 |                         cmd = f'{self.executable()} -b {scm.name}'
261 |                         if self.logger:
262 |                             self.logger.debug('running %s', cmd)
263 | 
264 |                         # redirect stderr to a tempfile and displaying it only
265 |                         # on errors. Messages are something like: "UniSyn:
266 |                         # using default diphone ax-ax for y-pau". This is
267 |                         # related to wave synthesis (done by festival during
268 |                         # phonemization).
269 |                         with tempfile.TemporaryFile('w+') as fstderr:
270 |                             return self._run_festival(cmd, fstderr)
271 |                     finally:
272 |                         os.remove(scm.name)
273 |             finally:
274 |                 os.remove(data.name)
275 | 
276 |     @staticmethod
277 |     def _run_festival(cmd, fstderr):
278 |         """Runs the festival command for phonemization
279 | 
280 |         Returns the raw phonemized output (need to be postprocesses). Raises a
281 |         RuntimeError if festival fails.
282 | 
283 |         """
284 |         try:
285 |             output = subprocess.check_output(
286 |                 shlex.split(cmd, posix=False), stderr=fstderr)
287 | 
288 |             # festival seems to use latin1 and not utf8
289 |             return re.sub(' +', ' ', output.decode('latin1'))
290 | 
291 |         except subprocess.CalledProcessError as err:  # pragma: nocover
292 |             fstderr.seek(0)
293 |             raise RuntimeError(
294 |                 f'Command "{cmd}" returned exit status {err.returncode}, '
295 |                 f'output is:\n{fstderr.read()}') from None
296 | 
297 |     @staticmethod
298 |     def _postprocess_syll(syll, separator, strip):
299 |         """Parse a syllable from festival to phonemized output"""
300 |         sep = separator.phone
301 |         out = (phone[0][0].replace('"', '') for phone in syll[1:])
302 |         out = sep.join(o for o in out if o != '')
303 |         return out if strip else out + sep
304 | 
305 |     @classmethod
306 |     def _postprocess_word(cls, word, separator, strip):
307 |         """Parse a word from festival to phonemized output"""
308 |         sep = separator.syllable
309 |         out = sep.join(
310 |             cls._postprocess_syll(syll, separator, strip)
311 |             for syll in word[1:])
312 |         return out if strip else out + sep
313 | 
314 |     @classmethod
315 |     def _postprocess_line(cls, line, separator, strip):
316 |         """Parse a line from festival to phonemized output"""
317 |         sep = separator.word
318 |         out = []
319 |         for word in lispy.parse(line):
320 |             word = cls._postprocess_word(word, separator, strip)
321 |             if word != '':
322 |                 out.append(word)
323 |         out = sep.join(out)
324 | 
325 |         return out if strip else out + sep
326 | 
327 |     @classmethod
328 |     def _postprocess(cls, tree, separator, strip):
329 |         """Conversion from festival syllable tree to desired format"""
330 |         return [cls._postprocess_line(line, separator, strip)
331 |                 for line in tree.split('\n')
332 |                 if line not in ['', '(nil nil nil)']]
333 | 


--------------------------------------------------------------------------------
/phonemizer/backend/festival/lispy.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonemizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonemizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Parse a Scheme expression as a nested list
16 | 
17 | The main function of this module is lispy.parse, other ones should be
18 | considered private. This module is a dependency of the festival
19 | backend.
20 | 
21 | From http://www.norvig.com/lispy.html
22 | 
23 | """
24 | 
25 | 
26 | def parse(program):
27 |     """Read a Scheme expression from a string
28 | 
29 |     Return a nested list
30 | 
31 |     Raises an IndexError if the expression is not valid scheme
32 |     (unbalanced parenthesis).
33 | 
34 |     >>> parse('(+ 2 (* 5 2))')
35 |     ['+', '2', ['*', '5', '2']]
36 | 
37 |     """
38 |     return _read_from_tokens(_tokenize(program))
39 | 
40 | 
41 | def _tokenize(chars):
42 |     "Convert a string of characters into a list of tokens."
43 |     return chars.replace('(', ' ( ').replace(')', ' ) ').split()
44 | 
45 | 
46 | def _read_from_tokens(tokens):
47 |     "Read an expression from a sequence of tokens"
48 |     if len(tokens) == 0:  # pragma: nocover
49 |         raise SyntaxError('unexpected EOF while reading')
50 | 
51 |     token = tokens.pop(0)
52 |     if token == '(':
53 |         expr = []
54 |         while tokens[0] != ')':
55 |             expr.append(_read_from_tokens(tokens))
56 |         tokens.pop(0)  # pop off ')'
57 |         return expr
58 | 
59 |     if token == ')':  # pragma: nocover
60 |         raise SyntaxError('unexpected )')
61 | 
62 |     return token
63 | 


--------------------------------------------------------------------------------
/phonemizer/backend/segments.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Segments backend for the phonemizer"""
 16 | 
 17 | import pathlib
 18 | 
 19 | import segments
 20 | from phonemizer.backend.base import BaseBackend
 21 | from phonemizer.logger import get_logger
 22 | from phonemizer.punctuation import Punctuation
 23 | from phonemizer.utils import get_package_resource, version_as_tuple
 24 | 
 25 | 
 26 | class SegmentsBackend(BaseBackend):
 27 |     """Segments backends for the phonemizer
 28 | 
 29 |     The phonemize method will raise a ValueError when parsing an
 30 |     unknown morpheme.
 31 | 
 32 |     """
 33 |     def __init__(self, language,
 34 |                  punctuation_marks=Punctuation.default_marks(),
 35 |                  preserve_punctuation=False,
 36 |                  logger=get_logger()):
 37 |         # will be initialized in _init_language() from super().__init__()
 38 |         self._tokenizer = None
 39 |         super().__init__(
 40 |             language,
 41 |             punctuation_marks=punctuation_marks,
 42 |             preserve_punctuation=preserve_punctuation,
 43 |             logger=logger)
 44 | 
 45 |     def _init_language(self, language):
 46 |         # load the grapheme to phoneme mapping
 47 |         profile = self._load_g2p_profile(language)
 48 |         self._tokenizer = segments.Tokenizer(profile=profile)
 49 | 
 50 |         # this is the language code
 51 |         return pathlib.Path(language).stem
 52 | 
 53 |     @staticmethod
 54 |     def name():
 55 |         return 'segments'
 56 | 
 57 |     @staticmethod
 58 |     def version():
 59 |         return version_as_tuple(segments.__version__)
 60 | 
 61 |     @staticmethod
 62 |     def is_available():
 63 |         return True
 64 | 
 65 |     @staticmethod
 66 |     def supported_languages():
 67 |         """Returns a dict of language: file supported by the segments backend
 68 | 
 69 |         The supported languages have a grapheme to phoneme conversion file
 70 |         bundled with phonemizer. Users can also use their own file as
 71 |         parameter of the phonemize() function.
 72 | 
 73 |         """
 74 |         # directory phonemizer/share/segments
 75 |         directory = get_package_resource('segments')
 76 | 
 77 |         # supported languages are files with the 'g2p' extension
 78 |         return {g2p.stem: g2p
 79 |                 for g2p in directory.iterdir() if g2p.suffix == '.g2p'}
 80 | 
 81 |     @classmethod
 82 |     def is_supported_language(cls, language):
 83 |         if pathlib.Path(language).is_file():
 84 |             try:
 85 |                 cls._load_g2p_profile(language)
 86 |                 return True
 87 |             except RuntimeError:
 88 |                 return False
 89 |         return language in cls.supported_languages()
 90 | 
 91 |     @classmethod
 92 |     def _load_g2p_profile(cls, language):
 93 |         """Returns a segments profile from a `language`"""
 94 |         # make sure the g2p file exists
 95 |         if not pathlib.Path(language).is_file():
 96 |             try:
 97 |                 language = cls.supported_languages()[language]
 98 |             except KeyError:
 99 |                 raise RuntimeError(
100 |                     f'grapheme to phoneme file not found: '
101 |                     f'{language}') from None
102 | 
103 |         # load the mapping grapheme -> phoneme from the file, make sure all
104 |         # lines are well formatted
105 |         g2p = {}
106 |         with open(language, 'r', encoding='utf8') as flang:
107 |             for num, line in enumerate(flang):
108 |                 elts = line.strip().split()
109 |                 if not len(elts) == 2:
110 |                     raise RuntimeError(
111 |                         'grapheme to phoneme file, line {} must have 2 rows '
112 |                         'but have {}: {}'.format(num + 1, len(elts), language))
113 |                 g2p[elts[0]] = elts[1]
114 | 
115 |         # build the segments profile from the g2p mapping
116 |         return segments.Profile(
117 |             *[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])
118 | 
119 |     # pylint: disable=unused-argument
120 |     def _phonemize_aux(self, text, offset, separator, strip):
121 |         # tokenize the input text per utterance
122 |         phonemized = (
123 |             self._tokenizer(line, column='mapping', errors='strict')
124 |             for line in text)
125 | 
126 |         # the output of segments is always strip, so we need to add
127 |         # token separation at the end when strip is False.
128 |         if not strip:
129 |             # add word separator at end of utterance
130 |             phonemized = (p + ' # ' for p in phonemized)
131 |             # add phoneme separator at end of word
132 |             phonemized = (p.replace(' # ', '  # ') for p in phonemized)
133 | 
134 |         # replace default separators by our custom ones
135 |         phonemized = (p.replace(' # ', '#') for p in phonemized)
136 |         phonemized = (p.replace(' ', separator.phone) for p in phonemized)
137 |         phonemized = (p.replace('#', separator.word) for p in phonemized)
138 | 
139 |         # return the result as a list of utterances
140 |         return list(phonemized)
141 | 


--------------------------------------------------------------------------------
/phonemizer/logger.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonemizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonemizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Logging facilities for the phonemizer"""
16 | 
17 | import logging
18 | import sys
19 | 
20 | 
21 | def get_logger(verbosity='quiet', name='phonemizer'):
22 |     """Returns a configured logging.Logger instance
23 | 
24 |     The logger is configured to output messages on the standard error stream
25 |     (stderr).
26 | 
27 |     Parameters
28 |     ----------
29 |     verbosity (str) : The level of verbosity, must be 'verbose' (displays
30 |       debug/info and warning messages), 'normal' (warnings only) or 'quiet' (do
31 |       not display anything).
32 |     name (str) : The logger name, default to 'phonemizer'
33 | 
34 |     Raises
35 |     ------
36 |     RuntimeError if `verbosity` is not 'normal', 'verbose', or 'quiet'.
37 | 
38 |     """
39 |     # make sure the verbosity argument is valid
40 |     valid_verbosity = ['normal', 'verbose', 'quiet']
41 |     if verbosity not in valid_verbosity:
42 |         raise RuntimeError(
43 |             f'verbosity is {verbosity} but must be in '
44 |             f'{", ".join(valid_verbosity)}')
45 | 
46 |     logger = logging.getLogger(name)
47 | 
48 |     # setup output to stderr
49 |     logger.handlers = []
50 |     handler = logging.StreamHandler(sys.stderr)
51 | 
52 |     # setup verbosity level
53 |     logger.setLevel(logging.WARNING)
54 |     if verbosity == 'verbose':
55 |         logger.setLevel(logging.DEBUG)
56 |     elif verbosity == 'quiet':
57 |         handler = logging.NullHandler()
58 | 
59 |     # setup messages format
60 |     handler.setFormatter(logging.Formatter('[%(levelname)s] %(message)s'))
61 |     logger.addHandler(handler)
62 |     return logger
63 | 


--------------------------------------------------------------------------------
/phonemizer/phonemize.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Provides the phonemize function
 16 | 
 17 | To use it in your own code, type:
 18 | 
 19 |     from phonemizer import phonemize
 20 | 
 21 | """
 22 | 
 23 | import os
 24 | import sys
 25 | 
 26 | from phonemizer.backend import BACKENDS
 27 | from phonemizer.logger import get_logger
 28 | from phonemizer.punctuation import Punctuation
 29 | from phonemizer.separator import default_separator
 30 | from phonemizer.utils import list2str, str2list
 31 | 
 32 | 
 33 | def phonemize(  # pylint: disable=too-many-arguments
 34 |         text,
 35 |         language='en-us',
 36 |         backend='espeak',
 37 |         separator=default_separator,
 38 |         strip=False,
 39 |         prepend_text=False,
 40 |         preserve_punctuation=False,
 41 |         punctuation_marks=Punctuation.default_marks(),
 42 |         with_stress=False,
 43 |         tie=False,
 44 |         language_switch='keep-flags',
 45 |         words_mismatch='ignore',
 46 |         njobs=1,
 47 |         logger=get_logger()):
 48 |     """Multilingual text to phonemes converter
 49 | 
 50 |     Return a phonemized version of an input `text`, given its `language` and a
 51 |     phonemization `backend`.
 52 | 
 53 |     Note
 54 |     ----
 55 |     To improve the processing speed it is better to minimize the calls to this
 56 |     function: provide the input text as a list and call phonemize() a single
 57 |     time is much more efficient than calling it on each element of the list.
 58 |     Indeed the initialization of the phonemization backend can be expensive,
 59 |     especially for espeak. In one exemple,
 60 | 
 61 |     Do this:
 62 | 
 63 |     >>> text = [line1, line2, ...]
 64 |     >>> phonemize(text, ...)
 65 | 
 66 |     Not this:
 67 | 
 68 |     >>> for line in text:
 69 |     >>>     phonemize(line, ...)
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     text (str or list of str): The text to be phonemized. Any empty line will
 74 |       be ignored. If `text` is an str, it can be multiline (lines being
 75 |       separated by \n). If `text` is a list, each element is considered as a
 76 |       separated line. Each line is considered as a text utterance.
 77 | 
 78 |     language (str): The language code of the input text, must be supported by
 79 |       the backend. If `backend` is 'segments', the language can be a file with
 80 |       a grapheme to phoneme mapping.
 81 | 
 82 |     backend (str, optional): The software backend to use for phonemization,
 83 |       must be 'festival' (US English only is supported, coded 'en-us'),
 84 |       'espeak', 'espeak-mbrola' or 'segments'.
 85 | 
 86 |     separator (Separator): string separators between phonemes, syllables and
 87 |       words, default to separator.default_separator. Syllable separator is
 88 |       considered only for the festival backend. Word separator is ignored by
 89 |       the 'espeak-mbrola' backend. Initialize it as follows:
 90 |         >>> from phonemizer.separator import Separator
 91 |         >>> separator = Separator(phone='-', word=' ')
 92 | 
 93 |     strip (bool, optional): If True, don't output the last word and phone
 94 |       separators of a token, default to False.
 95 | 
 96 |     prepend_text (bool, optional): When True, returns a pair (input utterance,
 97 |       phonemized utterance) for each line of the input text. When False,
 98 |       returns only the phonemized utterances. Default to False
 99 | 
100 |     preserve_punctuation (bool, optional): When True, will keep the punctuation
101 |       in the phonemized output. Not supported by the 'espeak-mbrola' backend.
102 |       Default to False and remove all the punctuation.
103 | 
104 |     punctuation_marks (str, optional): The punctuation marks to consider when
105 |       dealing with punctuation, either for removal or preservation. Default to
106 |       Punctuation.default_marks().
107 | 
108 |     with_stress (bool, optional): This option is only valid for the 'espeak'
109 |       backend. When True the stresses on phonemes are present (stresses
110 |       characters are ˈ'ˌ). When False stresses are removed. Default to False.
111 | 
112 |     tie (bool or char, optional): This option is only valid for the 'espeak'
113 |       backend with espeak>=1.49. It is incompatible with phone separator. When
114 |       not False, use a tie character within multi-letter phoneme names. When
115 |       True, the char 'U+361' is used (as in d͡ʒ), 'z' means ZWJ character,
116 |       default to False.
117 | 
118 |     language_switch (str, optional): Espeak can output some words in another
119 |       language (typically English) when phonemizing a text. This option setups
120 |       the policy to use when such a language switch occurs. Three values are
121 |       available: 'keep-flags' (the default), 'remove-flags' or
122 |       'remove-utterance'. The 'keep-flags' policy keeps the language switching
123 |       flags, for example "(en) or (jp)", in the output. The 'remove-flags'
124 |       policy removes them and the 'remove-utterance' policy removes the whole
125 |       line of text including a language switch. This option is only valid for
126 |       the 'espeak' backend.
127 | 
128 |     words_mismatch (str, optional): Espeak can join two consecutive words or
129 |       drop some words, yielding a word count mismatch between orthographic and
130 |       phonemized text. This option setups the policy to use when such a words
131 |       count mismatch occurs. Three values are available: 'ignore' (the default)
132 |       which do nothing, 'warn' which issue a warning for each mismatched line,
133 |       and 'remove' which remove the mismatched lines from the output.
134 | 
135 |     njobs (int): The number of parallel jobs to launch. The input text is split
136 |       in `njobs` parts, phonemized on parallel instances of the backend and the
137 |       outputs are finally collapsed.
138 | 
139 |     logger (logging.Logger): the logging instance where to send messages. If
140 |       not specified, use the default system logger.
141 | 
142 |     Returns
143 |     -------
144 |     phonemized text (str or list of str) : The input `text` phonemized for the
145 |       given `language` and `backend`. The returned value has the same type of
146 |       the input text (either a list or a string), excepted if `prepend_input`
147 |       is True where the output is forced as a list of pairs (input_text,
148 |       phonemized text).
149 | 
150 |     Raises
151 |     ------
152 |     RuntimeError if the `backend` is not valid or is valid but not installed,
153 |       if the `language` is not supported by the `backend`, if any incompatible
154 |       options are used.
155 | 
156 |     """
157 |     # ensure we are using a compatible Python version
158 |     if sys.version_info < (3, 6):  # pragma: nocover
159 |         logger.error(
160 |             'Your are using python-%s which is unsupported by the phonemizer, '
161 |             'please update to python>=3.6', ".".join(sys.version_info))
162 | 
163 |     # ensure the arguments are valid
164 |     _check_arguments(
165 |         backend, with_stress, tie, separator, language_switch, words_mismatch)
166 | 
167 |     # preserve_punctuation and word separator not valid for espeak-mbrola
168 |     if backend == 'espeak-mbrola' and preserve_punctuation:
169 |         logger.warning('espeak-mbrola backend cannot preserve punctuation')
170 |     if backend == 'espeak-mbrola' and separator.word:
171 |         logger.warning('espeak-mbrola backend cannot preserve word separation')
172 | 
173 |     # initialize the phonemization backend
174 |     if backend == 'espeak':
175 |         phonemizer = BACKENDS[backend](
176 |             language,
177 |             punctuation_marks=punctuation_marks,
178 |             preserve_punctuation=preserve_punctuation,
179 |             with_stress=with_stress,
180 |             tie=tie,
181 |             language_switch=language_switch,
182 |             words_mismatch=words_mismatch,
183 |             logger=logger)
184 |     elif backend == 'espeak-mbrola':
185 |         phonemizer = BACKENDS[backend](
186 |             language,
187 |             logger=logger)
188 |     else:  # festival or segments
189 |         phonemizer = BACKENDS[backend](
190 |             language,
191 |             punctuation_marks=punctuation_marks,
192 |             preserve_punctuation=preserve_punctuation,
193 |             logger=logger)
194 | 
195 |     # do the phonemization
196 |     return _phonemize(phonemizer, text, separator, strip, njobs, prepend_text)
197 | 
198 | 
199 | def _check_arguments(  # pylint: disable=too-many-arguments
200 |         backend, with_stress, tie, separator, language_switch, words_mismatch):
201 |     """Auxiliary function to phonemize()
202 | 
203 |     Ensures the parameters are compatible with each other, raises a
204 |     RuntimeError the first encountered error.
205 | 
206 |     """
207 |     # ensure the backend is either espeak, festival or segments
208 |     if backend not in ('espeak', 'espeak-mbrola', 'festival', 'segments'):
209 |         raise RuntimeError(
210 |             '{} is not a supported backend, choose in {}.'
211 |             .format(backend, ', '.join(
212 |                 ('espeak', 'espeak-mbrola', 'festival', 'segments'))))
213 | 
214 |     # with_stress option only valid for espeak
215 |     if with_stress and backend != 'espeak':
216 |         raise RuntimeError(
217 |             'the "with_stress" option is available for espeak backend only, '
218 |             'but you are using {} backend'.format(backend))
219 | 
220 |     # tie option only valid for espeak
221 |     if tie and backend != 'espeak':
222 |         raise RuntimeError(
223 |             'the "tie" option is available for espeak backend only, '
224 |             'but you are using {} backend'.format(backend))
225 | 
226 |     # tie option incompatible with phone separator
227 |     if tie and separator.phone:
228 |         raise RuntimeError(
229 |             'the "tie" option is incompatible with phone separator '
230 |             f'(which is "{separator.phone}")')
231 | 
232 |     # language_switch option only valid for espeak
233 |     if language_switch != 'keep-flags' and backend != 'espeak':
234 |         raise RuntimeError(
235 |             'the "language_switch" option is available for espeak backend '
236 |             'only, but you are using {} backend'.format(backend))
237 | 
238 |     # words_mismatch option only valid for espeak
239 |     if words_mismatch != 'ignore' and backend != 'espeak':
240 |         raise RuntimeError(
241 |             'the "words_mismatch" option is available for espeak backend '
242 |             'only, but you are using {} backend'.format(backend))
243 | 
244 | 
245 | def _phonemize(  # pylint: disable=too-many-arguments
246 |         backend, text, separator, strip, njobs, prepend_text):
247 |     """Auxiliary function to phonemize()
248 | 
249 |     Does the phonemization and returns the phonemized text. Raises a
250 |     RuntimeError on error.
251 | 
252 |     """
253 |     # remember the text type for output (either list or string), force the text
254 |     # as a list and ignore empty lines
255 |     text_type = type(text)
256 |     text = (line.strip(os.linesep) for line in str2list(text))
257 |     text = [line for line in text if line.strip()]
258 | 
259 |     # phonemize the text
260 |     phonemized = backend.phonemize(
261 |         text, separator=separator, strip=strip, njobs=njobs)
262 | 
263 |     # at that point, the phonemized text is a list of str. Format it as
264 |     # expected by the parameters
265 |     if prepend_text:
266 |         return list(zip(text, phonemized))
267 |     if text_type == str:
268 |         return list2str(phonemized)
269 |     return phonemized
270 | 


--------------------------------------------------------------------------------
/phonemizer/punctuation.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Implementation of punctuation processing"""
 16 | 
 17 | 
 18 | import collections
 19 | import re
 20 | import six
 21 | from phonemizer.utils import str2list
 22 | 
 23 | 
 24 | # The punctuation marks considered by default.
 25 | _DEFAULT_MARKS = ';:,.!?¡¿—…"«»“”'
 26 | 
 27 | 
 28 | _MarkIndex = collections.namedtuple(
 29 |     '_mark_index', ['index', 'mark', 'position'])
 30 | 
 31 | 
 32 | class Punctuation:
 33 |     """Preserve or remove the punctuation during phonemization
 34 | 
 35 |     Backends behave differently with punctuation: festival and espeak ignore it
 36 |     and remove it silently whereas segments will raise an error. The
 37 |     Punctuation class solves that issue by "hiding" the punctuation to the
 38 |     phonemization backend and restoring it afterwards.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     marks (str) : The list of punctuation marks to considerate for processing
 43 |         (either removal or preservation). Each mark must be made of a single
 44 |         character. Default to Punctuation.default_marks().
 45 | 
 46 |     """
 47 |     def __init__(self, marks=_DEFAULT_MARKS):
 48 |         self._marks = None
 49 |         self._marks_re = None
 50 |         self.marks = marks
 51 | 
 52 |     @staticmethod
 53 |     def default_marks():
 54 |         """Returns the default punctuation marks as a string"""
 55 |         return _DEFAULT_MARKS
 56 | 
 57 |     @property
 58 |     def marks(self):
 59 |         """The punctuation marks as a string"""
 60 |         return self._marks
 61 | 
 62 |     @marks.setter
 63 |     def marks(self, value):
 64 |         if not isinstance(value, six.string_types):
 65 |             raise ValueError('punctuation marks must be defined as a string')
 66 |         self._marks = ''.join(set(value))
 67 | 
 68 |         # catching all the marks in one regular expression: zero or more spaces
 69 |         # + one or more marks + zero or more spaces.
 70 |         self._marks_re = re.compile(fr'(\s*[{re.escape(self._marks)}]+\s*)+')
 71 | 
 72 |     def remove(self, text):
 73 |         """Returns the `text` with all punctuation marks replaced by spaces
 74 | 
 75 |         The input `text` can be a string or a list and is returned with the
 76 |         same type and punctuation removed.
 77 | 
 78 |         """
 79 |         def aux(text):
 80 |             return re.sub(self._marks_re, ' ', text).strip()
 81 | 
 82 |         if isinstance(text, six.string_types):
 83 |             return aux(text)
 84 |         return [aux(line) for line in text]
 85 | 
 86 |     def preserve(self, text):
 87 |         """Removes punctuation from `text`, allowing for furter restoration
 88 | 
 89 |         This method returns the text as a list of punctuated chunks, along with
 90 |         a list of punctuation marks for furter restoration:
 91 | 
 92 |             'hello, my world!' -> ['hello', 'my world'], [',', '!']
 93 | 
 94 |         """
 95 |         text = str2list(text)
 96 |         preserved_text = []
 97 |         preserved_marks = []
 98 | 
 99 |         for num, line in enumerate(text):
100 |             line, marks = self._preserve_line(line, num)
101 |             preserved_text += line
102 |             preserved_marks += marks
103 |         return [line for line in preserved_text if line], preserved_marks
104 | 
105 |     def _preserve_line(self, line, num):
106 |         """Auxiliary method for Punctuation.preserve()"""
107 |         matches = list(re.finditer(self._marks_re, line))
108 |         if not matches:
109 |             return [line], []
110 | 
111 |         # the line is made only of punctuation marks
112 |         if len(matches) == 1 and matches[0].group() == line:
113 |             return [], [_MarkIndex(num, line, 'A')]
114 | 
115 |         # build the list of mark indexes required to restore the punctuation
116 |         marks = []
117 |         for match in matches:
118 |             # find the position of the punctuation mark in the utterance:
119 |             # begin (B), end (E), in the middle (I) or alone (A)
120 |             position = 'I'
121 |             if match == matches[0] and line.startswith(match.group()):
122 |                 position = 'B'
123 |             elif match == matches[-1] and line.endswith(match.group()):
124 |                 position = 'E'
125 |             marks.append(_MarkIndex(num, match.group(), position))
126 | 
127 |         # split the line into sublines, each separated by a punctuation mark
128 |         preserved_line = []
129 |         for mark in marks:
130 |             split = line.split(mark.mark)
131 |             prefix, suffix = split[0], mark.mark.join(split[1:])
132 |             preserved_line.append(prefix)
133 |             line = suffix
134 | 
135 |         # append any trailing text to the preserved line
136 |         return preserved_line + [line], marks
137 | 
138 |     @classmethod
139 |     def restore(cls, text, marks):
140 |         """Restore punctuation in a text.
141 | 
142 |         This is the reverse operation of Punctuation.preserve(). It takes a
143 |         list of punctuated chunks and a list of punctuation marks. It returns a
144 |         a punctuated text as a list:
145 | 
146 |             ['hello', 'my world'], [',', '!'] -> ['hello, my world!']
147 | 
148 |         """
149 |         return cls._restore_aux(str2list(text), marks, 0)
150 | 
151 |     @classmethod
152 |     def _restore_current(cls, current, text, marks, num):
153 |         """Auxiliary method for Punctuation._restore_aux()"""
154 |         if current.position == 'B':
155 |             return cls._restore_aux(
156 |                 [current.mark + text[0]] + text[1:], marks[1:], num)
157 | 
158 |         if current.position == 'E':
159 |             return [text[0] + current.mark] + cls._restore_aux(
160 |                 text[1:], marks[1:], num + 1)
161 | 
162 |         if current.position == 'A':
163 |             return [current.mark] + cls._restore_aux(text, marks[1:], num + 1)
164 | 
165 |         # position == 'I'
166 |         if len(text) == 1:  # pragma: nocover
167 |             # a corner case where the final part of an intermediate
168 |             # mark (I) has not been phonemized
169 |             return cls._restore_aux([text[0] + current.mark], marks[1:], num)
170 | 
171 |         return cls._restore_aux(
172 |             [text[0] + current.mark + text[1]] + text[2:], marks[1:], num)
173 | 
174 |     @classmethod
175 |     def _restore_aux(cls, text, marks, num):
176 |         """Auxiliary method for Punctuation.restore()"""
177 |         if not marks:
178 |             return text
179 | 
180 |         # nothing have been phonemized, returns the marks alone
181 |         if not text:
182 |             return [''.join(m.mark for m in marks)]
183 | 
184 |         current = marks[0]
185 |         if current.index == num:  # place the current mark here
186 |             return cls._restore_current(current, text, marks, num)
187 | 
188 |         return [text[0]] + cls._restore_aux(text[1:], marks, num + 1)
189 | 


--------------------------------------------------------------------------------
/phonemizer/separator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Provides the Separator tuple and its default value"""
 16 | 
 17 | 
 18 | class Separator:
 19 |     """Defines phone, syllable and word boundary tokens"""
 20 |     def __init__(self, word=' ', syllable=None, phone=None):
 21 |         # check we have different separators, None excluded
 22 |         sep1 = list(sep for sep in (phone, syllable, word) if sep)
 23 |         sep2 = set(sep for sep in (phone, syllable, word) if sep)
 24 |         if len(sep1) != len(sep2):
 25 |             raise ValueError(
 26 |                 'illegal separator with word="{}", syllable="{}" and '
 27 |                 'phone="{}", must be all differents if not empty'
 28 |                 .format(phone, syllable, word))
 29 | 
 30 |         self._phone = str(phone) if phone else ''
 31 |         self._syllable = str(syllable) if syllable else ''
 32 |         self._word = str(word) if word else ''
 33 | 
 34 |     def __eq__(self, other):
 35 |         return (
 36 |             self.phone == other.phone
 37 |             and self.syllable == other.syllable
 38 |             and self.word == other.word)
 39 | 
 40 |     def __str__(self):
 41 |         return (
 42 |             f'(phone: "{self.phone}", '
 43 |             f'syllable: "{self.syllable}", '
 44 |             f'word: "{self.word}")')
 45 | 
 46 |     @property
 47 |     def phone(self):
 48 |         """Phones separator"""
 49 |         return self._phone
 50 | 
 51 |     @property
 52 |     def syllable(self):
 53 |         """Syllables separator"""
 54 |         return self._syllable
 55 | 
 56 |     @property
 57 |     def word(self):
 58 |         """Words separator"""
 59 |         return self._word
 60 | 
 61 |     def __contains__(self, value):
 62 |         """Returns True if the separator has `value` as token separation"""
 63 |         return value in (self.phone, self.syllable, self.word)
 64 | 
 65 |     def input_output_separator(self, field_separator):
 66 |         """Returns a suitable input/output separator based on token separator
 67 | 
 68 |         The input/output separator split orthographic and phonetic texts when
 69 |         using the --prepend-text option from command-line.
 70 | 
 71 |         Parameters
 72 |         ----------
 73 | 
 74 |         field_separator (bool or str): If str, ensures it's value is not
 75 |           already defined as a token separator. If True choose one of "|",
 76 |           "||", "|||", "||||" (the first one that is not defined as a token
 77 |           separator)
 78 | 
 79 |         Returns
 80 |         -------
 81 |         The input/output separator, or False if `field_separator` is False
 82 | 
 83 |         Raises
 84 |         ------
 85 |         RuntimeError if `field_separator` is a str but is already registered as
 86 |           token separator
 87 | 
 88 |         """
 89 |         if not field_separator:
 90 |             return False
 91 | 
 92 |         if isinstance(field_separator, str):
 93 |             if field_separator in self:
 94 |                 raise RuntimeError(
 95 |                     f'cannot prepend input with "{field_separator}" because '
 96 |                     f'it is already a token separator: {self}')
 97 |             return field_separator
 98 | 
 99 |         if field_separator is True:
100 |             field_separator = '|'
101 |             while field_separator in self:
102 |                 field_separator += '|'
103 |             return field_separator
104 | 
105 |         # not a bool nor a str
106 |         raise RuntimeError(
107 |             'invalid input/output separator, must be bool or str but is'
108 |             f'{field_separator}')
109 | 
110 | 
111 | default_separator = Separator(phone='', syllable='', word=' ')
112 | """The default separation characters for phonemes, syllables and words"""
113 | 


--------------------------------------------------------------------------------
/phonemizer/share/festival/phonemize.scm:
--------------------------------------------------------------------------------
 1 | ;; Copyright 2015-2021 Mathieu Bernard
 2 | ;;
 3 | ;; This file is part of phonemizer: you can redistribute it and/or
 4 | ;; modify it under the terms of the GNU General Public License as
 5 | ;; published by the Free Software Foundation, either version 3 of the
 6 | ;; License, or (at your option) any later version.
 7 | ;;
 8 | ;; Phonemizer is distributed in the hope that it will be useful, but
 9 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of
10 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | ;; General Public License for more details.
12 | ;;
13 | ;; You should have received a copy of the GNU General Public License
14 | ;; along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | 
16 | ;; This script is executed by festival for English text phonemization.
17 | (define (phonemize line)
18 |   "(phonemize LINE)
19 | Extract the phonemes of the string LINE as a tree and write it to stdout."
20 |   (set! utterance (eval (list 'Utterance 'Text line)))
21 |   (utt.synth utterance)
22 |   ;; Use of print instead of pprintf to have each utterance on one line
23 |   (print (utt.relation_tree utterance "SylStructure")))
24 | 
25 | ;; This double braket have to be replaced by the name of the text file
26 | ;; you want to read data from. To be parsed by festival as a unique
27 | ;; utterance, each line of that file must begin and end with
28 | ;; double-quotes.
29 | (set! lines (load "{}" t))
30 | (mapcar (lambda (line) (phonemize line)) lines)
31 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/chintang.g2p:
--------------------------------------------------------------------------------
 1 | a ʌ
 2 | â aː
 3 | b b
 4 | ch tʃ
 5 | d d
 6 | e eː
 7 | f f
 8 | g g
 9 | h h
10 | i ɪ
11 | î iː
12 | j dʒ
13 | k k
14 | kw kʷ
15 | l l
16 | m m
17 | n n
18 | o ʊ
19 | p p
20 | s s
21 | sh ʃ
22 | t t
23 | th θ
24 | u ʊ
25 | û o
26 | w w
27 | y j
28 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/cree.g2p:
--------------------------------------------------------------------------------
 1 | a	ʌ
 2 | â	aː
 3 | b	b
 4 | ch	tʃ
 5 | d	d
 6 | e	eː
 7 | f	f
 8 | g	g
 9 | h	h
10 | i	ɪ
11 | î	iː
12 | j	dʒ
13 | k	k
14 | kw	kʷ
15 | l	l
16 | m	m
17 | n	n
18 | o	ʊ
19 | p	p
20 | s	s
21 | sh	ʃ
22 | t	t
23 | th	θ
24 | u	ʊ
25 | û	o
26 | w	w
27 | y	j
28 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/inuktitut.g2p:
--------------------------------------------------------------------------------
 1 | a	a
 2 | g	g
 3 | h	h
 4 | i	i
 5 | j	j
 6 | k	k
 7 | l	l
 8 | ll	ɬ
 9 | m	m
10 | n	n
11 | ng	ŋ
12 | nng	ŋŋ
13 | p	p
14 | q	q
15 | r	ʁ
16 | rng	ɴ
17 | s	s
18 | t	t
19 | u	u
20 | v	v
21 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/japanese.g2p:
--------------------------------------------------------------------------------
 1 | a	a
 2 | aa	aː
 3 | b	b
 4 | by	bʲ
 5 | ch	tʃ
 6 | d	d
 7 | e	e
 8 | ee	eː
 9 | f	ɸ
10 | g	g
11 | gy	gʲ
12 | h	h
13 | hy	ç
14 | i	i
15 | j	dʒ
16 | k	k
17 | ky	kʲ
18 | m	m
19 | my	mʲ
20 | n	n
21 | ny	ɲ
22 | o	o
23 | oo	oː
24 | p	p
25 | py	pʲ
26 | r	r
27 | ry	rʲ
28 | sh	ʃ
29 | t	t
30 | ts	t͡s
31 | u	ɯ
32 | uu	ɯː
33 | w	w
34 | y	j
35 | z	z
36 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/sesotho.g2p:
--------------------------------------------------------------------------------
 1 | a	a
 2 | b	b
 3 | ch	tʃʰ
 4 | d	d
 5 | e	e
 6 | f	f
 7 | g	χ
 8 | h	h
 9 | hl	ɬ
10 | i	i
11 | j	dʒ
12 | k	k
13 | kg	kx
14 | kh	kʰ
15 | l	l
16 | m	m
17 | n	n
18 | ng	ŋ
19 | nq	ǃ̃
20 | ny	ɲ
21 | o	o
22 | p	t
23 | ph	pʰ
24 | q	ǃ
25 | qh	ǃʰ
26 | r	r
27 | s	s
28 | sh	ʃ
29 | t	t
30 | th	tʰ
31 | tj	tʃ
32 | tl	tɬ
33 | tlh	tɬʰ
34 | ts	t͡s
35 | tsh	t͡sʰ
36 | u	u
37 | w	w
38 | y	j
39 | 


--------------------------------------------------------------------------------
/phonemizer/share/segments/yucatec.g2p:
--------------------------------------------------------------------------------
 1 | a	a
 2 | aa	aː
 3 | aʼ	a̰
 4 | aʼa	a̰ː
 5 | b	b
 6 | ch	t̠͡ʃ
 7 | chʼ	t̠͡ʃʼ
 8 | e	e
 9 | ee	eː
10 | eʼ	ḛ
11 | eʼe	ḛː
12 | f	f
13 | h	h
14 | i	i
15 | ii	iː
16 | iʼ	ḭ
17 | iʼi	ḭː
18 | j	x
19 | k	k
20 | kʼ	kʼ
21 | l	l
22 | m	m
23 | n	n
24 | ñ	n
25 | o	o
26 | oo	oː
27 | oʼ	o̰
28 | oʼo	o̰ː
29 | p	pʼ
30 | pʼ	pʼ
31 | qu	k
32 | r	r
33 | s	s
34 | x	ʃ
35 | t	t
36 | ts	t͡s
37 | tsʼ	t͡sʼ
38 | tʼ	tʼ
39 | u	u
40 | uu	uː
41 | uʼ	ṵ
42 | uʼu	ṵː
43 | w	w
44 | y	j
45 | z	s
46 | 


--------------------------------------------------------------------------------
/phonemizer/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Provides utility functions for the phonemizer"""
 16 | 
 17 | import os
 18 | import pathlib
 19 | 
 20 | import pkg_resources
 21 | 
 22 | 
 23 | def cumsum(iterable):
 24 |     """Returns the cumulative sum of the `iterable` as a list"""
 25 |     res = []
 26 |     cumulative = 0
 27 |     for value in iterable:
 28 |         cumulative += value
 29 |         res.append(cumulative)
 30 |     return res
 31 | 
 32 | 
 33 | def str2list(text):
 34 |     """Returns the string `text` as a list of lines, split by \n"""
 35 |     if isinstance(text, str):
 36 |         return text.strip(os.linesep).split(os.linesep)
 37 |     return text
 38 | 
 39 | 
 40 | def list2str(text):
 41 |     """Returns the list of lines `text` as a single string separated by \n"""
 42 |     if isinstance(text, str):
 43 |         return text
 44 |     return os.linesep.join(text)
 45 | 
 46 | 
 47 | def chunks(text, num):
 48 |     """Return a maximum of `num` equally sized chunks of a `text`
 49 | 
 50 |     This method is usefull when phonemizing a single text on multiple jobs.
 51 | 
 52 |     The exact number of chunks returned is `m = min(num, len(str2list(text)))`.
 53 |     Only the m-1 first chunks have equal size. The last chunk can be longer.
 54 |     The input `text` can be a list or a string. Return a list of `m` strings.
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     text (str or list) : The text to divide in chunks
 59 | 
 60 |     num (int) : The number of chunks to build, must be a strictly positive
 61 |     integer.
 62 | 
 63 |     Returns
 64 |     -------
 65 |     chunks (list of list of str) : The chunked text with utterances separated
 66 |         by '\n'.
 67 | 
 68 |     offsets (list of int) : offset used below to recover the line numbers in
 69 |         the input text wrt the chunks
 70 | 
 71 |     """
 72 |     text = str2list(text)
 73 |     size = int(max(1, len(text) / num))
 74 |     nchunks = min(num, len(text))
 75 | 
 76 |     text_chunks = [
 77 |         text[i*size:(i+1)*size] for i in range(nchunks - 1)]
 78 | 
 79 |     last = text[(nchunks - 1)*size:]
 80 |     if last:
 81 |         text_chunks.append(last)
 82 | 
 83 |     offsets = [0] + cumsum((len(c) for c in text_chunks[:-1]))
 84 |     return text_chunks, offsets
 85 | 
 86 | 
 87 | def get_package_resource(path):
 88 |     """Returns the absolute path to a phonemizer resource file or directory
 89 | 
 90 |     The packages resource are stored within the source tree in the
 91 |     'phonemizer/share' directory and, once the package is installed, are moved
 92 |     to another system directory (e.g. /share/phonemizer).
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     path (str) : the file or directory to get, must be relative to
 97 |         'phonemizer/share'.
 98 | 
 99 |     Raises
100 |     ------
101 |     ValueError if the required `path` is not found
102 | 
103 |     Returns
104 |     -------
105 |     The absolute path to the required resource as a `pathlib.Path`
106 | 
107 |     """
108 |     path = pathlib.Path(
109 |         pkg_resources.resource_filename(
110 |             pkg_resources.Requirement.parse('phonemizer'),
111 |             f'phonemizer/share/{path}'))
112 | 
113 |     if not path.exists():  # pragma: nocover
114 |         raise ValueError(f'the requested resource does not exist: {path}')
115 | 
116 |     return path.resolve()
117 | 
118 | 
119 | def version_as_tuple(version):
120 |     """Returns a tuple of integers from a version string
121 | 
122 |     Any '-dev' in version string is ignored. For instance, returns (1, 2, 3)
123 |     from '1.2.3' or (0, 2) from '0.2-dev'
124 | 
125 |     """
126 |     return tuple(int(v) for v in version.replace('-dev', '').split('.'))
127 | 


--------------------------------------------------------------------------------
/phonemizer/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Mathieu Bernard
 2 | #
 3 | # This file is part of phonemizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonemizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Phonemizer version description"""
16 | 
17 | import pkg_resources
18 | 
19 | from phonemizer.backend import (
20 |     EspeakBackend, EspeakMbrolaBackend, FestivalBackend, SegmentsBackend)
21 | 
22 | 
23 | def _version_as_str(vers):
24 |     """From (1, 49, 3) to '1.49.3'"""
25 |     return '.'.join(str(v) for v in vers)
26 | 
27 | 
28 | def version():
29 |     """Return version information for front and backends"""
30 |     # version of the phonemizer
31 |     _version = (
32 |         'phonemizer-' + pkg_resources.get_distribution('phonemizer').version)
33 | 
34 |     # for each backend, check if it is available or not. If so get its version
35 |     available = []
36 |     unavailable = []
37 | 
38 |     if EspeakBackend.is_available():
39 |         available.append(
40 |             'espeak-' + ('ng-' if EspeakBackend.is_espeak_ng() else '')
41 |             + _version_as_str(EspeakBackend.version()))
42 |     else:  # pragma: nocover
43 |         unavailable.append('espeak')
44 | 
45 |     if EspeakMbrolaBackend.is_available():
46 |         available.append('espeak-mbrola')
47 |     else:  # pragma: nocover
48 |         unavailable.append('espeak-mbrola')
49 | 
50 |     if FestivalBackend.is_available():
51 |         available.append(
52 |             'festival-' + _version_as_str(FestivalBackend.version()))
53 |     else:  # pragma: nocover
54 |         unavailable.append('festival')
55 | 
56 |     if SegmentsBackend.is_available():
57 |         available.append(
58 |             'segments-' + _version_as_str(SegmentsBackend.version()))
59 |     else:  # pragma: nocover
60 |         unavailable.append('segments')
61 | 
62 |     # resumes the backends status in the final version string
63 |     if available:
64 |         _version += '\navailable backends: ' + ', '.join(available)
65 |     if unavailable:  # pragma: nocover
66 |         _version += '\nuninstalled backends: ' + ', '.join(unavailable)
67 | 
68 |     return _version
69 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | minversion = 5.0
 3 | testpaths = test
 4 | python_files = test/*.py
 5 | filterwarnings =
 6 |     # ignore some deprecation warnings (on regexp escape sequence) for segments
 7 |     # module and its dependencies
 8 |     ignore::DeprecationWarning:.*segments.*
 9 |     ignore::DeprecationWarning:.*csvw.*
10 |     ignore::DeprecationWarning:.*clldutils.*
11 | 
12 | [build_sphinx]
13 | source-dir = doc/source
14 | build-dir = doc/build
15 | 
16 | [coverage:report]
17 | exclude_lines =
18 |     pragma: nocover
19 |     @abc.abstractmethod
20 | 
21 | [metadata]
22 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright 2015-2021 Mathieu Bernard
 4 | #
 5 | # This file is part of phonemizer: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as
 7 | # published by the Free Software Foundation, either version 3 of the
 8 | # License, or (at your option) any later version.
 9 | #
10 | # Phonemizer is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
17 | """Setup script for the phonemizer package"""
18 | 
19 | import builtins
20 | import codecs
21 | import setuptools
22 | 
23 | 
24 | # This is a bit hackish: we are setting a global variable so that the main
25 | # phonemizer __init__ can detect if it is being loaded by the setup routine, to
26 | # avoid attempting to load components that aren't built yet.
27 | builtins.__PHONEMIZER_SETUP__ = True
28 | 
29 | 
30 | import phonemizer
31 | 
32 | 
33 | setuptools.setup(
34 |     # general description
35 |     name='phonemizer',
36 |     description=' Simple text to phones converter for multiple languages',
37 |     version=phonemizer.__version__,
38 | 
39 |     # python package dependancies
40 |     install_requires=['joblib', 'segments', 'attrs>=18.1', 'dlinfo'],
41 | 
42 |     # include Python code and any files in phonemizer/share
43 |     packages=setuptools.find_packages(),
44 |     package_data={
45 |         'phonemizer': [
46 |             'share/espeak/*', 'share/festival/*', 'share/segments/*']},
47 | 
48 |     # define the command-line script to use
49 |     entry_points={'console_scripts': ['phonemize = phonemizer.main:main']},
50 | 
51 |     # metadata for upload to PyPI
52 |     author='Mathieu Bernard',
53 |     author_email='mathieu.a.bernard@inria.fr',
54 |     license='GPL3',
55 |     keywords='linguistics G2P phone festival espeak TTS',
56 |     url='https://github.com/bootphon/phonemizer',
57 |     long_description=codecs.open('README.md', encoding='utf-8').read(),
58 |     long_description_content_type="text/markdown",
59 |     classifiers=[
60 |         "Programming Language :: Python :: 3",
61 |         "License :: OSI Approved :: "
62 |         "GNU General Public License v3 or later (GPLv3+)",
63 |         "Operating System :: OS Independent",
64 |     ],
65 |     python_requires='>=3.6',
66 |     zip_safe=True,
67 | )
68 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/resemble-ai/phonemizer/389c1716ce856369b8dffe5eced6579768feb9a6/test/__init__.py


--------------------------------------------------------------------------------
/test/test_espeak.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the espeak backend"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | # pylint: disable=redefined-outer-name
 19 | 
 20 | import os
 21 | import shutil
 22 | import pytest
 23 | 
 24 | from phonemizer.backend import EspeakBackend
 25 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 26 | from phonemizer.separator import Separator, default_separator
 27 | 
 28 | 
 29 | def test_english():
 30 |     backend = EspeakBackend('en-us')
 31 |     text = ['hello world', 'goodbye', 'third line', 'yet another']
 32 |     out = backend.phonemize(text, default_separator, True)
 33 |     assert out == ['həloʊ wɜːld', 'ɡʊdbaɪ', 'θɜːd laɪn', 'jɛt ɐnʌðɚ']
 34 | 
 35 | 
 36 | def test_stress():
 37 |     backend = EspeakBackend('en-us', with_stress=False)
 38 |     assert backend.phonemize(
 39 |         ['hello world'], default_separator, True) == ['həloʊ wɜːld']
 40 | 
 41 |     backend = EspeakBackend('en-us', with_stress=True)
 42 |     assert backend.phonemize(
 43 |         ['hello world'], default_separator, True) == ['həlˈoʊ wˈɜːld']
 44 | 
 45 | 
 46 | def test_french():
 47 |     backend = EspeakBackend('fr-fr')
 48 |     text = ['bonjour le monde']
 49 |     sep = Separator(word=';eword ', syllable=None, phone=' ')
 50 |     expected = ['b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ']
 51 |     out = backend.phonemize(text, sep, False)
 52 |     assert out == expected
 53 | 
 54 | 
 55 | @pytest.mark.skipif(
 56 |     (
 57 |         not EspeakBackend.is_espeak_ng() or
 58 |         # Arabic is not supported by the Windows msi installer from espeak-ng
 59 |         # github release
 60 |         not EspeakBackend.is_supported_language('ar')),
 61 |     reason='Arabic is not supported')
 62 | def test_arabic():
 63 |     backend = EspeakBackend('ar')
 64 |     text = ['السلام عليكم']
 65 |     sep = Separator()
 66 | 
 67 |     # Arabic seems to have changed starting at espeak-ng-1.49.3
 68 |     if EspeakBackend.version() >= (1, 49, 3):
 69 |         expected = ['ʔassalaːm ʕliːkm ']
 70 |     else:
 71 |         expected = ['ʔassalaam ʕaliijkum ']
 72 |     out = backend.phonemize(text, sep, False)
 73 |     assert out == expected
 74 | 
 75 | 
 76 | # see https://github.com/bootphon/phonemizer/issues/31
 77 | def test_phone_separator_simple():
 78 |     text = ['The lion and the tiger ran']
 79 |     sep = Separator(phone='_')
 80 |     backend = EspeakBackend('en-us')
 81 | 
 82 |     output = backend.phonemize(text, separator=sep, strip=True)
 83 |     expected = ['ð_ə l_aɪə_n æ_n_d ð_ə t_aɪ_ɡ_ɚ ɹ_æ_n']
 84 |     assert expected == output
 85 | 
 86 |     output = backend.phonemize(text, separator=sep, strip=False)
 87 |     expected = ['ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ ']
 88 |     assert expected == output
 89 | 
 90 | 
 91 | @pytest.mark.parametrize(
 92 |     'text, expected',
 93 |     (('the hello but the', 'ð_ə h_ə_l_oʊ b_ʌ_t ð_ə'),
 94 |      # ('Here there and everywhere', 'h_ɪɹ ð_ɛɹ æ_n_d ɛ_v_ɹ_ɪ_w_ɛɹ'),
 95 |      # ('He was hungry and tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i æ_n_d t_aɪɚ_d'),
 96 |      ('He was hungry but tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i b_ʌ_t t_aɪɚ_d')))
 97 | def test_phone_separator(text, expected):
 98 |     sep = Separator(phone='_')
 99 |     backend = EspeakBackend('en-us')
100 |     output = backend.phonemize([text], separator=sep, strip=True)[0]
101 |     assert output == expected
102 | 
103 | 
104 | @pytest.mark.skipif(
105 |     'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
106 |     reason='cannot modify environment')
107 | def test_path_good():
108 |     espeak = EspeakBackend.library()
109 |     try:
110 |         EspeakBackend.set_library(None)
111 |         assert espeak == EspeakBackend.library()
112 | 
113 |         library = EspeakWrapper().library_path
114 |         EspeakBackend.set_library(library)
115 | 
116 |         test_english()
117 | 
118 |     # restore the espeak path to default
119 |     finally:
120 |         EspeakBackend.set_library(None)
121 | 
122 | 
123 | @pytest.mark.skipif(
124 |     'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
125 |     reason='cannot modify environment')
126 | def test_path_bad():
127 |     try:
128 |         # corrupt the default espeak path, try to use python executable instead
129 |         binary = shutil.which('python')
130 |         EspeakBackend.set_library(binary)
131 | 
132 |         with pytest.raises(RuntimeError):
133 |             EspeakBackend('en-us')
134 |         with pytest.raises(RuntimeError):
135 |             EspeakBackend.version()
136 | 
137 |         EspeakBackend.set_library(__file__)
138 |         with pytest.raises(RuntimeError):
139 |             EspeakBackend('en-us')
140 | 
141 |     # restore the espeak path to default
142 |     finally:
143 |         EspeakBackend.set_library(None)
144 | 
145 | 
146 | @pytest.mark.skipif(
147 |     'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
148 |     reason='cannot modify environment')
149 | def test_path_venv():
150 |     try:
151 |         os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = (
152 |             shutil.which('python'))
153 |         with pytest.raises(RuntimeError):
154 |             EspeakBackend('en-us').phonemize(['hello'])
155 |         with pytest.raises(RuntimeError):
156 |             EspeakBackend.version()
157 | 
158 |         os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = __file__
159 |         with pytest.raises(RuntimeError):
160 |             EspeakBackend.version()
161 | 
162 |     finally:
163 |         try:
164 |             del os.environ['PHONEMIZER_ESPEAK_LIBRARY']
165 |         except KeyError:
166 |             pass
167 | 
168 | 
169 | @pytest.mark.skipif(
170 |     not EspeakBackend.is_espeak_ng(),
171 |     reason='tie only compatible with espeak-ng')
172 | @pytest.mark.parametrize(
173 |     'tie, expected', [
174 |         (False, 'dʒ_æ_k_i_ tʃ_æ_n_ '),
175 |         (True, 'd͡ʒæki t͡ʃæn '),
176 |         ('8', 'd8ʒæki t8ʃæn ')])
177 | def test_tie_simple(caplog, tie, expected):
178 |     backend = EspeakBackend('en-us', tie=tie)
179 |     assert backend.phonemize(
180 |         ['Jackie Chan'],
181 |         separator=Separator(word=' ', phone='_'))[0] == expected
182 | 
183 |     if tie:
184 |         messages = [msg[2] for msg in caplog.record_tuples]
185 |         assert (
186 |             'cannot use ties AND phone separation, ignoring phone separator'
187 |             in messages)
188 | 
189 | 
190 | @pytest.mark.skipif(
191 |     not EspeakBackend.is_espeak_ng(),
192 |     reason='tie only compatible with espeak-ng')
193 | def test_tie_utf8():
194 |     # NOTE this is a bug in espeak to append ties on (en) language switch
195 |     # flags. For now phonemizer does not fix it.
196 |     backend = EspeakBackend('fr-fr', tie=True)
197 | 
198 |     # used to be 'bɔ̃͡ʒuʁ '
199 |     assert backend.phonemize(['bonjour']) == ['bɔ̃ʒuʁ ']
200 | 
201 |     # used to be 'ty ɛm lə (͡e͡n͡)fʊtbɔ͡ːl(͡f͡r͡)'
202 |     assert backend.phonemize(
203 |         ['tu aimes le football']) == ['ty ɛm lə (͡e͡n)fʊtbɔːl(͡f͡r) ']
204 | 
205 |     assert backend.phonemize(
206 |         ['bonjour apple']) == ['bɔ̃ʒuʁ (͡e͡n)apə͡l(͡f͡r) ']
207 | 
208 | 
209 | @pytest.mark.skipif(
210 |     not EspeakBackend.is_espeak_ng(),
211 |     reason='tie only compatible with espeak-ng')
212 | def test_tie_bad():
213 |     with pytest.raises(RuntimeError):
214 |         EspeakBackend('en-us', tie='abc')
215 | 


--------------------------------------------------------------------------------
/test/test_espeak_lang_switch.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the espeak backend language switch processing"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | # pylint: disable=redefined-outer-name
 19 | 
 20 | import pytest
 21 | 
 22 | from phonemizer.backend import EspeakBackend
 23 | from phonemizer.separator import Separator
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def langswitch_text():
 28 |     return [
 29 |         "j'aime l'anglais",
 30 |         "j'aime le football",
 31 |         "football",
 32 |         "surtout le real madrid",
 33 |         "n'utilise pas google"]
 34 | 
 35 | 
 36 | @pytest.mark.skipif(
 37 |     not EspeakBackend.is_espeak_ng(),
 38 |     reason='language switch only exists for espeak-ng')
 39 | @pytest.mark.parametrize('njobs', [1, 3])
 40 | def test_language_switch_keep_flags(caplog, langswitch_text, njobs):
 41 |     backend = EspeakBackend('fr-fr', language_switch='keep-flags')
 42 |     out = backend.phonemize(
 43 |         langswitch_text, separator=Separator(), strip=True, njobs=njobs)
 44 |     assert out == [
 45 |         'ʒɛm lɑ̃ɡlɛ',
 46 |         'ʒɛm lə (en)fʊtbɔːl(fr)',
 47 |         '(en)fʊtbɔːl(fr)',
 48 |         'syʁtu lə (en)ɹiəl(fr) madʁid',
 49 |         'nytiliz pa (en)ɡuːɡəl(fr)']
 50 | 
 51 |     messages = [msg[2] for msg in caplog.record_tuples]
 52 |     assert (
 53 |         '4 utterances containing language switches on lines 2, 3, 4, 5'
 54 |         in messages)
 55 |     assert (
 56 |         'language switch flags have been kept (applying "keep-flags" policy)'
 57 |         in messages)
 58 | 
 59 | 
 60 | @pytest.mark.skipif(
 61 |     not EspeakBackend.is_espeak_ng(),
 62 |     reason='language switch only exists for espeak-ng')
 63 | @pytest.mark.parametrize('njobs', [1, 3])
 64 | def test_language_switch_default(caplog, langswitch_text, njobs):
 65 |     # default behavior is to keep the flags
 66 |     backend = EspeakBackend('fr-fr')
 67 |     out = backend.phonemize(
 68 |         langswitch_text, separator=Separator(), strip=True, njobs=njobs)
 69 |     assert out == [
 70 |         'ʒɛm lɑ̃ɡlɛ',
 71 |         'ʒɛm lə (en)fʊtbɔːl(fr)',
 72 |         '(en)fʊtbɔːl(fr)',
 73 |         'syʁtu lə (en)ɹiəl(fr) madʁid',
 74 |         'nytiliz pa (en)ɡuːɡəl(fr)']
 75 | 
 76 |     messages = [msg[2] for msg in caplog.record_tuples]
 77 |     assert (
 78 |         '4 utterances containing language switches on lines 2, 3, 4, 5'
 79 |         in messages)
 80 |     assert (
 81 |         'language switch flags have been kept (applying "keep-flags" policy)'
 82 |         in messages)
 83 | 
 84 | 
 85 | @pytest.mark.skipif(
 86 |     not EspeakBackend.is_espeak_ng(),
 87 |     reason='language switch only exists for espeak-ng')
 88 | @pytest.mark.parametrize('njobs', [1, 3])
 89 | def test_language_switch_remove_flags(caplog, langswitch_text, njobs):
 90 |     backend = EspeakBackend('fr-fr', language_switch='remove-flags')
 91 |     out = backend.phonemize(
 92 |         langswitch_text, separator=Separator(), strip=True, njobs=njobs)
 93 |     assert out == [
 94 |         'ʒɛm lɑ̃ɡlɛ',
 95 |         'ʒɛm lə fʊtbɔːl',
 96 |         'fʊtbɔːl',
 97 |         'syʁtu lə ɹiəl madʁid',
 98 |         'nytiliz pa ɡuːɡəl']
 99 | 
100 |     messages = [msg[2] for msg in caplog.record_tuples]
101 |     assert (
102 |         '4 utterances containing language switches on lines 2, 3, 4, 5'
103 |         in messages)
104 |     assert (
105 |         'language switch flags have been removed '
106 |         '(applying "remove-flags" policy)'
107 |         in messages)
108 | 
109 | 
110 | @pytest.mark.skipif(
111 |     not EspeakBackend.is_espeak_ng(),
112 |     reason='language switch only exists for espeak-ng')
113 | @pytest.mark.parametrize('njobs', [1, 3])
114 | def test_language_switch_remove_utterance(caplog, langswitch_text, njobs):
115 |     backend = EspeakBackend('fr-fr', language_switch='remove-utterance')
116 |     out = backend.phonemize(
117 |         langswitch_text, separator=Separator(), strip=True, njobs=njobs)
118 |     assert out == ['ʒɛm lɑ̃ɡlɛ', '', '', '', '']
119 | 
120 |     messages = [msg[2] for msg in caplog.record_tuples]
121 |     assert (
122 |         'removed 4 utterances containing language switches '
123 |         '(applying "remove-utterance" policy)'
124 |         in messages)
125 | 
126 |     with pytest.raises(RuntimeError):
127 |         backend = EspeakBackend('fr-fr', language_switch='foo')
128 | 
129 | 
130 | @pytest.mark.skipif(
131 |     not EspeakBackend.is_espeak_ng(),
132 |     reason='language switch only exists for espeak-ng')
133 | @pytest.mark.parametrize(
134 |     'policy', ('keep-flags', 'remove-flags', 'remove-utterance'))
135 | def test_no_switch(policy, caplog):
136 |     text = ["j'aime l'anglais", "tu parles le français"]
137 |     backend = EspeakBackend('fr-fr', language_switch=policy)
138 |     out = backend.phonemize(text, separator=Separator(), strip=True)
139 |     assert out == ['ʒɛm lɑ̃ɡlɛ', 'ty paʁl lə fʁɑ̃sɛ']
140 | 
141 |     messages = [msg[2] for msg in caplog.record_tuples]
142 |     assert not messages
143 | 


--------------------------------------------------------------------------------
/test/test_espeak_wrapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the EspeakWrapper class"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | # pylint: disable=redefined-outer-name
 19 | 
 20 | import os
 21 | import pathlib
 22 | import pickle
 23 | import sys
 24 | 
 25 | import pytest
 26 | 
 27 | from phonemizer.backend.espeak.wrapper import EspeakWrapper
 28 | from phonemizer.backend import EspeakMbrolaBackend
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def wrapper():
 33 |     return EspeakWrapper()
 34 | 
 35 | 
 36 | def test_basic(wrapper):
 37 |     assert wrapper.version >= (1, 48)
 38 |     assert 'espeak' in str(wrapper.library_path)
 39 |     assert os.path.isabs(wrapper.library_path)
 40 |     assert os.path.isabs(wrapper.data_path)  # not None, no raise
 41 | 
 42 | 
 43 | def test_available_voices(wrapper):
 44 |     espeak = set(wrapper.available_voices())
 45 |     assert espeak
 46 | 
 47 |     mbrola = set(wrapper.available_voices('mbrola'))
 48 |     # can be empty if no mbrola voice installed (occurs only on Windows, at
 49 |     # least within the github CI pipeline)
 50 |     if mbrola:
 51 |         assert not espeak.intersection(mbrola)
 52 | 
 53 | 
 54 | def test_set_get_voice(wrapper):
 55 |     assert wrapper.voice is None
 56 |     with pytest.raises(RuntimeError) as err:
 57 |         wrapper.set_voice('')
 58 |     assert 'invalid voice code ""' in str(err)
 59 | 
 60 |     wrapper.set_voice('fr-fr')
 61 |     assert wrapper.voice.language == 'fr-fr'
 62 |     assert wrapper.voice.name in (
 63 |         'French (France)',  # >1.48.3
 64 |         'french')           # older espeak
 65 | 
 66 |     wrapper.set_voice('en-us')
 67 |     assert wrapper.voice.language == 'en-us'
 68 |     assert wrapper.voice.name in (
 69 |         'English (America)',  # >1.48.3
 70 |         'english-us')         # older espeak
 71 | 
 72 |     # no mbrola voices available on Windows by default (at least on the github
 73 |     # CI pipeline)
 74 |     if sys.platform != 'win32':
 75 |         wrapper.set_voice('mb-af1')
 76 |         assert wrapper.voice.language == 'af'
 77 |         assert wrapper.voice.name == 'afrikaans-mbrola-1'
 78 | 
 79 |     with pytest.raises(RuntimeError) as err:
 80 |         wrapper.set_voice('some non existant voice code')
 81 |     assert 'invalid voice code' in str(err)
 82 | 
 83 | 
 84 | def _test_pickle(voice):
 85 |     # the wrapper is pickled when using espeak backend on multiple jobs
 86 |     wrapper = EspeakWrapper()
 87 |     wrapper.set_voice(voice)
 88 | 
 89 |     dump = pickle.dumps(wrapper)
 90 |     wrapper2 = pickle.loads(dump)
 91 | 
 92 |     assert wrapper.version == wrapper2.version
 93 |     assert wrapper.library_path == wrapper2.library_path
 94 |     assert wrapper.data_path == wrapper2.data_path
 95 |     assert wrapper.voice == wrapper2.voice
 96 | 
 97 | 
 98 | def test_pickle_en_us():
 99 |     _test_pickle('en-us')
100 | 
101 | 
102 | @pytest.mark.skipif(
103 |     not EspeakMbrolaBackend.is_available() or
104 |     not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
105 |     reason='mbrola or mb-fr1 voice not installed')
106 | def test_pickle_mb_fr1():
107 |     _test_pickle('mb-fr1')
108 | 
109 | 
110 | def test_twice():
111 |     wrapper1 = EspeakWrapper()
112 |     wrapper2 = EspeakWrapper()
113 | 
114 |     assert wrapper1.data_path == wrapper2.data_path
115 |     assert wrapper1.version == wrapper2.version
116 |     assert wrapper1.library_path == wrapper2.library_path
117 | 
118 |     wrapper1.set_voice('fr-fr')
119 |     assert wrapper1.voice.language == 'fr-fr'
120 |     wrapper2.set_voice('en-us')
121 |     assert wrapper2.voice.language == 'en-us'
122 |     assert wrapper1.voice.language == 'fr-fr'
123 | 
124 |     # pylint: disable=protected-access
125 |     assert wrapper1._espeak._tempdir != wrapper2._espeak._tempdir
126 | 
127 | 
128 | @pytest.mark.skipif(sys.platform == 'win32', reason='not supported on Windows')
129 | def test_deletion():
130 |     # pylint: disable=protected-access
131 |     wrapper = EspeakWrapper()
132 |     path = pathlib.Path(wrapper._espeak._tempdir)
133 |     del wrapper
134 |     assert not path.exists()
135 | 


--------------------------------------------------------------------------------
/test/test_festival.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the festival backend"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | 
 19 | 
 20 | import os
 21 | import pathlib
 22 | import shutil
 23 | 
 24 | import pytest
 25 | 
 26 | from phonemizer.separator import Separator
 27 | from phonemizer.backend import FestivalBackend
 28 | 
 29 | 
 30 | def _test(text, separator=Separator(
 31 |         word=' ', syllable='|', phone='-')):
 32 |     backend = FestivalBackend('en-us')
 33 |     # pylint: disable=protected-access
 34 |     return backend._phonemize_aux(text, 0, separator, True)
 35 | 
 36 | 
 37 | @pytest.mark.skipif(
 38 |     FestivalBackend.version() <= (2, 1),
 39 |     reason='festival-2.1 gives different results than further versions '
 40 |     'for syllable boundaries')
 41 | def test_hello():
 42 |     assert _test(['hello world']) == ['hh-ax|l-ow w-er-l-d']
 43 |     assert _test(['hello', 'world']) == ['hh-ax|l-ow', 'w-er-l-d']
 44 | 
 45 | 
 46 | @pytest.mark.parametrize('text', ['', ' ', '  ', '(', '()', '"', "'"])
 47 | def test_bad_input(text):
 48 |     assert _test(text) == []
 49 | 
 50 | 
 51 | def test_quote():
 52 |     assert _test(["it's"]) == ['ih-t-s']
 53 |     assert _test(["its"]) == ['ih-t-s']
 54 |     assert _test(["it s"]) == ['ih-t eh-s']
 55 |     assert _test(['it "s']) == ['ih-t eh-s']
 56 | 
 57 | 
 58 | def test_im():
 59 |     sep = Separator(word=' ', syllable='', phone='')
 60 |     assert _test(["I'm looking for an image"], sep) \
 61 |         == ['aym luhkaxng faor axn ihmaxjh']
 62 |     assert _test(["Im looking for an image"], sep) \
 63 |         == ['ihm luhkaxng faor axn ihmaxjh']
 64 | 
 65 | 
 66 | @pytest.mark.skipif(
 67 |     not shutil.which('festival'), reason='festival not in PATH')
 68 | def test_path_good():
 69 |     try:
 70 |         binary = shutil.which('festival')
 71 |         FestivalBackend.set_executable(binary)
 72 |         assert FestivalBackend('en-us').executable() == pathlib.Path(binary)
 73 |     # restore the festival path to default
 74 |     finally:
 75 |         FestivalBackend.set_executable(None)
 76 | 
 77 | 
 78 | @pytest.mark.skipif(
 79 |     'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
 80 |     reason='environment variable precedence')
 81 | def test_path_bad():
 82 |     try:
 83 |         # corrupt the default espeak path, try to use python executable instead
 84 |         binary = shutil.which('python')
 85 |         FestivalBackend.set_executable(binary)
 86 | 
 87 |         with pytest.raises(RuntimeError):
 88 |             FestivalBackend('en-us').phonemize(['hello'])
 89 |         with pytest.raises(RuntimeError):
 90 |             FestivalBackend.version()
 91 | 
 92 |         with pytest.raises(RuntimeError):
 93 |             FestivalBackend.set_executable(__file__)
 94 | 
 95 |     # restore the festival path to default
 96 |     finally:
 97 |         FestivalBackend.set_executable(None)
 98 | 
 99 | 
100 | @pytest.mark.skipif(
101 |     'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
102 |     reason='cannot modify environment')
103 | def test_path_venv():
104 |     try:
105 |         os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = shutil.which('python')
106 |         with pytest.raises(RuntimeError):
107 |             FestivalBackend('en-us').phonemize(['hello'])
108 |         with pytest.raises(RuntimeError):
109 |             FestivalBackend.version()
110 | 
111 |         os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = __file__
112 |         with pytest.raises(RuntimeError):
113 |             FestivalBackend.version()
114 | 
115 |     finally:
116 |         try:
117 |             del os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE']
118 |         except KeyError:
119 |             pass
120 | 


--------------------------------------------------------------------------------
/test/test_import.py:
--------------------------------------------------------------------------------
 1 | """Tests to import the phonemize function"""
 2 | 
 3 | # pylint: disable=missing-docstring
 4 | # pylint: disable=import-outside-toplevel
 5 | 
 6 | 
 7 | def test_relative():
 8 |     from phonemizer import phonemize
 9 |     assert phonemize('a') == 'eɪ '
10 | 
11 | 
12 | def test_absolute():
13 |     from phonemizer.phonemize import phonemize
14 |     assert phonemize('a') == 'eɪ '
15 | 


--------------------------------------------------------------------------------
/test/test_main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the command line interface"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | 
 19 | import os
 20 | import pathlib
 21 | import tempfile
 22 | import shlex
 23 | import sys
 24 | 
 25 | import pytest
 26 | 
 27 | from phonemizer.backend import EspeakMbrolaBackend
 28 | from phonemizer import main, backend, logger
 29 | 
 30 | 
 31 | def _test(text, expected_output, args=''):
 32 |     with tempfile.TemporaryDirectory() as tmpdir:
 33 |         input_file = pathlib.Path(tmpdir) / 'input.txt'
 34 |         output_file = pathlib.Path(tmpdir) / 'output.txt'
 35 |         with open(input_file, 'wb') as finput:
 36 |             finput.write(text.encode('utf8'))
 37 | 
 38 |         sys.argv = ['unused', f'{input_file}', '-o', f'{output_file}']
 39 |         if args:
 40 |             sys.argv += shlex.split(args)
 41 |         main.main()
 42 | 
 43 |         with open(output_file, 'rb') as foutput:
 44 |             output = foutput.read().decode()
 45 | 
 46 |         # silly fix for windows
 47 |         assert output.replace('\r', '').strip(os.linesep) \
 48 |             == expected_output.replace('\r', '')
 49 | 
 50 | 
 51 | def test_help():
 52 |     sys.argv = ['foo', '-h']
 53 |     with pytest.raises(SystemExit):
 54 |         main.main()
 55 | 
 56 | 
 57 | def test_version():
 58 |     sys.argv = ['foo', '--version']
 59 |     main.main()
 60 | 
 61 | 
 62 | def test_list_languages():
 63 |     sys.argv = ['foo', '--list-languages']
 64 |     main.main()
 65 | 
 66 | 
 67 | def test_readme():
 68 |     _test('hello world', 'həloʊ wɜːld ', '--verbose')
 69 |     _test('hello world', 'həloʊ wɜːld ', '--quiet')
 70 |     _test('hello world', 'hello world | həloʊ wɜːld ', '--prepend-text')
 71 |     _test('hello world', 'hhaxlow werld', '-b festival --strip')
 72 |     _test('bonjour le monde', 'bɔ̃ʒuʁ lə mɔ̃d ', '-l fr-fr')
 73 |     _test('bonjour le monde', 'b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ',
 74 |           '-l fr-fr -p " " -w ";eword "')
 75 | 
 76 | 
 77 | @pytest.mark.skipif(
 78 |     '2.1' in backend.FestivalBackend.version(),
 79 |     reason='festival-2.1 gives different results than further versions '
 80 |     'for syllable boundaries')
 81 | def test_readme_festival_syll():
 82 |     _test('hello world',
 83 |           'hh ax ;esyll l ow ;esyll ;eword w er l d ;esyll ;eword ',
 84 |           "-p ' ' -s ';esyll ' -w ';eword ' -b festival -l en-us")
 85 | 
 86 | 
 87 | @pytest.mark.parametrize('njobs', [1, 6])
 88 | def test_njobs(njobs):
 89 |     _test(
 90 |         os.linesep.join((
 91 |             'hello world',
 92 |             'goodbye',
 93 |             'third line',
 94 |             'yet another')),
 95 |         os.linesep.join((
 96 |             'h-ə-l-oʊ w-ɜː-l-d',
 97 |             'ɡ-ʊ-d-b-aɪ',
 98 |             'θ-ɜː-d l-aɪ-n',
 99 |             'j-ɛ-t ɐ-n-ʌ-ð-ɚ')),
100 |         f'--strip -j {njobs} -l en-us -b espeak -p "-" -s "|" -w " "')
101 | 
102 | 
103 | def test_unicode():
104 |     _test('untuʼule', 'untṵːle ', '-l yucatec -b segments')
105 | 
106 | 
107 | def test_logger():
108 |     with pytest.raises(RuntimeError):
109 |         logger.get_logger(verbosity=1)
110 | 
111 | 
112 | @pytest.mark.skipif(
113 |     not EspeakMbrolaBackend.is_available() or
114 |     not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
115 |     reason='mbrola or mb-fr1 voice not installed')
116 | def test_espeak_mbrola():
117 |     _test('coucou toi!', 'k u k u t w a ',
118 |           '-b espeak-mbrola -l mb-fr1 -p" " --preserve-punctuation')
119 | 
120 | 
121 | def test_espeak_path():
122 |     espeak = pathlib.Path(backend.EspeakBackend.library())
123 |     if sys.platform == 'win32':
124 |         espeak = str(espeak).replace('\\', '\\\\').replace(' ', '\\ ')
125 |     _test('hello world', 'həloʊ wɜːld ', f'--espeak-library={espeak}')
126 | 
127 | 
128 | def test_festival_path():
129 |     festival = pathlib.Path(backend.FestivalBackend.executable())
130 |     if sys.platform == 'win32':
131 |         festival = str(festival).replace('\\', '\\\\').replace(' ', '\\ ')
132 | 
133 |     _test('hello world', 'hhaxlow werld ',
134 |           f'--festival-executable={festival} -b festival')
135 | 


--------------------------------------------------------------------------------
/test/test_mbrola.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the espeak-mbrola backend"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | # pylint: disable=redefined-outer-name
 19 | 
 20 | import pytest
 21 | 
 22 | from phonemizer.backend import EspeakMbrolaBackend
 23 | from phonemizer.separator import Separator
 24 | 
 25 | 
 26 | @pytest.fixture(scope='session')
 27 | def backend():
 28 |     return EspeakMbrolaBackend('mb-fr1')
 29 | 
 30 | 
 31 | @pytest.mark.skipif(
 32 |     not EspeakMbrolaBackend.is_available() or
 33 |     not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
 34 |     reason='mbrola or mb-fr1 voice not installed')
 35 | @pytest.mark.parametrize(
 36 |     'text, expected',
 37 |     [
 38 |         # plosives
 39 |         ('pont', 'po~'),
 40 |         ('bon', 'bo~'),
 41 |         ('temps', 'ta~'),
 42 |         ('dans', 'da~'),
 43 |         ('quand', 'ka~'),
 44 |         ('gant', 'ga~'),
 45 |         # fricatives
 46 |         ('femme', 'fam'),
 47 |         ('vent', 'va~'),
 48 |         ('sans', 'sa~'),
 49 |         ('champ', 'Sa~'),
 50 |         ('gens', 'Za~'),
 51 |         ('ion', 'jo~'),
 52 |         # nasals
 53 |         ('mont', 'mo~'),
 54 |         ('nom', 'no~'),
 55 |         ('oignon', 'onjo~'),
 56 |         ('ping', 'piN'),
 57 |         # liquid glides
 58 |         ('long', 'lo~'),
 59 |         ('rond', 'Ro~'),
 60 |         ('coin', 'kwe~'),
 61 |         ('juin', 'Zye~'),
 62 |         ('pierre', 'pjER'),
 63 |         # vowels
 64 |         ('si', 'si'),
 65 |         ('ses', 'se'),
 66 |         ('seize', 'sEz'),
 67 |         ('patte', 'pat'),
 68 |         ('pâte', 'pat'),
 69 |         ('comme', 'kOm'),
 70 |         ('gros', 'gRo'),
 71 |         ('doux', 'du'),
 72 |         ('du', 'dy'),
 73 |         ('deux', 'd2'),
 74 |         ('neuf', 'n9f'),
 75 |         ('justement', 'Zystma~'),
 76 |         ('vin', 've~'),
 77 |         ('vent', 'va~'),
 78 |         ('bon', 'bo~'),
 79 |         ('brun', 'bR9~')])
 80 | def test_sampa_fr(backend, text, expected):
 81 |     assert expected == backend.phonemize(
 82 |         [text], strip=True, separator=Separator(phone=''))[0]
 83 | 
 84 | 
 85 | @pytest.mark.skipif(
 86 |     not EspeakMbrolaBackend.is_available() or
 87 |     not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
 88 |     reason='mbrola or mb-fr1 voice not installed')
 89 | def test_french_sampa(backend):
 90 |     text = ['bonjour le monde']
 91 |     sep = Separator(word=None, phone=' ')
 92 | 
 93 |     expected = ['b o~ Z u R l @ m o~ d ']
 94 |     out = backend.phonemize(text, separator=sep, strip=False)
 95 |     assert out == expected
 96 | 
 97 |     expected = ['b o~ Z u R l @ m o~ d']
 98 |     out = backend.phonemize(text, separator=sep, strip=True)
 99 |     assert out == expected
100 | 
101 |     assert backend.phonemize([''], separator=sep, strip=True) == ['']
102 |     assert backend.phonemize(['"'], separator=sep, strip=True) == ['']
103 | 
104 | 
105 | @pytest.mark.skipif(
106 |     not EspeakMbrolaBackend.is_available(),
107 |     reason='mbrola not installed')
108 | def test_mbrola_bad_language():
109 |     assert not EspeakMbrolaBackend.is_supported_language('foo-bar')
110 | 


--------------------------------------------------------------------------------
/test/test_phonemize.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the phonemizer.phonemize function"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | 
 19 | import os
 20 | import pytest
 21 | 
 22 | from phonemizer.phonemize import phonemize
 23 | from phonemizer.separator import Separator
 24 | from phonemizer.backend import EspeakBackend, EspeakMbrolaBackend
 25 | 
 26 | 
 27 | def test_bad_backend():
 28 |     with pytest.raises(RuntimeError):
 29 |         phonemize('', backend='fetiv')
 30 | 
 31 |     with pytest.raises(RuntimeError):
 32 |         phonemize('', backend='foo')
 33 | 
 34 |     with pytest.raises(RuntimeError):
 35 |         phonemize('', tie=True, backend='festival')
 36 |     with pytest.raises(RuntimeError):
 37 |         phonemize('', tie=True, backend='mbrola')
 38 |     with pytest.raises(RuntimeError):
 39 |         phonemize('', tie=True, backend='segments')
 40 |     with pytest.raises(RuntimeError):
 41 |         phonemize(
 42 |             '', tie=True, backend='espeak',
 43 |             separator=Separator(' ', None, '-'))
 44 | 
 45 | 
 46 | def test_bad_language():
 47 |     with pytest.raises(RuntimeError):
 48 |         phonemize('', language='fr-fr', backend='festival')
 49 | 
 50 |     with pytest.raises(RuntimeError):
 51 |         phonemize('', language='ffr', backend='espeak')
 52 | 
 53 |     with pytest.raises(RuntimeError):
 54 |         phonemize('', language='/path/to/nonexisting/file', backend='segments')
 55 | 
 56 |     with pytest.raises(RuntimeError):
 57 |         phonemize('', language='creep', backend='segments')
 58 | 
 59 | 
 60 | def test_text_type():
 61 |     text1 = ['one two', 'three', 'four five']
 62 |     text2 = os.linesep.join(text1)
 63 | 
 64 |     phn1 = phonemize(text1, language='en-us', backend='espeak', strip=True)
 65 |     phn2 = phonemize(text2, language='en-us', backend='espeak', strip=True)
 66 |     out3 = phonemize(text2, language='en-us', backend='espeak', strip=True,
 67 |                      prepend_text=True)
 68 |     text3 = [o[0] for o in out3]
 69 |     phn3 = [o[1] for o in out3]
 70 | 
 71 |     assert isinstance(phn1, list)
 72 |     assert isinstance(phn2, str)
 73 |     assert os.linesep.join(phn1) == phn2
 74 |     assert os.linesep.join(phn3) == phn2
 75 |     assert text3 == text1
 76 | 
 77 | 
 78 | @pytest.mark.skipif(
 79 |     not EspeakBackend.is_espeak_ng(),
 80 |     reason='language switch only exists for espeak-ng')
 81 | def test_lang_switch():
 82 |     text = ['bonjour apple', 'bonjour toi']
 83 |     out = phonemize(
 84 |         text,
 85 |         language='fr-fr',
 86 |         backend='espeak',
 87 |         prepend_text=True,
 88 |         language_switch='remove-utterance')
 89 |     assert out == [('bonjour apple', ''), ('bonjour toi', 'bɔ̃ʒuʁ twa ')]
 90 | 
 91 | 
 92 | @pytest.mark.parametrize('njobs', [2, 4])
 93 | def test_espeak(njobs):
 94 |     text = ['one two', 'three', 'four five']
 95 | 
 96 |     out = phonemize(
 97 |         text, language='en-us', backend='espeak',
 98 |         strip=True, njobs=njobs)
 99 |     assert out == ['wʌn tuː', 'θɹiː', 'foːɹ faɪv']
100 | 
101 |     out = phonemize(
102 |         ' '.join(text), language='en-us', backend='espeak',
103 |         strip=False, njobs=njobs)
104 |     assert out == ' '.join(['wʌn tuː', 'θɹiː', 'foːɹ faɪv '])
105 | 
106 |     out = phonemize(
107 |         os.linesep.join(text), language='en-us', backend='espeak',
108 |         strip=False, njobs=njobs)
109 |     assert out == os.linesep.join(['wʌn tuː ', 'θɹiː ', 'foːɹ faɪv '])
110 | 
111 | 
112 | @pytest.mark.skipif(
113 |     not EspeakMbrolaBackend.is_available() or
114 |     not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
115 |     reason='mbrola or mb-fr1 voice not installed')
116 | @pytest.mark.parametrize('njobs', [2, 4])
117 | def test_espeak_mbrola(caplog, njobs):
118 |     text = ['un deux', 'trois', 'quatre cinq']
119 | 
120 |     out = phonemize(
121 |         text,
122 |         language='mb-fr1',
123 |         backend='espeak-mbrola',
124 |         njobs=njobs,
125 |         preserve_punctuation=True)
126 |     assert out == ['9~d2', 'tRwa', 'katRse~k']
127 | 
128 |     messages = [msg[2] for msg in caplog.record_tuples]
129 |     assert 'espeak-mbrola backend cannot preserve punctuation' in messages
130 |     assert 'espeak-mbrola backend cannot preserve word separation' in messages
131 | 
132 | 
133 | @pytest.mark.parametrize('njobs', [2, 4])
134 | def test_festival(njobs):
135 |     text = ['one two', 'three', 'four five']
136 | 
137 |     out = phonemize(
138 |         text, language='en-us', backend='festival',
139 |         strip=False, njobs=njobs)
140 |     assert out == ['wahn tuw ', 'thriy ', 'faor fayv ']
141 | 
142 |     out = phonemize(
143 |         ' '.join(text), language='en-us', backend='festival',
144 |         strip=True, njobs=njobs)
145 |     assert out == ' '.join(['wahn tuw', 'thriy', 'faor fayv'])
146 | 
147 |     out = phonemize(
148 |         os.linesep.join(text), language='en-us', backend='festival',
149 |         strip=True, njobs=njobs)
150 |     assert out == os.linesep.join(['wahn tuw', 'thriy', 'faor fayv'])
151 | 
152 | 
153 | def test_festival_bad():
154 |     # cannot use options valid for espeak only
155 |     text = ['one two', 'three', 'four five']
156 | 
157 |     with pytest.raises(RuntimeError):
158 |         phonemize(
159 |             text, language='en-us', backend='festival', with_stress=True)
160 | 
161 |     with pytest.raises(RuntimeError):
162 |         phonemize(
163 |             text, language='en-us', backend='festival',
164 |             language_switch='remove-flags')
165 | 
166 | 
167 | @pytest.mark.parametrize('njobs', [2, 4])
168 | def test_segments(njobs):
169 |     # one two three four five in Maya Yucatec
170 |     text = ['untuʼuleʼ kaʼapʼeʼel', 'oʼoxpʼeʼel', 'kantuʼuloʼon chincho']
171 | 
172 |     out = phonemize(
173 |         text, language='yucatec', backend='segments',
174 |         strip=False, njobs=njobs)
175 |     assert out == [
176 |         'untṵːlḛ ka̰ːpʼḛːl ', 'o̰ːʃpʼḛːl ', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo ']
177 |     out = phonemize(
178 |         ' '.join(text), language='yucatec', backend='segments',
179 |         strip=False, njobs=njobs)
180 |     assert out == ' '.join(
181 |         ['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo '])
182 | 
183 |     out = phonemize(
184 |         os.linesep.join(text), language='yucatec', backend='segments',
185 |         strip=True, njobs=njobs)
186 |     assert out == os.linesep.join(
187 |         ['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo'])
188 | 


--------------------------------------------------------------------------------
/test/test_punctuation.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the punctuation processing"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | 
 19 | import pytest
 20 | 
 21 | from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
 22 | from phonemizer.punctuation import Punctuation
 23 | from phonemizer.phonemize import phonemize
 24 | 
 25 | # True if we are using espeak>=1.50
 26 | ESPEAK_150 = (EspeakBackend.version() >= (1, 50))
 27 | 
 28 | # True if we are using espeak>=1.49.3
 29 | ESPEAK_143 = (EspeakBackend.version() >= (1, 49, 3))
 30 | 
 31 | # True if we are using festival>=2.5
 32 | FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))
 33 | 
 34 | 
 35 | @pytest.mark.parametrize(
 36 |     'inp, out', [
 37 |         ('a, b,c.', 'a b c'),
 38 |         ('abc de', 'abc de'),
 39 |         ('!d.d. dd??  d!', 'd d dd d')])
 40 | def test_remove(inp, out):
 41 |     assert Punctuation().remove(inp) == out
 42 | 
 43 | 
 44 | @pytest.mark.parametrize(
 45 |     'inp', [
 46 |         ['.a.b.c.'],
 47 |         ['a, a?', 'b, b'],
 48 |         ['a, a?', 'b, b', '!'],
 49 |         ['a, a?', '!?', 'b, b'],
 50 |         ['!?', 'a, a?', 'b, b'],
 51 |         ['a, a, a'],
 52 |         ['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd??  d!'],
 53 |         ['Truly replied, "Yes".'],
 54 |         ['hi; ho,"'],
 55 |         ["!?"],
 56 |         ["!'"]])
 57 | def test_preserve(inp):
 58 |     punct = Punctuation()
 59 |     text, marks = punct.preserve(inp)
 60 |     assert inp == punct.restore(text, marks)
 61 | 
 62 | 
 63 | @pytest.mark.parametrize(
 64 |     'text, expected', [
 65 |         (['hi; ho,"'], ['haɪ ; hoʊ ,']),
 66 |         (['hi; "ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_143 else ['haɪ ;  hoʊ ,']),
 67 |         (['"hi; ho,'], ['haɪ ; hoʊ ,'] if ESPEAK_143 else [' haɪ ; hoʊ ,'])])
 68 | def test_preserve_2(text, expected):
 69 |     marks = ".!;:,?"
 70 |     punct = Punctuation(marks=marks)
 71 |     assert text == punct.restore(*punct.preserve(text))
 72 | 
 73 |     output = phonemize(
 74 |         text, backend="espeak",
 75 |         preserve_punctuation=True, punctuation_marks=marks)
 76 |     assert output == expected
 77 | 
 78 | 
 79 | def test_custom():
 80 |     punct = Punctuation()
 81 |     assert set(punct.marks) == set(punct.default_marks())
 82 |     assert punct.remove('a,b.c') == 'a b c'
 83 | 
 84 |     with pytest.raises(ValueError):
 85 |         punct.marks = ['?', '.']
 86 |     punct.marks = '?.'
 87 |     assert len(punct.marks) == 2
 88 |     assert punct.remove('a,b.c') == 'a,b c'
 89 | 
 90 | 
 91 | def test_espeak():
 92 |     text = 'hello, world!'
 93 |     expected1 = 'həloʊ wɜːld'
 94 |     expected2 = 'həloʊ, wɜːld!'
 95 |     expected3 = 'həloʊ wɜːld '
 96 |     expected4 = 'həloʊ , wɜːld !'
 97 | 
 98 |     out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
 99 |         [text], strip=True)[0]
100 |     assert out1 == expected1
101 | 
102 |     out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
103 |         [text], strip=True)[0]
104 |     assert out2 == expected2
105 | 
106 |     out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
107 |         [text], strip=False)[0]
108 |     assert out3 == expected3
109 | 
110 |     out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
111 |         [text], strip=False)[0]
112 |     assert out4 == expected4
113 | 
114 | 
115 | def test_festival():
116 |     text = 'hello, world!'
117 |     expected1 = 'hhaxlow werld'
118 |     expected2 = 'hhaxlow, werld!'
119 |     expected3 = 'hhaxlow werld '
120 |     expected4 = 'hhaxlow , werld !'
121 | 
122 |     out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
123 |         [text], strip=True)[0]
124 |     assert out1 == expected1
125 | 
126 |     out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
127 |         [text], strip=True)[0]
128 |     assert out2 == expected2
129 | 
130 |     out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
131 |         [text], strip=False)[0]
132 |     assert out3 == expected3
133 | 
134 |     out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
135 |         [text], strip=False)[0]
136 |     assert out4 == expected4
137 | 
138 | 
139 | def test_segments():
140 |     text = 'achi, acho!'
141 |     expected1 = 'ʌtʃɪ ʌtʃʊ'
142 |     expected2 = 'ʌtʃɪ, ʌtʃʊ!'
143 |     expected3 = 'ʌtʃɪ ʌtʃʊ '
144 |     expected4 = 'ʌtʃɪ , ʌtʃʊ !'
145 | 
146 |     out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
147 |         [text], strip=True)[0]
148 |     assert out1 == expected1
149 | 
150 |     out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
151 |         [text], strip=True)[0]
152 |     assert out2 == expected2
153 | 
154 |     out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
155 |         [text], strip=False)[0]
156 |     assert out3 == expected3
157 | 
158 |     out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
159 |         [text], strip=False)[0]
160 |     assert out4 == expected4
161 | 
162 | 
163 | # see https://github.com/bootphon/phonemizer/issues/54
164 | @pytest.mark.parametrize(
165 |     'text', ["!'", "'!", "!'!", "'!'"])
166 | def test_issue_54(text):
167 |     output = phonemize(
168 |         [text], language='en-us', backend='espeak',
169 |         preserve_punctuation=True)[0]
170 |     assert text.replace("'", '') == output
171 | 
172 | 
173 | # see https://github.com/bootphon/phonemizer/issues/55
174 | @pytest.mark.parametrize(
175 |     'backend, marks, text, expected', [
176 |         ('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ ! "', '"heɪ ,"']),
177 |         ('espeak', '.!;:,?', ['"Hey! "', '"hey,"'],
178 |          ['heɪ ! ', 'heɪ ,'] if ESPEAK_150 else [' heɪ ! ', ' heɪ ,']),
179 |         ('espeak', 'default', ['! ?', 'hey!'], ['! ?', 'heɪ !']),
180 |         ('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ !']),
181 |         ('segments', 'default', ['! ?', 'hey!'], ['! ?', 'heːj !']),
182 |         ('segments', '!', ['! ?', 'hey!'], ValueError),
183 |         ('festival', 'default', ['! ?', 'hey!'], ['! ?', 'hhey !']),
184 |         ('festival', '!', ['! ?', 'hey!'], ['!  ', 'hhey !'])])
185 | def test_issue55(backend, marks, text, expected):
186 |     if marks == 'default':
187 |         marks = Punctuation.default_marks()
188 |     language = 'cree' if backend == 'segments' else 'en-us'
189 | 
190 |     try:
191 |         with pytest.raises(expected):
192 |             phonemize(
193 |                 text, language=language, backend=backend,
194 |                 preserve_punctuation=True, punctuation_marks=marks)
195 |     except TypeError:
196 |         try:
197 |             assert expected == phonemize(
198 |                 text, language=language, backend=backend,
199 |                 preserve_punctuation=True, punctuation_marks=marks)
200 |         except RuntimeError:
201 |             if backend == 'festival':
202 |                 # TODO on some installations festival fails to phonemize "?".
203 |                 # It ends with a segmentation fault. This seems to only appear
204 |                 # with festival-2.5 (but is working on travis and docker image)
205 |                 pass
206 | 


--------------------------------------------------------------------------------
/test/test_segments.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015-2021 Mathieu Bernard
  2 | #
  3 | # This file is part of phonemizer: you can redistribute it and/or
  4 | # modify it under the terms of the GNU General Public License as
  5 | # published by the Free Software Foundation, either version 3 of the
  6 | # License, or (at your option) any later version.
  7 | #
  8 | # Phonemizer is distributed in the hope that it will be useful, but
  9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 11 | # General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
 15 | """Test of the segments backend"""
 16 | 
 17 | # pylint: disable=missing-docstring
 18 | 
 19 | import os
 20 | import pkg_resources
 21 | import pytest
 22 | 
 23 | from phonemizer.separator import Separator, default_separator
 24 | from phonemizer.backend import SegmentsBackend
 25 | 
 26 | 
 27 | def test_multiline():
 28 |     backend = SegmentsBackend('cree')
 29 |     assert backend.language == 'cree'
 30 | 
 31 |     assert backend.phonemize(['a']) == [u'ʌ ']
 32 |     assert backend.phonemize(['aa']) == [u'ʌʌ ']
 33 |     assert backend.phonemize(['a\n']) == [u'ʌ ']
 34 |     assert backend.phonemize(['a\na']) == [u'ʌ ʌ ']
 35 |     assert backend.phonemize(['a\na\n']) == [u'ʌ ʌ ']
 36 |     assert backend.phonemize(['a', 'a']) == [u'ʌ ', 'ʌ ']
 37 |     assert backend.phonemize(['a\n', 'a\n']) == [u'ʌ ', 'ʌ ']
 38 | 
 39 | 
 40 | def test_bad_morpheme():
 41 |     backend = SegmentsBackend('cree')
 42 |     with pytest.raises(ValueError):
 43 |         backend.phonemize('A')
 44 | 
 45 | 
 46 | def test_separator():
 47 |     backend = SegmentsBackend('cree')
 48 |     text = ['achi acho']
 49 | 
 50 |     sep = default_separator
 51 |     assert backend.phonemize(text, separator=sep) == [u'ʌtʃɪ ʌtʃʊ ']
 52 |     assert backend.phonemize(text, separator=sep, strip=True) == [u'ʌtʃɪ ʌtʃʊ']
 53 | 
 54 | 
 55 | def test_separator_2():
 56 |     backend = SegmentsBackend('cree')
 57 |     text = ['achi acho']
 58 | 
 59 |     sep = Separator(word='_', phone=' ')
 60 |     assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
 61 |     assert backend.phonemize(text, separator=sep, strip=True) \
 62 |         == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
 63 | 
 64 | 
 65 | def test_separator_3():
 66 |     backend = SegmentsBackend('cree')
 67 |     text = ['achi acho']
 68 | 
 69 |     sep = Separator(word=' ', syllable=None, phone='_')
 70 |     assert backend.phonemize(text, separator=sep) == [u'ʌ_tʃ_ɪ_ ʌ_tʃ_ʊ_ ']
 71 |     assert backend.phonemize(text, separator=sep, strip=True) \
 72 |         == [u'ʌ_tʃ_ɪ ʌ_tʃ_ʊ']
 73 | 
 74 | 
 75 | def test_separator_4():
 76 |     backend = SegmentsBackend('cree')
 77 |     text = ['achi acho']
 78 | 
 79 |     # TODO bug when sep.phone == ' ' with no sep.word
 80 |     sep = Separator(phone=' ', word='')
 81 |     assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ ʌ tʃ ʊ ']
 82 |     assert backend.phonemize(text, separator=sep, strip=True) \
 83 |         == [u'ʌ tʃ ɪʌ tʃ ʊ']
 84 | 
 85 | 
 86 | def test_separator_5():
 87 |     backend = SegmentsBackend('cree')
 88 |     text = ['achi acho']
 89 | 
 90 |     sep = Separator(phone=' ', word='_')
 91 |     assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
 92 |     assert backend.phonemize(text, separator=sep, strip=True) \
 93 |         == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
 94 | 
 95 | 
 96 | def test_language(tmpdir):
 97 |     # check languages by name
 98 |     assert SegmentsBackend.is_supported_language('cree')
 99 |     assert not SegmentsBackend.is_supported_language('unexisting')
100 | 
101 |     # check languages by g2p file
102 |     directory = pkg_resources.resource_filename(
103 |         pkg_resources.Requirement.parse('phonemizer'),
104 |         'phonemizer/share/segments')
105 |     assert SegmentsBackend.is_supported_language(
106 |         os.path.join(directory, 'cree.g2p'))
107 |     assert not SegmentsBackend.is_supported_language(
108 |         os.path.join(directory, 'cree'))
109 |     assert not SegmentsBackend.is_supported_language(
110 |         os.path.join(directory, 'unexisting.g2p'))
111 | 
112 |     # bad syntax in g2p file
113 |     g2p = tmpdir.join('foo.g2p')
114 |     g2p.write('\n'.join(['a a', 'b b b', 'c']))
115 |     assert not SegmentsBackend.is_supported_language(g2p)
116 | 


--------------------------------------------------------------------------------
/test/test_separator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
 2 | #
 3 | # This file is part of phonemizer: you can redistribute it and/or
 4 | # modify it under the terms of the GNU General Public License as
 5 | # published by the Free Software Foundation, either version 3 of the
 6 | # License, or (at your option) any later version.
 7 | #
 8 | # Phonemizer is distributed in the hope that it will be useful, but
 9 | # WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 | # General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
15 | """Test of the Separator class"""
16 | 
17 | # pylint: disable=missing-docstring
18 | 
19 | import pytest
20 | 
21 | from phonemizer.separator import Separator, default_separator
22 | 
23 | 
24 | def test_prop():
25 |     # read only attributes
26 |     with pytest.raises(AttributeError):
27 |         default_separator.phone = 'a'
28 | 
29 |     with pytest.raises(AttributeError):
30 |         default_separator.syllable = 'a'
31 | 
32 |     with pytest.raises(AttributeError):
33 |         default_separator.word = 'a'
34 | 
35 | 
36 | @pytest.mark.parametrize('val', [None, '', False])
37 | def test_empty(val):
38 |     s = Separator(val, val, val)
39 |     assert s.phone == ''
40 |     assert s.syllable == ''
41 |     assert s.word == ''
42 | 
43 | 
44 | def test_same():
45 |     with pytest.raises(ValueError):
46 |         Separator(word=' ', phone=' ')
47 | 
48 | 
49 | def test_str():
50 |     separator = Separator(word='w', syllable='s', phone='p')
51 |     assert str(separator) == '(phone: "p", syllable: "s", word: "w")'
52 |     assert str(default_separator) == '(phone: "", syllable: "", word: " ")'
53 | 
54 | 
55 | def test_equal():
56 |     assert Separator() == Separator()
57 |     assert default_separator == Separator(phone='', syllable='', word=' ')
58 |     assert Separator(word='  ') != default_separator
59 | 
60 | 
61 | def test_field_separator():
62 |     sep = Separator(word='w', syllable='s', phone='p')
63 |     assert 'w' in sep
64 |     assert 'p' in sep
65 |     assert 'wp' not in sep
66 |     assert ' ' not in sep
67 | 
68 |     assert sep.input_output_separator(False) is False
69 |     assert sep.input_output_separator(None) is False
70 |     assert sep.input_output_separator('') is False
71 |     assert sep.input_output_separator(True) == '|'
72 |     assert sep.input_output_separator('io') == 'io'
73 | 
74 |     with pytest.raises(RuntimeError) as err:
75 |         sep.input_output_separator([1, 2])
76 |     assert 'invalid input/output separator' in str(err)
77 |     with pytest.raises(RuntimeError) as err:
78 |         sep.input_output_separator('w')
79 |     assert 'cannot prepend input with "w"' in str(err)
80 | 
81 |     sep = Separator(phone='|', syllable='||', word='|||')
82 |     assert sep.input_output_separator(True) == '||||'
83 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | """Test of the phonemizer.utils module"""
 2 | 
 3 | # pylint: disable=missing-docstring
 4 | import os
 5 | 
 6 | from phonemizer.utils import chunks, cumsum, str2list, list2str
 7 | 
 8 | 
 9 | def test_cumsum():
10 |     assert cumsum([]) == []
11 |     assert cumsum([0]) == [0]
12 |     assert cumsum([1, 2, 3]) == [1, 3, 6]
13 | 
14 | 
15 | def test_list2str():
16 |     assert list2str('') == ''
17 |     assert list2str([]) == ''
18 |     assert list2str(['']) == ''
19 |     assert list2str(['abc']) == 'abc'
20 |     assert list2str(['a', 'b', 'c']) == os.linesep.join('abc')
21 | 
22 | 
23 | def test_str2list():
24 |     assert str2list('') == ['']
25 |     assert str2list('a') == ['a']
26 |     assert str2list('ab') == ['ab']
27 |     assert str2list('a b') == ['a b']
28 |     assert str2list(f'a{os.linesep}b') == ['a', 'b']
29 |     assert str2list(
30 |         f'a{os.linesep}{os.linesep}b{os.linesep}') == ['a', '', 'b']
31 | 
32 | 
33 | def test_chunks():
34 |     for i in range(1, 5):
35 |         assert chunks(['a'], i) == ([['a']], [0])
36 | 
37 |     assert chunks(['a', 'a'], 1) == ([['a', 'a']], [0])
38 |     assert chunks(['a', 'a'], 2) == ([['a'], ['a']], [0, 1])
39 |     assert chunks(['a', 'a'], 10) == ([['a'], ['a']], [0, 1])
40 | 
41 |     assert chunks(['a', 'a', 'a'], 1) == ([['a', 'a', 'a']], [0])
42 |     assert chunks(['a', 'a', 'a'], 2) == ([['a'], ['a', 'a']], [0, 1])
43 |     assert chunks(['a', 'a', 'a'], 3) == ([['a'], ['a'], ['a']], [0, 1, 2])
44 |     assert chunks(['a', 'a', 'a'], 10) == ([['a'], ['a'], ['a']], [0, 1, 2])
45 | 
46 |     assert chunks(['a', 'a', 'a', 'a'], 1) == ([['a', 'a', 'a', 'a']], [0])
47 |     assert chunks(['a', 'a', 'a', 'a'], 2) == (
48 |         [['a', 'a'], ['a', 'a']], [0, 2])
49 |     assert chunks(['a', 'a', 'a', 'a'], 3) == (
50 |         [['a'], ['a'], ['a', 'a']], [0, 1, 2])
51 |     assert chunks(['a', 'a', 'a', 'a'], 10) == (
52 |         [['a'], ['a'], ['a'], ['a']], [0, 1, 2, 3])
53 | 


--------------------------------------------------------------------------------