├── .directory
├── .gitignore
├── README.md
├── bin
    └── p2fa
├── changes.txt
├── images
    └── error1.png
├── p2fa
    ├── __init__.py
    ├── _tmp
    │   ├── ploppy.png
    │   └── ploppy_state.png
    ├── align.py
    ├── examples
    │   ├── ploppy.TextGrid
    │   ├── ploppy.txt
    │   └── ploppy.wav
    └── model
    │   ├── 8000
    │       ├── config
    │       ├── hmmdefs
    │       └── macros
    │   ├── 11025
    │       ├── config
    │       ├── hmmdefs
    │       └── macros
    │   ├── 16000
    │       ├── config
    │       ├── hmmdefs
    │       └── macros
    │   ├── dict
    │   ├── dict_original
    │   └── monophones
├── readme.txt
├── setup.py
└── tests
    ├── __init__.py
    └── test_p2fa.py


/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | PreviewsShown=false
3 | Timestamp=2018,8,22,14,0,20
4 | Version=4
5 | ViewMode=2
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | py2fa/__pycache__/
 2 | *.py[cod]
 3 | *$py.class
 4 | .ipynb_checkpoints
 5 | *~
 6 | \#*\#
 7 | .DS_Store
 8 | *.zip
 9 | sample
10 | sox*
11 | py2fa/tmp
12 | .idea/
13 | .directory/
14 | 
15 | # Personal
16 | .vscode
17 | HTK-3.4.1.tar.gz
18 | htk


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # P2FA for Python3.x
  2 | 
  3 | This is a modified version of P2FA for Python3 compatibility.
  4 | Everything else remains the same as the original P2FA.
  5 | Forced alignment helps to align linguistic units (e.g., phoneme or
  6 | words) with the corresponding sound file. All you need is to have a
  7 | sound file with a transcription file.
  8 | The output will be .TextGrid file with time-aligned phone, word and
  9 | optionally state-level tiers.
 10 | 
 11 | This was tested on macOS Ventura and Arch Linux.
 12 | 
 13 | ## 1. Install HTK
 14 | First, you need to download HTK source code (http://htk.eng.cam.ac.uk/).
 15 | This HTK installation guide is retrieved from
 16 | [Link](https://github.com/prosodylab/Prosodylab-Aligner).
 17 | (2021-04-13) Installation is based on macOS Sierra.
 18 | (2023-05-06) Installation is based on macOS Ventura 13.3.1 (a) - Apple M1 Max
 19 | 
 20 | ### For 1.1 Arch Linux
 21 | I couldn't run HTK-3.4.1 on Arch Linux. I switched to 3.4.0
 22 | and everything works fine. Installation of HTK is the same as the one
 23 | described below.
 24 | 
 25 | Unzip HTK-3.4.1.tar.gz file. I unzipped htk under the current repository so as to keep it in the same directory.
 26 | 
 27 | ```bash
 28 | $ tar -xvf HTK-3.4.1.tar.gz
 29 | ```
 30 | 
 31 | After extracting the tar file, switch to htk directory.
 32 | 
 33 | ```bash
 34 | $ cd htk
 35 | ```
 36 | 
 37 | Compile HTK in the htk directory.
 38 | 
 39 | ```bash
 40 | $ export CPPFLAGS=-UPHNALG
 41 | $ ./configure --disable-hlmtools --disable-hslab
 42 | $ make clean    # necessary if you're not starting from scratch
 43 | $ make all
 44 | $ sudo make install # use "sudo" to make htk functions available for all users
 45 | ```
 46 | 
 47 | ### 1.2 For Ubuntu 20.04.5 LTS
 48 | (Tested as of 2023-05-06)
 49 | ```bash
 50 | # Because I assume 64-bit platform, you may need to install 32-bit headers and libraries first (See: https://stackoverflow.com/a/54082790/7170059)
 51 | $ sudo apt-get install gcc-multilib
 52 | 
 53 | $ ./configure --disable-hlmtools --disable-hslab
 54 | $ make clean    # necessary if you're not starting from scratch
 55 | $ make all
 56 | $ sudo make install # use "sudo" to make htk functions available for all users
 57 | 
 58 | # Quick test if HVite works to confirm that htk functions are installed correctly
 59 | $ HVite
 60 | ```
 61 | 
 62 | ### 1.3 For macOS
 63 | (Tested as of 2023-05-06)
 64 | You may need to follow these steps before compiling HTK:
 65 | 
 66 | ```bash
 67 | # Add CPPFLAGS, LIBRARY_PATH
 68 | $ export CPPFLAGS=-I/opt/X11/include
 69 | $ export LIBRARY_PATH=/opt/X11/lib
 70 | 
 71 | # If the above doesn't work, do 
 72 | $ ln -s /opt/X11/include/X11 /usr/local/include/X11
 73 | 
 74 | # Replace line 21 (#include <malloc.h>) of HTKLib/strarr.c as below
 75 | #   include <malloc/malloc.h> 
 76 | 
 77 | # Replace line 1650 (labid != splabid) of HTKLib/HRec.c as below
 78 | #   labpr != splabid
 79 | # This step will prevent "ERROR [+8522] LatFromPaths: Align have dur<=0"
 80 | # See: https://speechtechie.wordpress.com/2009/06/12/using-htk-3-4-1-on-mac-os-10-5/
 81 | 
 82 | # Compile with options if necessary
 83 | $ ./configure 
 84 | $ make all
 85 | $ sudo make install  # use "sudo" to make htk functions available for all users
 86 | 
 87 | # Quick test if HVite works to confirm that htk functions are installed correctly
 88 | $ HVite
 89 | ```
 90 | 
 91 | ### 1.4 Troubleshooting
 92 | * If the "make" command generates errors such as:
 93 |     ```
 94 |     HGraf.c:73:10: fatal error: 'X11/Xlib.h' file not found
 95 |     ```
 96 |     * you may need to install XQuartz. Download XQuartz from [this site](https://www.xquartz.org/). You will have `/opt/X11` folder generated with necessary files. You need to manually compile HTKLib with the following command:
 97 |     ```bash
 98 |     $ cd HTKLib
 99 |     # Compile
100 |     $ gcc  -ansi -g -O2 -DNO_AUDIO -D'ARCH="darwin"' -I/usr/include/malloc -Wall -Wno-switch -g -O2 -I. -DPHNALG   -c -o HGraf.o HGraf.c -I /opt/X11/include
101 |     $ cd ..
102 |     # Set the path
103 |     $ export LIBRARY_PATH=/opt/X11/lib
104 |     # Run it again
105 |     $ make all
106 |     $ make install
107 |     (See: http://unixnme.blogspot.com/2018/01/build-htk-on-macos.html)
108 |     ```
109 | * If you encounter `strarr.c:21:10: fatal error: 'malloc.h' file not found`, then comment out `#include <malloc.h>` and add `#include <stdlib.h>` instead as follows.
110 |     (See: https://github.com/JoFrhwld/FAVE/issues/48#issue-602099201) ![error1](./images/error1.png) 
111 | 
112 | * If you see errors like `HTrain.c implicitly declaring library function 'finite'`, replace all `finite` functions in `HTKLib/HTrain.c` with `isfinite`. (See: https://trac.macports.org/ticket/61614)
113 | 
114 | * Architecture errors like `esignal.c:1184:25: error: use of undeclared identifier 'ARCH'` requires fixing it with the right architecture specifier. Open `HTKLib/esignal.c`and replace lines including `ARCH` as in `(strcmp(architecture, ARCH) == 0)` with `"darwin"` as in `(strcmp(architecture, "darwin") == 0)` (See: https://wstyler.ucsd.edu/posts/p2fa_mac.html)
115 | 
116 | 
117 | ## 2. Install sox
118 | 
119 | ```bash
120 | $ sudo apt-get install sox
121 | 
122 | # or in Arch
123 | 
124 | $ sudo pacman -S sox
125 | 
126 | # or using brew
127 | 
128 | $ brew install sox
129 | ```
130 | 
131 | ## 3. Run
132 | 
133 | ### stand alone
134 | 
135 | ```bash
136 | $ python align.py examples/ploppy.wav examples/ploppy.txt examples/ploppy.TextGrid
137 | ```
138 | 
139 | ### as part of your code
140 | 
141 | You can invoke the aligner from your code:
142 | 
143 | ```python
144 | from p2fa import align
145 | 
146 | phoneme_alignments, word_alignments = align.align('WAV_FILE_PATH', 'TRANSCRIPTION_FILE_PATH')
147 | 
148 | # or 
149 | 
150 | phoneme_alignments, word_alignments, state_alignments = align.align('WAV_FILE_PATH', 'TRANSCRIPTION_FILE_PATH', state_align=True)
151 | ```
152 | 
153 | ## 4. Result
154 | 
155 | ![image_of_ploppy_dot_png](p2fa/_tmp/ploppy.png)
156 | 
157 | With state-alignments
158 | 
159 | ![image_of_ploppy_dot_png](p2fa/_tmp/ploppy_state.png)
160 | 
161 | ## TODO
162 | * [x] Updated installation guide
163 | * [ ] Refactor `align.py`
164 | 
165 | ## References
166 | - http://www.ling.upenn.edu/phonetics/p2fa/
167 | - Jiahong Yuan and Mark Liberman. 2008. Speaker identification on the SCOTUS corpus. Proceedings of Acoustics '08.
168 | - https://github.com/prosodylab/Prosodylab-Aligner (P2FA seems better than Prosodylab-Aligner based on my qualitative evaluation)
169 | - English HMM-state level aligner: [Link](https://github.com/jaekookang/p2fa_state_aligner)
170 | - Korean Forced Aligner: [Link](https://github.com/EMCSlabs/Programs/tree/master/Korean_FA) from EMCSLabs.
171 | - Installing p2fa on Mac [Link](https://wstyler.ucsd.edu/posts/p2fa_mac.html)
172 | 


--------------------------------------------------------------------------------
/bin/p2fa:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import getopt
 5 | 
 6 | from p2fa import align
 7 | 
 8 | def getopt2(name, options, default=None):
 9 |     val = [v for n, v in options if n == name]
10 |     if len(val) == 0:
11 |         return default
12 |     return value[0]
13 | 
14 | #try:
15 | opts, args = getopt.getopt(sys.argv[1:], "r:s:e:", ["model="])
16 | 
17 | # get the three mandatory arguments
18 | if len(args) != 3:
19 |     raise ValueError(
20 |         "Specify wavefile, a transcript file, and an output file!")
21 | 
22 | wav_file, trs_file, out_file = args
23 | 
24 | sr_override_ = getopt2("-r", opts, None)
25 | wave_start_ = getopt2("-s", opts, "0.0")
26 | wave_end_ = getopt2("-e", opts, None)
27 | 
28 | mypath = getopt2("--model", opts, None)
29 | align.align(wav_file, trs_file, out_file, wave_start_, wave_end_, sr_override_, mypath)
30 | #except Exception as e:
31 | #    print(__doc__)
32 | #    (type, value, traceback) = sys.exc_info()
33 | #    print(value)
34 | #    sys.exit(0)


--------------------------------------------------------------------------------
/changes.txt:
--------------------------------------------------------------------------------
 1 | July 10, 2009 - Version 1.002
 2 | 
 3 | KME:  Added more punctuation symbols to the ones that align.py discards.
 4 | 
 5 | KME:  Fixed a bug in how align.py handles hyphenated words (now properly handles lines with both truncated and hyphenated words).
 6 | 
 7 | July 6, 2009
 8 | 
 9 | Removed the YMMV line in the readme about the choice
10 | of Linux distro.
11 | 
12 | Added make_package.sh, a script to build the tgz
13 | package. Must edit it to change version number.
14 | 
15 | June 30, 2009 - Version 1.001
16 | 
17 | Removed the -b, -p options since they are confusing.
18 | Noted that one must use HTK 3.4.1 with align.py.
19 | 
20 | June 17, 2009 - Version 1.0
21 | 
22 | Initial release of P2FA.
23 | 


--------------------------------------------------------------------------------
/images/error1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaekookang/p2fa_py3/d95bd403ede38cec40d8e02904a4a6fb5ded807a/images/error1.png


--------------------------------------------------------------------------------
/p2fa/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is a placeholder to allow this directory
2 | # to serve as a Python package, thus allowing e.g.:
3 | #
4 | # from p2tk.python.syllabify import syllabifier
5 | #
6 | # so long as this directory is in your module search
7 | # path (i.e. PYTHONPATH evironment variable) or, say,
8 | # symlinked in the current directory.
9 | 


--------------------------------------------------------------------------------
/p2fa/_tmp/ploppy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaekookang/p2fa_py3/d95bd403ede38cec40d8e02904a4a6fb5ded807a/p2fa/_tmp/ploppy.png


--------------------------------------------------------------------------------
/p2fa/_tmp/ploppy_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaekookang/p2fa_py3/d95bd403ede38cec40d8e02904a4a6fb5ded807a/p2fa/_tmp/ploppy_state.png


--------------------------------------------------------------------------------
/p2fa/align.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """ Command-line usage:
  4 |       python align.py [options] wave_file transcript_file output_file
  5 |       where options may include:
  6 |         -r sampling_rate -- override which sample rate model to use, one of 8000, 11025, and 16000
  7 |         -s start_time    -- start of portion of wavfile to align (in seconds, default 0)
  8 |         -e end_time      -- end of portion of wavfile to align (in seconds, default to end)
  9 |         -t state_align   -- align HMM states (eg. s1, s2, s3) additionally; default=0
 10 |         -v verbose       -- print HCopy and HVite commandline; default=0
 11 | 
 12 |     You can also import this file as a module and use the functions directly.
 13 | 
 14 |     2018-02-22 JK, This file was modified for Python3.x
 15 |     2018-08-21  papagandalf, This file was modified so that it can be called from Python code
 16 |     2020-06-20 JK, command-line option fixed; 
 17 |                    verbose option added for debugging;
 18 |                    state-level alignment added;
 19 | """
 20 | 
 21 | import os
 22 | import wave
 23 | import re
 24 | import shutil
 25 | import tempfile
 26 | import argparse
 27 | 
 28 | 
 29 | TEMP_DIR = os.path.join(tempfile.gettempdir(), 'p2fa')
 30 | LOG_LIKELIHOOD_REGEX = r'.+==\s+\[\d+ frames\]\s+(-?\d+.\d+)'
 31 | 
 32 | 
 33 | def prep_wav(orig_wav, out_wav, sr_override, wave_start, wave_end, sr_models):
 34 |     if os.path.exists(out_wav) and False:
 35 |         f = wave.open(out_wav, 'r')
 36 |         sr = f.getframerate()
 37 |         f.close()
 38 |         print("Already re-sampled the wav file to " + str(sr))
 39 |         return sr
 40 | 
 41 |     f = wave.open(orig_wav, 'r')
 42 |     SR = f.getframerate()
 43 |     f.close()
 44 | 
 45 |     soxopts = ""
 46 |     if float(wave_start) != 0.0 or wave_end is not None:
 47 |         soxopts += " trim " + wave_start
 48 |         if wave_end is not None:
 49 |             soxopts += " " + str(float(wave_end) - float(wave_start))
 50 | 
 51 |     if (sr_models is not None and SR not in sr_models) or (
 52 |             sr_override is not None and SR != sr_override) or soxopts != "":
 53 |         new_sr = 11025
 54 |         if sr_override is not None:
 55 |             new_sr = sr_override
 56 | 
 57 |         print("Resampling wav file from " + str(SR) +
 58 |               " to " + str(new_sr) + soxopts + "...")
 59 |         SR = new_sr
 60 |         os.system("sox " + orig_wav + " -r " + str(SR) +
 61 |                   " " + out_wav + " " + soxopts)
 62 |     else:
 63 |         # print("Using wav file, already at sampling rate " + str(SR) + ".")
 64 |         os.system("cp -f " + orig_wav + " " + out_wav)
 65 | 
 66 |     return SR
 67 | 
 68 | 
 69 | def prep_mlf(trsfile, mlffile, word_dictionary, surround, between):
 70 |     # Read in the dictionary to ensure all of the words
 71 |     # we put in the MLF file are in the dictionary. Words
 72 |     # that are not are skipped with a warning.
 73 |     f = open(word_dictionary, 'r')
 74 |     the_dict = {}  # build hash table
 75 |     for line in f.readlines():
 76 |         if line != "\n" and line != "":
 77 |             the_dict[line.split()[0]] = True
 78 |     f.close()
 79 | 
 80 |     f = open(trsfile, 'r')
 81 |     lines = f.readlines()
 82 |     f.close()
 83 | 
 84 |     words = []
 85 | 
 86 |     if surround is not None:
 87 |         words += surround.split(',')
 88 | 
 89 |     i = 0
 90 | 
 91 |     # this pattern matches hyphenated words, such as TWENTY-TWO;
 92 |     # however, it doesn't work with longer things like SOMETHING-OR-OTHER
 93 |     hyphen_pat = re.compile(r'([A-Z]+)-([A-Z]+)')
 94 | 
 95 |     while i < len(lines):
 96 |         txt = lines[i].replace('\n', '')
 97 |         txt = txt.replace('{breath}', '{BR}').replace('&lt;noise&gt;', '{NS}')
 98 |         txt = txt.replace('{laugh}', '{LG}').replace('{laughter}', '{LG}')
 99 |         txt = txt.replace('{cough}', '{CG}').replace('{lipsmack}', '{LS}')
100 | 
101 |         for pun in [',', '.', ':', ';', '!', '?', '"', '%', '(', ')', '--', '---']:
102 |             txt = txt.replace(pun, '')
103 | 
104 |         txt = txt.upper()
105 | 
106 |         # break up any hyphenated words into two separate words
107 |         txt = re.sub(hyphen_pat, r'\1 \2', txt)
108 | 
109 |         txt = txt.split()
110 | 
111 |         for wrd in txt:
112 |             if wrd in the_dict:
113 |                 words.append(wrd)
114 |                 if between is not None:
115 |                     words.append(between)
116 |             else:
117 |                 print("SKIPPING WORD", wrd)
118 | 
119 |         i += 1
120 | 
121 |     # remove the last 'between' token from the end
122 |     if between is not None:
123 |         words.pop()
124 | 
125 |     if surround is not None:
126 |         words += surround.split(',')
127 | 
128 |     write_input_mlf(mlffile, words)
129 | 
130 | 
131 | def write_input_mlf(mlffile, words):
132 |     fw = open(mlffile, 'w')
133 |     fw.write('#!MLF!#\n')
134 |     fw.write('"*/tmp.lab"\n')
135 |     for wrd in words:
136 |         fw.write(wrd + '\n')
137 |     fw.write('.\n')
138 |     fw.close()
139 | 
140 | 
141 | # def read_aligned_mlf(mlffile, sr, wave_start):
142 | #     # This reads a MLFalignment output  file with phone and word
143 | #     # alignments and returns a list of words, each word is a list containing
144 | #     # the word label followed by the phones, each phone is a tuple
145 | #     # (phone, start_time, end_time) with times in seconds.
146 | 
147 | #     f = open(mlffile, 'r')
148 | #     lines = [l.rstrip() for l in f.readlines()]
149 | #     f.close()
150 | 
151 | #     if len(lines) < 3:
152 | #         raise ValueError("Alignment did not complete succesfully.")
153 | 
154 | #     j = 2
155 | #     ret = []
156 | #     while lines[j] != '.':
157 | #         # Is this the start of a word; do we have a word label?
158 | #         if len(lines[j].split()) == 5:
159 | #             # Make a new word list in ret and put the word label at the beginning
160 | #             wrd = lines[j].split()[4]
161 | #             ret.append([wrd])
162 | 
163 | #         # Append this phone to the latest word (sub-)list
164 | #         ph = lines[j].split()[2]
165 | #         if sr == 11025:
166 | #             st = (float(lines[j].split()[0]) /
167 | #                   10000000.0 + 0.0125) * (11000.0 / 11025.0)
168 | #             en = (float(lines[j].split()[1]) /
169 | #                   10000000.0 + 0.0125) * (11000.0 / 11025.0)
170 | #         else:
171 | #             st = float(lines[j].split()[0]) / 10000000.0 + 0.0125
172 | #             en = float(lines[j].split()[1]) / 10000000.0 + 0.0125
173 | #         if st < en:
174 | #             ret[-1].append([ph, st + wave_start, en + wave_start])
175 | 
176 | #         j += 1
177 | 
178 | #     return ret
179 | 
180 | def read_aligned_mlf(mlffile, SR, wave_start):
181 |     # This reads a MLFalignment output  file with phone and word
182 |     # alignments and returns a list of words, each word is a list containing
183 |     # the word label followed by the phones, each phone is a tuple
184 |     # (phone, start_time, end_time) with times in seconds.
185 |     #
186 |     # TODO: extract log-likelihood score
187 |     
188 |     f = open(mlffile, 'r')
189 |     lines = [l.rstrip() for l in f.readlines()]
190 |     f.close()
191 | 
192 |     if len(lines) < 3 :
193 |         raise ValueError("Alignment did not complete succesfully.")
194 |         
195 |     j = 2
196 |     phon = []
197 |     ret = []
198 |     while (lines[j] != '.'):
199 |         if (len(lines[j].split()) >= 5): # Is this the start of a word; do we have a word label?
200 |             # Make a new word list in ret and put the word label at the beginning
201 |             wrd = lines[j].split()[4]
202 |             ret.append([wrd])
203 |     
204 |         # Append this phone to the latest word (sub-)list
205 |         ph = lines[j].split()[2]
206 |         if (SR == 11025):
207 |             st = (float(lines[j].split()[0])/10000000.0 + 0.0125)*(11000.0/11025.0)
208 |             en = (float(lines[j].split()[1])/10000000.0 + 0.0125)*(11000.0/11025.0)
209 |         else:
210 |             st = float(lines[j].split()[0])/10000000.0 + 0.0125
211 |             en = float(lines[j].split()[1])/10000000.0 + 0.0125   
212 |         if st < en:
213 |             ret[-1].append([ph, st+wave_start, en+wave_start])
214 |  
215 |         j += 1
216 |     
217 |     return ret
218 | 
219 | 
220 | def make_alignment_lists(word_alignments):
221 |     # make the list of just phone alignments
222 |     phons = []
223 |     for wrd in word_alignments:
224 |         phons.extend(wrd[1:])  # skip the word label
225 | 
226 |     # make the list of just word alignments
227 |     # we're getting elements of the form:
228 |     #   ["word label", ["phone1", start, end], ["phone2", start, end], ...]
229 |     wrds = []
230 |     for wrd in word_alignments:
231 |         # If no phones make up this word, then it was an optional word
232 |         # like a pause that wasn't actually realized.
233 |         if len(wrd) == 1:
234 |             continue
235 |         # word label, first phone start time, last phone end time
236 |         wrds.append([wrd[0], wrd[1][1], wrd[-1][2]])
237 |     return phons, wrds
238 | 
239 | 
240 | def get_av_log_likelihood_per_frame(file_path):
241 |     with open(file_path, 'r') as f:
242 |         lines = f.read().splitlines()
243 | 
244 |     score = re.match(LOG_LIKELIHOOD_REGEX, lines[-1]).groups()[0]
245 | 
246 |     return float(score)
247 | 
248 | 
249 | def write_text_grid(outfile, word_alignments, state_alignments=None) :
250 |     # make the list of just phone alignments
251 |     phons = []
252 |     for wrd in word_alignments :
253 |         phons.extend(wrd[1:]) # skip the word label
254 | 
255 |     # make the list of just state alignments
256 |     if state_alignments is not None:
257 |         states = []
258 |         for sts in state_alignments:
259 |             states.extend(sts[1:]) # skip the phone label
260 |     
261 |     # make the list of just word alignments
262 |     # we're getting elements of the form:
263 |     #   ["word label", ["phone1", start, end], ["phone2", start, end], ...]
264 |     wrds = []
265 |     for wrd in word_alignments :
266 |         # If no phones make up this word, then it was an optional word
267 |         # like a pause that wasn't actually realized.
268 |         if len(wrd) == 1 :
269 |             continue
270 |         wrds.append([wrd[0], wrd[1][1], wrd[-1][2]]) # word label, first phone start time, last phone end time
271 |     
272 |     fw = open(outfile, 'w')
273 |     fw.write('File type = "ooTextFile short"\n')
274 |     fw.write('"TextGrid"\n')
275 |     fw.write('\n')
276 |     fw.write(str(phons[0][1]) + '\n')
277 |     fw.write(str(phons[-1][-1]) + '\n')
278 |     fw.write('<exists>\n')
279 |     if state_alignments is not None:
280 |         fw.write('3\n')
281 |     else:
282 |         fw.write('2\n')
283 | 
284 |     #write the state interval tier
285 |     if state_alignments is not None:
286 |         fw.write('"IntervalTier"\n')
287 |         fw.write('"state"\n')
288 |         fw.write(str(states[0][1]) + '\n')
289 |         fw.write(str(states[-1][-1]) + '\n')
290 |         fw.write(str(len(states)) + '\n')
291 |         for k in range(len(states)):
292 |             fw.write(str(states[k][1]) + '\n')
293 |             fw.write(str(states[k][2]) + '\n')
294 |             fw.write('"' + states[k][0] + '"' + '\n')
295 |         
296 |     #write the phone interval tier
297 |     fw.write('"IntervalTier"\n')
298 |     fw.write('"phone"\n')
299 |     fw.write(str(phons[0][1]) + '\n')
300 |     fw.write(str(phons[-1][-1]) + '\n')
301 |     fw.write(str(len(phons)) + '\n')
302 |     for k in range(len(phons)):
303 |         fw.write(str(phons[k][1]) + '\n')
304 |         fw.write(str(phons[k][2]) + '\n')
305 |         fw.write('"' + phons[k][0] + '"' + '\n')
306 | 
307 |     #write the word interval tier
308 |     fw.write('"IntervalTier"\n')
309 |     fw.write('"word"\n')
310 |     fw.write(str(phons[0][1]) + '\n')
311 |     fw.write(str(phons[-1][-1]) + '\n')
312 |     fw.write(str(len(wrds)) + '\n')
313 |     for k in range(len(wrds) - 1):
314 |         fw.write(str(wrds[k][1]) + '\n')
315 |         fw.write(str(wrds[k+1][1]) + '\n')
316 |         fw.write('"' + wrds[k][0] + '"' + '\n')
317 |     fw.write(str(wrds[-1][1]) + '\n')
318 |     fw.write(str(phons[-1][2]) + '\n')
319 |     fw.write('"' + wrds[-1][0] + '"' + '\n')  
320 |     
321 |     fw.close()    
322 | 
323 | 
324 | def prep_working_directory():
325 |     delete_working_directory()
326 |     os.mkdir(TEMP_DIR)
327 | 
328 | 
329 | def delete_working_directory():
330 |     try:
331 |         shutil.rmtree(TEMP_DIR)
332 |     except OSError:
333 |         pass
334 | 
335 | 
336 | def prep_scp(wavfile):
337 |     fw = open(os.path.join(TEMP_DIR, 'codetr.scp'), 'w')
338 |     fw.write(wavfile + ' ' + os.path.join(TEMP_DIR, 'tmp.plp') + '\n')
339 |     fw.close()
340 |     fw = open(os.path.join(TEMP_DIR, 'test.scp'), 'w')
341 |     fw.write(os.path.join(TEMP_DIR, 'tmp.plp') + '\n')
342 |     fw.close()
343 | 
344 | 
345 | def create_plp(hcopy_config, verbose=False):
346 |     #os.system('HCopy -T 1 -C ' + hcopy_config + ' -S ' + os.path.join(TEMP_DIR, 'codetr.scp'))
347 |     cmd = (
348 |         'HCopy -T 1'
349 |         f' -C {hcopy_config}'
350 |         f' -S {os.path.join(TEMP_DIR, "codetr.scp")}'
351 |         )
352 |     if verbose:
353 |         print('creating plp...\n', cmd)
354 | 
355 |     os.system(cmd)
356 | 
357 | 
358 | def viterbi(input_mlf, word_dictionary, output_mlf, phoneset, hmmdir, state_align=False, verbose=False):
359 |     if state_align:
360 |         salign = ' -f -y lab'
361 |     else:
362 |         salign = ''
363 | 
364 |     cmd = (
365 |         'HVite -T 1 -a -m'
366 |         f'{salign}'
367 |         f' -I {input_mlf}'
368 |         f' -H {os.path.join(hmmdir, "macros")}'
369 |         f' -H {os.path.join(hmmdir, "hmmdefs")}'
370 |         f' -S {os.path.join(TEMP_DIR, "test.scp")}'
371 |         f' -i {output_mlf}'
372 |         f' -p 0.0 -s 5.0'
373 |         f' {word_dictionary}'
374 |         f' {phoneset}'
375 |         f' > {os.path.join(TEMP_DIR, "aligned.results")}'
376 |         )
377 |     
378 |     if verbose:
379 |         print('running viterbi...\n', cmd)
380 |     os.system(cmd)
381 | 
382 | 
383 | def align(wavfile, trsfile, outfile=None, wave_start='0.0', wave_end=None, sr_override=None, model_path=None, custom_dict=None, state_align=False, verbose=False):
384 |     surround_token = "sp"
385 |     between_token = "sp"
386 | 
387 |     # If no model directory was said explicitly, get directory containing this script.
388 |     hmmsubdir = ""
389 |     sr_models = None
390 |     if model_path is None:
391 |         model_path = os.path.dirname(os.path.realpath(__file__)) + "/model"
392 |         hmmsubdir = "FROM-SR"
393 |         # sample rates for which there are acoustic models set up, otherwise
394 |         # the signal must be resampled to one of these rates.
395 |         sr_models = [8000, 11025, 16000]
396 | 
397 |     if sr_override is not None and sr_models is not None and sr_override not in sr_models:
398 |         raise Exception("invalid sample rate: not an acoustic model available")
399 | 
400 |     word_dictionary = os.path.join(TEMP_DIR, 'dict')
401 |     input_mlf = os.path.join(TEMP_DIR, 'tmp.mlf')
402 |     output_mlf = os.path.join(TEMP_DIR, 'aligned.mlf')
403 |     results_mlf = os.path.join(TEMP_DIR, 'aligned.results')
404 |     if state_align:
405 |         state_mlf = os.path.join(TEMP_DIR, 'aligned_state.mlf')
406 |     else:
407 |         state_mlf = None
408 | 
409 |     # create working directory
410 |     prep_working_directory()
411 | 
412 |     # create ./tmp/dict by concatening our dict with a local one
413 |     if custom_dict is not None:
414 |         os.system("cat " + model_path + "/dict " + custom_dict + " > " + word_dictionary)
415 |     else:
416 |         if os.path.exists("dict.local"):
417 |             os.system("cat " + model_path + "/dict dict.local > " + word_dictionary)
418 |         else:
419 |             os.system("cat " + model_path + "/dict > " + word_dictionary)
420 | 
421 |     # prepare wavefile: do a resampling if necessary
422 |     tmpwav = os.path.join(TEMP_DIR, 'sound.wav')
423 |     sr = prep_wav(wavfile, tmpwav, sr_override, wave_start, wave_end, sr_models)
424 | 
425 |     if hmmsubdir == "FROM-SR":
426 |         hmmsubdir = str(sr)
427 | 
428 |     # prepare mlfile
429 |     prep_mlf(trsfile, input_mlf, word_dictionary,
430 |              surround_token, between_token)
431 | 
432 |     # prepare scp files
433 |     prep_scp(tmpwav)
434 | 
435 |     # generate the plp file using a given configuration file for HCopy
436 |     create_plp(os.path.join(model_path, hmmsubdir, 'config'), verbose=verbose)
437 | 
438 |     # run Verterbi decoding
439 |     # print("Running HVite...")
440 |     mpfile = os.path.join(model_path, 'monophones')
441 |     if not os.path.exists(mpfile):
442 |         mpfile = os.path.join(model_path, 'hmmnames')
443 |     
444 |     viterbi(input_mlf, word_dictionary, output_mlf, mpfile, os.path.join(model_path, hmmsubdir), verbose=verbose)
445 |     if state_align:
446 |         viterbi(input_mlf, word_dictionary, state_mlf, mpfile, os.path.join(model_path, hmmsubdir), state_align=True, verbose=verbose)
447 |         state_alignments = read_aligned_mlf(state_mlf, sr, float(wave_start))
448 |     else:
449 |         state_alignments = None
450 | 
451 |     _alignments = read_aligned_mlf(output_mlf, sr, float(wave_start))
452 |     phoneme_alignments, word_alignments = make_alignment_lists(_alignments)
453 | 
454 |     av_score_per_frame = get_av_log_likelihood_per_frame(results_mlf)
455 | 
456 |     # output the alignment as a Praat TextGrid
457 |     if outfile is not None:
458 |         write_text_grid(outfile, _alignments, state_alignments=state_alignments)
459 | 
460 |     # clean directory
461 |     delete_working_directory()
462 |     if not state_align:
463 |         return phoneme_alignments, word_alignments, av_score_per_frame
464 |     else:
465 |         return phoneme_alignments, word_alignments, state_alignments, av_score_per_frame
466 | 
467 | 
468 | if __name__ == '__main__':
469 |     parser = argparse.ArgumentParser(description='P2FA for Python3 (https://github.com/jaekookang/p2fa_py3)')
470 |     parser.add_argument('wavfile', type=str, help='Provide wav file with valid path')
471 |     parser.add_argument('trsfile', type=str, help='Provide transcription file (txt) with valid path')
472 |     parser.add_argument('outfile', type=str, help='Provide output filename (TextGrid) with valid path')
473 |     parser.add_argument('-r', '--sampling_rate', type=int, default=11025, choices=[8000,11025,16000],
474 |         help='override which sample rate model to use, one of 8000, 11025, and 16000')
475 |     parser.add_argument('-s', '--start_time', default='0.0', 
476 |         help='start of portion of wavfile to align (in seconds, default 0)')
477 |     parser.add_argument('-e', '--end_time', default=None, 
478 |         help='end of portion of wavfile to align (in seconds, defaul to end)')
479 |     parser.add_argument('-t', '--state_align', type=int, default=0, choices=[0, 1], 
480 |         help='align HMM states (eg. s1, s2, s3) additionally; default=0')
481 |     parser.add_argument('-v', '--verbose', type=int, default='0', choices=[0, 1], 
482 |         help='print HCopy and HVite commandlines; default=0')
483 | 
484 |     args = parser.parse_args()
485 | 
486 |     align(args.wavfile, args.trsfile, outfile=args.outfile, 
487 |         wave_start=args.start_time, wave_end=args.end_time, 
488 |         sr_override=args.sampling_rate, model_path=None, custom_dict=None, 
489 |         state_align=int(args.state_align),
490 |         verbose=int(args.verbose))
491 | 


--------------------------------------------------------------------------------
/p2fa/examples/ploppy.TextGrid:
--------------------------------------------------------------------------------
  1 | File type = "ooTextFile short"
  2 | "TextGrid"
  3 | 
  4 | 0.012471655328798186
  5 | 2.147619047619048
  6 | <exists>
  7 | 2
  8 | "IntervalTier"
  9 | "phone"
 10 | 0.012471655328798186
 11 | 2.147619047619048
 12 | 19
 13 | 0.012471655328798186
 14 | 0.08231292517006802
 15 | "sp"
 16 | 0.08231292517006802
 17 | 0.2219954648526077
 18 | "AY1"
 19 | 0.2219954648526077
 20 | 0.2519274376417234
 21 | "AE1"
 22 | 0.2519274376417234
 23 | 0.3217687074829932
 24 | "M"
 25 | 0.3217687074829932
 26 | 0.41156462585034015
 27 | "T"
 28 | 0.41156462585034015
 29 | 0.48140589569160996
 30 | "R"
 31 | 0.48140589569160996
 32 | 0.5412698412698412
 33 | "AY1"
 34 | 0.5412698412698412
 35 | 0.611111111111111
 36 | "NG"
 37 | 0.611111111111111
 38 | 0.6909297052154195
 39 | "T"
 40 | 0.6909297052154195
 41 | 0.7408163265306121
 42 | "UW1"
 43 | 0.7408163265306121
 44 | 0.7807256235827664
 45 | "sp"
 46 | 0.7807256235827664
 47 | 0.8206349206349206
 48 | "S"
 49 | 0.8206349206349206
 50 | 0.900453514739229
 51 | "EY1"
 52 | 0.900453514739229
 53 | 0.9802721088435373
 54 | "P"
 55 | 0.9802721088435373
 56 | 1.0401360544217686
 57 | "L"
 58 | 1.0401360544217686
 59 | 1.1997732426303853
 60 | "AA1"
 61 | 1.1997732426303853
 62 | 1.299546485260771
 63 | "P"
 64 | 1.299546485260771
 65 | 1.519047619047619
 66 | "IH0"
 67 | 1.519047619047619
 68 | 2.147619047619048
 69 | "sp"
 70 | "IntervalTier"
 71 | "word"
 72 | 0.012471655328798186
 73 | 2.147619047619048
 74 | 9
 75 | 0.012471655328798186
 76 | 0.08231292517006802
 77 | "sp"
 78 | 0.08231292517006802
 79 | 0.2219954648526077
 80 | "I"
 81 | 0.2219954648526077
 82 | 0.3217687074829932
 83 | "AM"
 84 | 0.3217687074829932
 85 | 0.611111111111111
 86 | "TRYING"
 87 | 0.611111111111111
 88 | 0.7408163265306121
 89 | "TO"
 90 | 0.7408163265306121
 91 | 0.7807256235827664
 92 | "sp"
 93 | 0.7807256235827664
 94 | 0.900453514739229
 95 | "SAY"
 96 | 0.900453514739229
 97 | 1.519047619047619
 98 | "PLOPPY"
 99 | 1.519047619047619
100 | 2.147619047619048
101 | "sp"
102 | 


--------------------------------------------------------------------------------
/p2fa/examples/ploppy.txt:
--------------------------------------------------------------------------------
1 | I
2 | AM
3 | TRYING
4 | TO
5 | SAY
6 | PLOPPY


--------------------------------------------------------------------------------
/p2fa/examples/ploppy.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaekookang/p2fa_py3/d95bd403ede38cec40d8e02904a4a6fb5ded807a/p2fa/examples/ploppy.wav


--------------------------------------------------------------------------------
/p2fa/model/11025/config:
--------------------------------------------------------------------------------
 1 | # Coding parameters
 2 | SOURCEKIND = WAVEFORM
 3 | SOURCEFORMAT = WAVE
 4 | SOURCERATE = 907.02947845804988
 5 | TARGETKIND = PLP_0_D_A_Z
 6 | TARGETRATE = 100000.0
 7 | SAVECOMPRESSED = T
 8 | SAVEWITHCRC = T
 9 | WINDOWSIZE = 250000.0
10 | ZMEANSOURCE = T
11 | USEHAMMING = T
12 | PREEMCOEF = 0.97
13 | NUMCHANS = 20
14 | LPCORDER = 12 
15 | USEPOWER = T
16 | 


--------------------------------------------------------------------------------
/p2fa/model/11025/macros:
--------------------------------------------------------------------------------
1 | ~o
2 | <STREAMINFO> 1 39
3 | <VECSIZE> 39<NULLD><PLP_D_A_Z_0><DIAGC>
4 | ~v "varFloor1"
5 | <VARIANCE> 39
6 |  2.970232e-03 3.081554e-03 3.337499e-03 4.222610e-03 4.197491e-03 3.755180e-03 3.401211e-03 3.156109e-03 2.829444e-03 2.476874e-03 1.801175e-03 1.400571e-03 4.726708e-03 1.402909e-04 1.383319e-04 1.553502e-04 2.128327e-04 2.107100e-04 2.003327e-04 2.263938e-04 2.249473e-04 2.067962e-04 1.757082e-04 1.399256e-04 1.028699e-04 1.197369e-04 2.207970e-05 2.272787e-05 2.571406e-05 3.619217e-05 3.745446e-05 3.682210e-05 4.203814e-05 4.217610e-05 3.967129e-05 3.367268e-05 2.703490e-05 1.971991e-05 1.748702e-05
7 | 


--------------------------------------------------------------------------------
/p2fa/model/16000/config:
--------------------------------------------------------------------------------
 1 | # Coding parameters (PLP_0_D_A_Z)
 2 | SOURCEKIND = WAVEFORM
 3 | SOURCEFORMAT = WAVE
 4 | SOURCERATE = 625.0
 5 | TARGETKIND = PLP_0_D_A_Z
 6 | TARGETRATE = 100000.0
 7 | SAVECOMPRESSED = T
 8 | SAVEWITHCRC = T
 9 | WINDOWSIZE = 250000.0
10 | ZMEANSOURCE = T
11 | USEHAMMING = T
12 | PREEMCOEF = 0.97
13 | NUMCHANS = 20
14 | LPCORDER = 12 
15 | USEPOWER = T
16 | 


--------------------------------------------------------------------------------
/p2fa/model/16000/macros:
--------------------------------------------------------------------------------
1 | ~o
2 | <STREAMINFO> 1 39
3 | <VECSIZE> 39<NULLD><PLP_D_A_Z_0><DIAGC>
4 | ~v "varFloor1"
5 | <VARIANCE> 39
6 |  2.970232e-03 3.081554e-03 3.337499e-03 4.222610e-03 4.197491e-03 3.755180e-03 3.401211e-03 3.156109e-03 2.829444e-03 2.476874e-03 1.801175e-03 1.400571e-03 4.726708e-03 1.402909e-04 1.383319e-04 1.553502e-04 2.128327e-04 2.107100e-04 2.003327e-04 2.263938e-04 2.249473e-04 2.067962e-04 1.757082e-04 1.399256e-04 1.028699e-04 1.197369e-04 2.207970e-05 2.272787e-05 2.571406e-05 3.619217e-05 3.745446e-05 3.682210e-05 4.203814e-05 4.217610e-05 3.967129e-05 3.367268e-05 2.703490e-05 1.971991e-05 1.748702e-05
7 | 


--------------------------------------------------------------------------------
/p2fa/model/8000/config:
--------------------------------------------------------------------------------
 1 | # Coding parameters
 2 | SOURCEKIND = WAVEFORM
 3 | SOURCEFORMAT = WAVE
 4 | SOURCERATE = 1250
 5 | TARGETKIND = PLP_0_D_A_Z
 6 | TARGETRATE = 100000.0
 7 | SAVECOMPRESSED = T
 8 | SAVEWITHCRC = T
 9 | WINDOWSIZE = 250000.0
10 | ZMEANSOURCE = T
11 | USEHAMMING = T
12 | PREEMCOEF = 0.97
13 | NUMCHANS = 20
14 | LPCORDER = 12 
15 | USEPOWER = T
16 | 


--------------------------------------------------------------------------------
/p2fa/model/8000/macros:
--------------------------------------------------------------------------------
1 | ~o
2 | <STREAMINFO> 1 39
3 | <VECSIZE> 39<NULLD><PLP_D_A_Z_0><DIAGC>
4 | ~v "varFloor1"
5 | <VARIANCE> 39
6 |  2.320759e-03 3.364773e-03 2.644561e-03 4.602237e-03 4.153211e-03 3.535625e-03 3.436818e-03 3.055576e-03 2.946933e-03 2.210875e-03 1.983593e-03 1.391166e-03 5.161191e-03 1.195636e-04 1.395769e-04 1.410736e-04 2.242859e-04 2.118236e-04 2.178820e-04 2.484023e-04 2.270718e-04 2.155360e-04 1.773744e-04 1.613469e-04 1.159174e-04 1.315518e-04 1.986226e-05 2.259619e-05 2.456991e-05 3.887276e-05 3.827550e-05 4.066243e-05 4.655687e-05 4.391165e-05 4.144727e-05 3.483306e-05 3.158762e-05 2.273686e-05 1.879711e-05
7 | 


--------------------------------------------------------------------------------
/p2fa/model/monophones:
--------------------------------------------------------------------------------
 1 | EH2
 2 | K
 3 | S
 4 | L
 5 | AH0
 6 | M
 7 | EY1
 8 | SH
 9 | N
10 | P
11 | OY2
12 | T
13 | OW1
14 | Z
15 | W
16 | D
17 | AH1
18 | B
19 | EH1
20 | V
21 | IH1
22 | AA1
23 | R
24 | AY1
25 | ER0
26 | AE1
27 | AE2
28 | AO1
29 | NG
30 | G
31 | IH0
32 | TH
33 | IY2
34 | F
35 | DH
36 | IY1
37 | HH
38 | UH1
39 | IY0
40 | OY1
41 | OW2
42 | CH
43 | UW1
44 | IH2
45 | EH0
46 | AO2
47 | AA0
48 | AA2
49 | OW0
50 | EY0
51 | AE0
52 | AW2
53 | AW1
54 | EY2
55 | UW0
56 | AH2
57 | UW2
58 | AO0
59 | JH
60 | Y
61 | ZH
62 | AY2
63 | ER1
64 | UH2
65 | AY0
66 | ER2
67 | OY0
68 | UH0
69 | AW0
70 | br
71 | cg
72 | lg
73 | ls
74 | ns
75 | sil
76 | sp
77 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
  1 | University of Pennsylvania Department of Linguistics
  2 | Penn Phonetics Lab Forced Aligner Toolkit (P2FA)
  3 | http://www.ling.upenn.edu/phonetics/p2fa/
  4 | ============================================================
  5 | 
  6 | A. Introduction
  7 | ---------------
  8 | 
  9 | The Penn Phonetics Lab Forced Aligner (P2FA) is an automatic phonetic 
 10 | alignment toolkit based on HTK. It contains the acoustic models of 
 11 | American English, a Python script that can be used to do forced 
 12 | alignment, as well as this readme file and some examples. There is 
 13 | also an online processing system on the P2FA website with which you 
 14 | can submit a WAV file and transcript and get back a Praat TextGrid 
 15 | file by email.
 16 | 
 17 | The acoustic models included in the toolkit are GMM-based monophone 
 18 | HMMs. Each HMM state has 32 Gaussian mixture components on 39 PLP 
 19 | coefficients. Separate models were created for speech sampled at 8 KHz, 
 20 | 11,025 Hz, and 16 KHz. The acoustic model includes a robust short-pause 
 21 | ("sp") HMM inserted optionally between words which greatly improves
 22 | alignment accuracy.
 23 | 
 24 | P2FA can be cited as:
 25 |  Jiahong Yuan and Mark Liberman. 2008. Speaker identification on the
 26 |  SCOTUS corpus. Proceedings of Acoustics '08.
 27 | 
 28 | B. Included in the Toolkit
 29 | --------------------------
 30 | 
 31 | ./models/: The acoustic models, parameter files, and CMU 
 32 | pronunciation dictionary;
 33 | 
 34 | ./align.py: a python script that automates the procedure of doing 
 35 | forced alignment, creating a Praat TextGrid file from a WAV file and 
 36 | an orthographic transcript;
 37 | 
 38 | ./test/: files for testing the Toolkit;
 39 | 
 40 | ./examples/: the example files.
 41 | 
 42 | C. Prerequisites
 43 | ----------------
 44 | 
 45 | You will need the HTK toolkit version 3.4 already installed to do forced 
 46 | alignment. HTK can be found at http://htk.eng.cam.ac.uk/. Only version
 47 | 3.4 is supported by the align.py script. The newer version 3.4.1 has
 48 | a problem aligning the 'sp' model.
 49 | 
 50 | You need to have Python 2.5/2.6 (earlier/later versions have not been tested)
 51 | for the align.py script.
 52 | 
 53 | D. Using align.py
 54 | -----------------
 55 | 
 56 | 1. Run "python align.py wavfile trsfile output_file". If you are not
 57 |    running align.py from the toolkit directory, you will need to
 58 |    specify its path.
 59 |    
 60 |    wavfile is the path to a WAV file containing the audio to be 
 61 |    aligned. If it was not sampled at one of the three sampling
 62 |    rates used for the acoustic models, it will be automatically
 63 |    resampled to 11,025 Hz. This sampling rate is recommended.
 64 |    
 65 |    trsfile is a text file containing the transcript. Spaces or newlines 
 66 |    should separate words. If a word is not found in the CMU pronouncing 
 67 |    dictionary, an error will occur, but you can edit the file model/dict 
 68 |    and add new pronunciations as needed. You may include the following 
 69 |    labels in the transcript: '{SL}' for silence, '{LG}' for laughter, 
 70 |    '{NS}' for noise, '{CG}' for cough, '{BR}' for breath, and '{LS}' for 
 71 |    lipsmack.
 72 |    
 73 |    output_file is a Praat TextGrid file containing the rest of
 74 |    the forced alignment.
 75 | 
 76 |    The toolkit contains example files for testing. You can align
 77 |    the test files with:
 78 |    
 79 |    python align.py ./test/BREY00538.wav ./test/BREY00538.txt ./test/BREY00538.TextGrid
 80 | 
 81 |    (the slash denotes that the single line is supposed to continue
 82 |     unbroken)
 83 |     
 84 |    The output file may contain 'sp' intervals between words where
 85 |    there was a pause.
 86 | 
 87 | 2. If you have more than one file to align, you can write a shell 
 88 |    script and call align.py in a loop. You can also follow the 
 89 |    instructions below.
 90 | 
 91 | 3. Several command-line options can also be included. They must
 92 |    precede the specification of the WAV file. They are:
 93 | 
 94 |   	-r sampling_rate	override which sample rate model to use,
 95 |   						one of 8000, 11025, and 16000. The default
 96 |   						is the sampling rate of the WAV file if it
 97 |   						is one of the three, otherwise 11025.
 98 | 	-s start_time		start of portion of wavfile to align (in
 99 | 						seconds, default 0)
100 | 	-e end_time			end of portion of wavfile to align (in seconds,
101 | 						defaul to end)
102 | 
103 | E. Doing Forced Alignment The Hard Way
104 | --------------------------------------
105 | 
106 | 1. Please refer to the files under ./examples/ for the right
107 |    formats for the files described in the next steps.
108 | 
109 | 2. Prepare speech files and their word transcription as described 
110 |    below. You can do forced alignment for very long speech files (e.g., 
111 |    one hour), or you can also align many files in one step.
112 | 
113 | 3. Create the reference transcription file, transcript.mlf. 
114 |    transcript.mlf is a HTK "master label file" containing the 
115 |    transcripts of all the files to be force-aligned. Below are the steps 
116 |    to generate the file:
117 | 
118 |     I. Capitalize all the letters of the words. If a word doesn’t 
119 |     appear in the CMU pronunciation dictionary, you can either 
120 |     manually add it to the dictionary or exclude it from forced 
121 |     alignment, depending on your goal. The alignment will fail if 
122 |     there is an unknown word in transcript.mlf.
123 | 
124 |     II. You may include the following labels in transcript.mlf: 
125 |     '{SL}' for silence, '{LG}' for laughter, '{NS}' for noise, 
126 |     '{CG}' for cough, '{BR}' for breath, and '{LS}' for lipsmack.
127 | 
128 |     III. You may want to insert an 'sp' between every two words. 'sp' 
129 |     stands for small pause. It can have zero length (no pause) from 
130 |     forced alignment.
131 | 
132 | 4. Create code.scp and test.scp. They contain the names of the files 
133 |    to be coded and aligned respectively.
134 | 
135 | 5. Extract acoustic features:
136 |                  
137 |    HCopy -T 1 -C ./models/your-sampling-rate/config -S code.scp
138 | 
139 |    The file config contains parameter settings for the speech files 
140 |    (.wav, .raw, sampling rate, etc.) and for the acoustic features 
141 |    (mfcc, plp, etc.). If your speech files have a different sampling 
142 |    rate other than 8,000Hz, 11,025Hz, or 16,000Hz, you can downsample 
143 |    or upsample to 11,025 Hz (which by far has the best performance). 
144 |    In our training procedure, we downsampled from 44,100 Hz to the 
145 |    target sampling rate using 'sox -polyphase'.
146 | 
147 | 6. Forced alignment:
148 | 
149 |    HVite -T 1 -a -m -I transcript.mlf -H ./model/your-sampling-rate/macros -H ./model/your-sampling-rate/hmmdefs -S test.scp -i ./align.mlf -p 0.0 -s 5.0 ./model/dict ./model/monophones
150 | 
151 |    align.mlf is the forced alignment results. You can convert it to 
152 |    label files, e.g., Praat TextGrids. Please note that: a) the time 
153 |    unit in align.mlf is 100 ns (0.00000001 second); b) there is a 
154 |    rounding issue when the sampling rate is 11,025 Hz and the time 
155 |    step is 10 milliseconds, so you need to correct the time stamps in 
156 |    alilgn.mlf. You can use the following formula to convert the time 
157 |    stamp x into seconds:
158 |         y = (x/10000000 + 0.0125)*(11000/11025).  
159 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="p2fa",
 8 |     version="0.0.2",
 9 |     author="",
10 |     author_email="",
11 |     description="Python wrapper for Penn Forced Aligner",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="",
15 |     packages=setuptools.find_packages(),
16 |     package_data={'p2fa': ['model/*', 'model/*/*']},
17 |     scripts=['bin/p2fa'],
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3",
20 |         "Operating System :: OS Independent",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaekookang/p2fa_py3/d95bd403ede38cec40d8e02904a4a6fb5ded807a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_p2fa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import unittest
 4 | import os
 5 | import subprocess
 6 | import tempfile
 7 | import filecmp
 8 | from p2fa import align
 9 | 
10 | module_path = os.path.dirname(align.__file__)
11 | 
12 | 
13 | class P2FATest(unittest.TestCase):
14 |     input_wav = os.path.join(module_path, 'examples', 'ploppy.wav')
15 |     input_transcription = os.path.join(module_path, 'examples', 'ploppy.txt')
16 |     outfile = os.path.join(tempfile.gettempdir(), 'ploppy.gen.TextGrid')
17 |     true_alignment_file = os.path.join(module_path, 'examples', 'ploppy.TextGrid')
18 | 
19 |     def test_aligner(self):
20 |         align.align(self.input_wav, self.input_transcription, self.outfile)
21 |         self.assertTrue(filecmp.cmp(self.outfile, self.true_alignment_file))
22 | 
23 |     def test_standalone_aligner(self):
24 |         subprocess.run(['p2fa/align.py', self.input_wav, self.input_transcription, self.outfile])
25 |         self.assertTrue(filecmp.cmp(self.outfile, self.true_alignment_file))
26 | 


--------------------------------------------------------------------------------