├── .github ├── .gitignore ├── CODE_OF_CONDUCT.md └── workflows │ ├── pkgdown.yaml │ └── R-CMD-check.yaml ├── src ├── .gitignore ├── cpp11.cpp └── code.cpp ├── vignettes └── articles │ ├── .gitignore │ └── textgrid-specification.Rmd ├── .gitignore ├── man ├── figures │ ├── logo.png │ └── demo-textgrid.png ├── readtextgrid-package.Rd ├── example_textgrid.Rd ├── read_textgrid.Rd └── pivot_textgrid_tiers.Rd ├── inst ├── utf_16_be.TextGrid ├── Mary_John_bell.TextGrid ├── draw-tg-parts.praat ├── speaker-data │ ├── speaker001 │ │ ├── s2T04.TextGrid │ │ ├── s2T01.TextGrid │ │ ├── s2T02.TextGrid │ │ ├── s2T03.TextGrid │ │ └── s2T05.TextGrid │ └── speaker002 │ │ ├── s2T04.TextGrid │ │ ├── s2T01.TextGrid │ │ ├── s2T02.TextGrid │ │ ├── s2T03.TextGrid │ │ └── s2T05.TextGrid ├── make-logo.R ├── nested-intervals.TextGrid └── draw-tg-parts.Collection ├── tests ├── testthat.R └── testthat │ ├── test-data │ ├── short.TextGrid │ ├── praat-test │ │ ├── okay-digit-dot-space.TextGrid │ │ ├── fail-space-dot-digit.TextGrid │ │ ├── okay-plus-digit-or-minus-digit.TextGrid │ │ ├── fail-space-plus-dot-digit.TextGrid │ │ ├── okay-percents-fractions.TextGrid │ │ ├── okay-scientific-notation.TextGrid │ │ ├── fail-letters-digits.TextGrid │ │ ├── okay-real-with-trailing-characters.TextGrid │ │ ├── okay-hex-numbers.TextGrid │ │ ├── okay-percents.TextGrid │ │ └── okay-fractions.TextGrid │ ├── comment.TextGrid │ ├── points.TextGrid │ ├── Mary_John_bell.TextGrid │ ├── elan.TextGrid │ ├── quoted.TextGrid │ ├── hard-to-parse-normalized.TextGrid │ ├── hard-to-parse.TextGrid │ └── nested-intervals.TextGrid │ └── test-read-textgrid.R ├── _pkgdown.yml ├── cran-comments.md ├── R ├── readtextgrid-package.R ├── cpp11.R ├── pivot.R ├── pure-r-parser.R ├── legacy.R └── readtextgrid.R ├── NAMESPACE ├── .Rbuildignore ├── readtextgrid.Rproj ├── DESCRIPTION ├── NEWS.md ├── README.Rmd ├── README.md └── LICENSE.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /vignettes/articles/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | docs 6 | Praat.exe 7 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tjmahr/readtextgrid/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /inst/utf_16_be.TextGrid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tjmahr/readtextgrid/HEAD/inst/utf_16_be.TextGrid -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(readtextgrid) 3 | 4 | test_check("readtextgrid") 5 | -------------------------------------------------------------------------------- /man/figures/demo-textgrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tjmahr/readtextgrid/HEAD/man/figures/demo-textgrid.png -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://www.tjmahr.com/readtextgrid/ 2 | template: 3 | bootstrap: 5 4 | theme: arrow-light 5 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | Tested on local Windows, five default GitHub actions environments, and 2 | Devel on WinBuilder 3 | -------------------------------------------------------------------------------- /R/readtextgrid-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | #' @useDynLib readtextgrid, .registration = TRUE 6 | ## usethis namespace: end 7 | NULL 8 | -------------------------------------------------------------------------------- /R/cpp11.R: -------------------------------------------------------------------------------- 1 | # Generated by cpp11: do not edit by hand 2 | 3 | cpp_tg_scan_tokens <- function(src) { 4 | .Call(`_readtextgrid_cpp_tg_scan_tokens`, src) 5 | } 6 | 7 | cpp_parse_praat_numbers <- function(x) { 8 | .Call(`_readtextgrid_cpp_parse_praat_numbers`, x) 9 | } 10 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(example_textgrid) 4 | export(legacy_read_textgrid) 5 | export(legacy_read_textgrid_lines) 6 | export(pivot_textgrid_tiers) 7 | export(read_textgrid) 8 | export(read_textgrid_lines) 9 | useDynLib(readtextgrid, .registration = TRUE) 10 | -------------------------------------------------------------------------------- /tests/testthat/test-data/short.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 0 4 | 2.3 5 | 6 | 3 7 | "IntervalTier" 8 | "Mary" 9 | 0 10 | 2.3 11 | 1 12 | 0 13 | 2.3 14 | "" 15 | "IntervalTier" 16 | "John" 17 | 0 18 | 2.3 19 | 1 20 | 0 21 | 2.3 22 | "" 23 | "TextTier" 24 | "bell" 25 | 0 26 | 2.3 27 | 0 28 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^\.travis\.yml$ 6 | ^Praat.exe$ 7 | ^inst/draw-tg-parts.Collection$ 8 | ^inst/draw-tg-parts.praat$ 9 | ^inst/make-logo.R$ 10 | ^inst/tg-parts.png$ 11 | ^\.github$ 12 | ^cran-comments\.md$ 13 | ^CRAN-RELEASE$ 14 | ^CODE_OF_CONDUCT\.md$ 15 | ^CRAN-SUBMISSION$ 16 | ^_pkgdown\.yml$ 17 | ^docs$ 18 | ^pkgdown$ 19 | ^vignettes/articles$ 20 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-digit-dot-space.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 2 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 1. 18 | text = "" 19 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/fail-space-dot-digit.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = .0 5 | xmax = 1 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 1. 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 1 18 | text = "" 19 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-plus-digit-or-minus-digit.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2 6 | tiers? 7 | size = +1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = -0.3 13 | xmax = +2 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = +1.0 18 | text = "" 19 | -------------------------------------------------------------------------------- /readtextgrid.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: XeLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /tests/testthat/test-data/comment.TextGrid: -------------------------------------------------------------------------------- 1 | "ooTextFile" 2 | "TextGrid" 3 | 0 2.3 ! time domain of TextGrid 4 | 5 | 3 tiers 6 | "IntervalTier" "Mary" ! type and name of tier 1 7 | 0 2.3 ! time domain of tier 1 8 | 1 interval coming 9 | 0 2.3 "" ! interval 1 on tier 1 10 | "IntervalTier" "John" ! type and name of tier 2 11 | 0 2.3 ! time domain of tier 2 12 | 1 interval coming 13 | 0 2.3 "" ! interval 1 on tier 2 14 | "TextTier" "bell" ! type and name of tier 3 15 | 0 2.3 ! time domain of tier 3 16 | 0 points coming 17 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/fail-space-plus-dot-digit.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = +.0 5 | xmax = 1 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 1.0 14 | intervals: size = 2 15 | intervals [1]: 16 | xmin = 0.0 17 | xmax = 0.5 18 | text = "zz" 19 | intervals [2]: 20 | xmin = 0.5 21 | xmax = 1 22 | text = "zz" 23 | -------------------------------------------------------------------------------- /tests/testthat/test-data/points.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "TextTier" 11 | name = "test" 12 | xmin = 0 13 | xmax = 1 14 | points: size = 3 15 | points [1]: 16 | number = 0.10000000000000007 17 | mark = "point 1" 18 | points [2]: 19 | number = 0.5 20 | mark = "point 2" 21 | points [3]: 22 | number = 0.8000000000000003 23 | mark = "point 3" 24 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-percents-fractions.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 00000.0 5 | xmax = 3 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "TextTier" 11 | name = "points" 12 | xmin = 0 13 | xmax = 1 14 | points: size = 3 15 | points [1]: 16 | number = 2/200% 17 | mark = "2/200% -> 1.0" 18 | points [2]: 19 | number = 300%/2 20 | mark = "300%/2 -> 1.5" 21 | points [3]: 22 | number = 400%/200% 23 | mark = "400%/200% -> 2.0" -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-scientific-notation.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2e1 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 2E1 14 | intervals: size = 3 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 5e-1 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.5 21 | xmax = 1e+1 22 | text = "" 23 | intervals [3]: 24 | xmin = 1.0e1 25 | xmax = 20 26 | text = "" 27 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/fail-letters-digits.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = nope0 5 | xmax = 2e1 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0e 13 | xmax = 2E 14 | intervals: size = 3E 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 5e-1 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.5 21 | xmax = 1e+1 22 | text = "" 23 | intervals [3]: 24 | xmin = 1.0e1 25 | xmax = 20 26 | text = "" -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-real-with-trailing-characters.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0ignored 5 | xmax = 2e1ignored 6 | tiers? 7 | size = 1ignored 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0e 13 | xmax = 2E1ignored 14 | intervals: size = 3E 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 5e-1ignored 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.5ignored 21 | xmax = 1e+1ignored 22 | text = "" 23 | intervals [3]: 24 | xmin = 1.0e1ignored 25 | xmax = 20 26 | text = "" 27 | -------------------------------------------------------------------------------- /inst/Mary_John_bell.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1 6 | tiers? 7 | size = 3 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 1 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 1 18 | text = "" 19 | item [2]: 20 | class = "IntervalTier" 21 | name = "John" 22 | xmin = 0 23 | xmax = 1 24 | intervals: size = 1 25 | intervals [1]: 26 | xmin = 0 27 | xmax = 1 28 | text = "" 29 | item [3]: 30 | class = "TextTier" 31 | name = "bell" 32 | xmin = 0 33 | xmax = 1 34 | points: size = 0 35 | -------------------------------------------------------------------------------- /tests/testthat/test-data/Mary_John_bell.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1 6 | tiers? 7 | size = 3 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = 1 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 1 18 | text = "" 19 | item [2]: 20 | class = "IntervalTier" 21 | name = "John" 22 | xmin = 0 23 | xmax = 1 24 | intervals: size = 1 25 | intervals [1]: 26 | xmin = 0 27 | xmax = 1 28 | text = "" 29 | item [3]: 30 | class = "TextTier" 31 | name = "bell" 32 | xmin = 0 33 | xmax = 1 34 | points: size = 0 35 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-hex-numbers.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 00000.0 5 | xmax = 0x3 6 | tiers? 7 | size = 1 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "Mary" 12 | xmin = 0 13 | xmax = +0x3. 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0x0.8 18 | text = "0 to 0.5" 19 | intervals [2]: 20 | xmin = 0x1P-1 21 | xmax = 0x1.8p+0 22 | text = "0.5 to 1.5" 23 | intervals [3]: 24 | xmin = +0x1.8P0 25 | xmax = 0x1.4p+1 26 | text = "1.5 to 2.5" 27 | intervals [4]: 28 | xmin = 0x1.4P1 29 | xmax = 0x3 30 | text = "2.5 to 3" 31 | -------------------------------------------------------------------------------- /tests/testthat/test-data/elan.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0.0 5 | xmax = 1.41 6 | tiers? 7 | size = 1 8 | item []: 9 | item[1]: 10 | class = "IntervalTier" 11 | name = "default" 12 | xmin = 0.0 13 | xmax = 1.41 14 | intervals: size = 5 15 | intervals [1] 16 | xmin = 0.0 17 | xmax = 0.25 18 | text = "" 19 | intervals [2] 20 | xmin = 0.25 21 | xmax = 0.54 22 | text = "bird" 23 | intervals [3] 24 | xmin = 0.54 25 | xmax = 0.56 26 | text = "" 27 | intervals [4] 28 | xmin = 0.56 29 | xmax = 0.8 30 | text = "house" 31 | intervals [5] 32 | xmin = 0.8 33 | xmax = 1.41 34 | text = "" 35 | -------------------------------------------------------------------------------- /inst/draw-tg-parts.praat: -------------------------------------------------------------------------------- 1 | Read from file: "draw-tg-parts.Collection" 2 | 3 | Erase all 4 | Black 5 | selectObject: "TextGrid left-marginal-text" 6 | Select outer viewport: 1, 2, 0, 4.5 7 | Draw: 0, 0, "no", "yes", "no" 8 | White 9 | selectObject: "TextGrid left-marginal-text-mask" 10 | Draw: 0, 0, "no", "yes", "no" 11 | Black 12 | 13 | Select outer viewport: 5.25, 7, 0, 4.5 14 | selectObject: "TextGrid right-marginal-text" 15 | Draw: 0, 0, "no", "yes", "no" 16 | White 17 | selectObject: "TextGrid right-marginal-text-mask" 18 | Draw: 0, 0, "no", "yes", "no" 19 | Black 20 | 21 | Select outer viewport: 1, 6.5, 0, 4.5 22 | selectObject: "TextGrid Mary_John_bell" 23 | Draw: 0, 0, "no", "yes", "no" 24 | Select outer viewport: 1, 6.5, 3.5, 5.5 25 | selectObject: "TextGrid blue-text" 26 | Blue 27 | Draw: 0, 0, "no", "yes", "no" 28 | selectObject: "TextGrid blue-text-mask" 29 | White 30 | Draw: 0, 0, "no", "yes", "no" 31 | selectObject: "TextGrid forehead-and-chin" 32 | Black 33 | Draw: 0, 0, "no", "yes", "no" 34 | 35 | Select outer viewport: 1, 6.5, 0, 2 36 | selectObject: "TextGrid forehead-and-chin" 37 | Black 38 | Draw: 0, 0, "no", "yes", "no" 39 | 40 | Select outer viewport: 1, 7, 0, 5.5 41 | Save as 300-dpi PNG file: "tg-parts.png" 42 | 43 | -------------------------------------------------------------------------------- /src/cpp11.cpp: -------------------------------------------------------------------------------- 1 | // Generated by cpp11: do not edit by hand 2 | // clang-format off 3 | 4 | 5 | #include "cpp11/declarations.hpp" 6 | #include 7 | 8 | // code.cpp 9 | list cpp_tg_scan_tokens(std::string src); 10 | extern "C" SEXP _readtextgrid_cpp_tg_scan_tokens(SEXP src) { 11 | BEGIN_CPP11 12 | return cpp11::as_sexp(cpp_tg_scan_tokens(cpp11::as_cpp>(src))); 13 | END_CPP11 14 | } 15 | // code.cpp 16 | list cpp_parse_praat_numbers(strings x); 17 | extern "C" SEXP _readtextgrid_cpp_parse_praat_numbers(SEXP x) { 18 | BEGIN_CPP11 19 | return cpp11::as_sexp(cpp_parse_praat_numbers(cpp11::as_cpp>(x))); 20 | END_CPP11 21 | } 22 | 23 | extern "C" { 24 | static const R_CallMethodDef CallEntries[] = { 25 | {"_readtextgrid_cpp_parse_praat_numbers", (DL_FUNC) &_readtextgrid_cpp_parse_praat_numbers, 1}, 26 | {"_readtextgrid_cpp_tg_scan_tokens", (DL_FUNC) &_readtextgrid_cpp_tg_scan_tokens, 1}, 27 | {NULL, NULL, 0} 28 | }; 29 | } 30 | 31 | extern "C" attribute_visible void R_init_readtextgrid(DllInfo* dll){ 32 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 33 | R_useDynamicSymbols(dll, FALSE); 34 | R_forceSymbols(dll, TRUE); 35 | } 36 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-percents.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 00000.0 5 | xmax = 3 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "intervals" 12 | xmin = 0 13 | xmax = 3 14 | intervals: size = 2 15 | intervals [1]: 16 | xmin = 0% 17 | xmax = 100% 18 | text = "0% to 100% (0 to 1)" 19 | intervals [2]: 20 | xmin = 1.5e2% 21 | xmax = +300% 22 | text = "1.5e2% to +300% (1.5 to 3.0)" 23 | item [2]: 24 | class = "TextTier" 25 | name = "points" 26 | xmin = 0 27 | xmax = 3 28 | points: size = 5 29 | points [1]: 30 | number = 10% 31 | mark = "10% -> 0.1" 32 | points [2]: 33 | number = 5000e-2% 34 | mark = "500e-2% -> 0.5" 35 | points [3]: 36 | number = 80%ms 37 | mark = "80%ms -> 0.8" 38 | points [4]: 39 | number = 0.90ms% 40 | mark = "0.90ms% -> 0.9" 41 | points [5]: 42 | number = 2E 43 | mark = "1e% -> 0.9" 44 | -------------------------------------------------------------------------------- /tests/testthat/test-data/quoted.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2.3 6 | tiers? 7 | size = 3 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "sentence" 12 | xmin = 0 13 | xmax = 2.3 14 | intervals: size = 1 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 2.3 18 | text = "říkej ""ahoj"" dvakrát" 19 | item [2]: 20 | class = "IntervalTier" 21 | name = "phonemes" 22 | xmin = 0 23 | xmax = 2.3 24 | intervals: size = 3 25 | intervals [1]: 26 | xmin = 0 27 | xmax = 0.7 28 | text = "r̝iːkɛj" 29 | intervals [2]: 30 | xmin = 0.7 31 | xmax = 1.6 32 | text = "ʔaɦɔj" 33 | intervals [3]: 34 | xmin = 1.6 35 | xmax = 2.3 36 | text = "dʋakraːt" 37 | item [3]: 38 | class = "TextTier" 39 | name = "bell" 40 | xmin = 0 41 | xmax = 2.3 42 | points: size = 2 43 | points [1]: 44 | number = 0.9 45 | mark = "ding" 46 | points [2]: 47 | number = 1.3 48 | mark = "dong" 49 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: readtextgrid 2 | Type: Package 3 | Title: Read in a 'Praat' 'TextGrid' File 4 | Version: 0.2.0 5 | Authors@R: c( 6 | person("Tristan", "Mahr", role = c("aut", "cre"), 7 | email = "tristan.mahr@wisc.edu", 8 | comment = c(ORCID = "0000-0002-8890-5116")), 9 | person("Dan", "Villarreal", role = "ctb"), 10 | person("Jonathan", "Washington", role = "ctb"), 11 | person("Josef", "Fruehwald", role = "aut")) 12 | Description: 'Praat' is a widely 13 | used tool for manipulating, annotating and analyzing speech and 14 | acoustic data. It stores annotation data in a format called a 15 | 'TextGrid'. This package provides a way to read these 16 | files into R. 17 | License: GPL-3 18 | Encoding: UTF-8 19 | Depends: R (>= 4.3.0) 20 | Suggests: 21 | testthat (>= 2.1.0) 22 | RoxygenNote: 7.3.3 23 | Imports: 24 | utils, 25 | stats, 26 | tibble, 27 | purrr, 28 | readr, 29 | stringr, 30 | dplyr, 31 | rlang, 32 | withr 33 | URL: https://github.com/tjmahr/readtextgrid, https://www.tjmahr.com/readtextgrid/ 34 | BugReports: https://github.com/tjmahr/readtextgrid/issues 35 | Roxygen: list(markdown = TRUE) 36 | LinkingTo: 37 | cpp11 38 | Config/Needs/website: rmarkdown 39 | -------------------------------------------------------------------------------- /man/readtextgrid-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readtextgrid-package.R 3 | \docType{package} 4 | \name{readtextgrid-package} 5 | \alias{readtextgrid} 6 | \alias{readtextgrid-package} 7 | \title{readtextgrid: Read in a 'Praat' 'TextGrid' File} 8 | \description{ 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} 10 | 11 | 'Praat' \url{https://www.fon.hum.uva.nl/praat/} is a widely used tool for manipulating, annotating and analyzing speech and acoustic data. It stores annotation data in a format called a 'TextGrid'. This package provides a way to read these files into R. 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://github.com/tjmahr/readtextgrid} 17 | \item \url{https://www.tjmahr.com/readtextgrid/} 18 | \item Report bugs at \url{https://github.com/tjmahr/readtextgrid/issues} 19 | } 20 | 21 | } 22 | \author{ 23 | \strong{Maintainer}: Tristan Mahr \email{tristan.mahr@wisc.edu} (\href{https://orcid.org/0000-0002-8890-5116}{ORCID}) 24 | 25 | Authors: 26 | \itemize{ 27 | \item Josef Fruehwald 28 | } 29 | 30 | Other contributors: 31 | \itemize{ 32 | \item Dan Villarreal [contributor] 33 | \item Jonathan Washington [contributor] 34 | } 35 | 36 | } 37 | \keyword{internal} 38 | -------------------------------------------------------------------------------- /tests/testthat/test-data/praat-test/okay-fractions.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 00000.0 5 | xmax = 3 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "intervals" 12 | xmin = 0 13 | xmax = 3 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0/1 17 | xmax = 1/2 18 | text = "0/1 to 1/2 (0 to 0.5)" 19 | intervals [2]: 20 | xmin = 1/2 21 | xmax = 6/4 22 | text = "1/2 to 6/4 (0.5 to 1.5)" 23 | intervals [3]: 24 | xmin = +3/2 25 | xmax = 5.0/+2.0 26 | text = "+3/2 to 5.0/+2.0 (1.5 to 2.5)" 27 | intervals [4]: 28 | xmin = -5/-2 29 | xmax = -3e0/-1e0 30 | text = "-5/-2 to -3e0/-1e0 (2.5 to 3)" 31 | item [2]: 32 | class = "TextTier" 33 | name = "points" 34 | xmin = 0 35 | xmax = 1 36 | points: size = 3 37 | points [1]: 38 | number = 1/10 39 | mark = "1/10 -> 0.1" 40 | points [2]: 41 | number = -1/-2 42 | mark = "-1/-2 -> 0.5" 43 | points [3]: 44 | number = 16e-1/2 45 | mark = "16e-1/2 -> 0.8" 46 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (https://www.contributor-covenant.org), version 1.0.0, available at 25 | https://contributor-covenant.org/version/1/0/0/. 26 | -------------------------------------------------------------------------------- /man/example_textgrid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readtextgrid.R 3 | \name{example_textgrid} 4 | \alias{example_textgrid} 5 | \title{Locate the path of an example textgrid file} 6 | \usage{ 7 | example_textgrid(which = 1) 8 | } 9 | \arguments{ 10 | \item{which}{index of the textgrid to load} 11 | } 12 | \value{ 13 | Path of \code{"Mary_John_bell.TextGrid"} bundled with the \code{readtextgrid} 14 | package. 15 | } 16 | \description{ 17 | Locate the path of an example textgrid file 18 | } 19 | \details{ 20 | This function is a wrapper over \code{\link[=system.file]{system.file()}} to locate the 21 | paths to bundled textgrids. These files are used to test or demonstrate 22 | functionality of the package. 23 | 24 | Two files are included: 25 | \enumerate{ 26 | \item \code{"Mary_John_bell.TextGrid"} - the default TextGrid created by Praat's 27 | Create TextGrid command. This file is saved as UTF-8 encoding. 28 | \item \code{"utf_16_be.TextGrid"} - a TextGrid with some IPA characters entered using 29 | Praat's IPA character selector. This file is saved with UTF-16 encoding. 30 | \item \code{"nested-intervals.TextGrid"} - A textgrid containing an \code{"utterance"} 31 | tier, a \code{"words"} tier, and a \code{"phones"} tier. This file is typical of 32 | forced alignment textgrids where utterances contain words which contain 33 | speech segments. In this case, alignment was made by hand so that word 34 | and phone boundaries do not correspond exactly. 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /tests/testthat/test-data/hard-to-parse-normalized.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2.3 6 | tiers? 7 | size = 4 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "! Fake Comment" 12 | xmin = 0 13 | xmax = 2.3 14 | intervals: size = 3 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 1 18 | text = "" 19 | intervals [2]: 20 | xmin = 1 21 | xmax = 2 22 | text = """""" 23 | intervals [3]: 24 | xmin = 2 25 | xmax = 2.3 26 | text = "Not a ! Comment" 27 | item [2]: 28 | class = "IntervalTier" 29 | name = "Embedded ""String"" here" 30 | xmin = -0.5 31 | xmax = 2.3 32 | intervals: size = 1 33 | intervals [1]: 34 | xmin = -0.1 35 | xmax = 2.8 36 | text = "this string 37 | has line breaks in it 38 | and a ! fake comment 39 | " 40 | item [3]: 41 | class = "TextTier" 42 | name = "point tier 1" 43 | xmin = 0 44 | xmax = 2.3 45 | points: size = 2 46 | points [1]: 47 | number = 0.1 48 | mark = "point label" 49 | points [2]: 50 | number = 0.8000000000000003 51 | mark = "deliberate extra spaces" 52 | item [4]: 53 | class = "TextTier" 54 | name = "TextTier" 55 | xmin = 0 56 | xmax = 2.3 57 | points: size = 0 58 | -------------------------------------------------------------------------------- /tests/testthat/test-data/hard-to-parse.TextGrid: -------------------------------------------------------------------------------- 1 | "ooTextFile" 2 | "TextGrid" 3 | ! A deliberate comment starts with a bang and ends on a new line 4 | 0 2.3 ! Here is a normal comment 5 | ! This comment has a stray number 1 6 | 7 | ! Extra unquoted text is also a comment like the word "tiers" below 8 | 4 tiers ! This comment has a stray quote " 9 | "IntervalTier" "! Fake Comment" 10 | 0 2.3 ! time domain of tier 1 11 | 3 interval coming 12 | 0 1 "" ! interval 1 on tier 1 13 | 1 2 """""" ! interval 2 on tier 1 14 | 2 2.3 "Not a ! Comment" ! interval 3 on tier 1 15 | "IntervalTier" "Embedded ""String"" here" ! type and name of tier 2 16 | I guess some negative numbers are fine and make it into Praat 17 | -0.5s 2.3 18 | 1 interval coming 19 | -0.1 2.8s "this string 20 | has line breaks in it 21 | and a ! fake comment 22 | " 23 | ! testing comments that touch strings and numbers 24 | ! 1. "string"! throws an error in Praat so don't test that 25 | "TextTier" "point tier 1" 26 | ! 2. [NUMBER]!comment is handled fine by Praat 27 | 0. 2.3!touching a number 28 | okay this is weird it won't see letternumber like this10 29 | but it doesn't care about 2this 30 | 0.1 31 | "point label" 32 | 33 | ! Finally include a bracketed number 34 | points [3]: 35 | number = 0.8000000000000003 36 | mark = "deliberate extra spaces" 37 | 38 | "TextTier" "TextTier" 0. 2.3 39 | and now here is the weird part -10 ! points gets turned to 0 40 | 41 | 42 | ! and now here is another weird part. this doesn't get read because we said 43 | ! there are only 4 tiers 44 | "TextTier" "point tier 3" 0. 2.3 0 45 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # readtextgrid 0.2.0 2 | 3 | * `read_textgrid()` now manually parses textgrids and can handle short-format 4 | textgrids. (#4, #16, initial parser by @JoFrhwld). Part of the parsing is done 5 | in C++ so the new manual parser is faster than the legacy version. 6 | * The new parser is documented in a non-package supplemental vignette called 7 | "Textgrid specification" on the package website. 8 | * Original package functions are available in `legacy_read_textgrid()`. 9 | * `legacy_read_textgrid()` un-escapes `A ""quote"" word` to `A "quoted" word`. 10 | * `legacy_read_textgrid()` can handle interval text with line breaks in them. 11 | * New function `pivot_textgrid_tiers()` to pivot out nested textgrid 12 | intervals into a wide dataframe. For example, if a forced aligner has a 13 | `words` and `phones` tier, we can pivot the `words` and `phones` 14 | intervals into a dataframe with one row per `phones` intervals and with 15 | columns `words`, `words_xmin`, `words_xmax`, `phones`, `phones_xmin`, 16 | `phones_xmax`, etc. (#10, request of @stefanocoretta) 17 | * Testing suite includes a short-format textgrid, a short-format textgrid 18 | with inline comments, and a textgrid with escaped `"` characters. (@JoFrhwld) 19 | * Testing suit includes an adversarial textgrid to challenge parsing. 20 | * Support ELAN-generated textgrids. (#11, @djvill) 21 | * Raised required R version to 4.3.0. (April 2023) 22 | 23 | 24 | # readtextgrid 0.1.2 25 | 26 | * Add `encoding` argument to `read_textgrid()`. (#7, #8, #9, @jonorthwash) 27 | * Include example UTF-16 textgrid for testing. 28 | * Remove magrittr dependency. (#5, @JoFrhwld) 29 | 30 | 31 | # readtextgrid 0.1.1 32 | 33 | * Fixes for CRAN resubmission. 34 | 35 | 36 | # readtextgrid 0.1.0 37 | 38 | * Initial release. 39 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker001/s2T04.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.6841632653061225 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.6841632653061225 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.441 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.441 21 | xmax = 0.648 22 | text = "get" 23 | intervals [3]: 24 | xmin = 0.648 25 | xmax = 1.098 26 | text = "off" 27 | intervals [4]: 28 | xmin = 1.098 29 | xmax = 1.6841632653061225 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.6841632653061225 36 | intervals: size = 8 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.441 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.441 43 | xmax = 0.522 44 | text = "G" 45 | intervals [3]: 46 | xmin = 0.522 47 | xmax = 0.5940000000000001 48 | text = "EH1" 49 | intervals [4]: 50 | xmin = 0.5940000000000001 51 | xmax = 0.648 52 | text = "T" 53 | intervals [5]: 54 | xmin = 0.648 55 | xmax = 0.855 56 | text = "AO1" 57 | intervals [6]: 58 | xmin = 0.855 59 | xmax = 1.098 60 | text = "F" 61 | intervals [7]: 62 | xmin = 1.098 63 | xmax = 1.665 64 | text = "sp" 65 | intervals [8]: 66 | xmin = 1.665 67 | xmax = 1.6841632653061225 68 | text = "" 69 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker002/s2T04.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.6841632653061225 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.6841632653061225 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.441 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.441 21 | xmax = 0.648 22 | text = "get" 23 | intervals [3]: 24 | xmin = 0.648 25 | xmax = 1.098 26 | text = "off" 27 | intervals [4]: 28 | xmin = 1.098 29 | xmax = 1.6841632653061225 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.6841632653061225 36 | intervals: size = 8 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.441 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.441 43 | xmax = 0.522 44 | text = "G" 45 | intervals [3]: 46 | xmin = 0.522 47 | xmax = 0.5940000000000001 48 | text = "EH1" 49 | intervals [4]: 50 | xmin = 0.5940000000000001 51 | xmax = 0.648 52 | text = "T" 53 | intervals [5]: 54 | xmin = 0.648 55 | xmax = 0.855 56 | text = "AO1" 57 | intervals [6]: 58 | xmin = 0.855 59 | xmax = 1.098 60 | text = "F" 61 | intervals [7]: 62 | xmin = 1.098 63 | xmax = 1.665 64 | text = "sp" 65 | intervals [8]: 66 | xmin = 1.665 67 | xmax = 1.6841632653061225 68 | text = "" 69 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker001/s2T01.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.3485714285714285 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.3485714285714285 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.29700000000000004 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.29700000000000004 21 | xmax = 0.522 22 | text = "bird" 23 | intervals [3]: 24 | xmin = 0.522 25 | xmax = 0.9720000000000001 26 | text = "house" 27 | intervals [4]: 28 | xmin = 0.9720000000000001 29 | xmax = 1.3485714285714285 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.3485714285714285 36 | intervals: size = 9 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.29700000000000004 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.29700000000000004 43 | xmax = 0.36000000000000004 44 | text = "B" 45 | intervals [3]: 46 | xmin = 0.36000000000000004 47 | xmax = 0.49500000000000005 48 | text = "ER1" 49 | intervals [4]: 50 | xmin = 0.49500000000000005 51 | xmax = 0.522 52 | text = "D" 53 | intervals [5]: 54 | xmin = 0.522 55 | xmax = 0.621 56 | text = "HH" 57 | intervals [6]: 58 | xmin = 0.621 59 | xmax = 0.783 60 | text = "AW1" 61 | intervals [7]: 62 | xmin = 0.783 63 | xmax = 0.9720000000000001 64 | text = "S" 65 | intervals [8]: 66 | xmin = 0.9720000000000001 67 | xmax = 1.332 68 | text = "sp" 69 | intervals [9]: 70 | xmin = 1.332 71 | xmax = 1.3485714285714285 72 | text = "" 73 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker002/s2T01.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.3485714285714285 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.3485714285714285 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.29700000000000004 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.29700000000000004 21 | xmax = 0.522 22 | text = "bird" 23 | intervals [3]: 24 | xmin = 0.522 25 | xmax = 0.9720000000000001 26 | text = "house" 27 | intervals [4]: 28 | xmin = 0.9720000000000001 29 | xmax = 1.3485714285714285 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.3485714285714285 36 | intervals: size = 9 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.29700000000000004 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.29700000000000004 43 | xmax = 0.36000000000000004 44 | text = "B" 45 | intervals [3]: 46 | xmin = 0.36000000000000004 47 | xmax = 0.49500000000000005 48 | text = "ER1" 49 | intervals [4]: 50 | xmin = 0.49500000000000005 51 | xmax = 0.522 52 | text = "D" 53 | intervals [5]: 54 | xmin = 0.522 55 | xmax = 0.621 56 | text = "HH" 57 | intervals [6]: 58 | xmin = 0.621 59 | xmax = 0.783 60 | text = "AW1" 61 | intervals [7]: 62 | xmin = 0.783 63 | xmax = 0.9720000000000001 64 | text = "S" 65 | intervals [8]: 66 | xmin = 0.9720000000000001 67 | xmax = 1.332 68 | text = "sp" 69 | intervals [9]: 70 | xmin = 1.332 71 | xmax = 1.3485714285714285 72 | text = "" 73 | -------------------------------------------------------------------------------- /man/read_textgrid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readtextgrid.R, R/legacy.R 3 | \name{read_textgrid} 4 | \alias{read_textgrid} 5 | \alias{read_textgrid_lines} 6 | \alias{legacy_read_textgrid} 7 | \alias{legacy_read_textgrid_lines} 8 | \title{Read a textgrid file into a tibble} 9 | \usage{ 10 | read_textgrid(path, file = NULL, encoding = NULL) 11 | 12 | read_textgrid_lines(lines, file = NULL) 13 | 14 | legacy_read_textgrid(path, file = NULL, encoding = NULL) 15 | 16 | legacy_read_textgrid_lines(lines, file = NULL) 17 | } 18 | \arguments{ 19 | \item{path}{a path to a textgrid} 20 | 21 | \item{file}{an optional value to use for the \code{file} column. For 22 | \code{read_textgrid()}, the default is the base filename of the input file. For 23 | \code{read_textgrid_lines()}, the default is \code{NA}.} 24 | 25 | \item{encoding}{the encoding of the textgrid. The default value \code{NULL} uses 26 | \code{\link[readr:encoding]{readr::guess_encoding()}} to guess the encoding of the textgrid. If an 27 | encoding is provided, it is forwarded to \verb{[readr::locale()]} and 28 | \verb{[readr::read_lines()]}.} 29 | 30 | \item{lines}{alternatively, the lines of a textgrid file} 31 | } 32 | \value{ 33 | a tibble with one row per textgrid annotation 34 | } 35 | \description{ 36 | Read a textgrid file into a tibble 37 | } 38 | \details{ 39 | The \code{legacy_read_textgrid} functions are the original textgrid 40 | parsers provided by the package. They assume that the TextGrid file is a 41 | "long" format textgrid; this is the default format used by "Save a text 42 | file..." in Praat. 43 | 44 | The current \code{read_textgrid()} functions are more 45 | flexible and can read in "short" format textgrids and textgrids with 46 | comments. 47 | 48 | See \url{https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html} 49 | for a description of the textgrid file format. Note that this package does 50 | not strictly adhere to format as described in this document. For example, 51 | the document says that numbers should be freestanding (surrounded by spaces 52 | or string boundaries), but Praat.exe can handle malformed numbers like 53 | \verb{100ms}. Therefore, we tried to implement a parser that matched what Praat 54 | actually handles. 55 | } 56 | \examples{ 57 | tg <- system.file("Mary_John_bell.TextGrid", package = "readtextgrid") 58 | read_textgrid(tg) 59 | } 60 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker001/s2T02.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.5918163265306122 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.5918163265306122 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.29700000000000004 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.29700000000000004 21 | xmax = 0.7020000000000001 22 | text = "cowboy" 23 | intervals [3]: 24 | xmin = 0.7020000000000001 25 | xmax = 1.1700000000000002 26 | text = "boots" 27 | intervals [4]: 28 | xmin = 1.1700000000000002 29 | xmax = 1.5918163265306122 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.5918163265306122 36 | intervals: size = 11 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.29700000000000004 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.29700000000000004 43 | xmax = 0.378 44 | text = "K" 45 | intervals [3]: 46 | xmin = 0.378 47 | xmax = 0.5040000000000001 48 | text = "AW1" 49 | intervals [4]: 50 | xmin = 0.5040000000000001 51 | xmax = 0.5760000000000001 52 | text = "B" 53 | intervals [5]: 54 | xmin = 0.5760000000000001 55 | xmax = 0.7020000000000001 56 | text = "OY2" 57 | intervals [6]: 58 | xmin = 0.7020000000000001 59 | xmax = 0.774 60 | text = "B" 61 | intervals [7]: 62 | xmin = 0.774 63 | xmax = 0.918 64 | text = "UW1" 65 | intervals [8]: 66 | xmin = 0.918 67 | xmax = 1.035 68 | text = "T" 69 | intervals [9]: 70 | xmin = 1.035 71 | xmax = 1.1700000000000002 72 | text = "S" 73 | intervals [10]: 74 | xmin = 1.1700000000000002 75 | xmax = 1.575 76 | text = "sp" 77 | intervals [11]: 78 | xmin = 1.575 79 | xmax = 1.5918163265306122 80 | text = "" 81 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker002/s2T02.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.5918163265306122 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.5918163265306122 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.29700000000000004 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.29700000000000004 21 | xmax = 0.7020000000000001 22 | text = "cowboy" 23 | intervals [3]: 24 | xmin = 0.7020000000000001 25 | xmax = 1.1700000000000002 26 | text = "boots" 27 | intervals [4]: 28 | xmin = 1.1700000000000002 29 | xmax = 1.5918163265306122 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.5918163265306122 36 | intervals: size = 11 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.29700000000000004 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.29700000000000004 43 | xmax = 0.378 44 | text = "K" 45 | intervals [3]: 46 | xmin = 0.378 47 | xmax = 0.5040000000000001 48 | text = "AW1" 49 | intervals [4]: 50 | xmin = 0.5040000000000001 51 | xmax = 0.5760000000000001 52 | text = "B" 53 | intervals [5]: 54 | xmin = 0.5760000000000001 55 | xmax = 0.7020000000000001 56 | text = "OY2" 57 | intervals [6]: 58 | xmin = 0.7020000000000001 59 | xmax = 0.774 60 | text = "B" 61 | intervals [7]: 62 | xmin = 0.774 63 | xmax = 0.918 64 | text = "UW1" 65 | intervals [8]: 66 | xmin = 0.918 67 | xmax = 1.035 68 | text = "T" 69 | intervals [9]: 70 | xmin = 1.035 71 | xmax = 1.1700000000000002 72 | text = "S" 73 | intervals [10]: 74 | xmin = 1.1700000000000002 75 | xmax = 1.575 76 | text = "sp" 77 | intervals [11]: 78 | xmin = 1.575 79 | xmax = 1.5918163265306122 80 | text = "" 81 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker001/s2T03.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.7316530612244898 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.7316530612244898 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.369 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.369 21 | xmax = 0.657 22 | text = "hug" 23 | intervals [3]: 24 | xmin = 0.657 25 | xmax = 0.6930000000000001 26 | text = "" 27 | intervals [4]: 28 | xmin = 0.6930000000000001 29 | xmax = 1.1520000000000001 30 | text = "daddy" 31 | intervals [5]: 32 | xmin = 1.1520000000000001 33 | xmax = 1.7316530612244898 34 | text = "" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "phones" 38 | xmin = 0 39 | xmax = 1.7316530612244898 40 | intervals: size = 11 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 0.369 44 | text = "sil" 45 | intervals [2]: 46 | xmin = 0.369 47 | xmax = 0.45 48 | text = "HH" 49 | intervals [3]: 50 | xmin = 0.45 51 | xmax = 0.5850000000000001 52 | text = "AH1" 53 | intervals [4]: 54 | xmin = 0.5850000000000001 55 | xmax = 0.657 56 | text = "G" 57 | intervals [5]: 58 | xmin = 0.657 59 | xmax = 0.6930000000000001 60 | text = "sp" 61 | intervals [6]: 62 | xmin = 0.6930000000000001 63 | xmax = 0.774 64 | text = "D" 65 | intervals [7]: 66 | xmin = 0.774 67 | xmax = 0.927 68 | text = "AE1" 69 | intervals [8]: 70 | xmin = 0.927 71 | xmax = 0.9720000000000001 72 | text = "D" 73 | intervals [9]: 74 | xmin = 0.9720000000000001 75 | xmax = 1.1520000000000001 76 | text = "IY0" 77 | intervals [10]: 78 | xmin = 1.1520000000000001 79 | xmax = 1.71 80 | text = "sp" 81 | intervals [11]: 82 | xmin = 1.71 83 | xmax = 1.7316530612244898 84 | text = "" 85 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker002/s2T03.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.7316530612244898 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.7316530612244898 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.369 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.369 21 | xmax = 0.657 22 | text = "hug" 23 | intervals [3]: 24 | xmin = 0.657 25 | xmax = 0.6930000000000001 26 | text = "" 27 | intervals [4]: 28 | xmin = 0.6930000000000001 29 | xmax = 1.1520000000000001 30 | text = "daddy" 31 | intervals [5]: 32 | xmin = 1.1520000000000001 33 | xmax = 1.7316530612244898 34 | text = "" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "phones" 38 | xmin = 0 39 | xmax = 1.7316530612244898 40 | intervals: size = 11 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 0.369 44 | text = "sil" 45 | intervals [2]: 46 | xmin = 0.369 47 | xmax = 0.45 48 | text = "HH" 49 | intervals [3]: 50 | xmin = 0.45 51 | xmax = 0.5850000000000001 52 | text = "AH1" 53 | intervals [4]: 54 | xmin = 0.5850000000000001 55 | xmax = 0.657 56 | text = "G" 57 | intervals [5]: 58 | xmin = 0.657 59 | xmax = 0.6930000000000001 60 | text = "sp" 61 | intervals [6]: 62 | xmin = 0.6930000000000001 63 | xmax = 0.774 64 | text = "D" 65 | intervals [7]: 66 | xmin = 0.774 67 | xmax = 0.927 68 | text = "AE1" 69 | intervals [8]: 70 | xmin = 0.927 71 | xmax = 0.9720000000000001 72 | text = "D" 73 | intervals [9]: 74 | xmin = 0.9720000000000001 75 | xmax = 1.1520000000000001 76 | text = "IY0" 77 | intervals [10]: 78 | xmin = 1.1520000000000001 79 | xmax = 1.71 80 | text = "sp" 81 | intervals [11]: 82 | xmin = 1.71 83 | xmax = 1.7316530612244898 84 | text = "" 85 | -------------------------------------------------------------------------------- /inst/make-logo.R: -------------------------------------------------------------------------------- 1 | # install.packages("hexSticker") 2 | library(hexSticker) 3 | library(ggplot2) 4 | 5 | # dir.create("man/figures") 6 | 7 | df <- tibble::tibble( 8 | row = 2, 9 | x = .5, 10 | label = "read" 11 | ) 12 | 13 | df2 <- tibble::tibble( 14 | row = 1, 15 | x = seq(0, 1, length.out = 8), 16 | label = c("t", "e", "x", "t", "g", "r", "i", "d") 17 | ) 18 | 19 | lines <- purrr::map2_dbl( 20 | df2$x[1:7], 21 | df2$x[2:8], 22 | function(x, y) median(c(x, y)) 23 | ) 24 | 25 | # # from actual praat 26 | # yellow <- "#FADF28" 27 | # red <- "#DE0805" 28 | # blue <- "#0000D3" 29 | 30 | text <- "#404e4d" 31 | line <- "#747e7d" 32 | yellow <- "#fde74c" 33 | red <- "#c3423f" 34 | blue <- "#4D85BD" 35 | 36 | grid_min <- -.143 / 2 37 | grid_max <- 1 + .143 / 2 38 | text_size <- 15 39 | 40 | p <- ggplot(df) + 41 | aes(x = x, y = row) + 42 | geom_segment( 43 | aes(x = x, y = y, xend = xend, yend = yend), 44 | data = tibble::tibble( 45 | x = grid_min, 46 | xend = grid_max, 47 | y = c(.5, 1.5, 2.5), 48 | yend = y 49 | ), 50 | color = line 51 | ) + 52 | geom_segment( 53 | aes(x = x, y = y, xend = xend, yend = yend), 54 | data = tibble::tibble( 55 | x = c(grid_min, grid_max), 56 | xend = x, 57 | y = .5, 58 | yend = 2.5 59 | ), 60 | size = 2, 61 | lineend = "round", 62 | color = blue 63 | ) + 64 | geom_ribbon( 65 | aes(x = x, ymax = ymax, ymin = ymin), 66 | data = tibble::tibble( 67 | x = lines[5:6], 68 | ymin = .5, 69 | ymax = 1.5, 70 | row = .5 71 | ), 72 | fill = yellow 73 | ) + 74 | geom_segment( 75 | aes(x = x, y = y, xend = xend, yend = yend), 76 | data = tibble::tibble( 77 | x = lines[-5], 78 | xend = x, 79 | y = .5, 80 | yend = 1.5 81 | ), 82 | size = 2, 83 | lineend = "round", 84 | color = blue 85 | ) + 86 | geom_segment( 87 | aes(x = x, y = y, xend = xend, yend = yend), 88 | data = tibble::tibble( 89 | x = lines[5], 90 | xend = x, 91 | y = .5, 92 | yend = 1.5 93 | ), 94 | size = 2, 95 | lineend = "round", 96 | color = red 97 | ) + 98 | geom_text(aes(label = label), size = text_size, color = text) + 99 | geom_text( 100 | aes(label = label), 101 | data = df2[-6, ], 102 | size = text_size, 103 | color = text 104 | ) + 105 | geom_text( 106 | aes(label = label), 107 | data = df2[6, ], 108 | size = text_size, 109 | color = red 110 | ) + 111 | theme_void() + 112 | theme_transparent() 113 | 114 | sticker( 115 | p, 116 | package = "", 117 | s_x = 1, 118 | s_y = 1, 119 | s_width = 1.65, 120 | s_height = 1, 121 | filename = "man/figures/logo.png", 122 | h_fill = "white", 123 | h_color = red 124 | ) 125 | 126 | system2("open", "man/figures/logo.png") 127 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker001/s2T05.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.6920816326530612 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.6920816326530612 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.333 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.333 21 | xmax = 0.7200000000000001 22 | text = "animal" 23 | intervals [3]: 24 | xmin = 0.7200000000000001 25 | xmax = 1.332 26 | text = "crackers" 27 | intervals [4]: 28 | xmin = 1.332 29 | xmax = 1.6920816326530612 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.6920816326530612 36 | intervals: size = 15 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.333 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.333 43 | xmax = 0.459 44 | text = "AE1" 45 | intervals [3]: 46 | xmin = 0.459 47 | xmax = 0.531 48 | text = "N" 49 | intervals [4]: 50 | xmin = 0.531 51 | xmax = 0.558 52 | text = "AH0" 53 | intervals [5]: 54 | xmin = 0.558 55 | xmax = 0.6030000000000001 56 | text = "M" 57 | intervals [6]: 58 | xmin = 0.6030000000000001 59 | xmax = 0.639 60 | text = "AH0" 61 | intervals [7]: 62 | xmin = 0.639 63 | xmax = 0.7200000000000001 64 | text = "L" 65 | intervals [8]: 66 | xmin = 0.7200000000000001 67 | xmax = 0.792 68 | text = "K" 69 | intervals [9]: 70 | xmin = 0.792 71 | xmax = 0.846 72 | text = "R" 73 | intervals [10]: 74 | xmin = 0.846 75 | xmax = 0.9540000000000001 76 | text = "AE1" 77 | intervals [11]: 78 | xmin = 0.9540000000000001 79 | xmax = 1.062 80 | text = "K" 81 | intervals [12]: 82 | xmin = 1.062 83 | xmax = 1.1880000000000002 84 | text = "ER0" 85 | intervals [13]: 86 | xmin = 1.1880000000000002 87 | xmax = 1.332 88 | text = "Z" 89 | intervals [14]: 90 | xmin = 1.332 91 | xmax = 1.6740000000000002 92 | text = "sp" 93 | intervals [15]: 94 | xmin = 1.6740000000000002 95 | xmax = 1.6920816326530612 96 | text = "" 97 | -------------------------------------------------------------------------------- /inst/speaker-data/speaker002/s2T05.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.6920816326530612 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.6920816326530612 14 | intervals: size = 4 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.333 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.333 21 | xmax = 0.7200000000000001 22 | text = "animal" 23 | intervals [3]: 24 | xmin = 0.7200000000000001 25 | xmax = 1.332 26 | text = "crackers" 27 | intervals [4]: 28 | xmin = 1.332 29 | xmax = 1.6920816326530612 30 | text = "" 31 | item [2]: 32 | class = "IntervalTier" 33 | name = "phones" 34 | xmin = 0 35 | xmax = 1.6920816326530612 36 | intervals: size = 15 37 | intervals [1]: 38 | xmin = 0 39 | xmax = 0.333 40 | text = "sil" 41 | intervals [2]: 42 | xmin = 0.333 43 | xmax = 0.459 44 | text = "AE1" 45 | intervals [3]: 46 | xmin = 0.459 47 | xmax = 0.531 48 | text = "N" 49 | intervals [4]: 50 | xmin = 0.531 51 | xmax = 0.558 52 | text = "AH0" 53 | intervals [5]: 54 | xmin = 0.558 55 | xmax = 0.6030000000000001 56 | text = "M" 57 | intervals [6]: 58 | xmin = 0.6030000000000001 59 | xmax = 0.639 60 | text = "AH0" 61 | intervals [7]: 62 | xmin = 0.639 63 | xmax = 0.7200000000000001 64 | text = "L" 65 | intervals [8]: 66 | xmin = 0.7200000000000001 67 | xmax = 0.792 68 | text = "K" 69 | intervals [9]: 70 | xmin = 0.792 71 | xmax = 0.846 72 | text = "R" 73 | intervals [10]: 74 | xmin = 0.846 75 | xmax = 0.9540000000000001 76 | text = "AE1" 77 | intervals [11]: 78 | xmin = 0.9540000000000001 79 | xmax = 1.062 80 | text = "K" 81 | intervals [12]: 82 | xmin = 1.062 83 | xmax = 1.1880000000000002 84 | text = "ER0" 85 | intervals [13]: 86 | xmin = 1.1880000000000002 87 | xmax = 1.332 88 | text = "Z" 89 | intervals [14]: 90 | xmin = 1.332 91 | xmax = 1.6740000000000002 92 | text = "sp" 93 | intervals [15]: 94 | xmin = 1.6740000000000002 95 | xmax = 1.6920816326530612 96 | text = "" 97 | -------------------------------------------------------------------------------- /inst/nested-intervals.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.8634920634920635 6 | tiers? 7 | size = 3 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.8634920634920635 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.4193533480814183 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.4193533480814183 21 | xmax = 0.7609484705244939 22 | text = "hug" 23 | intervals [3]: 24 | xmin = 0.7609484705244939 25 | xmax = 0.8544577498524867 26 | text = "" 27 | intervals [4]: 28 | xmin = 0.8544577498524867 29 | xmax = 1.4422303627712987 30 | text = "daddy" 31 | intervals [5]: 32 | xmin = 1.4422303627712987 33 | xmax = 1.8634920634920635 34 | text = "" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "phones" 38 | xmin = 0 39 | xmax = 1.8634920634920635 40 | intervals: size = 11 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 0.4193533480814183 44 | text = "sil" 45 | intervals [2]: 46 | xmin = 0.4193533480814183 47 | xmax = 0.5243127432454918 48 | text = "HH" 49 | intervals [3]: 50 | xmin = 0.5243127432454918 51 | xmax = 0.6369055489669525 52 | text = "AH1" 53 | intervals [4]: 54 | xmin = 0.6369055489669525 55 | xmax = 0.7609484705244939 56 | text = "G" 57 | intervals [5]: 58 | xmin = 0.7609484705244939 59 | xmax = 0.8544577498524867 60 | text = "sp" 61 | intervals [6]: 62 | xmin = 0.8544577498524867 63 | xmax = 1.052926424344553 64 | text = "D" 65 | intervals [7]: 66 | xmin = 1.052926424344553 67 | xmax = 1.2304032198038046 68 | text = "AE1" 69 | intervals [8]: 70 | xmin = 1.2304032198038046 71 | xmax = 1.3220041464924508 72 | text = "D" 73 | intervals [9]: 74 | xmin = 1.3220041464924508 75 | xmax = 1.4422303627712987 76 | text = "IY0" 77 | intervals [10]: 78 | xmin = 1.4422303627712987 79 | xmax = 1.785733837853721 80 | text = "sp" 81 | intervals [11]: 82 | xmin = 1.785733837853721 83 | xmax = 1.8634920634920635 84 | text = "" 85 | item [3]: 86 | class = "IntervalTier" 87 | name = "utterance" 88 | xmin = 0 89 | xmax = 1.8634920634920635 90 | intervals: size = 1 91 | intervals [1]: 92 | xmin = 0 93 | xmax = 1.8634920634920635 94 | text = "hug daddy" 95 | -------------------------------------------------------------------------------- /tests/testthat/test-data/nested-intervals.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 1.8634920634920635 6 | tiers? 7 | size = 3 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0 13 | xmax = 1.8634920634920635 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.4193533480814183 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.4193533480814183 21 | xmax = 0.7609484705244939 22 | text = "hug" 23 | intervals [3]: 24 | xmin = 0.7609484705244939 25 | xmax = 0.8544577498524867 26 | text = "" 27 | intervals [4]: 28 | xmin = 0.8544577498524867 29 | xmax = 1.4422303627712987 30 | text = "daddy" 31 | intervals [5]: 32 | xmin = 1.4422303627712987 33 | xmax = 1.8634920634920635 34 | text = "" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "phones" 38 | xmin = 0 39 | xmax = 1.8634920634920635 40 | intervals: size = 11 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 0.4193533480814183 44 | text = "sil" 45 | intervals [2]: 46 | xmin = 0.4193533480814183 47 | xmax = 0.5243127432454918 48 | text = "HH" 49 | intervals [3]: 50 | xmin = 0.5243127432454918 51 | xmax = 0.6369055489669525 52 | text = "AH1" 53 | intervals [4]: 54 | xmin = 0.6369055489669525 55 | xmax = 0.7609484705244939 56 | text = "G" 57 | intervals [5]: 58 | xmin = 0.7609484705244939 59 | xmax = 0.8544577498524867 60 | text = "sp" 61 | intervals [6]: 62 | xmin = 0.8544577498524867 63 | xmax = 1.052926424344553 64 | text = "D" 65 | intervals [7]: 66 | xmin = 1.052926424344553 67 | xmax = 1.2304032198038046 68 | text = "AE1" 69 | intervals [8]: 70 | xmin = 1.2304032198038046 71 | xmax = 1.3220041464924508 72 | text = "D" 73 | intervals [9]: 74 | xmin = 1.3220041464924508 75 | xmax = 1.4422303627712987 76 | text = "IY0" 77 | intervals [10]: 78 | xmin = 1.4422303627712987 79 | xmax = 1.785733837853721 80 | text = "sp" 81 | intervals [11]: 82 | xmin = 1.785733837853721 83 | xmax = 1.8634920634920635 84 | text = "" 85 | item [3]: 86 | class = "IntervalTier" 87 | name = "utterance" 88 | xmin = 0 89 | xmax = 1.8634920634920635 90 | intervals: size = 1 91 | intervals [1]: 92 | xmin = 0 93 | xmax = 1.8634920634920635 94 | text = "hug daddy" 95 | -------------------------------------------------------------------------------- /man/pivot_textgrid_tiers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pivot.R 3 | \name{pivot_textgrid_tiers} 4 | \alias{pivot_textgrid_tiers} 5 | \title{Pivot a textgrid into wide format, respecting nested tiers} 6 | \usage{ 7 | pivot_textgrid_tiers(data, tiers, join_cols = "file") 8 | } 9 | \arguments{ 10 | \item{data}{a textgrid dataframe created with \code{\link[=read_textgrid]{read_textgrid()}}} 11 | 12 | \item{tiers}{character vector of tiers to pivot into wide format. When 13 | \code{tiers} has more than 1 element, the tiers are treated as nested. For 14 | example, if \code{tiers} is \code{c("utterance", "word", "phone")}, where 15 | \code{"utterance"} intervals contain \code{"word"} intervals which in turn contain 16 | \code{"phone"} intervals, the output will have one row per \code{"phone"} interval 17 | and include \verb{utterance_*} and \verb{word_*} columns for the utterance and word 18 | intervals that contain each phone interval. \code{tiers} should be ordered from 19 | broadest to narrowest (e.g, \code{"word"} preceding \code{"phone"}).} 20 | 21 | \item{join_cols}{character vector of the columns that will uniquely identify 22 | a textgrid file. Defaults to \code{"file"} because 23 | these columns have identical values for tiers read from the same textgrid 24 | file.} 25 | } 26 | \value{ 27 | a dataframe with just the intervals from tiers named in \code{tiers} 28 | converted into a wide format. Columns are renamed so that the \code{text} column 29 | is pivot into columns named after the tier names. For example, the \code{text} 30 | column in a \code{words} tier is renamed to \code{words}. The \code{xmax}, \code{xmin}, 31 | \code{annotation_num}, \code{tier_num}, \code{tier_type} are also prefixed with the tier 32 | name. For example, the \code{xmax} column in a \code{words} tier is renamed to 33 | \code{words_xmax}. An additional helper column \code{xmid} is added and prefixed 34 | appropriately. See examples below. 35 | } 36 | \description{ 37 | Pivot a textgrid into wide format, respecting nested tiers 38 | } 39 | \details{ 40 | For the joining nested intervals, two intervals \emph{a} and \emph{b} are combined into 41 | the same row if they match on the values in the \code{join_cols} columns and if 42 | the \code{a$xmin <= b$xmid} and \code{b$xmid <= a$xmax}. That is, if the midpoint of 43 | \emph{b} is contained inside the interval \emph{a}. 44 | } 45 | \examples{ 46 | data <- example_textgrid(3) |> 47 | read_textgrid() 48 | data 49 | 50 | # With a single tier, we get just that tier with the columns prefixed with 51 | # the tier_name 52 | pivot_textgrid_tiers(data, "utterance") 53 | pivot_textgrid_tiers(data, "words") 54 | 55 | # With multiple tiers, intervals in one tier that contain intervals in 56 | # another tier are combined into the same row. 57 | a <- pivot_textgrid_tiers(data, c("utterance", "words")) 58 | cols <- c( 59 | "utterance", "utterance_xmin", "utterance_xmax", 60 | "words", "words_xmin", "words_xmax" 61 | ) 62 | a[cols] 63 | 64 | a <- pivot_textgrid_tiers(data, c("utterance", "words", "phones")) 65 | cols <- c(cols, "phones", "phones_xmin", "phones_xmax") 66 | a[cols] 67 | } 68 | -------------------------------------------------------------------------------- /R/pivot.R: -------------------------------------------------------------------------------- 1 | 2 | #' Pivot a textgrid into wide format, respecting nested tiers 3 | #' 4 | #' @param data a textgrid dataframe created with [read_textgrid()] 5 | #' @param tiers character vector of tiers to pivot into wide format. When 6 | #' `tiers` has more than 1 element, the tiers are treated as nested. For 7 | #' example, if `tiers` is `c("utterance", "word", "phone")`, where 8 | #' `"utterance"` intervals contain `"word"` intervals which in turn contain 9 | #' `"phone"` intervals, the output will have one row per `"phone"` interval 10 | #' and include `utterance_*` and `word_*` columns for the utterance and word 11 | #' intervals that contain each phone interval. `tiers` should be ordered from 12 | #' broadest to narrowest (e.g, `"word"` preceding `"phone"`). 13 | #' @param join_cols character vector of the columns that will uniquely identify 14 | #' a textgrid file. Defaults to `"file"` because 15 | #' these columns have identical values for tiers read from the same textgrid 16 | #' file. 17 | #' @return a dataframe with just the intervals from tiers named in `tiers` 18 | #' converted into a wide format. Columns are renamed so that the `text` column 19 | #' is pivot into columns named after the tier names. For example, the `text` 20 | #' column in a `words` tier is renamed to `words`. The `xmax`, `xmin`, 21 | #' `annotation_num`, `tier_num`, `tier_type` are also prefixed with the tier 22 | #' name. For example, the `xmax` column in a `words` tier is renamed to 23 | #' `words_xmax`. An additional helper column `xmid` is added and prefixed 24 | #' appropriately. See examples below. 25 | #' @export 26 | #' 27 | #' @details 28 | #' For the joining nested intervals, two intervals *a* and *b* are combined into 29 | #' the same row if they match on the values in the `join_cols` columns and if 30 | #' the `a$xmin <= b$xmid` and `b$xmid <= a$xmax`. That is, if the midpoint of 31 | #' *b* is contained inside the interval *a*. 32 | #' 33 | #' 34 | #' @examples 35 | #' data <- example_textgrid(3) |> 36 | #' read_textgrid() 37 | #' data 38 | #' 39 | #' # With a single tier, we get just that tier with the columns prefixed with 40 | #' # the tier_name 41 | #' pivot_textgrid_tiers(data, "utterance") 42 | #' pivot_textgrid_tiers(data, "words") 43 | #' 44 | #' # With multiple tiers, intervals in one tier that contain intervals in 45 | #' # another tier are combined into the same row. 46 | #' a <- pivot_textgrid_tiers(data, c("utterance", "words")) 47 | #' cols <- c( 48 | #' "utterance", "utterance_xmin", "utterance_xmax", 49 | #' "words", "words_xmin", "words_xmax" 50 | #' ) 51 | #' a[cols] 52 | #' 53 | #' a <- pivot_textgrid_tiers(data, c("utterance", "words", "phones")) 54 | #' cols <- c(cols, "phones", "phones_xmin", "phones_xmax") 55 | #' a[cols] 56 | pivot_textgrid_tiers <- function( 57 | data, 58 | tiers, 59 | join_cols = "file" 60 | ) { 61 | stopifnot( 62 | `tier names must be used in textgrid` = 63 | all(tiers %in% unique(data[["tier_name"]])) 64 | ) 65 | 66 | # TODO: 67 | # allow only point tiers "TextTier" at last point in nesting 68 | 69 | tiers <- unique(tiers) 70 | data <- data[data[["tier_name"]] %in% tiers, ] 71 | 72 | join_cols <- join_cols |> 73 | c("tier_xmin", "tier_xmax") |> 74 | unique() 75 | 76 | f <- function(x, y) left_join_nested_tiers(x, y, join_cols) 77 | 78 | l <- data |> 79 | split(~tier_name) |> 80 | _[tiers] |> 81 | lapply(pivot_single_tier, join_cols) |> 82 | Reduce(f, x = _) 83 | 84 | l[["tier_name"]] <- NULL 85 | l 86 | } 87 | 88 | pivot_single_tier <- function(data, join_cols) { 89 | tier_name <- unique(data[["tier_name"]]) 90 | stopifnot(length(tier_name) == 1) 91 | 92 | data[["xmid"]] <- data[["xmin"]] + (data[["xmax"]] - data[["xmin"]]) / 2 93 | 94 | names_end <- c("tier_name", "tier_xmin", "tier_xmax") 95 | names_front <- setdiff(join_cols, names_end) 96 | names_mid <- c( 97 | "xmin", "xmax", "xmid", 98 | "annotation_num", "tier_num", "tier_type" 99 | ) 100 | name_ordering <- c(names_front, "text", names_mid, names_end) 101 | names_new <- c( 102 | names_front, tier_name, 103 | paste0(tier_name, "_", names_mid), 104 | names_end 105 | ) 106 | 107 | data <- data[name_ordering] |> 108 | stats::setNames(names_new) 109 | 110 | data 111 | } 112 | 113 | # For the dplyr::join_by() syntax 114 | utils::globalVariables(c("x", "y")) 115 | 116 | left_join_nested_tiers <- function(data_parent, data_child, join_cols) { 117 | x_names <- data_parent[["tier_name"]][1] |> 118 | paste0("_", c("xmin", "xmax")) 119 | 120 | y_names <- data_child[["tier_name"]][1] |> 121 | paste0("_", "xmid") 122 | 123 | e <- rlang::expr( 124 | dplyr::between( 125 | `$`(y, !! y_names), 126 | `$`(x, !! x_names[1]), 127 | `$`(x, !! x_names[2]) 128 | ) 129 | ) 130 | 131 | data_parent[["tier_name"]] <- NULL 132 | 133 | dplyr::left_join( 134 | data_parent, 135 | data_child, 136 | dplyr::join_by(!!! join_cols, !! e), 137 | relationship = "one-to-many" 138 | ) 139 | } 140 | -------------------------------------------------------------------------------- /src/code.cpp: -------------------------------------------------------------------------------- 1 | #include "cpp11.hpp" 2 | using namespace cpp11; 3 | #include // for std::strtod 4 | 5 | // Generated by asking ChatGPT to translate an R-based, no-AI version 6 | // into cpp11. I reviewed the generated code, clarified the meaning of some 7 | // lines with AI, and made code comment notes. 8 | 9 | // early declaration to satisfy RStudio warning 10 | list cpp_parse_praat_numbers(strings x); 11 | 12 | [[cpp11::register]] 13 | list cpp_tg_scan_tokens(std::string src) { 14 | // final space required 15 | if (src.empty() || src.back() != 0x20) src.push_back(0x20); 16 | 17 | const size_t nbytes = src.size(); 18 | 19 | writable::strings tokens; 20 | writable::logicals tokens_is_string; 21 | 22 | bool in_comment = false; 23 | bool in_string = false; 24 | // are we in a "" sequence (escaped quotes)? 25 | bool esc_next = false; 26 | 27 | bool have_token = false; 28 | size_t tok_start_byte = 0; 29 | 30 | size_t prev_char_byte = 0; 31 | size_t curr_char_byte = 0; 32 | 33 | // Is this a UTF-8 continuation byte? (10xxxxxx) 34 | auto is_cont = [](unsigned char b)->bool { 35 | // Are the first two bits 10? 36 | return (b & 0xC0) == 0x80; 37 | }; 38 | // Is this an ASCII whitespace? 39 | auto is_ws = [](unsigned char b)->bool { 40 | // space, tab, CR, LF 41 | return b == 0x20 || b == 0x09 || b == 0x0D || b == 0x0A; 42 | }; 43 | 44 | for (size_t i = 0; i < nbytes; ++i) { 45 | // char might be signed on some systems so make sure we have 46 | // a simple unsigned char 47 | unsigned char b = static_cast(src[i]); 48 | if (is_cont(b)) continue; 49 | 50 | prev_char_byte = curr_char_byte; 51 | curr_char_byte = i; 52 | 53 | if (in_comment) { 54 | if (b == 0x0A) in_comment = false; 55 | continue; 56 | } 57 | if (!in_string && b == 0x21) { // '!' 58 | in_comment = true; 59 | continue; 60 | } 61 | if (esc_next) { esc_next = false; continue; } 62 | 63 | if (!in_string && is_ws(b)) { 64 | if (have_token) { 65 | size_t start = tok_start_byte; 66 | size_t end = (curr_char_byte == 0 ? 0 : prev_char_byte); 67 | size_t len = (end >= start) ? (end - start + 1) : 0; 68 | if (len > 0) { 69 | // do we have a string (start and end with ") 70 | bool q = (static_cast(src[start]) == 0x22) && 71 | (static_cast(src[end]) == 0x22); 72 | tokens.push_back(src.substr(start, len)); 73 | tokens_is_string.push_back(q); 74 | } 75 | have_token = false; 76 | } 77 | continue; 78 | } 79 | 80 | if (b == 0x22) { // '"' 81 | // peek ahead to see if we have a double "" escapement 82 | size_t j = i + 1; 83 | // We need the next character, not just the next byte, so we skip 84 | // continuation characters. 85 | while (j < nbytes && is_cont(static_cast(src[j]))) ++j; 86 | // Use `0x00` dummy character if we are at then end of the string 87 | unsigned char nextb = (j < nbytes) ? static_cast(src[j]) : 0x00; 88 | 89 | if (in_string && nextb == 0x22) { 90 | esc_next = true; // consume next '"' once 91 | } else { 92 | in_string = !in_string; 93 | } 94 | } 95 | 96 | if (!have_token) { 97 | have_token = true; 98 | tok_start_byte = curr_char_byte; 99 | } 100 | } 101 | 102 | list number_data = cpp_parse_praat_numbers(tokens); 103 | writable::list out(4); 104 | out[0] = tokens; // strings 105 | out[1] = tokens_is_string; // writable::logicals 106 | out[2] = number_data[0]; // prefix_len (integers) 107 | out[3] = number_data[1]; // value (doubles) 108 | out.attr("names") = writable::strings({"tokens", "is_string", "num_prefix", "num_value"}); 109 | return out; 110 | } 111 | 112 | 113 | [[cpp11::register]] 114 | list cpp_parse_praat_numbers(strings x) { 115 | R_xlen_t n = x.size(); 116 | writable::doubles out_len(n); // store prefix length 117 | writable::doubles out_val(n); // store parsed value (for testing) 118 | 119 | for (R_xlen_t i = 0; i < n; i++) { 120 | if (x[i] == NA_STRING) { 121 | out_len[i] = NA_REAL; 122 | out_val[i] = NA_REAL; 123 | continue; 124 | } 125 | 126 | const char* str = Rf_translateCharUTF8(x[i]); 127 | 128 | // Reject leading '.' per Praat rule (".4" invalid) 129 | if (str[0] == '.' ) { 130 | out_len[i] = 0; 131 | out_val[i] = NA_REAL; 132 | continue; 133 | } 134 | 135 | // Allow optional sign 136 | const char* p = str; 137 | if (*p == '+' || *p == '-') ++p; 138 | if (!(*p >= '0' && *p <= '9')) { 139 | out_len[i] = 0; 140 | out_val[i] = NA_REAL; 141 | continue; 142 | } 143 | 144 | char* endptr = nullptr; 145 | double val = std::strtod(str, &endptr); 146 | 147 | if (endptr == str) { 148 | out_len[i] = 0; 149 | out_val[i] = NA_REAL; 150 | } else { 151 | out_len[i] = static_cast(endptr - str); 152 | out_val[i] = val; 153 | } 154 | } 155 | 156 | // Return as a data.frame-like list 157 | writable::list res; 158 | res.push_back(out_len); 159 | res.push_back(out_val); 160 | res.attr("names") = writable::strings({"prefix_len", "value"}); 161 | return res; 162 | } 163 | -------------------------------------------------------------------------------- /R/pure-r-parser.R: -------------------------------------------------------------------------------- 1 | # Implementation of the textgrid parsing written in pure R. 2 | # This was ported to C++ for speed but it's important to have around 3 | # for unit tests as a reference implementation. 4 | 5 | r_read_textgrid <- function(path, file = NULL, encoding = NULL) { 6 | if (is.null(file)) { 7 | file <- basename(path) 8 | } 9 | 10 | if (is.null(encoding)) { 11 | encoding <- readr::guess_encoding(path)$encoding[1] 12 | } 13 | file_locale <- readr::locale(encoding = encoding) 14 | 15 | path |> 16 | readr::read_lines(locale = file_locale) |> 17 | r_read_textgrid_lines(file = file) 18 | } 19 | 20 | r_read_textgrid_lines <- function(lines, file = NULL) { 21 | if (is.null(file)) { 22 | file <- NA_character_ 23 | } 24 | 25 | stopifnot(str_detect_any(lines, "ooTextFile")) 26 | 27 | lines |> 28 | r_parse_textgrid_lines() |> 29 | tibble::add_column(file = file, .before = 1) |> 30 | tibble::as_tibble() 31 | } 32 | 33 | r_parse_textgrid_lines <- function(lines) { 34 | tg_characters <- lines |> 35 | # collapse into one string 36 | stringr::str_c(collapse = "\n") |> 37 | # concat one trailing space 38 | stringr::str_c(" ") |> 39 | # split into individual characters 40 | stringr::str_split("") |> 41 | unlist() 42 | 43 | tg_tokens <- r_tokenize_textgrid_chars(tg_characters) 44 | tier_indices <- find_tier_boundaries(tg_tokens) 45 | tier_types <- tg_tokens[tier_indices$start] |> unlist() 46 | 47 | tier_info_df <- data.frame( 48 | tier_num = seq_along(tier_types), 49 | tier_type = tier_types, 50 | tier_start = tier_indices$start, 51 | tier_end = tier_indices$end 52 | ) 53 | 54 | data <- tier_info_df |> 55 | split(~tier_num) |> 56 | lapply(parse_tier, tg_tokens = tg_tokens) |> 57 | dplyr::bind_rows() 58 | 59 | data[["tier_xmin"]] <- as.numeric(data[["tier_xmin"]]) 60 | data[["tier_xmax"]] <- as.numeric(data[["tier_xmax"]]) 61 | data[["xmin"]] <- as.numeric(data[["xmin"]]) 62 | data[["xmax"]] <- as.numeric(data[["xmax"]]) 63 | data[["tier_num"]] <- as.integer(data[["tier_num"]]) 64 | data[["annotation_num"]] <- as.integer(data[["annotation_num"]]) 65 | data[["text"]] <- as.character(data[["text"]]) 66 | data 67 | } 68 | 69 | r_tokenize_textgrid_chars <- function(all_char) { 70 | # The parser rules here follow the textgrid specifications 71 | # EXCEPT 72 | # when they contradict the behavior of Praat.exe. For example, the specs says 73 | # the main literals are freestanding strings and numbers, where freestanding 74 | # means that they have a whitespace or boundary (newline or file start/end). 75 | # But Praat.exe can handle numbers like "10.00!comment". So, this parser 76 | # gathers freestanding literals but only keeps ones that are strings or 77 | # start with a valid number (the non-numeric characters are lopped off.) 78 | 79 | in_strong_comment <- FALSE # Comment mode: ! to new line \n 80 | in_string <- FALSE # String mode: "Quote to quote" 81 | in_escaped_quote <- FALSE # Escaped quote: "" inside of a string 82 | 83 | token_start <- integer(0) # Start of current token 84 | values <- vector(mode = "list") # Collects completed values 85 | 86 | for (i in seq_along(all_char)) { 87 | cur_value_ready <- length(token_start) != 0 88 | c <- all_char[i] 89 | c_is_whitespace <- c %in% c(" ", "\n") 90 | c_starts_string <- c == "\"" 91 | 92 | # Comments start with ! and end with \n. Skip characters in this mode. 93 | if (!in_string & c == "!") { 94 | in_strong_comment <- TRUE 95 | next 96 | } 97 | if (in_strong_comment) { 98 | if (c == "\n") in_strong_comment <- FALSE 99 | next 100 | } 101 | 102 | # Whitespace delimits values so collect values if we see whitespace 103 | if (c_is_whitespace & !in_string) { 104 | # Skip whitespace if no values collected so far 105 | if (!cur_value_ready) next 106 | 107 | total_value <- all_char[seq(token_start, i - 1)] |> 108 | paste0(collapse = "") 109 | is_string <- all_char[token_start] == "\"" && all_char[i - 1] == "\"" 110 | 111 | # Collect only numbers and strings 112 | if (r_tg_parse_is_number(total_value)) { 113 | # Keep only the numeric part. 114 | total_value <- total_value |> r_tg_parse_extract_number() 115 | values <- c(values, total_value) 116 | } else if (is_string) { 117 | values <- c(values, total_value) 118 | } 119 | token_start <- integer(0) 120 | next 121 | } 122 | 123 | # Store character if ending an escaped quote 124 | if (in_escaped_quote) { 125 | in_escaped_quote <- !in_escaped_quote 126 | next 127 | } 128 | 129 | # Start or close string mode if we see " 130 | if (c_starts_string) { 131 | # Check for "" escapes 132 | peek_c <- all_char[i + 1] 133 | if (peek_c == "\"" & in_string) { 134 | in_escaped_quote <- TRUE 135 | } else { 136 | in_string <- !in_string 137 | } 138 | } 139 | 140 | if (!cur_value_ready) { 141 | token_start <- i 142 | } 143 | } 144 | 145 | values |> 146 | lapply(r_tg_parse_convert_value) 147 | } 148 | 149 | # A numeric token is: 150 | # string start 151 | # (optional minus sign) 152 | # digit(s) 153 | # (optional decimal point and digit(s)) 154 | r_tg_parse_is_number <- function(x) { 155 | .NUM_RE <- "^[+-]?\\d+(?:\\.\\d*)?(?:[eE][+-]?\\d+)?" 156 | stringr::str_detect(x, .NUM_RE) 157 | } 158 | 159 | r_tg_parse_extract_number <- function(x) { 160 | .NUM_RE <- "^[+-]?\\d+(?:\\.\\d*)?(?:[eE][+-]?\\d+)?" 161 | x |> 162 | stringr::str_extract(.NUM_RE) |> 163 | as.numeric() 164 | } 165 | 166 | r_tg_parse_convert_value <- function(x) { 167 | if (is.character(x)) { 168 | # unquote strings 169 | x <- substr(x, 2, nchar(x) - 1) 170 | # undo "" escapement 171 | x <- stringr::str_replace_all(x, "\"\"", "\"") 172 | } 173 | x 174 | } 175 | -------------------------------------------------------------------------------- /R/legacy.R: -------------------------------------------------------------------------------- 1 | # Old version of parsing code 2 | 3 | 4 | #' @export 5 | #' @rdname read_textgrid 6 | #' @order 3 7 | legacy_read_textgrid <- function(path, file = NULL, encoding = NULL) { 8 | if (is.null(file)) { 9 | file <- basename(path) 10 | } 11 | 12 | if (is.null(encoding)) { 13 | encoding <- readr::guess_encoding(path)$encoding[1] 14 | } 15 | file_locale <- readr::locale(encoding = encoding) 16 | 17 | path |> 18 | readr::read_lines(locale = file_locale) |> 19 | legacy_read_textgrid_lines(file = file) 20 | } 21 | 22 | #' @export 23 | #' @rdname read_textgrid 24 | #' @order 4 25 | legacy_read_textgrid_lines <- function(lines, file = NULL) { 26 | if (is.null(file)) { 27 | file <- NA_character_ 28 | } 29 | 30 | stopifnot(str_detect_any(lines, "ooTextFile")) 31 | 32 | df <- lines |> 33 | .v1_parse_textgrid_lines() |> 34 | tibble::as_tibble() |> 35 | tibble::add_column(file = file, .before = 1) 36 | 37 | df[["tier_name"]] <- .v1_str_unescape_quote(df[["tier_name"]]) 38 | df[["text"]] <- .v1_str_unescape_quote(df[["text"]]) 39 | df 40 | } 41 | 42 | .v1_parse_textgrid_lines <- function(lines) { 43 | lines |> 44 | .v1_slice_sections("item") |> 45 | purrr::map(.v1_parse_item_lines) |> 46 | dplyr::bind_rows() 47 | } 48 | 49 | .v1_slice_sections <- function(lines, section_head) { 50 | re <- sprintf("^\\s+%s ?\\[\\d+\\]:?", section_head) 51 | starts <- stringr::str_which(lines, re) 52 | ends <- c(starts[-1] - 1, length(lines)) 53 | purrr::map2(starts, ends, function(x, y) lines[seq(x, y, by = 1)]) 54 | } 55 | 56 | .v1_parse_item_lines <- function(lines_items) { 57 | item_num <- lines_items[1] |> 58 | stringr::str_extract("\\d+") |> 59 | as.numeric() 60 | 61 | tier_type <- .v1_get_field(lines_items, "class") 62 | tier_name <- .v1_get_field(lines_items, "name") 63 | tier_xmin <- .v1_get_field_dbl(lines_items, "xmin") 64 | tier_xmax <- .v1_get_field_dbl(lines_items, "xmax") 65 | 66 | stopifnot(tier_type %in% c("IntervalTier", "TextTier")) 67 | 68 | if (tier_type == "IntervalTier") { 69 | df <- .v1_parse_interval_tier(lines_items) 70 | } else { 71 | df <- .v1_parse_point_tier(lines_items) 72 | } 73 | 74 | df[["xmin"]] <- as.numeric(df[["xmin"]]) 75 | df[["xmax"]] <- as.numeric(df[["xmax"]]) 76 | 77 | tibble::add_column( 78 | .data = df, 79 | tier_num = item_num, 80 | tier_name = tier_name, 81 | tier_type = tier_type, 82 | tier_xmin = tier_xmin, 83 | tier_xmax = tier_xmax, 84 | .before = 1 85 | ) 86 | } 87 | 88 | .v1_parse_interval_tier <- function(lines_interval_tier) { 89 | lines_interval_tier |> 90 | .v1_slice_sections("intervals") |> 91 | purrr::map(.v1_combine_text_lines) |> 92 | purrr::map(.v1_get_field_list, fields = c("xmin", "xmax", "text")) |> 93 | purrr::imap(.v1_add_annotation_num) |> 94 | dplyr::bind_rows() 95 | } 96 | 97 | # If the text field spans multiple lines, combine them into one string 98 | .v1_combine_text_lines <- function(lines_annotation) { 99 | loc_mark_start <- lines_annotation |> .v1_which_field("mark") 100 | loc_text_start <- lines_annotation |> .v1_which_field("text") 101 | loc_text_start <- c(loc_text_start, loc_mark_start) 102 | 103 | if (loc_text_start != length(lines_annotation)) { 104 | loc_text_rest <- seq(loc_text_start + 1, length(lines_annotation), by = 1) 105 | loc_text_full <- c(loc_text_start, loc_text_rest) 106 | lines_annotation[loc_text_start] <- lines_annotation[loc_text_full] |> 107 | paste0(collapse = "\n") 108 | lines_annotation <- lines_annotation[-loc_text_rest] 109 | } 110 | lines_annotation 111 | } 112 | 113 | .v1_parse_point_tier <- function(lines_point_tier) { 114 | no_points <- .v1_str_detect_any(lines_point_tier, "points: size = 0") 115 | 116 | if (!no_points) { 117 | df <- lines_point_tier |> 118 | .v1_slice_sections("points") |> 119 | purrr::map(.v1_get_field_list, fields = c("number", "mark")) |> 120 | purrr::imap(.v1_add_annotation_num) |> 121 | dplyr::bind_rows() 122 | 123 | # We treat points as zero-width intervals 124 | df[["xmin"]] <- df[["number"]] 125 | df[["xmax"]] <- df[["number"]] 126 | df[["text"]] <- df[["mark"]] 127 | df[["mark"]] <- NULL 128 | df[["number"]] <- NULL 129 | df <- df[c("xmin", "xmax", "text", "annotation_num")] 130 | } else { 131 | # A point interval with no points should be represented in the results. 132 | df <- data.frame( 133 | xmin = NA, 134 | xmax = NA, 135 | text = NA_character_, 136 | annotation_num = NA, 137 | stringsAsFactors = FALSE 138 | ) 139 | } 140 | 141 | df 142 | } 143 | 144 | .v1_add_annotation_num <- function(x, y) { 145 | x[["annotation_num"]] <- y 146 | x 147 | } 148 | 149 | .v1_get_field_list <- function(lines, fields) { 150 | stats::setNames( 151 | lapply(fields, function(x) .v1_get_field(lines, x)), 152 | fields 153 | ) 154 | } 155 | 156 | # Find first match of "[field] = [value]", returning [value] 157 | .v1_get_field <- function(lines, field) { 158 | re <- paste0("(?<=", field, " = ).+") |> 159 | # "text = .*" needs to capture newlines too 160 | stringr::regex(dotall = TRUE) 161 | 162 | lines |> 163 | stringr::str_extract(re) |> 164 | .v1_remove_na() |> 165 | utils::head(1) |> 166 | stringr::str_trim() |> 167 | .v1_str_unquote() 168 | } 169 | 170 | # Find first match of "[field] = [value]", returning [value] 171 | .v1_which_field <- function(lines, field) { 172 | re <- paste0("(?<=", field, " = ).+") 173 | lines |> 174 | stringr::str_which(re) 175 | } 176 | 177 | # Find first match of "[field] = [value]", returning [value] 178 | .v1_get_field_dbl <- function(lines, field) { 179 | as.numeric(.v1_get_field(lines, field)) 180 | } 181 | 182 | .v1_remove_na <- function(xs) { 183 | xs[!is.na(xs)] 184 | } 185 | 186 | .v1_str_unquote <- function(xs) { 187 | gsub("^\"|\"$", "", xs) 188 | } 189 | 190 | .v1_str_unescape_quote <- function(xs) { 191 | gsub('""', '"', xs, perl = TRUE) 192 | } 193 | 194 | .v1_str_detect_any <- function(xs, pattern) { 195 | any(stringr::str_detect(xs, pattern)) 196 | } 197 | -------------------------------------------------------------------------------- /tests/testthat/test-read-textgrid.R: -------------------------------------------------------------------------------- 1 | test_that("reading in point tiers", { 2 | path <- testthat::test_path("test-data/points.TextGrid") 3 | tg <- read_textgrid(path) 4 | expect_equal(nrow(tg), 3) 5 | 6 | # Validate against v1 and pure-r version 7 | tg2 <- legacy_read_textgrid(path) 8 | expect_equal(tg, tg2) 9 | tg3 <- r_read_textgrid(path) 10 | expect_equal(tg, tg3) 11 | }) 12 | 13 | test_that("reading in empty point tiers", { 14 | path <- testthat::test_path("test-data/Mary_John_bell.TextGrid") 15 | tg <- read_textgrid(path) 16 | expect_equal(nrow(tg), 3) 17 | 18 | # Validate against v1 and pure-r version 19 | tg2 <- legacy_read_textgrid(path) 20 | expect_equal(tg, tg2) 21 | tg3 <- r_read_textgrid(path) 22 | expect_equal(tg, tg3) 23 | }) 24 | 25 | test_that("result is a tibble", { 26 | path <- testthat::test_path("test-data/Mary_John_bell.TextGrid") 27 | tg <- read_textgrid(path) 28 | testthat::expect_s3_class(tg, "tbl") 29 | }) 30 | 31 | test_that("we can parse numbers supported by Praat.exe", { 32 | # Files here are minimal tests for numbers that can be parsed by Praat.exe. 33 | # We need to match what Praat.exe supports, not what they say they support. 34 | path <- testthat::test_path("test-data/praat-test/okay-digit-dot-space.TextGrid") 35 | # if (interactive()) writeLines(readLines(path)) 36 | tg <- read_textgrid(path) 37 | expect_equal(tg$xmax, 1.0) 38 | 39 | path <- testthat::test_path("test-data/praat-test/okay-plus-digit-or-minus-digit.TextGrid") 40 | # if (interactive()) writeLines(readLines(path)) 41 | tg <- read_textgrid(path) 42 | expect_equal(tg$tier_xmin, -0.3) 43 | expect_equal(tg$tier_xmax, 2.0) 44 | 45 | path <- testthat::test_path("test-data/praat-test/okay-scientific-notation.TextGrid") 46 | # if (interactive()) writeLines(readLines(path)) 47 | tg <- read_textgrid(path) 48 | expect_equal(tg$tier_xmin, c(0, 0, 0)) 49 | expect_equal(tg$tier_xmax, c(20, 20, 20)) 50 | expect_equal(tg$xmin, c(0, 0.5, 10)) 51 | expect_equal(tg$xmax, c(0.5, 10, 20)) 52 | 53 | path <- testthat::test_path("test-data/praat-test/okay-hex-numbers.TextGrid") 54 | # if (interactive()) writeLines(readLines(path)) 55 | tg <- read_textgrid(path) 56 | expect_equal(tg$tier_xmin, c(0, 0, 0, 0)) 57 | expect_equal(tg$tier_xmax, c(3, 3, 3, 3)) 58 | expect_equal(tg$xmin, c(0, 0.5, 1.5, 2.5)) 59 | expect_equal(tg$xmax, c(0.5, 1.5, 2.5, 3)) 60 | 61 | path <- testthat::test_path("test-data/praat-test/okay-real-with-trailing-characters.TextGrid") 62 | # if (interactive()) writeLines(readLines(path)) 63 | tg <- read_textgrid(path) 64 | expect_equal(tg$tier_xmin, c(0, 0, 0)) 65 | expect_equal(tg$tier_xmax, c(20, 20, 20)) 66 | expect_equal(tg$xmin, c(0, 0.5, 10)) 67 | expect_equal(tg$xmax, c(0.5, 10, 20)) 68 | }) 69 | 70 | test_that("example_textgrid works", { 71 | path <- example_textgrid() 72 | tg <- read_textgrid(path) 73 | expect_equal(nrow(tg), 3) 74 | 75 | # Validate against v1 and pure-r version 76 | tg2 <- legacy_read_textgrid(path) 77 | expect_equal(tg, tg2) 78 | tg3 <- r_read_textgrid(path) 79 | expect_equal(tg, tg3) 80 | }) 81 | 82 | test_that("comment textgrid works", { 83 | path <- testthat::test_path("test-data/comment.TextGrid") 84 | tg <- read_textgrid(path) 85 | testthat::expect_s3_class(tg, "tbl") 86 | expect_equal(nrow(tg), 3) 87 | }) 88 | 89 | test_that("short format textgrid works", { 90 | path <- testthat::test_path("test-data/short.TextGrid") 91 | tg <- read_textgrid(path) 92 | testthat::expect_s3_class(tg, "tbl") 93 | expect_equal(nrow(tg), 3) 94 | }) 95 | 96 | test_that("escaped quotes (\"\") are converted to single (\")", { 97 | path <- testthat::test_path("test-data/quoted.TextGrid") 98 | tg <- read_textgrid(path) 99 | has_double <- any(grepl('""', tg$text)) 100 | has_single <- any(grepl('"', tg$text)) 101 | expect_false(has_double) 102 | expect_true(has_single) 103 | 104 | # Validate against v1 and pure-r version 105 | tg2 <- legacy_read_textgrid(path) 106 | expect_equal(tg, tg2) 107 | tg3 <- r_read_textgrid(path) 108 | expect_equal(tg, tg3) 109 | }) 110 | 111 | test_that("can read in hard-to-parse file", { 112 | path <- testthat::test_path("test-data/hard-to-parse.TextGrid") 113 | tg <- read_textgrid(path) 114 | 115 | # Validate against pure r 116 | tg4 <- r_read_textgrid(path) 117 | tg4$file <- "hard-to-parse.TextGrid" 118 | expect_equal(tg, tg4) 119 | 120 | # a version of the TextGrid opened and saved by Praat to a long TextGrid 121 | path2 <- testthat::test_path("test-data/hard-to-parse-normalized.TextGrid") 122 | tg2 <- read_textgrid(path2) 123 | tg2$file <- "hard-to-parse.TextGrid" 124 | expect_equal(tg, tg2) 125 | 126 | # Validate against v1 opening the normalized version 127 | tg3 <- legacy_read_textgrid(path2) 128 | tg3$file <- "hard-to-parse.TextGrid" 129 | expect_equal(tg, tg3) 130 | }) 131 | 132 | 133 | 134 | test_that("encoding support", { 135 | example_textgrid(1) |> 136 | read_textgrid(encoding = "UTF-8") |> 137 | nrow() |> 138 | expect_equal(3) 139 | 140 | example_textgrid(1) |> 141 | read_textgrid() |> 142 | nrow() |> 143 | expect_equal(3) 144 | 145 | example_textgrid(2) |> 146 | read_textgrid(encoding = "UTF-16") |> 147 | nrow() |> 148 | expect_equal(3) 149 | 150 | example_textgrid(2) |> 151 | read_textgrid() |> 152 | nrow() |> 153 | expect_equal(3) 154 | }) 155 | 156 | test_that("reading in ELAN-generated textgrids (#11)", { 157 | path <- testthat::test_path("test-data/elan.TextGrid") 158 | 159 | path |> 160 | read_textgrid() |> 161 | nrow() |> 162 | expect_equal(5) 163 | }) 164 | 165 | test_that("pivoting words on a single tier", { 166 | path <- testthat::test_path("test-data/nested-intervals.TextGrid") 167 | 168 | data <- path |> 169 | read_textgrid() 170 | 171 | p1 <- data |> pivot_textgrid_tiers("utterance") 172 | expect_equal(p1$utterance, "hug daddy") 173 | 174 | p1 |> 175 | hasName(c("utterance_xmin", "utterance_xmid", "utterance_xmax")) |> 176 | all() |> 177 | expect_true() 178 | 179 | data |> 180 | pivot_textgrid_tiers("fake name") |> 181 | expect_error("must be used") 182 | }) 183 | 184 | test_that("pivoting works with multiple tiers", { 185 | path <- testthat::test_path("test-data/nested-intervals.TextGrid") 186 | phones <- c("sil", "HH", "AH1", "G", "sp", "D", "AE1", "D", "IY0", "sp", "") 187 | words <- rep(c("", "hug", "", "daddy", ""), c(1, 3, 1, 4, 2)) 188 | 189 | data <- path |> 190 | read_textgrid() 191 | 192 | p2 <- data |> pivot_textgrid_tiers(c("words", "phones")) 193 | 194 | p2$words |> 195 | expect_equal(words) 196 | 197 | p2$phones |> 198 | expect_equal(phones) 199 | 200 | p2 |> 201 | hasName(c("words_xmin", "words_xmid", "words_xmax")) |> 202 | all() |> 203 | expect_true() 204 | 205 | p2 |> 206 | hasName(c("phones_xmin", "phones_xmid", "phones_xmax")) |> 207 | all() |> 208 | expect_true() 209 | 210 | }) 211 | 212 | 213 | 214 | test_that("we match Praat.exe's parsing behavior", { 215 | c("+1.0", "000", "3e", "3E", "-2", "0xA", ".5", "+.0") |> 216 | cpp_parse_praat_numbers() |> 217 | _$value |> 218 | expect_equal(c(1, 0, 3, 3, -2, 10, NA_real_, NA_real_)) 219 | }) 220 | -------------------------------------------------------------------------------- /R/readtextgrid.R: -------------------------------------------------------------------------------- 1 | 2 | #' Read a textgrid file into a tibble 3 | #' 4 | #' @rdname read_textgrid 5 | #' @param path a path to a textgrid 6 | #' @param lines alternatively, the lines of a textgrid file 7 | #' @param encoding the encoding of the textgrid. The default value `NULL` uses 8 | #' [readr::guess_encoding()] to guess the encoding of the textgrid. If an 9 | #' encoding is provided, it is forwarded to `[readr::locale()]` and 10 | #' `[readr::read_lines()]`. 11 | #' @param file an optional value to use for the `file` column. For 12 | #' `read_textgrid()`, the default is the base filename of the input file. For 13 | #' `read_textgrid_lines()`, the default is `NA`. 14 | #' @return a tibble with one row per textgrid annotation 15 | #' 16 | #' @details The `legacy_read_textgrid` functions are the original textgrid 17 | #' parsers provided by the package. They assume that the TextGrid file is a 18 | #' "long" format textgrid; this is the default format used by "Save a text 19 | #' file..." in Praat. 20 | #' 21 | #' The current `read_textgrid()` functions are more 22 | #' flexible and can read in "short" format textgrids and textgrids with 23 | #' comments. 24 | #' 25 | #' See 26 | #' for a description of the textgrid file format. Note that this package does 27 | #' not strictly adhere to format as described in this document. For example, 28 | #' the document says that numbers should be freestanding (surrounded by spaces 29 | #' or string boundaries), but Praat.exe can handle malformed numbers like 30 | #' `100ms`. Therefore, we tried to implement a parser that matched what Praat 31 | #' actually handles. 32 | #' 33 | #' @export 34 | #' @order 1 35 | #' @examples 36 | #' tg <- system.file("Mary_John_bell.TextGrid", package = "readtextgrid") 37 | #' read_textgrid(tg) 38 | read_textgrid <- function(path, file = NULL, encoding = NULL) { 39 | if (is.null(file)) { 40 | file <- basename(path) 41 | } 42 | 43 | if (is.null(encoding)) { 44 | encoding <- readr::guess_encoding(path)$encoding[1] 45 | } 46 | file_locale <- readr::locale(encoding = encoding) 47 | 48 | path |> 49 | readr::read_lines(locale = file_locale) |> 50 | read_textgrid_lines(file = file) 51 | } 52 | 53 | #' @rdname read_textgrid 54 | #' @order 2 55 | #' @export 56 | read_textgrid_lines <- function(lines, file = NULL) { 57 | if (is.null(file)) { 58 | file <- NA_character_ 59 | } 60 | 61 | stopifnot(str_detect_any(lines, "ooTextFile")) 62 | 63 | lines |> 64 | parse_textgrid_lines() |> 65 | tibble::add_column(file = file, .before = 1) |> 66 | tibble::as_tibble() 67 | } 68 | 69 | parse_textgrid_lines <- function(lines) { 70 | tg_text <- lines |> 71 | # collapse into one string 72 | stringr::str_c(collapse = "\n") |> 73 | # concat one trailing space 74 | stringr::str_c(" ") 75 | 76 | tg_tokens <- tokenize_textgrid(tg_text) 77 | tier_indices <- find_tier_boundaries(tg_tokens) 78 | tier_types <- tg_tokens[tier_indices$start] |> unlist() 79 | 80 | tier_info_df <- data.frame( 81 | tier_num = seq_along(tier_types), 82 | tier_type = tier_types, 83 | tier_start = tier_indices$start, 84 | tier_end = tier_indices$end 85 | ) 86 | 87 | data <- tier_info_df |> 88 | split(~tier_num) |> 89 | lapply(parse_tier, tg_tokens = tg_tokens) |> 90 | dplyr::bind_rows() 91 | 92 | data[["tier_xmin"]] <- as.numeric(data[["tier_xmin"]]) 93 | data[["tier_xmax"]] <- as.numeric(data[["tier_xmax"]]) 94 | data[["xmin"]] <- as.numeric(data[["xmin"]]) 95 | data[["xmax"]] <- as.numeric(data[["xmax"]]) 96 | data[["tier_num"]] <- as.integer(data[["tier_num"]]) 97 | data[["annotation_num"]] <- as.integer(data[["annotation_num"]]) 98 | data[["text"]] <- as.character(data[["text"]]) 99 | data 100 | } 101 | 102 | 103 | parse_tier <- function(tier_info, tg_tokens) { 104 | tier_tokens <- tg_tokens[tier_info$tier_start:tier_info$tier_end] 105 | 106 | # An empty Interval tier always has at least one interval. So it has that 107 | # at least 8 elements: 108 | # - (5) class, tier name, tier xmin, tier xmax, num intervals, 109 | # - (3) interval xmin, interval xmax, interval text 110 | # An empty Point tier has at least 5 elements 111 | # - (5) class, tier name, tier xmin, tier xmax, num points 112 | LENGTH_EMPTY_POINT_INTERVAL <- 5 113 | 114 | if (length(tier_tokens) == LENGTH_EMPTY_POINT_INTERVAL) { 115 | outer_df <- data.frame( 116 | tier_num = tier_info[["tier_num"]], 117 | tier_name = tier_tokens[[2]], 118 | tier_type = tier_tokens[[1]], 119 | tier_xmin = tier_tokens[[3]], 120 | tier_xmax = tier_tokens[[4]], 121 | xmin = NA_real_, 122 | xmax = NA_real_, 123 | text = NA_character_, 124 | annotation_num = NA_integer_ 125 | ) 126 | return(outer_df) 127 | } 128 | 129 | if (tier_info$tier_type == "IntervalTier") { 130 | marks_df <- make_intervals(tier_tokens, tg_tokens) 131 | } 132 | 133 | if (tier_info$tier_type == "TextTier") { 134 | marks_df <- make_points(tier_tokens, tg_tokens) 135 | } 136 | 137 | marks_df[["tier_num"]] <- tier_info[["tier_num"]] 138 | marks_df 139 | } 140 | 141 | 142 | make_intervals <- function(tier_tokens, tg_tokens) { 143 | # Skip first five elements (tier-level data) 144 | interval_data <- tier_tokens[-(1:5)] 145 | start_idx <- seq(1, length(interval_data) - 2, by = 3) 146 | 147 | data.frame( 148 | tier_num = NA_integer_, 149 | tier_name = tier_tokens[[2]], 150 | tier_type = tier_tokens[[1]], 151 | tier_xmin = tier_tokens[[3]], 152 | tier_xmax = tier_tokens[[4]], 153 | xmin = interval_data[start_idx] |> unlist(), 154 | xmax = interval_data[start_idx + 1] |> unlist(), 155 | text = interval_data[start_idx + 2] |> unlist(), 156 | annotation_num = seq_along(start_idx) 157 | ) 158 | } 159 | 160 | 161 | make_points <- function(tier_tokens, tg_tokens) { 162 | # Skip first five elements (tier-level data) 163 | point_data <- tier_tokens[-(1:5)] 164 | start_idx <- seq(1, length(point_data) - 1, by = 2) 165 | 166 | data.frame( 167 | tier_num = NA_integer_, 168 | tier_name = tier_tokens[[2]], 169 | tier_type = tier_tokens[[1]], 170 | tier_xmin = tier_tokens[[3]], 171 | tier_xmax = tier_tokens[[4]], 172 | xmin = point_data[start_idx] |> unlist(), 173 | xmax = point_data[start_idx] |> unlist(), 174 | text = point_data[start_idx + 1] |> unlist(), 175 | annotation_num = seq_along(start_idx) 176 | ) 177 | } 178 | 179 | tokenize_textgrid <- function(tg_text) { 180 | # C++ scan for tokens 181 | res <- withr::with_locale( 182 | c(LC_NUMERIC = "C"), 183 | cpp_tg_scan_tokens(tg_text) 184 | ) 185 | toks <- res$tokens 186 | is_string <- res$is_string 187 | is_number <- (res$num_prefix != 0) & !is_string 188 | keep <- is_number | is_string 189 | 190 | toks <- toks[keep] 191 | out <- vector("list", length(toks)) 192 | 193 | strings <- toks[is_string[keep]] 194 | strings <- substring(strings, 2L, nchar(strings) - 1L) 195 | strings <- gsub('""', '"', strings, fixed = TRUE) 196 | out[is_string[keep]] <- strings 197 | out[is_number[keep]] <- res$num_value[is_number] 198 | 199 | out 200 | } 201 | 202 | 203 | 204 | find_tier_boundaries <- function(tg_tokens) { 205 | # TODO: 206 | # TextGrid_checkInvariants_e() in Praat source provides strong and weak 207 | # invariants 208 | # https://github.com/praat/praat.github.io/blob/master/fon/TextGrid.cpp#L1402 209 | 210 | 211 | # A textgrid interval might legitimately have the text "Tier" in it so 212 | # don't use regexes. Just consume tokens. 213 | num_tiers <- tg_tokens[[5]] 214 | tier_starts <- integer(num_tiers) 215 | tier_ends <- integer(num_tiers) 216 | tier_starts[1] <- 6L 217 | 218 | for (tier_i in seq_len(num_tiers)) { 219 | type <- tg_tokens[[tier_starts[tier_i]]] 220 | size <- tg_tokens[[tier_starts[tier_i] + 4]] 221 | # promote negative size to 0 222 | size <- max(c(0, size)) 223 | 224 | if (type == "IntervalTier") { 225 | tier_end <- tier_starts[tier_i] + 4 + 3 * size 226 | } else { 227 | # 2 lines per point but they can have size 0 228 | tier_end <- tier_starts[tier_i] + 4 + 2 * size 229 | } 230 | 231 | if (tier_i != num_tiers) { 232 | tier_starts[tier_i + 1] <- tier_end + 1 233 | } 234 | tier_ends[tier_i] <- tier_end 235 | } 236 | 237 | tier_types <- tg_tokens[tier_starts] |> unlist() 238 | valid_tier_types <- tier_types |> 239 | is.element(c("IntervalTier", "TextTier")) |> 240 | all() 241 | 242 | if (!valid_tier_types) { 243 | rlang::abort("TextGrid appears misformatted") 244 | } 245 | 246 | list(start = tier_starts, end = tier_ends) 247 | } 248 | 249 | 250 | str_detect_any <- function(xs, pattern) { 251 | any(stringr::str_detect(xs, pattern)) 252 | } 253 | 254 | 255 | #' Locate the path of an example textgrid file 256 | #' 257 | #' Locate the path of an example textgrid file 258 | #' 259 | #' @param which index of the textgrid to load 260 | #' @return Path of `"Mary_John_bell.TextGrid"` bundled with the `readtextgrid` 261 | #' package. 262 | #' 263 | #' @details This function is a wrapper over [`system.file()`] to locate the 264 | #' paths to bundled textgrids. These files are used to test or demonstrate 265 | #' functionality of the package. 266 | #' 267 | #' Two files are included: 268 | #' 269 | #' 1. `"Mary_John_bell.TextGrid"` - the default TextGrid created by Praat's 270 | #' Create TextGrid command. This file is saved as UTF-8 encoding. 271 | #' 2. `"utf_16_be.TextGrid"` - a TextGrid with some IPA characters entered using 272 | #' Praat's IPA character selector. This file is saved with UTF-16 encoding. 273 | #' 3. `"nested-intervals.TextGrid"` - A textgrid containing an `"utterance"` 274 | #' tier, a `"words"` tier, and a `"phones"` tier. This file is typical of 275 | #' forced alignment textgrids where utterances contain words which contain 276 | #' speech segments. In this case, alignment was made by hand so that word 277 | #' and phone boundaries do not correspond exactly. 278 | #' 279 | #' @export 280 | example_textgrid <- function(which = 1) { 281 | choices <- c( 282 | "Mary_John_bell.TextGrid", 283 | "utf_16_be.TextGrid", 284 | "nested-intervals.TextGrid" 285 | ) 286 | 287 | system.file(choices[which], package = "readtextgrid") 288 | } 289 | -------------------------------------------------------------------------------- /inst/draw-tg-parts.Collection: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "Collection" 3 | 4 | size = 8 5 | item []: 6 | item [1]: 7 | class = "TextGrid" 8 | name = "Mary_John_bell" 9 | xmin = 0 10 | xmax = 1 11 | tiers? 12 | size = 3 13 | item []: 14 | item [1]: 15 | class = "IntervalTier" 16 | name = "Mary" 17 | xmin = 0 18 | xmax = 1 19 | intervals: size = 1 20 | intervals [1]: 21 | xmin = 0 22 | xmax = 1 23 | text = "" 24 | item [2]: 25 | class = "IntervalTier" 26 | name = "John" 27 | xmin = 0 28 | xmax = 1 29 | intervals: size = 1 30 | intervals [1]: 31 | xmin = 0 32 | xmax = 1 33 | text = "" 34 | item [3]: 35 | class = "TextTier" 36 | name = "bell" 37 | xmin = 0 38 | xmax = 1 39 | points: size = 0 40 | item [2]: 41 | class = "TextGrid" 42 | name = "blue-text" 43 | xmin = 0 44 | xmax = 1 45 | tiers? 46 | size = 3 47 | item []: 48 | item [1]: 49 | class = "IntervalTier" 50 | name = "Mary" 51 | xmin = 0 52 | xmax = 1 53 | intervals: size = 1 54 | intervals [1]: 55 | xmin = 0 56 | xmax = 1 57 | text = "" 58 | item [2]: 59 | class = "IntervalTier" 60 | name = "John" 61 | xmin = 0 62 | xmax = 1 63 | intervals: size = 3 64 | intervals [1]: 65 | xmin = 0 66 | xmax = 0.1850158439093439 67 | text = "\s{0}" 68 | intervals [2]: 69 | xmin = 0.1850158439093439 70 | xmax = 0.7817226699265979 71 | text = "" 72 | intervals [3]: 73 | xmin = 0.7817226699265979 74 | xmax = 1 75 | text = "\s{1.000000}" 76 | item [3]: 77 | class = "IntervalTier" 78 | name = "" 79 | xmin = 0 80 | xmax = 1 81 | intervals: size = 1 82 | intervals [1]: 83 | xmin = 0 84 | xmax = 1 85 | text = "" 86 | item [3]: 87 | class = "TextGrid" 88 | name = "forehead-and-chin" 89 | xmin = 0 90 | xmax = 1 91 | tiers? 92 | size = 3 93 | item []: 94 | item [1]: 95 | class = "IntervalTier" 96 | name = "Mary" 97 | xmin = 0 98 | xmax = 1 99 | intervals: size = 1 100 | intervals [1]: 101 | xmin = 0 102 | xmax = 1 103 | text = "\s{1.00000 (1.000 / s)}" 104 | item [2]: 105 | class = "IntervalTier" 106 | name = "John" 107 | xmin = 0 108 | xmax = 1 109 | intervals: size = 1 110 | intervals [1]: 111 | xmin = 0 112 | xmax = 1 113 | text = "\s{Visible part 1.000000 seconds}" 114 | item [3]: 115 | class = "IntervalTier" 116 | name = "" 117 | xmin = 0 118 | xmax = 1 119 | intervals: size = 1 120 | intervals [1]: 121 | xmin = 0 122 | xmax = 1 123 | text = "\s{Total duration 1.000000 seconds}" 124 | item [4]: 125 | class = "TextGrid" 126 | name = "right-marginal-text" 127 | xmin = 0 128 | xmax = 1 129 | tiers? 130 | size = 3 131 | item []: 132 | item [1]: 133 | class = "IntervalTier" 134 | name = "Mary" 135 | xmin = 0 136 | xmax = 1 137 | intervals: size = 1 138 | intervals [1]: 139 | xmin = 0 140 | xmax = 1 141 | text = "\s{Mary} 142 | \s{(1)}" 143 | item [2]: 144 | class = "IntervalTier" 145 | name = "John" 146 | xmin = 0 147 | xmax = 1 148 | intervals: size = 1 149 | intervals [1]: 150 | xmin = 0 151 | xmax = 1 152 | text = "\s{John} 153 | \s{(1)}" 154 | item [3]: 155 | class = "IntervalTier" 156 | name = "" 157 | xmin = 0 158 | xmax = 1 159 | intervals: size = 1 160 | intervals [1]: 161 | xmin = 0 162 | xmax = 1 163 | text = "\s{bell} 164 | \s{(0)}" 165 | item [5]: 166 | class = "TextGrid" 167 | name = "left-marginal-text" 168 | xmin = 0 169 | xmax = 1 170 | tiers? 171 | size = 3 172 | item []: 173 | item [1]: 174 | class = "IntervalTier" 175 | name = "Mary" 176 | xmin = 0 177 | xmax = 1 178 | intervals: size = 1 179 | intervals [1]: 180 | xmin = 0 181 | xmax = 1 182 | text = "1" 183 | item [2]: 184 | class = "IntervalTier" 185 | name = "John" 186 | xmin = 0 187 | xmax = 1 188 | intervals: size = 1 189 | intervals [1]: 190 | xmin = 0 191 | xmax = 1 192 | text = "2" 193 | item [3]: 194 | class = "IntervalTier" 195 | name = "" 196 | xmin = 0 197 | xmax = 1 198 | intervals: size = 1 199 | intervals [1]: 200 | xmin = 0 201 | xmax = 1 202 | text = "3" 203 | item [6]: 204 | class = "TextGrid" 205 | name = "blue-text-mask" 206 | xmin = 0 207 | xmax = 1 208 | tiers? 209 | size = 3 210 | item []: 211 | item [1]: 212 | class = "IntervalTier" 213 | name = "Mary" 214 | xmin = 0 215 | xmax = 1 216 | intervals: size = 1 217 | intervals [1]: 218 | xmin = 0 219 | xmax = 1 220 | text = "" 221 | item [2]: 222 | class = "IntervalTier" 223 | name = "John" 224 | xmin = 0 225 | xmax = 1 226 | intervals: size = 3 227 | intervals [1]: 228 | xmin = 0 229 | xmax = 0.1850158439093439 230 | text = "" 231 | intervals [2]: 232 | xmin = 0.1850158439093439 233 | xmax = 0.7817226699265979 234 | text = "" 235 | intervals [3]: 236 | xmin = 0.7817226699265979 237 | xmax = 1 238 | text = "" 239 | item [3]: 240 | class = "IntervalTier" 241 | name = "" 242 | xmin = 0 243 | xmax = 1 244 | intervals: size = 1 245 | intervals [1]: 246 | xmin = 0 247 | xmax = 1 248 | text = "" 249 | item [7]: 250 | class = "TextGrid" 251 | name = "left-marginal-text-mask" 252 | xmin = 0 253 | xmax = 1 254 | tiers? 255 | size = 3 256 | item []: 257 | item [1]: 258 | class = "IntervalTier" 259 | name = "Mary" 260 | xmin = 0 261 | xmax = 1 262 | intervals: size = 1 263 | intervals [1]: 264 | xmin = 0 265 | xmax = 1 266 | text = "" 267 | item [2]: 268 | class = "IntervalTier" 269 | name = "John" 270 | xmin = 0 271 | xmax = 1 272 | intervals: size = 1 273 | intervals [1]: 274 | xmin = 0 275 | xmax = 1 276 | text = "" 277 | item [3]: 278 | class = "IntervalTier" 279 | name = "" 280 | xmin = 0 281 | xmax = 1 282 | intervals: size = 1 283 | intervals [1]: 284 | xmin = 0 285 | xmax = 1 286 | text = "" 287 | item [8]: 288 | class = "TextGrid" 289 | name = "right-marginal-text-mask" 290 | xmin = 0 291 | xmax = 1 292 | tiers? 293 | size = 3 294 | item []: 295 | item [1]: 296 | class = "IntervalTier" 297 | name = "Mary" 298 | xmin = 0 299 | xmax = 1 300 | intervals: size = 1 301 | intervals [1]: 302 | xmin = 0 303 | xmax = 1 304 | text = "" 305 | item [2]: 306 | class = "IntervalTier" 307 | name = "John" 308 | xmin = 0 309 | xmax = 1 310 | intervals: size = 1 311 | intervals [1]: 312 | xmin = 0 313 | xmax = 1 314 | text = "" 315 | item [3]: 316 | class = "IntervalTier" 317 | name = "" 318 | xmin = 0 319 | xmax = 1 320 | intervals: size = 1 321 | intervals [1]: 322 | xmin = 0 323 | xmax = 1 324 | text = "" 325 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # readtextgrid 17 | 18 | 19 | [![CRAN status](https://www.r-pkg.org/badges/version/readtextgrid)](https://CRAN.R-project.org/package=readtextgrid) 20 | [![readtextgrid status badge](https://tjmahr.r-universe.dev/readtextgrid/badges/version)](https://tjmahr.r-universe.dev/readtextgrid) 21 | [![R-CMD-check](https://github.com/tjmahr/readtextgrid/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tjmahr/readtextgrid/actions/workflows/R-CMD-check.yaml) 22 | 23 | 24 | readtextgrid parses [Praat](https://www.fon.hum.uva.nl/praat/) textgrids 25 | into tidy R dataframes. 26 | 27 | ## Features 28 | 29 | - **Simple**: Minimal package with two core functions (`read_textgrid()` and 30 | `read_textgrid_lines()`). 31 | - **Tidy**: Returns rectangular tibbles ready for downstream processing with 32 | dplyr and tidyr. 33 | - **Flexible**: Supports both long and short textgrid file formats. 34 | - **Fast**: Uses a compiled C++ tokenizer for high-throughput parsing. 35 | 36 | 37 | ## Installation 38 | 39 | Install readtextgrid from CRAN: 40 | 41 | ``` r 42 | install.packages("readtextgrid") 43 | ``` 44 | 45 | **Development version**. Install precompiled version of readtextgrid from 46 | R-universe: 47 | 48 | ``` r 49 | install.packages( 50 | "readtextgrid", 51 | repos = c("https://tjmahr.r-universe.dev", "https://cloud.r-project.org") 52 | ) 53 | ``` 54 | 55 | 56 | 57 | ## Basic usage 58 | 59 | Here is the example textgrid created by Praat. It was created using 60 | `New > Create TextGrid...` with default settings in Praat. 61 | 62 | Textgrid drawing from Praat with three tiers (Mary, John, and Bell) 63 | 64 | This textgrid is bundled with this R package. We can locate the file with 65 | `example_textgrid()`. We read in the textgrid with `read_textgrid()`. 66 | 67 | ```{r example, R.options = list(tibble.width = 100)} 68 | library(readtextgrid) 69 | 70 | # Locates path to an example textgrid bundled with this package 71 | tg <- example_textgrid() 72 | 73 | read_textgrid(path = tg) 74 | ``` 75 | 76 | The dataframe contains one row per annotation: one row for each interval on an 77 | interval tier and one row for each point on a point tier. If a point tier has no 78 | points, it is represented with single row with `NA` values. 79 | 80 | The columns encode the following information: 81 | 82 | - `file` filename of the textgrid. By default this column uses the filename in 83 | `path`. A user can override this value by setting the `file` argument in 84 | `read_textgrid(path, file)`, which can be useful if textgrids are stored in 85 | speaker-specific folders. 86 | - `tier_num` the number of the tier (as in the left margin of Praat's textgrid 87 | editor) 88 | - `tier_name` the name of the tier (as in the right margin of Praat's textgrid 89 | editor) 90 | - `tier_type` the type of the tier. `"IntervalTier"` for interval tiers and 91 | `"TextTier"` for point tiers (this is the terminology used inside of the 92 | textgrid file format). 93 | - `tier_xmin`, `tier_xmax` start and end times of the tier in seconds 94 | - `xmin`, `xmax` start and end times of the textgrid interval or point tier 95 | annotation in seconds 96 | - `text` the text in the annotation 97 | - `annotation_num` the number of the annotation in that tier (1 for the first 98 | annotation, etc.) 99 | 100 | ## Reading in directories of textgrids 101 | 102 | Suppose we have data on multiple speakers with one folder of textgrids per 103 | speaker. As an example, this package has a folder called `speaker_data` bundled 104 | with it representing 5 five textgrids from 2 speakers. 105 | 106 | ``` 107 | 📂 speaker-data 108 | ├── 📂 speaker001 109 | │ ├── s2T01.TextGrid 110 | │ ├── s2T02.TextGrid 111 | │ ├── s2T03.TextGrid 112 | │ ├── s2T04.TextGrid 113 | │ └── s2T05.TextGrid 114 | └── 📂 speaker002 115 | ├── s2T01.TextGrid 116 | ├── s2T02.TextGrid 117 | ├── s2T03.TextGrid 118 | ├── s2T04.TextGrid 119 | └── s2T05.TextGrid 120 | ``` 121 | 122 | First, we create a vector of file-paths to read into R. 123 | 124 | ```{r} 125 | # Get the path of the folder bundled with the package 126 | data_dir <- system.file(package = "readtextgrid", "speaker-data") 127 | 128 | # Get the full paths to all the textgrids 129 | paths <- list.files( 130 | path = data_dir, 131 | pattern = "TextGrid$", 132 | full.names = TRUE, 133 | recursive = TRUE 134 | ) 135 | ``` 136 | 137 | We can use `purrr::map()`--*map* the `read_textgrid()` function over the 138 | `paths`---to read all these textgrids into R and combine them from a list to a 139 | single dataframe with `purrr::list_rbind()`. But note that this way doesn't 140 | track any speaker information. 141 | 142 | ```{r, R.options = list(tibble.width = 100)} 143 | library(purrr) 144 | 145 | paths |> 146 | map(read_textgrid) |> 147 | list_rbind() 148 | ``` 149 | 150 | By default, `read_textgrid()` uses the file basename (the file-path minus the 151 | directory part) for the `file` column. But we can manually set the `file` value. 152 | Here, we use `purrr::map2()` to map the function over `read_textgrid(path, file)` 153 | over `path` and `file` pairs. Then we add the speaker information with 154 | some dataframe manipulation functions. 155 | 156 | ```{r, R.options = list(tibble.width = 100), message = FALSE, warning = FALSE} 157 | library(dplyr) 158 | 159 | # This tells read_textgrid() to set the file column to the full path 160 | data <- map2(paths, paths, read_textgrid) |> 161 | list_rbind() |> 162 | mutate( 163 | # basename() removes the folder part from a path, 164 | # dirname() removes the file part from a path 165 | speaker = basename(dirname(file)), 166 | file = basename(file), 167 | ) |> 168 | select( 169 | speaker, everything() 170 | ) 171 | 172 | data 173 | ``` 174 | 175 | Another strategy would be to read the textgrid dataframes into a list column and 176 | `tidyr::unnest()` them. 177 | 178 | ```{r} 179 | # Read dataframes into a list column 180 | data_nested <- tibble( 181 | speaker = basename(dirname(paths)), 182 | data = map(paths, read_textgrid) 183 | ) 184 | 185 | # We have one row per textgrid dataframe because `data` is a list column 186 | data_nested 187 | 188 | # promote the nested dataframes into the main dataframe 189 | tidyr::unnest(data_nested, "data") 190 | ``` 191 | 192 | 193 | 194 | 195 | ## Pivoting nested intervals in textgrids 196 | 197 | In the textgrids above, there is a natural nesting or hierarchy to the tiers. 198 | Intervals in `words` tier contain intervals in the `phones` tier. It is often 199 | necessary to group intervals by their parent intervals (group phones by words). 200 | This package provides the `pivot_textgrid_tiers()` function to convert textgrids 201 | into a wide format in a way that respects the nesting/hierarchy of tiers. 202 | 203 | ```{r} 204 | data_wide <- pivot_textgrid_tiers( 205 | data, 206 | tiers = c("words", "phones"), 207 | join_cols = c("speaker", "file") 208 | ) 209 | 210 | data_wide 211 | 212 | # more clearly 213 | data_wide |> 214 | select( 215 | speaker, file, words, phones, 216 | words_xmin, words_xmax, phones_xmin, phones_xmax 217 | ) 218 | ``` 219 | 220 | Some remarks: 221 | 222 | - Each tier in `tiers` becomes a batch of columns. For the rows for the 223 | `words` tier become the batch of columns `words` (the original `text` 224 | value), `words_xmin`, `words_xmax`, etc. 225 | - The columns in `join_cols` should uniquely identify a textgrid file, so the 226 | combination of `speaker` and `file` is needed in the case where different 227 | speakers have the same file. 228 | - The tier names in `tiers` should be given in the order of their nesting from 229 | outside to inside (e.g., `words` contain `phones`). Behind the scenes, 230 | `dplyr::left_join(..., relationship = "one-to-many")` is used to constrain 231 | how intervals are combined. 232 | 233 | This function also works on a single `tiers` value. In this case, the function 234 | returns just the intervals in that tier with the columns renamed and prefixed. 235 | 236 | ```{r} 237 | data |> 238 | pivot_textgrid_tiers( 239 | tiers = "words", 240 | join_cols = c("speaker", "file") 241 | ) 242 | ``` 243 | 244 | 245 | ## Speeding things up 246 | 247 | Do you have thousands of textgrids to read? The following workflow can speed 248 | things up. We are going to **read the textgrids in parallel**. Below are two 249 | approaches: 250 | 251 | - future backend and furrr frontend 252 | - mirai backend and purrr frontend 253 | 254 | The backend manages the parallel computation, and the frontend provides the 255 | syntax for calling a function with parallelism. 256 | 257 | **Approach 1**: We tell future to use a `multisession` `plan` for parallelism, 258 | so the computations are done on separate R sessions in the background. The 259 | syntax is like the above purrr code, but we replace `map()` with `future_map()`. 260 | 261 | ```{r, warning = FALSE} 262 | library(future) 263 | library(furrr) 264 | plan(multisession, workers = 4) 265 | 266 | data_nested <- tibble( 267 | speaker = basename(dirname(paths)), 268 | data = future_map(paths, read_textgrid) 269 | ) 270 | ``` 271 | 272 | **Approach 2**: We have mirai set up 4 daemons (background processes), and then 273 | we use purrr's `in_parallel()` helper to signal to `map()` that the function 274 | should be run in parallel. We need to give *all* the information needed for the 275 | daemons to run the function, so we 1) provide a complete function definition 276 | (including `function(x) ...`) and 2) spell out the package namespace 277 | `readtextgrid::read_textgrid()`. 278 | 279 | ```{r, warning = FALSE} 280 | mirai::daemons(4) 281 | data_nested <- tibble( 282 | speaker = basename(dirname(paths)), 283 | data = map(paths, in_parallel(function(x) readtextgrid::read_textgrid(x))) 284 | ) 285 | mirai::daemons(0) 286 | ``` 287 | 288 | Another way to eke out performance is to set the encoding. By default, 289 | readtextgrid uses `readr::guess_encoding()` to determine the encoding of the 290 | textgrid before reading it in. But if you know the encoding beforehand, you can 291 | skip this guessing. In my limited testing, I found that **setting the encoding** 292 | could reduce benchmark times by 3--4% compared to guessing the encoding. 293 | 294 | Here, we read 100 textgrids using different approaches to benchmark the 295 | results. 296 | 297 | ```{r} 298 | paths_bench <- withr::with_seed(1, sample(paths, 100, replace = TRUE)) 299 | 300 | mirai::daemons(4) 301 | bench::mark( 302 | lapply_guess = lapply(paths_bench, read_textgrid), 303 | lapply_set = lapply(paths_bench, read_textgrid, encoding = "UTF-8"), 304 | future_guess = future_map(paths_bench, read_textgrid), 305 | future_set = future_map(paths_bench, read_textgrid, encoding = "UTF-8"), 306 | mirai_guess = purrr::map( 307 | paths_bench, 308 | in_parallel(function(x) readtextgrid::read_textgrid(x)) 309 | ), 310 | mirai_set = purrr::map( 311 | paths_bench, 312 | in_parallel(function(x) readtextgrid::read_textgrid(x, encoding = "UTF-8")) 313 | ), 314 | check = TRUE 315 | ) 316 | mirai::daemons(0) 317 | ``` 318 | 319 | ## Legacy behavior and supported textgrid formats 320 | 321 | ```{r, include = FALSE} 322 | examples <- c( 323 | 'File type = "ooTextFile" 324 | Object class = "TextGrid" 325 | 326 | xmin = 0 327 | xmax = 1 328 | tiers? 329 | size = 1 330 | item []: 331 | item [1]: 332 | class = "IntervalTier" 333 | name = "Mary" 334 | xmin = 0 335 | xmax = 1 336 | intervals: size = 1 337 | intervals [1]: 338 | xmin = 0 339 | xmax = 1 340 | text = "" 341 | ', 342 | 343 | 'File type = "ooTextFile" 344 | Object class = "TextGrid" 345 | 346 | 0 347 | 1 348 | 349 | 1 350 | "IntervalTier" 351 | "Mary" 352 | 0 353 | 1 354 | 1 355 | 0 356 | 1 357 | "" 358 | ', 359 | 'File type = "ooTextFile" 360 | Object class = "TextGrid" 361 | 362 | ! info about the grid 363 | 0s 1s 1 364 | ! info about the tier 365 | "IntervalTier" "Mary" 0s 1s 1 ! type, name, xmin, xmax, size 366 | 0s 1s "" ! interval xmin, xmax, size 367 | ' 368 | ) 369 | # make sure these don't actually error 370 | example_tgs <- lapply(examples, read_textgrid_lines) 371 | ``` 372 | 373 | 374 | The original version of this package assumed the textgrid text files followed a 375 | "long" format with helpful labels and annotations. For example, in the following 376 | textgrid, each number has a label that makes it easy and fast to parse the 377 | textgrid with regular expressions: 378 | 379 | ```{r, echo = FALSE, comment = ""} 380 | writeLines(examples[1]) 381 | ``` 382 | 383 | The original version of the parser designed for this textgrid format is still 384 | provided with the `legacy_read_textgrid()` and `legacy_read_textgrid_lines()` 385 | functions. 386 | 387 | Version 0.2.0 of readtextgrid added a C++ based parser that can handle many more 388 | textgrid formats. For example, it can "short" format textgrids like the 389 | following: 390 | 391 | ```{r, echo = FALSE, comment = ""} 392 | writeLines(examples[2]) 393 | ``` 394 | 395 | The "long" format textgrids are outputted in Praat with `Save > Save as text 396 | file...`, and the "short" format textgrids are outputted with `Save > Save as 397 | short textfile...`. 398 | 399 | readtextgrid's parser can also handle [esoteric 400 | features](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) 401 | like comments (that start with `!`) or arbitrary text attached to a number, as 402 | in the following example;: 403 | 404 | ```{r, echo = FALSE, comment = ""} 405 | writeLines(examples[3]) 406 | ``` 407 | 408 | Because the new parser uses C++ for tokenization---that is, the part scans the 409 | contents character by character and determines whether the inputs are strings, 410 | numbers, or skipped---it is much faster the legacy version. 411 | 412 | ```{r} 413 | paths_bench <- withr::with_seed(2, sample(paths, 10, replace = TRUE)) 414 | 415 | bench::mark( 416 | current = lapply(paths_bench, read_textgrid), 417 | legacy = lapply(paths_bench, legacy_read_textgrid), 418 | min_iterations = 10, 419 | filter_gc = FALSE, 420 | check = TRUE 421 | ) 422 | ``` 423 | 424 | ## Other tips 425 | 426 | ### Helpful columns 427 | 428 | The following columns are often helpful: 429 | 430 | - `duration` of an interval 431 | - `xmid` midpoint of an interval 432 | - `total_annotations` total number of annotations on a tier 433 | 434 | Here is how to create them: 435 | 436 | ```{r} 437 | data |> 438 | # grouping needed for counting annotations per tier per file per speaker 439 | group_by(speaker, file, tier_num) |> 440 | mutate( 441 | duration = xmax - xmin, 442 | xmid = xmin + (xmax - xmin) / 2, 443 | total_annotations = sum(!is.na(annotation_num)) 444 | ) |> 445 | ungroup() |> 446 | glimpse() 447 | ``` 448 | 449 | 450 | ### Launching Praat 451 | 452 | *This tip is written from the perspective of a Windows user who uses git bash 453 | for a terminal*. 454 | 455 | To open textgrids in Praat, you can tell R to call Praat from 456 | the command line. You have to know where the location of the Praat binary is 457 | though. I like to keep a copy in my project directories. So, assuming that 458 | Praat.exe in my working folder, the following would open the 10 textgrids in 459 | `paths` in Praat. 460 | 461 | ```{r, eval = FALSE} 462 | system2( 463 | command = "./Praat.exe", 464 | args = c("--open", paths), 465 | wait = FALSE 466 | ) 467 | ``` 468 | 469 | 470 | ## Acknowledgments 471 | 472 | readtextgrid was created to process data from the [WISC Lab 473 | project](https://kidspeech.wisc.edu/). Thus, development of this package was 474 | supported by NIH R01DC009411 and NIH R01DC015653. 475 | 476 | *** 477 | 478 | Please note that the 'readtextgrid' project is released with a 479 | [Contributor Code of Conduct](https://www.contributor-covenant.org/version/1/0/0/code-of-conduct.html). 480 | By contributing to this project, you agree to abide by its terms. 481 | -------------------------------------------------------------------------------- /vignettes/articles/textgrid-specification.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Textgrid specification" 3 | author: Tristan Mahr 4 | --- 5 | 6 | ```{r, include = FALSE} 7 | knitr::opts_chunk$set( 8 | collapse = TRUE, 9 | comment = "#>" 10 | ) 11 | 12 | examples <- c( 13 | 'File type = "ooTextFile" 14 | Object class = "TextGrid" 15 | 16 | xmin = 0 17 | xmax = 1 18 | tiers? 19 | size = 1 20 | item []: 21 | item [1]: 22 | class = "IntervalTier" 23 | name = "Mary" 24 | xmin = 0 25 | xmax = 1 26 | intervals: size = 1 27 | intervals [1]: 28 | xmin = 0 29 | xmax = 1 30 | text = "" 31 | ', 32 | 33 | 'File type = "ooTextFile" 34 | Object class = "TextGrid" 35 | 36 | 0 37 | 1 38 | 39 | 1 40 | "IntervalTier" 41 | "Mary" 42 | 0 43 | 1 44 | 1 45 | 0 46 | 1 47 | "" 48 | ', 49 | 'File type = "ooTextFile" 50 | Object class = "TextGrid" 51 | 52 | ! info about the grid 53 | 0s 1s 1 54 | ! info about the tier 55 | "IntervalTier" "Mary" 0s 1s 1 ! type, name, xmin, xmax, size 56 | 0s 1s "" ! interval xmin, xmax, size 57 | ' 58 | ) 59 | # make sure these don't actually error 60 | example_tgs <- lapply(examples, readtextgrid::read_textgrid_lines) 61 | stopifnot(length(unique(example_tgs)) == 1) 62 | ``` 63 | 64 | 65 | _This vignette documents the internal parsing logic of the **readtextgrid** 66 | package. It is intended for developers maintaining the parser or for developers 67 | in other languages, not for end users of the package._ 68 | 69 | In this article, I describe the specification of the `.TextGrid` file format used 70 | in this package, note how it differs from the documented specification provided 71 | by Praat, and provide a high-level overview of R code and a C++ translation that 72 | can parse `.TextGrid` files. 73 | 74 | ## Example `.TextGrid` file contents 75 | 76 | The `.TextGrid` file format used by Praat is very flexible. Below are three 77 | different `.TextGrid` files representing the same Praat textgrid. 78 | 79 | Long format: 80 | 81 | ```{r, echo = FALSE, comment = ""} 82 | writeLines(examples[1]) 83 | ``` 84 | 85 | Short format: 86 | 87 | ```{r, echo = FALSE, comment = ""} 88 | writeLines(examples[2]) 89 | ``` 90 | 91 | Custom format with comments and other noise: 92 | 93 | ```{r, echo = FALSE, comment = ""} 94 | writeLines(examples[3]) 95 | ``` 96 | 97 | readtextgrid can handle all three of these files in the same way because the 98 | Praat textgrid specification is simple---once you figure it out. I developed the 99 | readtextgrid specification by reading Praat's description of the format, testing 100 | various edge cases in the format and testing whether Praat would open the test 101 | file. If Praat could handle the file, it had to be supported by this package's 102 | textgrid parser. 103 | 104 | 105 | ## Package design 106 | 107 | To read in `.TextGrid` file, we do the following: 108 | 109 | - read it in with the proper character encoding 110 | - tokenize the file contents from a sequence of characters into a list of 111 | Praat strings and Praat numbers 112 | - identify the start and end tokens of each textgrid tier 113 | - split those tokens up into batches of data and assemble dataframes 114 | 115 | This document concerns the tokenization step. The remaining parsing steps 116 | follow straightforward split-apply-combine programming in R. 117 | 118 | 119 | ## Documented `.TextGrid` file format specification 120 | 121 | First, let's start with [Paul Boersma's own 122 | description](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) 123 | of the file format. He notes that the long format contains several comments to 124 | help a person read the file, and that these are ultimately ignored by Praat. 125 | Instead, there are only a few important tokens: 126 | 127 | > Praat will consider as data only the following types of information in the 128 | > file: 129 | > 130 | > - free-standing numbers, such as `0` and `2.3` above, but not `[1]` or 131 | > `[3]`; 132 | > - free-standing text enclosed within double quotes, such as `"TextGrid"` and 133 | > `""` above; 134 | > - free-standing flags, such as `` above (this is the only flag that 135 | > appears in TextGrid files [...]). 136 | > 137 | > In this list, "free-standing" means that the number, text or flag is preceded 138 | > by the beginning of the file, the beginning of a line, or a space, and that it 139 | > is followed by the end of the file, the end of a line, or a space. 140 | 141 | He also mentions additional features about the format: 142 | 143 | - `!` comments: "everything that follows an exclamation mark on the same line 144 | is considered a comment". 145 | - `""` escapement by doubling: "a double quote that appears in a text 146 | [*i.e.*, a string] is written as a *doubled* double quote in the text 147 | file." 148 | - ignore the `` tokens anyway: "The flag ``, which tells us that 149 | this TextGrid contains tiers (this value would be `` if the TextGrid 150 | contained no tiers, in which case the file would end here; however, you 151 | cannot really create TextGrid objects without tiers in Praat, so this issue 152 | can be ignored)." 153 | 154 | These details are mostly accurate and simple enough, but they don't specify 155 | what to do with `.1` for example (Praat treat it as an error). 156 | 157 | 158 | ## Our specification of the `.TextGrid` file format 159 | 160 | After testing, I developed the following specification for this R package. 161 | 162 | - There are two kinds of tokens: strings and numbers. 163 | 164 | - **Strings** start and end with a `"`. If a string is supposed to have a 165 | double-quote character `"` inside of it, double the quote characters 166 | instead. The textgrid interval text *He said "hello" to me* would have the 167 | string `"He said ""hello"" to me"`. Everything inside of the `"` pair 168 | belongs to the string, even line breaks and comments. 169 | 170 | - A string is fully "free-standing". It should be preceded and followed by a 171 | space, newline, or the start or end of a file. `I said"Hello"` does not 172 | contain a string because there is no space before the `"` character. 173 | 174 | - **Numbers** start with a plus, minus or digit. Decimal, hexadecimal, and 175 | scientific notation are supported. Fractions are supported. A number ending 176 | with a `%` (a percentage) is divided by 100. Numbers use a `.` for the 177 | decimal point character. `.5` is not a number because it doesn't start with 178 | a plus, minus or digit. 179 | 180 | - A number is "left free-standing" (my terminology). It must be preceded by a 181 | space or newline. (Using the file start doesn't make sense for a boundary). 182 | From a valid start of a number, characters are read until the sequence of 183 | characters would no longer yield a number. Any additional characters until 184 | the next space, newline, or file boundary are ignored. In `100ms` and 185 | `+100e1ms`, for example, the final `ms` characters are ignored. 186 | 187 | - Praat does not support real numbers with a stranded exponent (`1e`). These 188 | kinds of numbers are an exception to the left-free-standing feature 189 | described earlier. 190 | 191 | - Everything else is a comment and ignored. I differentiate between two kinds 192 | of comments. This is my terminology, not Praat's. 193 | 194 | - **"Strong" comments** start with a `!` and end with a newline (`\n`). 195 | 196 | - **"Weak" comments** would be any token that does not start like a string or 197 | number. In the long format textgrid, `size = 1` would be two ignored weak 198 | comments (`size`, `=`) and a number (`1`). 199 | 200 | The allowance for characters on the right side of numbers is the major 201 | difference between the description of the Praat format and the one used in this 202 | package. 203 | 204 | 205 | 206 | 207 | 208 | ## Reference R implementation for textgrid tokenization 209 | 210 | Given a vector of characters from a Praat `.TextGrid` file, we want a list of 211 | strings and numbers contained in the file. For example, here are the characters 212 | from the short textgrid file and the output of the R-based tokenization: 213 | 214 | ```{r} 215 | tg_characters <- examples[2] |> 216 | strsplit("") |> 217 | unlist() 218 | 219 | tg_characters 220 | 221 | tg_characters |> 222 | readtextgrid:::r_tokenize_textgrid_chars() |> 223 | str() 224 | ``` 225 | 226 | Some comments about this function: 227 | 228 | - `r_tokenize_textgrid_chars()` is not an exported or supported function. That 229 | is why it needs to be accessed with the triple colon namespace operator 230 | `:::`. 231 | - The function was the intended implementation for the package until I 232 | converted the implementation to C++. I keep this R version around as a 233 | reference implementation for testing the current C++ implementation. 234 | - Don't use it. 235 | 236 | 237 | The big ideas in `r_tokenize_textgrid_chars()` are the following: 238 | 239 | - We have three special states: `in_strong_comment`, `in_string`, and 240 | `in_escaped_quote`. These determine how we interpret spaces, newlines, and 241 | `"` characters. When `in_strong_comment` is true, we skip the character 242 | iteration loop with `next` until we see a newline. When `in_escaped_quote` 243 | is true, we skip the next iteration of the loop (to catch next to `"`). 244 | When `in_string` is true, we keep collecting characters for the current 245 | token until we see a closing `"`. 246 | 247 | - When these states are all false *and* we see a space or newline, then we 248 | have the end of current token. We extract the characters for the current 249 | token, combine them into a single value, check the value and keep it if it 250 | is a Praat string or Praat number. Then we reset the current token position 251 | and advance. 252 | 253 | Everything else is book-keeping to check for a special state or initialize a new 254 | token. 255 | 256 | The complete code is given below. It is fairly well-commented but you don't 257 | have to read it---just knowing the high-level details is sufficient. 258 | 259 | ```{r, eval = FALSE} 260 | function(all_char) { 261 | # The parser rules here follow the textgrid specifications 262 | # EXCEPT 263 | # when they contradict the behavior of Praat.exe. For example, the specs says 264 | # the main literals are freestanding strings and numbers, where freestanding 265 | # means that they have a whitespace or boundary (newline or file start/end). 266 | # But Praat.exe can handle numbers like "10.00!comment". So, this parser 267 | # gathers freestanding literals but only keeps ones that are strings or 268 | # start with a valid number (the non-numeric characters are lopped off.) 269 | 270 | in_strong_comment <- FALSE # Comment mode: ! to new line \n 271 | in_string <- FALSE # String mode: "Quote to quote" 272 | in_escaped_quote <- FALSE # Escaped quote: "" inside of a string 273 | 274 | token_start <- integer(0) # Start of current token 275 | values <- vector(mode = "list") # Collects completed values 276 | 277 | for (i in seq_along(all_char)) { 278 | cur_value_ready <- length(token_start) != 0 279 | c <- all_char[i] 280 | c_is_whitespace <- c %in% c(" ", "\n") 281 | c_starts_string <- c == "\"" 282 | 283 | # Comments start with ! and end with \n. Skip characters in this mode. 284 | if (!in_string & c == "!") { 285 | in_strong_comment <- TRUE 286 | next 287 | } 288 | if (in_strong_comment) { 289 | if (c == "\n") in_strong_comment <- FALSE 290 | next 291 | } 292 | 293 | # Whitespace delimits values so collect values if we see whitespace 294 | if (c_is_whitespace & !in_string) { 295 | # Skip whitespace if no values collected so far 296 | if (!cur_value_ready) next 297 | 298 | total_value <- all_char[seq(token_start, i - 1)] |> 299 | paste0(collapse = "") 300 | is_string <- all_char[token_start] == "\"" && all_char[i - 1] == "\"" 301 | 302 | # Collect only numbers and strings 303 | if (r_tg_parse_is_number(total_value)) { 304 | # Keep only the numeric part. 305 | total_value <- total_value |> r_tg_parse_extract_number() 306 | values <- c(values, total_value) 307 | } else if (is_string) { 308 | values <- c(values, total_value) 309 | } 310 | token_start <- integer(0) 311 | next 312 | } 313 | 314 | # Store character if ending an escaped quote 315 | if (in_escaped_quote) { 316 | in_escaped_quote <- !in_escaped_quote 317 | next 318 | } 319 | 320 | # Start or close string mode if we see " 321 | if (c_starts_string) { 322 | # Check for "" escapes 323 | peek_c <- all_char[i + 1] 324 | if (peek_c == "\"" & in_string) { 325 | in_escaped_quote <- TRUE 326 | } else { 327 | in_string <- !in_string 328 | } 329 | } 330 | 331 | if (!cur_value_ready) { 332 | token_start <- i 333 | } 334 | } 335 | 336 | values |> 337 | lapply(r_tg_parse_convert_value) 338 | } 339 | ``` 340 | 341 | 342 | ## C++ implementation 343 | 344 | Given the simple nature of the R code and its relatively slow performance 345 | compared to the legacy version of the parser, I used ChatGPT to help convert 346 | the R code into a C++ implementation built on the cpp11 package. I tried to make 347 | sure I understood every line and made my own comments to help my understanding. 348 | 349 | The C++ code is a straightforward translation of the R version. For example, 350 | here is the part of the function that collects tokens when we see 351 | a space or newline: 352 | 353 | ```c++ 354 | if (!in_string && is_ws(b)) { 355 | if (have_token) { 356 | size_t start = tok_start_byte; 357 | size_t end = (curr_char_byte == 0 ? 0 : prev_char_byte); 358 | size_t len = (end >= start) ? (end - start + 1) : 0; 359 | if (len > 0) { 360 | // do we have a string (start and end with ") 361 | bool q = (static_cast(src[start]) == 0x22) && 362 | (static_cast(src[end]) == 0x22); 363 | tokens.push_back(src.substr(start, len)); 364 | tokens_is_string.push_back(q); 365 | } 366 | have_token = false; 367 | } 368 | continue; 369 | } 370 | ``` 371 | 372 | Some details are different: The C++ version extracts tokens with a substring 373 | (`.substr()`) method, delays checking whether the token is a number until 374 | later on, and accumulates results into lists (`tokens` and 375 | `tokens_is_string`). But the underlying logic is the same as the R version. 376 | 377 | The C++ function takes a single character value (one whole string) for the file 378 | contents and returns a list of the tokens in the file, whether each token is a 379 | Praat string, the numbers of characters of each token that form a number, and 380 | the value of that token's number: 381 | 382 | ```{r} 383 | examples[2] |> 384 | readtextgrid:::cpp_tg_scan_tokens() |> 385 | as.data.frame() 386 | ``` 387 | 388 | Before I had figured out how to parse numbers with C++, I originally was going 389 | to use R code on the `token` column to figure out whether each token is a legal 390 | number or not. That's why this function returns a list of vectors with 391 | information about the tokens. 392 | 393 | Back in the R layer, the final tokens are selected using really basic vector 394 | operations: 395 | 396 | ```{r} 397 | readtextgrid:::tokenize_textgrid 398 | ``` 399 | 400 | An important part of this function is the `withr::with_locale(c(LC_NUMERIC = 401 | "C"), ... )` call. We are setting the locale for numbers to the C locale which 402 | means that `.` is the decimal point character, and not a comma as in some 403 | locales. 404 | 405 | Parsing numbers is also handled by C++. I discovered that the standard 406 | library `strtod()` function does exactly what we need: 407 | 408 | > Interprets a floating-point value in a byte string pointed to by `str`. 409 | > 410 | > Function discards any whitespace characters (as determined by `isspace`) until 411 | > first non-whitespace character is found. Then it takes as many characters as 412 | > possible to form a valid floating-point representation and converts them to a 413 | > floating-point value. 414 | > 415 | > -- https://en.cppreference.com/w/c/string/byte/strtof 416 | 417 | We include some additional logic to make sure that `.4` is illegal and to output `NA_real_` for missing values, but otherwise, `strtod()` does the work for us. 418 | 419 | One consequence of this approach is that we can parse other kinds of numbers 420 | like hexadecimal with exponents. It turns out that Praat can also parse these 421 | numbers in a `.TextGrid` file as well. 422 | 423 | The number-parsing logic has its own function, so we can test how tokens 424 | specific tokens are parsed: 425 | 426 | ```{r} 427 | test_nums <- c("+1.0", "000ms", "-2", "0xA", ".5", "+.0") 428 | 429 | as.data.frame(c( 430 | test_nums = list(test_nums), 431 | readtextgrid:::cpp_parse_praat_numbers(test_nums) 432 | )) 433 | ``` 434 | 435 | There are two limitations with the number parser used in this package: 436 | 437 | - We do not support fractions and percentages. (Praat does.) 438 | - We accept stranded exponents. (Praat does not.) 439 | 440 | ```{r} 441 | test_nums <- c("1e", "1E", "20/10", "1000%") 442 | expected <- c(NA_real_, NA_real_, 2.0, 10.0) 443 | 444 | as.data.frame(c( 445 | test_nums = list(test_nums), 446 | readtextgrid:::cpp_parse_praat_numbers(test_nums), 447 | expected_value = list(expected) 448 | )) 449 | ``` 450 | 451 | These are not high-priority limitations until we find a case where a software 452 | program writes out `.TextGrid` files that uses these features. 453 | 454 | ## Notes on testing 455 | 456 | The package's folder `tests/testthat/test-data` includes a series of `.TextGrid` 457 | files for testing the parsing functions. One of these, `hard-to-parse.TextGrid`, 458 | collects as many edge cases as I can imagine. 459 | 460 | The C++ implementation is tested against the legacy parser on easy long-format 461 | textgrid files and against the pure R implementation on other test textgrid 462 | files, including `hard-to-parse.TextGrid`. 463 | 464 | The folder `tests/testthat/test-data/praat-test` include some tests of whether 465 | Praat can open a file or not. Files that fail to open start with `fail-` and 466 | files that open start with `okay-`. We support the only the syntactic 467 | features in the `okay-` files. 468 | 469 | ## Notes on the Praat source code 470 | 471 | I did not rely on the Praat source code but I tried! The Praat source code has 472 | to read in all kinds of text files so there is not an obvious 473 | `read_textgrid()`-like function for parsing a `.TextGrid` file. Still, I was 474 | able to find how numbers a read in from a text file. 475 | 476 | The primitive data types of Praat are defined in the `Melder` folder. The 477 | `abcio.cpp` files has functions like `getReal()` for reading a float from text. 478 | `getReal()` calls `Melder_a8tof()` function in `melder_atof.cpp` to convert 479 | strings into numbers, and this function in turn calls `findEndOfNumericString()` 480 | which processes numbers character by character. 481 | 482 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # readtextgrid 5 | 6 | 7 | 8 | [![CRAN 9 | status](https://www.r-pkg.org/badges/version/readtextgrid)](https://CRAN.R-project.org/package=readtextgrid) 10 | [![readtextgrid status 11 | badge](https://tjmahr.r-universe.dev/readtextgrid/badges/version)](https://tjmahr.r-universe.dev/readtextgrid) 12 | [![R-CMD-check](https://github.com/tjmahr/readtextgrid/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tjmahr/readtextgrid/actions/workflows/R-CMD-check.yaml) 13 | 14 | 15 | readtextgrid parses [Praat](https://www.fon.hum.uva.nl/praat/) textgrids 16 | into tidy R dataframes. 17 | 18 | ## Features 19 | 20 | - **Simple**: Minimal package with two core functions (`read_textgrid()` 21 | and `read_textgrid_lines()`). 22 | - **Tidy**: Returns rectangular tibbles ready for downstream processing 23 | with dplyr and tidyr. 24 | - **Flexible**: Supports both long and short textgrid file formats. 25 | - **Fast**: Uses a compiled C++ tokenizer for high-throughput parsing. 26 | 27 | ## Installation 28 | 29 | Install readtextgrid from CRAN: 30 | 31 | ``` r 32 | install.packages("readtextgrid") 33 | ``` 34 | 35 | **Development version**. Install precompiled version of readtextgrid 36 | from R-universe: 37 | 38 | ``` r 39 | install.packages( 40 | "readtextgrid", 41 | repos = c("https://tjmahr.r-universe.dev", "https://cloud.r-project.org") 42 | ) 43 | ``` 44 | 45 | ## Basic usage 46 | 47 | Here is the example textgrid created by Praat. It was created using 48 | `New > Create TextGrid...` with default settings in Praat. 49 | 50 | Textgrid drawing from Praat with three tiers (Mary, John, and Bell) 51 | 52 | This textgrid is bundled with this R package. We can locate the file 53 | with `example_textgrid()`. We read in the textgrid with 54 | `read_textgrid()`. 55 | 56 | ``` r 57 | library(readtextgrid) 58 | 59 | # Locates path to an example textgrid bundled with this package 60 | tg <- example_textgrid() 61 | 62 | read_textgrid(path = tg) 63 | #> # A tibble: 3 × 10 64 | #> file tier_num tier_name tier_type tier_xmin tier_xmax 65 | #> 66 | #> 1 Mary_John_bell.TextGrid 1 Mary IntervalTier 0 1 67 | #> 2 Mary_John_bell.TextGrid 2 John IntervalTier 0 1 68 | #> 3 Mary_John_bell.TextGrid 3 bell TextTier 0 1 69 | #> xmin xmax text annotation_num 70 | #> 71 | #> 1 0 1 "" 1 72 | #> 2 0 1 "" 1 73 | #> 3 NA NA NA 74 | ``` 75 | 76 | The dataframe contains one row per annotation: one row for each interval 77 | on an interval tier and one row for each point on a point tier. If a 78 | point tier has no points, it is represented with single row with `NA` 79 | values. 80 | 81 | The columns encode the following information: 82 | 83 | - `file` filename of the textgrid. By default this column uses the 84 | filename in `path`. A user can override this value by setting the 85 | `file` argument in `read_textgrid(path, file)`, which can be useful if 86 | textgrids are stored in speaker-specific folders. 87 | - `tier_num` the number of the tier (as in the left margin of Praat’s 88 | textgrid editor) 89 | - `tier_name` the name of the tier (as in the right margin of Praat’s 90 | textgrid editor) 91 | - `tier_type` the type of the tier. `"IntervalTier"` for interval tiers 92 | and `"TextTier"` for point tiers (this is the terminology used inside 93 | of the textgrid file format). 94 | - `tier_xmin`, `tier_xmax` start and end times of the tier in seconds 95 | - `xmin`, `xmax` start and end times of the textgrid interval or point 96 | tier annotation in seconds 97 | - `text` the text in the annotation 98 | - `annotation_num` the number of the annotation in that tier (1 for the 99 | first annotation, etc.) 100 | 101 | ## Reading in directories of textgrids 102 | 103 | Suppose we have data on multiple speakers with one folder of textgrids 104 | per speaker. As an example, this package has a folder called 105 | `speaker_data` bundled with it representing 5 five textgrids from 2 106 | speakers. 107 | 108 | 📂 speaker-data 109 | ├── 📂 speaker001 110 | │ ├── s2T01.TextGrid 111 | │ ├── s2T02.TextGrid 112 | │ ├── s2T03.TextGrid 113 | │ ├── s2T04.TextGrid 114 | │ └── s2T05.TextGrid 115 | └── 📂 speaker002 116 | ├── s2T01.TextGrid 117 | ├── s2T02.TextGrid 118 | ├── s2T03.TextGrid 119 | ├── s2T04.TextGrid 120 | └── s2T05.TextGrid 121 | 122 | First, we create a vector of file-paths to read into R. 123 | 124 | ``` r 125 | # Get the path of the folder bundled with the package 126 | data_dir <- system.file(package = "readtextgrid", "speaker-data") 127 | 128 | # Get the full paths to all the textgrids 129 | paths <- list.files( 130 | path = data_dir, 131 | pattern = "TextGrid$", 132 | full.names = TRUE, 133 | recursive = TRUE 134 | ) 135 | ``` 136 | 137 | We can use `purrr::map()`–*map* the `read_textgrid()` function over the 138 | `paths`—to read all these textgrids into R and combine them from a list 139 | to a single dataframe with `purrr::list_rbind()`. But note that this way 140 | doesn’t track any speaker information. 141 | 142 | ``` r 143 | library(purrr) 144 | 145 | paths |> 146 | map(read_textgrid) |> 147 | list_rbind() 148 | #> # A tibble: 150 × 10 149 | #> file tier_num tier_name tier_type tier_xmin tier_xmax xmin 150 | #> 151 | #> 1 s2T01.TextGrid 1 words IntervalTier 0 1.35 0 152 | #> 2 s2T01.TextGrid 1 words IntervalTier 0 1.35 0.297 153 | #> 3 s2T01.TextGrid 1 words IntervalTier 0 1.35 0.522 154 | #> 4 s2T01.TextGrid 1 words IntervalTier 0 1.35 0.972 155 | #> 5 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0 156 | #> 6 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0.297 157 | #> 7 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0.36 158 | #> 8 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0.495 159 | #> 9 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0.522 160 | #> 10 s2T01.TextGrid 2 phones IntervalTier 0 1.35 0.621 161 | #> xmax text annotation_num 162 | #> 163 | #> 1 0.297 "" 1 164 | #> 2 0.522 "bird" 2 165 | #> 3 0.972 "house" 3 166 | #> 4 1.35 "" 4 167 | #> 5 0.297 "sil" 1 168 | #> 6 0.36 "B" 2 169 | #> 7 0.495 "ER1" 3 170 | #> 8 0.522 "D" 4 171 | #> 9 0.621 "HH" 5 172 | #> 10 0.783 "AW1" 6 173 | #> # ℹ 140 more rows 174 | ``` 175 | 176 | By default, `read_textgrid()` uses the file basename (the file-path 177 | minus the directory part) for the `file` column. But we can manually set 178 | the `file` value. Here, we use `purrr::map2()` to map the function over 179 | `read_textgrid(path, file)` over `path` and `file` pairs. Then we add 180 | the speaker information with some dataframe manipulation functions. 181 | 182 | ``` r 183 | library(dplyr) 184 | 185 | # This tells read_textgrid() to set the file column to the full path 186 | data <- map2(paths, paths, read_textgrid) |> 187 | list_rbind() |> 188 | mutate( 189 | # basename() removes the folder part from a path, 190 | # dirname() removes the file part from a path 191 | speaker = basename(dirname(file)), 192 | file = basename(file), 193 | ) |> 194 | select( 195 | speaker, everything() 196 | ) 197 | 198 | data 199 | #> # A tibble: 150 × 11 200 | #> speaker file tier_num tier_name tier_type tier_xmin tier_xmax 201 | #> 202 | #> 1 speaker001 s2T01.TextGrid 1 words IntervalTier 0 1.35 203 | #> 2 speaker001 s2T01.TextGrid 1 words IntervalTier 0 1.35 204 | #> 3 speaker001 s2T01.TextGrid 1 words IntervalTier 0 1.35 205 | #> 4 speaker001 s2T01.TextGrid 1 words IntervalTier 0 1.35 206 | #> 5 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 207 | #> 6 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 208 | #> 7 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 209 | #> 8 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 210 | #> 9 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 211 | #> 10 speaker001 s2T01.TextGrid 2 phones IntervalTier 0 1.35 212 | #> xmin xmax text annotation_num 213 | #> 214 | #> 1 0 0.297 "" 1 215 | #> 2 0.297 0.522 "bird" 2 216 | #> 3 0.522 0.972 "house" 3 217 | #> 4 0.972 1.35 "" 4 218 | #> 5 0 0.297 "sil" 1 219 | #> 6 0.297 0.36 "B" 2 220 | #> 7 0.36 0.495 "ER1" 3 221 | #> 8 0.495 0.522 "D" 4 222 | #> 9 0.522 0.621 "HH" 5 223 | #> 10 0.621 0.783 "AW1" 6 224 | #> # ℹ 140 more rows 225 | ``` 226 | 227 | Another strategy would be to read the textgrid dataframes into a list 228 | column and `tidyr::unnest()` them. 229 | 230 | ``` r 231 | # Read dataframes into a list column 232 | data_nested <- tibble( 233 | speaker = basename(dirname(paths)), 234 | data = map(paths, read_textgrid) 235 | ) 236 | 237 | # We have one row per textgrid dataframe because `data` is a list column 238 | data_nested 239 | #> # A tibble: 10 × 2 240 | #> speaker data 241 | #> 242 | #> 1 speaker001 243 | #> 2 speaker001 244 | #> 3 speaker001 245 | #> 4 speaker001 246 | #> 5 speaker001 247 | #> 6 speaker002 248 | #> 7 speaker002 249 | #> 8 speaker002 250 | #> 9 speaker002 251 | #> 10 speaker002 252 | 253 | # promote the nested dataframes into the main dataframe 254 | tidyr::unnest(data_nested, "data") 255 | #> # A tibble: 150 × 11 256 | #> speaker file tier_num tier_name tier_type tier_xmin tier_xmax xmin xmax 257 | #> 258 | #> 1 speaker001 s2T0… 1 words Interval… 0 1.35 0 0.297 259 | #> 2 speaker001 s2T0… 1 words Interval… 0 1.35 0.297 0.522 260 | #> 3 speaker001 s2T0… 1 words Interval… 0 1.35 0.522 0.972 261 | #> 4 speaker001 s2T0… 1 words Interval… 0 1.35 0.972 1.35 262 | #> 5 speaker001 s2T0… 2 phones Interval… 0 1.35 0 0.297 263 | #> 6 speaker001 s2T0… 2 phones Interval… 0 1.35 0.297 0.36 264 | #> 7 speaker001 s2T0… 2 phones Interval… 0 1.35 0.36 0.495 265 | #> 8 speaker001 s2T0… 2 phones Interval… 0 1.35 0.495 0.522 266 | #> 9 speaker001 s2T0… 2 phones Interval… 0 1.35 0.522 0.621 267 | #> 10 speaker001 s2T0… 2 phones Interval… 0 1.35 0.621 0.783 268 | #> # ℹ 140 more rows 269 | #> # ℹ 2 more variables: text , annotation_num 270 | ``` 271 | 272 | ## Pivoting nested intervals in textgrids 273 | 274 | In the textgrids above, there is a natural nesting or hierarchy to the 275 | tiers. Intervals in `words` tier contain intervals in the `phones` tier. 276 | It is often necessary to group intervals by their parent intervals 277 | (group phones by words). This package provides the 278 | `pivot_textgrid_tiers()` function to convert textgrids into a wide 279 | format in a way that respects the nesting/hierarchy of tiers. 280 | 281 | ``` r 282 | data_wide <- pivot_textgrid_tiers( 283 | data, 284 | tiers = c("words", "phones"), 285 | join_cols = c("speaker", "file") 286 | ) 287 | 288 | data_wide 289 | #> # A tibble: 108 × 18 290 | #> speaker file words words_xmin words_xmax words_xmid words_annotation_num 291 | #> 292 | #> 1 speaker001 s2T01… "" 0 0.297 0.149 1 293 | #> 2 speaker001 s2T01… "bir… 0.297 0.522 0.410 2 294 | #> 3 speaker001 s2T01… "bir… 0.297 0.522 0.410 2 295 | #> 4 speaker001 s2T01… "bir… 0.297 0.522 0.410 2 296 | #> 5 speaker001 s2T01… "hou… 0.522 0.972 0.747 3 297 | #> 6 speaker001 s2T01… "hou… 0.522 0.972 0.747 3 298 | #> 7 speaker001 s2T01… "hou… 0.522 0.972 0.747 3 299 | #> 8 speaker001 s2T01… "" 0.972 1.35 1.16 4 300 | #> 9 speaker001 s2T01… "" 0.972 1.35 1.16 4 301 | #> 10 speaker001 s2T02… "" 0 0.297 0.149 1 302 | #> # ℹ 98 more rows 303 | #> # ℹ 11 more variables: words_tier_num , words_tier_type , 304 | #> # tier_xmin , tier_xmax , phones , phones_xmin , 305 | #> # phones_xmax , phones_xmid , phones_annotation_num , 306 | #> # phones_tier_num , phones_tier_type 307 | 308 | # more clearly 309 | data_wide |> 310 | select( 311 | speaker, file, words, phones, 312 | words_xmin, words_xmax, phones_xmin, phones_xmax 313 | ) 314 | #> # A tibble: 108 × 8 315 | #> speaker file words phones words_xmin words_xmax phones_xmin phones_xmax 316 | #> 317 | #> 1 speaker001 s2T01.… "" "sil" 0 0.297 0 0.297 318 | #> 2 speaker001 s2T01.… "bir… "B" 0.297 0.522 0.297 0.36 319 | #> 3 speaker001 s2T01.… "bir… "ER1" 0.297 0.522 0.36 0.495 320 | #> 4 speaker001 s2T01.… "bir… "D" 0.297 0.522 0.495 0.522 321 | #> 5 speaker001 s2T01.… "hou… "HH" 0.522 0.972 0.522 0.621 322 | #> 6 speaker001 s2T01.… "hou… "AW1" 0.522 0.972 0.621 0.783 323 | #> 7 speaker001 s2T01.… "hou… "S" 0.522 0.972 0.783 0.972 324 | #> 8 speaker001 s2T01.… "" "sp" 0.972 1.35 0.972 1.33 325 | #> 9 speaker001 s2T01.… "" "" 0.972 1.35 1.33 1.35 326 | #> 10 speaker001 s2T02.… "" "sil" 0 0.297 0 0.297 327 | #> # ℹ 98 more rows 328 | ``` 329 | 330 | Some remarks: 331 | 332 | - Each tier in `tiers` becomes a batch of columns. For the rows for the 333 | `words` tier become the batch of columns `words` (the original `text` 334 | value), `words_xmin`, `words_xmax`, etc. 335 | - The columns in `join_cols` should uniquely identify a textgrid file, 336 | so the combination of `speaker` and `file` is needed in the case where 337 | different speakers have the same file. 338 | - The tier names in `tiers` should be given in the order of their 339 | nesting from outside to inside (e.g., `words` contain `phones`). 340 | Behind the scenes, 341 | `dplyr::left_join(..., relationship = "one-to-many")` is used to 342 | constrain how intervals are combined. 343 | 344 | This function also works on a single `tiers` value. In this case, the 345 | function returns just the intervals in that tier with the columns 346 | renamed and prefixed. 347 | 348 | ``` r 349 | data |> 350 | pivot_textgrid_tiers( 351 | tiers = "words", 352 | join_cols = c("speaker", "file") 353 | ) 354 | #> # A tibble: 42 × 11 355 | #> speaker file words words_xmin words_xmax words_xmid words_annotation_num 356 | #> 357 | #> 1 speaker001 s2T01… "" 0 0.297 0.149 1 358 | #> 2 speaker001 s2T01… "bir… 0.297 0.522 0.410 2 359 | #> 3 speaker001 s2T01… "hou… 0.522 0.972 0.747 3 360 | #> 4 speaker001 s2T01… "" 0.972 1.35 1.16 4 361 | #> 5 speaker001 s2T02… "" 0 0.297 0.149 1 362 | #> 6 speaker001 s2T02… "cow… 0.297 0.702 0.500 2 363 | #> 7 speaker001 s2T02… "boo… 0.702 1.17 0.936 3 364 | #> 8 speaker001 s2T02… "" 1.17 1.59 1.38 4 365 | #> 9 speaker001 s2T03… "" 0 0.369 0.184 1 366 | #> 10 speaker001 s2T03… "hug" 0.369 0.657 0.513 2 367 | #> # ℹ 32 more rows 368 | #> # ℹ 4 more variables: words_tier_num , words_tier_type , 369 | #> # tier_xmin , tier_xmax 370 | ``` 371 | 372 | ## Speeding things up 373 | 374 | Do you have thousands of textgrids to read? The following workflow can 375 | speed things up. We are going to **read the textgrids in parallel**. 376 | Below are two approaches: 377 | 378 | - future backend and furrr frontend 379 | - mirai backend and purrr frontend 380 | 381 | The backend manages the parallel computation, and the frontend provides 382 | the syntax for calling a function with parallelism. 383 | 384 | **Approach 1**: We tell future to use a `multisession` `plan` for 385 | parallelism, so the computations are done on separate R sessions in the 386 | background. The syntax is like the above purrr code, but we replace 387 | `map()` with `future_map()`. 388 | 389 | ``` r 390 | library(future) 391 | library(furrr) 392 | plan(multisession, workers = 4) 393 | 394 | data_nested <- tibble( 395 | speaker = basename(dirname(paths)), 396 | data = future_map(paths, read_textgrid) 397 | ) 398 | ``` 399 | 400 | **Approach 2**: We have mirai set up 4 daemons (background processes), 401 | and then we use purrr’s `in_parallel()` helper to signal to `map()` that 402 | the function should be run in parallel. We need to give *all* the 403 | information needed for the daemons to run the function, so we 1) provide 404 | a complete function definition (including `function(x) ...`) and 2) 405 | spell out the package namespace `readtextgrid::read_textgrid()`. 406 | 407 | ``` r 408 | mirai::daemons(4) 409 | data_nested <- tibble( 410 | speaker = basename(dirname(paths)), 411 | data = map(paths, in_parallel(function(x) readtextgrid::read_textgrid(x))) 412 | ) 413 | mirai::daemons(0) 414 | ``` 415 | 416 | Another way to eke out performance is to set the encoding. By default, 417 | readtextgrid uses `readr::guess_encoding()` to determine the encoding of 418 | the textgrid before reading it in. But if you know the encoding 419 | beforehand, you can skip this guessing. In my limited testing, I found 420 | that **setting the encoding** could reduce benchmark times by 3–4% 421 | compared to guessing the encoding. 422 | 423 | Here, we read 100 textgrids using different approaches to benchmark the 424 | results. 425 | 426 | ``` r 427 | paths_bench <- withr::with_seed(1, sample(paths, 100, replace = TRUE)) 428 | 429 | mirai::daemons(4) 430 | bench::mark( 431 | lapply_guess = lapply(paths_bench, read_textgrid), 432 | lapply_set = lapply(paths_bench, read_textgrid, encoding = "UTF-8"), 433 | future_guess = future_map(paths_bench, read_textgrid), 434 | future_set = future_map(paths_bench, read_textgrid, encoding = "UTF-8"), 435 | mirai_guess = purrr::map( 436 | paths_bench, 437 | in_parallel(function(x) readtextgrid::read_textgrid(x)) 438 | ), 439 | mirai_set = purrr::map( 440 | paths_bench, 441 | in_parallel(function(x) readtextgrid::read_textgrid(x, encoding = "UTF-8")) 442 | ), 443 | check = TRUE 444 | ) 445 | #> Warning: Some expressions had a GC in every iteration; so filtering is 446 | #> disabled. 447 | #> # A tibble: 6 × 6 448 | #> expression min median `itr/sec` mem_alloc `gc/sec` 449 | #> 450 | #> 1 lapply_guess 1.17s 1.17s 0.852 13.32MB 5.96 451 | #> 2 lapply_set 883.69ms 883.69ms 1.13 5.41MB 6.79 452 | #> 3 future_guess 407.83ms 421.37ms 2.37 627.53KB 2.37 453 | #> 4 future_set 356.49ms 358.09ms 2.79 627.53KB 2.79 454 | #> 5 mirai_guess 315.85ms 338.82ms 2.95 1006.66KB 0 455 | #> 6 mirai_set 258.63ms 259.45ms 3.85 1006.66KB 0 456 | mirai::daemons(0) 457 | ``` 458 | 459 | ## Legacy behavior and supported textgrid formats 460 | 461 | The original version of this package assumed the textgrid text files 462 | followed a “long” format with helpful labels and annotations. For 463 | example, in the following textgrid, each number has a label that makes 464 | it easy and fast to parse the textgrid with regular expressions: 465 | 466 | File type = "ooTextFile" 467 | Object class = "TextGrid" 468 | 469 | xmin = 0 470 | xmax = 1 471 | tiers? 472 | size = 1 473 | item []: 474 | item [1]: 475 | class = "IntervalTier" 476 | name = "Mary" 477 | xmin = 0 478 | xmax = 1 479 | intervals: size = 1 480 | intervals [1]: 481 | xmin = 0 482 | xmax = 1 483 | text = "" 484 | 485 | The original version of the parser designed for this textgrid format is 486 | still provided with the `legacy_read_textgrid()` and 487 | `legacy_read_textgrid_lines()` functions. 488 | 489 | Version 0.2.0 of readtextgrid added a C++ based parser that can handle 490 | many more textgrid formats. For example, it can “short” format textgrids 491 | like the following: 492 | 493 | File type = "ooTextFile" 494 | Object class = "TextGrid" 495 | 496 | 0 497 | 1 498 | 499 | 1 500 | "IntervalTier" 501 | "Mary" 502 | 0 503 | 1 504 | 1 505 | 0 506 | 1 507 | "" 508 | 509 | The “long” format textgrids are outputted in Praat with 510 | `Save > Save as text file...`, and the “short” format textgrids are 511 | outputted with `Save > Save as short textfile...`. 512 | 513 | readtextgrid’s parser can also handle [esoteric 514 | features](https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html) 515 | like comments (that start with `!`) or arbitrary text attached to a 516 | number, as in the following example;: 517 | 518 | File type = "ooTextFile" 519 | Object class = "TextGrid" 520 | 521 | ! info about the grid 522 | 0s 1s 1 523 | ! info about the tier 524 | "IntervalTier" "Mary" 0s 1s 1 ! type, name, xmin, xmax, size 525 | 0s 1s "" ! interval xmin, xmax, size 526 | 527 | Because the new parser uses C++ for tokenization—that is, the part scans 528 | the contents character by character and determines whether the inputs 529 | are strings, numbers, or skipped—it is much faster the legacy version. 530 | 531 | ``` r 532 | paths_bench <- withr::with_seed(2, sample(paths, 10, replace = TRUE)) 533 | 534 | bench::mark( 535 | current = lapply(paths_bench, read_textgrid), 536 | legacy = lapply(paths_bench, legacy_read_textgrid), 537 | min_iterations = 10, 538 | filter_gc = FALSE, 539 | check = TRUE 540 | ) 541 | #> # A tibble: 2 × 6 542 | #> expression min median `itr/sec` mem_alloc `gc/sec` 543 | #> 544 | #> 1 current 114ms 123ms 8.11 1.31MB 4.86 545 | #> 2 legacy 332ms 342ms 2.89 19.57MB 6.06 546 | ``` 547 | 548 | ## Other tips 549 | 550 | ### Helpful columns 551 | 552 | The following columns are often helpful: 553 | 554 | - `duration` of an interval 555 | - `xmid` midpoint of an interval 556 | - `total_annotations` total number of annotations on a tier 557 | 558 | Here is how to create them: 559 | 560 | ``` r 561 | data |> 562 | # grouping needed for counting annotations per tier per file per speaker 563 | group_by(speaker, file, tier_num) |> 564 | mutate( 565 | duration = xmax - xmin, 566 | xmid = xmin + (xmax - xmin) / 2, 567 | total_annotations = sum(!is.na(annotation_num)) 568 | ) |> 569 | ungroup() |> 570 | glimpse() 571 | #> Rows: 150 572 | #> Columns: 14 573 | #> $ speaker "speaker001", "speaker001", "speaker001", "speaker00… 574 | #> $ file "s2T01.TextGrid", "s2T01.TextGrid", "s2T01.TextGrid"… 575 | #> $ tier_num 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2… 576 | #> $ tier_name "words", "words", "words", "words", "phones", "phone… 577 | #> $ tier_type "IntervalTier", "IntervalTier", "IntervalTier", "Int… 578 | #> $ tier_xmin 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0… 579 | #> $ tier_xmax 1.348571, 1.348571, 1.348571, 1.348571, 1.348571, 1.… 580 | #> $ xmin 0.000, 0.297, 0.522, 0.972, 0.000, 0.297, 0.360, 0.4… 581 | #> $ xmax 0.297000, 0.522000, 0.972000, 1.348571, 0.297000, 0.… 582 | #> $ text "", "bird", "house", "", "sil", "B", "ER1", "D", "HH… 583 | #> $ annotation_num 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 1… 584 | #> $ duration 0.29700000, 0.22500000, 0.45000000, 0.37657143, 0.29… 585 | #> $ xmid 0.148500, 0.409500, 0.747000, 1.160286, 0.148500, 0.… 586 | #> $ total_annotations 4, 4, 4, 4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 4, 4, 4, 4, 1… 587 | ``` 588 | 589 | ### Launching Praat 590 | 591 | *This tip is written from the perspective of a Windows user who uses git 592 | bash for a terminal*. 593 | 594 | To open textgrids in Praat, you can tell R to call Praat from the 595 | command line. You have to know where the location of the Praat binary is 596 | though. I like to keep a copy in my project directories. So, assuming 597 | that Praat.exe in my working folder, the following would open the 10 598 | textgrids in `paths` in Praat. 599 | 600 | ``` r 601 | system2( 602 | command = "./Praat.exe", 603 | args = c("--open", paths), 604 | wait = FALSE 605 | ) 606 | ``` 607 | 608 | ## Acknowledgments 609 | 610 | readtextgrid was created to process data from the [WISC Lab 611 | project](https://kidspeech.wisc.edu/). Thus, development of this package 612 | was supported by NIH R01DC009411 and NIH R01DC015653. 613 | 614 | ------------------------------------------------------------------------ 615 | 616 | Please note that the ‘readtextgrid’ project is released with a 617 | [Contributor Code of 618 | Conduct](https://www.contributor-covenant.org/version/1/0/0/code-of-conduct.html). 619 | By contributing to this project, you agree to abide by its terms. 620 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU General Public License 2 | ========================== 3 | 4 | _Version 3, 29 June 2007_ 5 | _Copyright © 2007 Free Software Foundation, Inc. <>_ 6 | 7 | Everyone is permitted to copy and distribute verbatim copies of this license 8 | document, but changing it is not allowed. 9 | 10 | ## Preamble 11 | 12 | The GNU General Public License is a free, copyleft license for software and other 13 | kinds of works. 14 | 15 | The licenses for most software and other practical works are designed to take away 16 | your freedom to share and change the works. By contrast, the GNU General Public 17 | License is intended to guarantee your freedom to share and change all versions of a 18 | program--to make sure it remains free software for all its users. We, the Free 19 | Software Foundation, use the GNU General Public License for most of our software; it 20 | applies also to any other work released this way by its authors. You can apply it to 21 | your programs, too. 22 | 23 | When we speak of free software, we are referring to freedom, not price. Our General 24 | Public Licenses are designed to make sure that you have the freedom to distribute 25 | copies of free software (and charge for them if you wish), that you receive source 26 | code or can get it if you want it, that you can change the software or use pieces of 27 | it in new free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you these rights or 30 | asking you to surrender the rights. Therefore, you have certain responsibilities if 31 | you distribute copies of the software, or if you modify it: responsibilities to 32 | respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether gratis or for a fee, 35 | you must pass on to the recipients the same freedoms that you received. You must make 36 | sure that they, too, receive or can get the source code. And you must show them these 37 | terms so they know their rights. 38 | 39 | Developers that use the GNU GPL protect your rights with two steps: **(1)** assert 40 | copyright on the software, and **(2)** offer you this License giving you legal permission 41 | to copy, distribute and/or modify it. 42 | 43 | For the developers' and authors' protection, the GPL clearly explains that there is 44 | no warranty for this free software. For both users' and authors' sake, the GPL 45 | requires that modified versions be marked as changed, so that their problems will not 46 | be attributed erroneously to authors of previous versions. 47 | 48 | Some devices are designed to deny users access to install or run modified versions of 49 | the software inside them, although the manufacturer can do so. This is fundamentally 50 | incompatible with the aim of protecting users' freedom to change the software. The 51 | systematic pattern of such abuse occurs in the area of products for individuals to 52 | use, which is precisely where it is most unacceptable. Therefore, we have designed 53 | this version of the GPL to prohibit the practice for those products. If such problems 54 | arise substantially in other domains, we stand ready to extend this provision to 55 | those domains in future versions of the GPL, as needed to protect the freedom of 56 | users. 57 | 58 | Finally, every program is threatened constantly by software patents. States should 59 | not allow patents to restrict development and use of software on general-purpose 60 | computers, but in those that do, we wish to avoid the special danger that patents 61 | applied to a free program could make it effectively proprietary. To prevent this, the 62 | GPL assures that patents cannot be used to render the program non-free. 63 | 64 | The precise terms and conditions for copying, distribution and modification follow. 65 | 66 | ## TERMS AND CONDITIONS 67 | 68 | ### 0. Definitions 69 | 70 | “This License” refers to version 3 of the GNU General Public License. 71 | 72 | “Copyright” also means copyright-like laws that apply to other kinds of 73 | works, such as semiconductor masks. 74 | 75 | “The Program” refers to any copyrightable work licensed under this 76 | License. Each licensee is addressed as “you”. “Licensees” and 77 | “recipients” may be individuals or organizations. 78 | 79 | To “modify” a work means to copy from or adapt all or part of the work in 80 | a fashion requiring copyright permission, other than the making of an exact copy. The 81 | resulting work is called a “modified version” of the earlier work or a 82 | work “based on” the earlier work. 83 | 84 | A “covered work” means either the unmodified Program or a work based on 85 | the Program. 86 | 87 | To “propagate” a work means to do anything with it that, without 88 | permission, would make you directly or secondarily liable for infringement under 89 | applicable copyright law, except executing it on a computer or modifying a private 90 | copy. Propagation includes copying, distribution (with or without modification), 91 | making available to the public, and in some countries other activities as well. 92 | 93 | To “convey” a work means any kind of propagation that enables other 94 | parties to make or receive copies. Mere interaction with a user through a computer 95 | network, with no transfer of a copy, is not conveying. 96 | 97 | An interactive user interface displays “Appropriate Legal Notices” to the 98 | extent that it includes a convenient and prominently visible feature that **(1)** 99 | displays an appropriate copyright notice, and **(2)** tells the user that there is no 100 | warranty for the work (except to the extent that warranties are provided), that 101 | licensees may convey the work under this License, and how to view a copy of this 102 | License. If the interface presents a list of user commands or options, such as a 103 | menu, a prominent item in the list meets this criterion. 104 | 105 | ### 1. Source Code 106 | 107 | The “source code” for a work means the preferred form of the work for 108 | making modifications to it. “Object code” means any non-source form of a 109 | work. 110 | 111 | A “Standard Interface” means an interface that either is an official 112 | standard defined by a recognized standards body, or, in the case of interfaces 113 | specified for a particular programming language, one that is widely used among 114 | developers working in that language. 115 | 116 | The “System Libraries” of an executable work include anything, other than 117 | the work as a whole, that **(a)** is included in the normal form of packaging a Major 118 | Component, but which is not part of that Major Component, and **(b)** serves only to 119 | enable use of the work with that Major Component, or to implement a Standard 120 | Interface for which an implementation is available to the public in source code form. 121 | A “Major Component”, in this context, means a major essential component 122 | (kernel, window system, and so on) of the specific operating system (if any) on which 123 | the executable work runs, or a compiler used to produce the work, or an object code 124 | interpreter used to run it. 125 | 126 | The “Corresponding Source” for a work in object code form means all the 127 | source code needed to generate, install, and (for an executable work) run the object 128 | code and to modify the work, including scripts to control those activities. However, 129 | it does not include the work's System Libraries, or general-purpose tools or 130 | generally available free programs which are used unmodified in performing those 131 | activities but which are not part of the work. For example, Corresponding Source 132 | includes interface definition files associated with source files for the work, and 133 | the source code for shared libraries and dynamically linked subprograms that the work 134 | is specifically designed to require, such as by intimate data communication or 135 | control flow between those subprograms and other parts of the work. 136 | 137 | The Corresponding Source need not include anything that users can regenerate 138 | automatically from other parts of the Corresponding Source. 139 | 140 | The Corresponding Source for a work in source code form is that same work. 141 | 142 | ### 2. Basic Permissions 143 | 144 | All rights granted under this License are granted for the term of copyright on the 145 | Program, and are irrevocable provided the stated conditions are met. This License 146 | explicitly affirms your unlimited permission to run the unmodified Program. The 147 | output from running a covered work is covered by this License only if the output, 148 | given its content, constitutes a covered work. This License acknowledges your rights 149 | of fair use or other equivalent, as provided by copyright law. 150 | 151 | You may make, run and propagate covered works that you do not convey, without 152 | conditions so long as your license otherwise remains in force. You may convey covered 153 | works to others for the sole purpose of having them make modifications exclusively 154 | for you, or provide you with facilities for running those works, provided that you 155 | comply with the terms of this License in conveying all material for which you do not 156 | control copyright. Those thus making or running the covered works for you must do so 157 | exclusively on your behalf, under your direction and control, on terms that prohibit 158 | them from making any copies of your copyrighted material outside their relationship 159 | with you. 160 | 161 | Conveying under any other circumstances is permitted solely under the conditions 162 | stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 163 | 164 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law 165 | 166 | No covered work shall be deemed part of an effective technological measure under any 167 | applicable law fulfilling obligations under article 11 of the WIPO copyright treaty 168 | adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention 169 | of such measures. 170 | 171 | When you convey a covered work, you waive any legal power to forbid circumvention of 172 | technological measures to the extent such circumvention is effected by exercising 173 | rights under this License with respect to the covered work, and you disclaim any 174 | intention to limit operation or modification of the work as a means of enforcing, 175 | against the work's users, your or third parties' legal rights to forbid circumvention 176 | of technological measures. 177 | 178 | ### 4. Conveying Verbatim Copies 179 | 180 | You may convey verbatim copies of the Program's source code as you receive it, in any 181 | medium, provided that you conspicuously and appropriately publish on each copy an 182 | appropriate copyright notice; keep intact all notices stating that this License and 183 | any non-permissive terms added in accord with section 7 apply to the code; keep 184 | intact all notices of the absence of any warranty; and give all recipients a copy of 185 | this License along with the Program. 186 | 187 | You may charge any price or no price for each copy that you convey, and you may offer 188 | support or warranty protection for a fee. 189 | 190 | ### 5. Conveying Modified Source Versions 191 | 192 | You may convey a work based on the Program, or the modifications to produce it from 193 | the Program, in the form of source code under the terms of section 4, provided that 194 | you also meet all of these conditions: 195 | 196 | * **a)** The work must carry prominent notices stating that you modified it, and giving a 197 | relevant date. 198 | * **b)** The work must carry prominent notices stating that it is released under this 199 | License and any conditions added under section 7. This requirement modifies the 200 | requirement in section 4 to “keep intact all notices”. 201 | * **c)** You must license the entire work, as a whole, under this License to anyone who 202 | comes into possession of a copy. This License will therefore apply, along with any 203 | applicable section 7 additional terms, to the whole of the work, and all its parts, 204 | regardless of how they are packaged. This License gives no permission to license the 205 | work in any other way, but it does not invalidate such permission if you have 206 | separately received it. 207 | * **d)** If the work has interactive user interfaces, each must display Appropriate Legal 208 | Notices; however, if the Program has interactive interfaces that do not display 209 | Appropriate Legal Notices, your work need not make them do so. 210 | 211 | A compilation of a covered work with other separate and independent works, which are 212 | not by their nature extensions of the covered work, and which are not combined with 213 | it such as to form a larger program, in or on a volume of a storage or distribution 214 | medium, is called an “aggregate” if the compilation and its resulting 215 | copyright are not used to limit the access or legal rights of the compilation's users 216 | beyond what the individual works permit. Inclusion of a covered work in an aggregate 217 | does not cause this License to apply to the other parts of the aggregate. 218 | 219 | ### 6. Conveying Non-Source Forms 220 | 221 | You may convey a covered work in object code form under the terms of sections 4 and 222 | 5, provided that you also convey the machine-readable Corresponding Source under the 223 | terms of this License, in one of these ways: 224 | 225 | * **a)** Convey the object code in, or embodied in, a physical product (including a 226 | physical distribution medium), accompanied by the Corresponding Source fixed on a 227 | durable physical medium customarily used for software interchange. 228 | * **b)** Convey the object code in, or embodied in, a physical product (including a 229 | physical distribution medium), accompanied by a written offer, valid for at least 230 | three years and valid for as long as you offer spare parts or customer support for 231 | that product model, to give anyone who possesses the object code either **(1)** a copy of 232 | the Corresponding Source for all the software in the product that is covered by this 233 | License, on a durable physical medium customarily used for software interchange, for 234 | a price no more than your reasonable cost of physically performing this conveying of 235 | source, or **(2)** access to copy the Corresponding Source from a network server at no 236 | charge. 237 | * **c)** Convey individual copies of the object code with a copy of the written offer to 238 | provide the Corresponding Source. This alternative is allowed only occasionally and 239 | noncommercially, and only if you received the object code with such an offer, in 240 | accord with subsection 6b. 241 | * **d)** Convey the object code by offering access from a designated place (gratis or for 242 | a charge), and offer equivalent access to the Corresponding Source in the same way 243 | through the same place at no further charge. You need not require recipients to copy 244 | the Corresponding Source along with the object code. If the place to copy the object 245 | code is a network server, the Corresponding Source may be on a different server 246 | (operated by you or a third party) that supports equivalent copying facilities, 247 | provided you maintain clear directions next to the object code saying where to find 248 | the Corresponding Source. Regardless of what server hosts the Corresponding Source, 249 | you remain obligated to ensure that it is available for as long as needed to satisfy 250 | these requirements. 251 | * **e)** Convey the object code using peer-to-peer transmission, provided you inform 252 | other peers where the object code and Corresponding Source of the work are being 253 | offered to the general public at no charge under subsection 6d. 254 | 255 | A separable portion of the object code, whose source code is excluded from the 256 | Corresponding Source as a System Library, need not be included in conveying the 257 | object code work. 258 | 259 | A “User Product” is either **(1)** a “consumer product”, which 260 | means any tangible personal property which is normally used for personal, family, or 261 | household purposes, or **(2)** anything designed or sold for incorporation into a 262 | dwelling. In determining whether a product is a consumer product, doubtful cases 263 | shall be resolved in favor of coverage. For a particular product received by a 264 | particular user, “normally used” refers to a typical or common use of 265 | that class of product, regardless of the status of the particular user or of the way 266 | in which the particular user actually uses, or expects or is expected to use, the 267 | product. A product is a consumer product regardless of whether the product has 268 | substantial commercial, industrial or non-consumer uses, unless such uses represent 269 | the only significant mode of use of the product. 270 | 271 | “Installation Information” for a User Product means any methods, 272 | procedures, authorization keys, or other information required to install and execute 273 | modified versions of a covered work in that User Product from a modified version of 274 | its Corresponding Source. The information must suffice to ensure that the continued 275 | functioning of the modified object code is in no case prevented or interfered with 276 | solely because modification has been made. 277 | 278 | If you convey an object code work under this section in, or with, or specifically for 279 | use in, a User Product, and the conveying occurs as part of a transaction in which 280 | the right of possession and use of the User Product is transferred to the recipient 281 | in perpetuity or for a fixed term (regardless of how the transaction is 282 | characterized), the Corresponding Source conveyed under this section must be 283 | accompanied by the Installation Information. But this requirement does not apply if 284 | neither you nor any third party retains the ability to install modified object code 285 | on the User Product (for example, the work has been installed in ROM). 286 | 287 | The requirement to provide Installation Information does not include a requirement to 288 | continue to provide support service, warranty, or updates for a work that has been 289 | modified or installed by the recipient, or for the User Product in which it has been 290 | modified or installed. Access to a network may be denied when the modification itself 291 | materially and adversely affects the operation of the network or violates the rules 292 | and protocols for communication across the network. 293 | 294 | Corresponding Source conveyed, and Installation Information provided, in accord with 295 | this section must be in a format that is publicly documented (and with an 296 | implementation available to the public in source code form), and must require no 297 | special password or key for unpacking, reading or copying. 298 | 299 | ### 7. Additional Terms 300 | 301 | “Additional permissions” are terms that supplement the terms of this 302 | License by making exceptions from one or more of its conditions. Additional 303 | permissions that are applicable to the entire Program shall be treated as though they 304 | were included in this License, to the extent that they are valid under applicable 305 | law. If additional permissions apply only to part of the Program, that part may be 306 | used separately under those permissions, but the entire Program remains governed by 307 | this License without regard to the additional permissions. 308 | 309 | When you convey a copy of a covered work, you may at your option remove any 310 | additional permissions from that copy, or from any part of it. (Additional 311 | permissions may be written to require their own removal in certain cases when you 312 | modify the work.) You may place additional permissions on material, added by you to a 313 | covered work, for which you have or can give appropriate copyright permission. 314 | 315 | Notwithstanding any other provision of this License, for material you add to a 316 | covered work, you may (if authorized by the copyright holders of that material) 317 | supplement the terms of this License with terms: 318 | 319 | * **a)** Disclaiming warranty or limiting liability differently from the terms of 320 | sections 15 and 16 of this License; or 321 | * **b)** Requiring preservation of specified reasonable legal notices or author 322 | attributions in that material or in the Appropriate Legal Notices displayed by works 323 | containing it; or 324 | * **c)** Prohibiting misrepresentation of the origin of that material, or requiring that 325 | modified versions of such material be marked in reasonable ways as different from the 326 | original version; or 327 | * **d)** Limiting the use for publicity purposes of names of licensors or authors of the 328 | material; or 329 | * **e)** Declining to grant rights under trademark law for use of some trade names, 330 | trademarks, or service marks; or 331 | * **f)** Requiring indemnification of licensors and authors of that material by anyone 332 | who conveys the material (or modified versions of it) with contractual assumptions of 333 | liability to the recipient, for any liability that these contractual assumptions 334 | directly impose on those licensors and authors. 335 | 336 | All other non-permissive additional terms are considered “further 337 | restrictions” within the meaning of section 10. If the Program as you received 338 | it, or any part of it, contains a notice stating that it is governed by this License 339 | along with a term that is a further restriction, you may remove that term. If a 340 | license document contains a further restriction but permits relicensing or conveying 341 | under this License, you may add to a covered work material governed by the terms of 342 | that license document, provided that the further restriction does not survive such 343 | relicensing or conveying. 344 | 345 | If you add terms to a covered work in accord with this section, you must place, in 346 | the relevant source files, a statement of the additional terms that apply to those 347 | files, or a notice indicating where to find the applicable terms. 348 | 349 | Additional terms, permissive or non-permissive, may be stated in the form of a 350 | separately written license, or stated as exceptions; the above requirements apply 351 | either way. 352 | 353 | ### 8. Termination 354 | 355 | You may not propagate or modify a covered work except as expressly provided under 356 | this License. Any attempt otherwise to propagate or modify it is void, and will 357 | automatically terminate your rights under this License (including any patent licenses 358 | granted under the third paragraph of section 11). 359 | 360 | However, if you cease all violation of this License, then your license from a 361 | particular copyright holder is reinstated **(a)** provisionally, unless and until the 362 | copyright holder explicitly and finally terminates your license, and **(b)** permanently, 363 | if the copyright holder fails to notify you of the violation by some reasonable means 364 | prior to 60 days after the cessation. 365 | 366 | Moreover, your license from a particular copyright holder is reinstated permanently 367 | if the copyright holder notifies you of the violation by some reasonable means, this 368 | is the first time you have received notice of violation of this License (for any 369 | work) from that copyright holder, and you cure the violation prior to 30 days after 370 | your receipt of the notice. 371 | 372 | Termination of your rights under this section does not terminate the licenses of 373 | parties who have received copies or rights from you under this License. If your 374 | rights have been terminated and not permanently reinstated, you do not qualify to 375 | receive new licenses for the same material under section 10. 376 | 377 | ### 9. Acceptance Not Required for Having Copies 378 | 379 | You are not required to accept this License in order to receive or run a copy of the 380 | Program. Ancillary propagation of a covered work occurring solely as a consequence of 381 | using peer-to-peer transmission to receive a copy likewise does not require 382 | acceptance. However, nothing other than this License grants you permission to 383 | propagate or modify any covered work. These actions infringe copyright if you do not 384 | accept this License. Therefore, by modifying or propagating a covered work, you 385 | indicate your acceptance of this License to do so. 386 | 387 | ### 10. Automatic Licensing of Downstream Recipients 388 | 389 | Each time you convey a covered work, the recipient automatically receives a license 390 | from the original licensors, to run, modify and propagate that work, subject to this 391 | License. You are not responsible for enforcing compliance by third parties with this 392 | License. 393 | 394 | An “entity transaction” is a transaction transferring control of an 395 | organization, or substantially all assets of one, or subdividing an organization, or 396 | merging organizations. If propagation of a covered work results from an entity 397 | transaction, each party to that transaction who receives a copy of the work also 398 | receives whatever licenses to the work the party's predecessor in interest had or 399 | could give under the previous paragraph, plus a right to possession of the 400 | Corresponding Source of the work from the predecessor in interest, if the predecessor 401 | has it or can get it with reasonable efforts. 402 | 403 | You may not impose any further restrictions on the exercise of the rights granted or 404 | affirmed under this License. For example, you may not impose a license fee, royalty, 405 | or other charge for exercise of rights granted under this License, and you may not 406 | initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging 407 | that any patent claim is infringed by making, using, selling, offering for sale, or 408 | importing the Program or any portion of it. 409 | 410 | ### 11. Patents 411 | 412 | A “contributor” is a copyright holder who authorizes use under this 413 | License of the Program or a work on which the Program is based. The work thus 414 | licensed is called the contributor's “contributor version”. 415 | 416 | A contributor's “essential patent claims” are all patent claims owned or 417 | controlled by the contributor, whether already acquired or hereafter acquired, that 418 | would be infringed by some manner, permitted by this License, of making, using, or 419 | selling its contributor version, but do not include claims that would be infringed 420 | only as a consequence of further modification of the contributor version. For 421 | purposes of this definition, “control” includes the right to grant patent 422 | sublicenses in a manner consistent with the requirements of this License. 423 | 424 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license 425 | under the contributor's essential patent claims, to make, use, sell, offer for sale, 426 | import and otherwise run, modify and propagate the contents of its contributor 427 | version. 428 | 429 | In the following three paragraphs, a “patent license” is any express 430 | agreement or commitment, however denominated, not to enforce a patent (such as an 431 | express permission to practice a patent or covenant not to sue for patent 432 | infringement). To “grant” such a patent license to a party means to make 433 | such an agreement or commitment not to enforce a patent against the party. 434 | 435 | If you convey a covered work, knowingly relying on a patent license, and the 436 | Corresponding Source of the work is not available for anyone to copy, free of charge 437 | and under the terms of this License, through a publicly available network server or 438 | other readily accessible means, then you must either **(1)** cause the Corresponding 439 | Source to be so available, or **(2)** arrange to deprive yourself of the benefit of the 440 | patent license for this particular work, or **(3)** arrange, in a manner consistent with 441 | the requirements of this License, to extend the patent license to downstream 442 | recipients. “Knowingly relying” means you have actual knowledge that, but 443 | for the patent license, your conveying the covered work in a country, or your 444 | recipient's use of the covered work in a country, would infringe one or more 445 | identifiable patents in that country that you have reason to believe are valid. 446 | 447 | If, pursuant to or in connection with a single transaction or arrangement, you 448 | convey, or propagate by procuring conveyance of, a covered work, and grant a patent 449 | license to some of the parties receiving the covered work authorizing them to use, 450 | propagate, modify or convey a specific copy of the covered work, then the patent 451 | license you grant is automatically extended to all recipients of the covered work and 452 | works based on it. 453 | 454 | A patent license is “discriminatory” if it does not include within the 455 | scope of its coverage, prohibits the exercise of, or is conditioned on the 456 | non-exercise of one or more of the rights that are specifically granted under this 457 | License. You may not convey a covered work if you are a party to an arrangement with 458 | a third party that is in the business of distributing software, under which you make 459 | payment to the third party based on the extent of your activity of conveying the 460 | work, and under which the third party grants, to any of the parties who would receive 461 | the covered work from you, a discriminatory patent license **(a)** in connection with 462 | copies of the covered work conveyed by you (or copies made from those copies), or **(b)** 463 | primarily for and in connection with specific products or compilations that contain 464 | the covered work, unless you entered into that arrangement, or that patent license 465 | was granted, prior to 28 March 2007. 466 | 467 | Nothing in this License shall be construed as excluding or limiting any implied 468 | license or other defenses to infringement that may otherwise be available to you 469 | under applicable patent law. 470 | 471 | ### 12. No Surrender of Others' Freedom 472 | 473 | If conditions are imposed on you (whether by court order, agreement or otherwise) 474 | that contradict the conditions of this License, they do not excuse you from the 475 | conditions of this License. If you cannot convey a covered work so as to satisfy 476 | simultaneously your obligations under this License and any other pertinent 477 | obligations, then as a consequence you may not convey it at all. For example, if you 478 | agree to terms that obligate you to collect a royalty for further conveying from 479 | those to whom you convey the Program, the only way you could satisfy both those terms 480 | and this License would be to refrain entirely from conveying the Program. 481 | 482 | ### 13. Use with the GNU Affero General Public License 483 | 484 | Notwithstanding any other provision of this License, you have permission to link or 485 | combine any covered work with a work licensed under version 3 of the GNU Affero 486 | General Public License into a single combined work, and to convey the resulting work. 487 | The terms of this License will continue to apply to the part which is the covered 488 | work, but the special requirements of the GNU Affero General Public License, section 489 | 13, concerning interaction through a network will apply to the combination as such. 490 | 491 | ### 14. Revised Versions of this License 492 | 493 | The Free Software Foundation may publish revised and/or new versions of the GNU 494 | General Public License from time to time. Such new versions will be similar in spirit 495 | to the present version, but may differ in detail to address new problems or concerns. 496 | 497 | Each version is given a distinguishing version number. If the Program specifies that 498 | a certain numbered version of the GNU General Public License “or any later 499 | version” applies to it, you have the option of following the terms and 500 | conditions either of that numbered version or of any later version published by the 501 | Free Software Foundation. If the Program does not specify a version number of the GNU 502 | General Public License, you may choose any version ever published by the Free 503 | Software Foundation. 504 | 505 | If the Program specifies that a proxy can decide which future versions of the GNU 506 | General Public License can be used, that proxy's public statement of acceptance of a 507 | version permanently authorizes you to choose that version for the Program. 508 | 509 | Later license versions may give you additional or different permissions. However, no 510 | additional obligations are imposed on any author or copyright holder as a result of 511 | your choosing to follow a later version. 512 | 513 | ### 15. Disclaimer of Warranty 514 | 515 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 516 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 517 | PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER 518 | EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 519 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE 520 | QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE 521 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 522 | 523 | ### 16. Limitation of Liability 524 | 525 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY 526 | COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS 527 | PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, 528 | INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 529 | PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE 530 | OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE 531 | WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 532 | POSSIBILITY OF SUCH DAMAGES. 533 | 534 | ### 17. Interpretation of Sections 15 and 16 535 | 536 | If the disclaimer of warranty and limitation of liability provided above cannot be 537 | given local legal effect according to their terms, reviewing courts shall apply local 538 | law that most closely approximates an absolute waiver of all civil liability in 539 | connection with the Program, unless a warranty or assumption of liability accompanies 540 | a copy of the Program in return for a fee. 541 | 542 | _END OF TERMS AND CONDITIONS_ 543 | 544 | ## How to Apply These Terms to Your New Programs 545 | 546 | If you develop a new program, and you want it to be of the greatest possible use to 547 | the public, the best way to achieve this is to make it free software which everyone 548 | can redistribute and change under these terms. 549 | 550 | To do so, attach the following notices to the program. It is safest to attach them 551 | to the start of each source file to most effectively state the exclusion of warranty; 552 | and each file should have at least the “copyright” line and a pointer to 553 | where the full notice is found. 554 | 555 | 556 | Copyright (C) 2019 Tristan Mahr 557 | 558 | This program is free software: you can redistribute it and/or modify 559 | it under the terms of the GNU General Public License as published by 560 | the Free Software Foundation, either version 3 of the License, or 561 | (at your option) any later version. 562 | 563 | This program is distributed in the hope that it will be useful, 564 | but WITHOUT ANY WARRANTY; without even the implied warranty of 565 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 566 | GNU General Public License for more details. 567 | 568 | You should have received a copy of the GNU General Public License 569 | along with this program. If not, see . 570 | 571 | Also add information on how to contact you by electronic and paper mail. 572 | 573 | If the program does terminal interaction, make it output a short notice like this 574 | when it starts in an interactive mode: 575 | 576 | readtextgrid Copyright (C) 2019 Tristan Mahr 577 | This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'. 578 | This is free software, and you are welcome to redistribute it 579 | under certain conditions; type 'show c' for details. 580 | 581 | The hypothetical commands `show w` and `show c` should show the appropriate parts of 582 | the General Public License. Of course, your program's commands might be different; 583 | for a GUI interface, you would use an “about box”. 584 | 585 | You should also get your employer (if you work as a programmer) or school, if any, to 586 | sign a “copyright disclaimer” for the program, if necessary. For more 587 | information on this, and how to apply and follow the GNU GPL, see 588 | <>. 589 | 590 | The GNU General Public License does not permit incorporating your program into 591 | proprietary programs. If your program is a subroutine library, you may consider it 592 | more useful to permit linking proprietary applications with the library. If this is 593 | what you want to do, use the GNU Lesser General Public License instead of this 594 | License. But first, please read 595 | <>. 596 | --------------------------------------------------------------------------------