├── .editorconfig ├── .github └── workflows │ └── build.yaml ├── .gitignore ├── AUTHORS ├── CMakeLists.txt ├── COPYING ├── README.md ├── autogen.sh ├── cmake.sh ├── configure ├── lttoolbox.pc.in ├── lttoolbox ├── CMakeLists.txt ├── acx.cc ├── acx.h ├── acx.rng ├── alphabet.cc ├── alphabet.h ├── att_compiler.cc ├── att_compiler.h ├── buffer.h ├── check-cstdint.cc ├── cli.cc ├── cli.h ├── compiler.cc ├── compiler.h ├── compression.cc ├── compression.h ├── deserialiser.h ├── dix.dtd ├── dix.rnc ├── dix.rng ├── entry_token.cc ├── entry_token.h ├── exception.h ├── expander.cc ├── expander.h ├── file_utils.cc ├── file_utils.h ├── fst_processor.cc ├── fst_processor.h ├── icx.rng ├── input_file.cc ├── input_file.h ├── lt-append.1 ├── lt-comp.1 ├── lt-compose.1 ├── lt-expand.1 ├── lt-merge.1 ├── lt-paradigm.1 ├── lt-print.1 ├── lt-proc.1 ├── lt-tmxcomp.1 ├── lt-tmxproc.1 ├── lt-trim.1 ├── lt_append.cc ├── lt_apply_acx.cc ├── lt_comp.cc ├── lt_compose.cc ├── lt_expand.cc ├── lt_invert.cc ├── lt_locale.cc ├── lt_locale.h ├── lt_merge.cc ├── lt_paradigm.cc ├── lt_print.cc ├── lt_proc.cc ├── lt_restrict.cc ├── lt_tmxcomp.cc ├── lt_tmxproc.cc ├── lt_trim.cc ├── match_exe.cc ├── match_exe.h ├── match_node.cc ├── match_node.h ├── match_state.cc ├── match_state.h ├── my_stdio.h ├── node.cc ├── node.h ├── pattern_list.cc ├── pattern_list.h ├── pool.h ├── rcx.rng ├── regexp_compiler.cc ├── regexp_compiler.h ├── serialiser.h ├── sorted_vector.cc ├── sorted_vector.h ├── sorted_vector.hpp ├── state.cc ├── state.h ├── stream_reader.cc ├── stream_reader.h ├── string_utils.cc ├── string_utils.h ├── symbol_iter.cc ├── symbol_iter.h ├── tmx_compiler.cc ├── tmx_compiler.h ├── trans_exe.cc ├── trans_exe.h ├── transducer.cc ├── transducer.h ├── ustring.cc ├── ustring.h ├── win32 │ ├── libgen.c │ ├── libgen.h │ ├── regex.c │ ├── regex.h │ └── unistd.h ├── xml_parse_util.cc ├── xml_parse_util.h ├── xml_walk_util.cc ├── xml_walk_util.h └── xsd │ ├── acx.xsd │ └── dix.xsd ├── python ├── CMakeLists.txt ├── lttoolbox.i.in └── setup.py.in └── tests ├── README ├── basictest.py ├── data ├── a2b.dix ├── alphabet.att ├── alphabetic-after-group-bi.dix ├── alphabetic-after-group-mono.dix ├── apostrophe.att ├── append1.dix ├── append2.dix ├── arabic-punct.att ├── baregroup-mono.dix ├── basic.acx ├── basic.lsx ├── bidix-epsilons-bi.dix ├── bidix-epsilons-mono.dix ├── bidixpardef-bi.dix ├── bidixpardef-mono.dix ├── big-mono.dix ├── biproc-skips-tags-mono.dix ├── cat-epsilon-loop.att ├── cat-epsilon-to-final.att ├── cat-multiple-fst.att ├── cat-weight-final.att ├── cat-weight-heavy.att ├── cat-weight-initial.att ├── cat-weight-middle.att ├── cat-weight-negative.att ├── cat-weight.att ├── cmp-bi.dix ├── cmp-mono.dix ├── compose1.dix ├── diverging-paths-bi.dix ├── diverging-paths-mono.dix ├── double-clitics-bi.dix ├── double-clitics-mono.dix ├── empty-bi.dix ├── empty-mono.dix ├── entirely-empty.dix ├── entry-weights.dix ├── expand-re.dix ├── final-epsilons-bi.dix ├── final-epsilons-mono.dix ├── gardenpath-mwe.dix ├── group-after-join-bi.dix ├── group-after-join-mono.dix ├── group-bi.dix ├── group-mono.dix ├── intergen.dix ├── left-unbalanced-epsilons-bi.dix ├── left-unbalanced-epsilons-mono.dix ├── lemma-entry-weights.dix ├── lhs-empty-mono.dix ├── lhs-ws-mono.dix ├── longleft-bi.dix ├── longleft-mono.dix ├── merging-paths-bi.dix ├── merging-paths-mono.dix ├── minimal-bi.dix ├── minimal-mono.dix ├── morpheme-boundaries.dix ├── multichar.att ├── non-bmp.att ├── non-bmp.dix ├── numbers.tmx ├── oci-pgen.dix ├── pass-through.lsx ├── plus-lemma-bi.dix ├── plus-lemma-mono.dix ├── postgen-overlap.dix ├── postgen-short.dix ├── postgen.dix ├── pp2p.dix ├── rhs-empty-mono.dix ├── rhs-ws-mono.dix ├── sectiondupes.dix ├── sections.dix ├── simple.tmx ├── slash-tags.dix ├── space-eof-incond.dix ├── spcmp.dix ├── unbalanced-epsilons-bi.dix ├── unbalanced-epsilons-mono.dix ├── underscore.dix ├── upp2up.dix ├── variants.dix ├── walk-weight.att └── wordbound-blank.dix ├── lt_append └── __init__.py ├── lt_apply_acx └── __init__.py ├── lt_comp └── __init__.py ├── lt_compose └── __init__.py ├── lt_expand └── __init__.py ├── lt_merge └── __init__.py ├── lt_paradigm └── __init__.py ├── lt_print └── __init__.py ├── lt_proc ├── __init__.py └── null_flush_invalid_stream_format.py ├── lt_tmxproc └── __init__.py ├── lt_trim └── __init__.py └── run_tests.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org/ 2 | root = true 3 | 4 | [*] 5 | charset = utf-8 6 | end_of_line = lf 7 | indent_size = 4 8 | indent_style = tab 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | 12 | [**.cc] 13 | indent_size = 2 14 | indent_style = space 15 | 16 | [**.h] 17 | indent_size = 2 18 | indent_style = space 19 | 20 | [**.py] 21 | indent_size = 4 22 | indent_style = space 23 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Lttoolbox CI Build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: dependencies 11 | run: | 12 | sudo apt-get -qy update 13 | sudo apt-get -qfy install --no-install-recommends build-essential cmake pkg-config libutfcpp-dev libxml2-dev libxml2-utils python3-dev python3-setuptools swig 14 | - name: configure 15 | run: cmake -DENABLE_PYTHON_BINDINGS=ON . 16 | - name: build 17 | run: make -j4 V=1 VERBOSE=1 18 | - name: tests 19 | run: make test 20 | - name: make install 21 | run: sudo make install 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /build*/ 3 | /.ccls-cache/ 4 | /*.pc 5 | Makefile 6 | CMakeCache.txt 7 | CMakeFiles 8 | CTestTestfile.cmake 9 | cmake_install.cmake 10 | install_manifest.txt 11 | 12 | # Prerequisites 13 | *.d 14 | 15 | # Compiled Object files 16 | *.slo 17 | *.lo 18 | *.o 19 | *.obj 20 | 21 | # Precompiled Headers 22 | *.gch 23 | *.pch 24 | 25 | # Compiled Dynamic libraries 26 | *.so 27 | *.dylib 28 | *.dll 29 | lttoolbox/liblttoolbox.so.* 30 | 31 | # Fortran module files 32 | *.mod 33 | *.smod 34 | 35 | # Compiled Static libraries 36 | *.lai 37 | *.la 38 | *.a 39 | *.lib 40 | 41 | # Executables 42 | *.exe 43 | *.out 44 | *.app 45 | 46 | /compile_commands.json 47 | /lttoolbox/lt-comp 48 | /lttoolbox/lt-compose 49 | /lttoolbox/lt-proc 50 | /lttoolbox/lt-merge 51 | /lttoolbox/lt-trim 52 | /lttoolbox/Makefile 53 | /lttoolbox/Makefile.in 54 | /lttoolbox/lt-tmxcomp 55 | /lttoolbox/lt-print 56 | /lttoolbox/stamp-h1 57 | /lttoolbox/lttoolbox_config.h.in 58 | /lttoolbox/lttoolbox_config.h 59 | /lttoolbox/lt-tmxproc 60 | /lttoolbox/lt-expand 61 | /lttoolbox/lt-append 62 | /lttoolbox/lsx-comp 63 | /lttoolbox/lsx-comp.1 64 | /lttoolbox/lt-paradigm 65 | /lttoolbox/lt-invert 66 | /lttoolbox/lt-restrict 67 | /lttoolbox/lt-apply-acx 68 | /python/Makefile 69 | /python/Makefile.in 70 | /python/lttoolbox.i 71 | /python/lttoolbox_wrap.cpp 72 | /python/lttoolbox.py 73 | /python/setup.py 74 | /python/build* 75 | *.egg-info/ 76 | *.egg 77 | **/.mypy_cache/ 78 | *~ 79 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | (c) 2005 Universitat d'Alacant / Universidad de Alicante. 2 | (c) 2007-2008 Prompsit Language Engineering S.L. 3 | 4 | 2007-2020, Francis M. Tyers 5 | 2009-2020, Kevin Brubeck Unhammer 6 | 2015-2020, Tino Didriksen 7 | 2019-2020, Daniel Swanson 8 | 2020, Tanmai Khanna 9 | 2018-2019, Xavi Ivars 10 | 2019, Amr Keleg 11 | 2019, Bruno Baruffaldi 12 | 2019, Lokendra Singh 13 | 2019, Marc Riera Irigoyen 14 | 2019, Tommi A Pirinen 15 | 2018, Abinash Senapati 16 | 2018, Anthony J. Bentley 17 | 2018, Flammie Pirinen 18 | 2018, Kartik Mistry 19 | 2018, Sushain Cherivirala 20 | 2008-2017, Jim O'Regan 21 | 2017, Himanshu Sekhar Nayak 22 | 2017, Tommi Pirinen 23 | 2016, Frankie Robertson 24 | 2014-2015, Hrvoje Peradin 25 | 2007-2013, Sergio Ortiz Rojas 26 | 2011, Pim Otte 27 | 2011, Sjur Nørstebø Moshagen 28 | 2010, Trond Trosterud 29 | 2009, Pasquale Minervini 30 | 2008, Felipe Sánchez Martínez 31 | 2008, Jacob Nordfalk 32 | 2008, Wynand Winterbach 33 | 2007, Stephen Paulger 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | lttoolbox 4 | =============================================================================== 5 | 6 | lttoolbox contains finite state tools for lexical processing, 7 | morphological analysis and generation of words. Analysis is the 8 | process of splitting a word like `cats` into its lemma `cat` and the 9 | grammatical information ``. Generation is the opposite process. 10 | 11 | The three main programs are lt-comp, the compiler, lt-proc, the 12 | processor, and lt-expand, which generates all possible mappings 13 | between surface forms and lexical forms in the dictionary. 14 | 15 | Executables built by this package: 16 | 17 | * `lt-comp`: compiler, execute without parameters to show usage 18 | instructions. 19 | 20 | * `lt-proc`: processor, typical options are -a (lexical analyser, 21 | default option), -g (lexical generator) and -p (lexical 22 | post-generator). Using -h will show all flags. 23 | 24 | * `lt-expand`: generates all the pairs of transductions of a given 25 | dictionary. Execute without parameters to show the instructions of 26 | use. 27 | 28 | * `lt-trim`: trims a compiled analyser to only contain entries which 29 | would pass through a compiled bidix, creating a new compiled and 30 | trimmed analyser. 31 | 32 | * `lt-compose`: composes two compiled transducers (applying output of 33 | the first to input of the second), with support for flipping labels 34 | and allowing incomplete matches. 35 | 36 | * `lt-print`: prints the arcs of a transducer in [ATT format][3]. 37 | 38 | * `lt-append`: merges two compiled dictionaries. 39 | 40 | * `lt-paradigm`: extracts all paths from a compiled dictionary 41 | matching an input pattern. 42 | 43 | * `lsx-comp`: an alias of `lt-comp`. 44 | 45 | There is also a C++ API that you can link to (see how [apertium][1] or 46 | [apertium-lex-tools][2] do this). 47 | 48 | See https://wiki.apertium.org/wiki/Lttoolbox for usage examples and 49 | more information. 50 | 51 | Installation 52 | =============================================================================== 53 | 54 | There are binaries available for Debian, Ubuntu, Fedora, CentOS, OpenSUSE, 55 | Windows, and macOS. We package both nightly builds and releases. 56 | See https://wiki.apertium.org/wiki/Installation for more information. 57 | Only build from source if you either want to change this tool's behavior, 58 | or are on a platform we don't yet package for. 59 | 60 | Requirements: 61 | 62 | * A C++ compiler capable of C++17 63 | * CMake >= 3.12 64 | * libxml2 >= 2.6.17 65 | * ICU 66 | * utfcpp 67 | 68 | Building & installing: 69 | 70 | * ./cmake.sh 71 | * make 72 | * make install 73 | 74 | [1]: https://github.com/apertium/apertium 75 | [2]: https://github.com/apertium/apertium-lex-tools 76 | [3]: https://wiki.apertium.org/wiki/ATT_format 77 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | cmake.sh -------------------------------------------------------------------------------- /cmake.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | args=() 4 | 5 | while [[ $# -gt 0 ]]; 6 | do 7 | case "$1" in 8 | --prefix) 9 | args+=("-DCMAKE_INSTALL_PREFIX=$2") 10 | shift 2 11 | ;; 12 | --prefix=*) 13 | args+=("-DCMAKE_INSTALL_PREFIX=${1#*=}") 14 | shift 15 | ;; 16 | --enable-python-bindings) 17 | args+=("-DENABLE_PYTHON_BINDINGS=ON") 18 | shift 19 | ;; 20 | *) 21 | args+=("$1") 22 | shift 23 | ;; 24 | esac 25 | done 26 | 27 | set -- "${args[@]}" 28 | 29 | D=$(dirname "$0") 30 | 31 | echo "- rm -rf CMake caches" 32 | rm -rf install_manifest.txt CMakeCache.txt *.cmake CMakeFiles [sp]*/CMakeFiles [sp]*/*.cmake _CPack_Packages Testing 33 | echo "- cmake " "$@" "$D" 34 | cmake "$@" "$D" 35 | echo "- You may now perform: make -j8" 36 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | cmake.sh -------------------------------------------------------------------------------- /lttoolbox.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | 6 | Name: lttoolbox 7 | Description: Augmented letter transducer tools for natural language processing 8 | Version: @VERSION@ 9 | Cflags: -I${includedir} 10 | Libs: -L${libdir} -l@PACKAGE_NAME@ 11 | -------------------------------------------------------------------------------- /lttoolbox/acx.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | 20 | const xmlChar* CHAR_NODE = (const xmlChar*)"char"; 21 | const xmlChar* EQUIV_NODE = (const xmlChar*)"equiv-char"; 22 | const char* VALUE_ATTR = "value"; 23 | 24 | int32_t get_val(xmlNode* node) 25 | { 26 | UString s = getattr(node, VALUE_ATTR); 27 | if (s.empty()) { 28 | error_and_die(node, "Missing value attribute."); 29 | } 30 | std::vector v; 31 | ustring_to_vec32(s, v); 32 | if (v.size() > 1) { 33 | error_and_die(node, "Expected a single character in value attribute, but found %d.", v.size()); 34 | } 35 | return v[0]; 36 | } 37 | 38 | std::map> readACX(const char* file) 39 | { 40 | std::map> acx; 41 | xmlNode* top_node = load_xml(file); 42 | for (auto char_node : children(top_node)) { 43 | if (!xmlStrEqual(char_node->name, CHAR_NODE)) { 44 | error_and_die(char_node, "Expected but found <%s>.", 45 | (const char*)char_node->name); 46 | } 47 | int32_t key = get_val(char_node); 48 | sorted_vector vec; 49 | for (auto equiv_node : children(char_node)) { 50 | if (!xmlStrEqual(equiv_node->name, EQUIV_NODE)) { 51 | error_and_die(char_node, "Expected but found <%s>.", 52 | (const char*)equiv_node->name); 53 | } 54 | vec.insert(get_val(equiv_node)); 55 | } 56 | if (!vec.empty()) { 57 | acx.insert({key, vec}); 58 | } 59 | } 60 | return acx; 61 | } 62 | -------------------------------------------------------------------------------- /lttoolbox/acx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #ifndef _ACXPARSEUTIL_ 18 | #define _ACXPARSEUTIL_ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | std::map> readACX(const char* file); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /lttoolbox/acx.rng: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 1 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /lttoolbox/check-cstdint.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() { 6 | static_assert(!std::is_same::value, "size_t == uint32_t"); 7 | static_assert(!std::is_same::value, "size_t == uint64_t"); 8 | } 9 | -------------------------------------------------------------------------------- /lttoolbox/cli.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | class CLI { 24 | private: 25 | struct CLIOption { 26 | char short_opt; 27 | std::string long_opt; 28 | std::string desc; 29 | bool is_bool; 30 | std::string var; 31 | }; 32 | 33 | std::string description; 34 | std::string version; 35 | std::string epilog; 36 | 37 | std::vector options; 38 | std::vector> file_args; 39 | size_t min_file_args = 0; 40 | 41 | std::map> strs; 42 | std::map bools; 43 | std::vector files; 44 | 45 | std::string prog_name; 46 | 47 | public: 48 | CLI(std::string desc, std::string version); 49 | CLI(std::string desc); 50 | ~CLI(); 51 | void add_str_arg(char short_flag, std::string long_flag, std::string desc, 52 | std::string arg); 53 | void add_bool_arg(char short_flag, std::string long_flag, std::string desc); 54 | void add_file_arg(std::string name, bool optional = true); 55 | void set_epilog(std::string e); 56 | void print_usage(std::ostream& out = std::cerr); 57 | void parse_args(int argc, char* argv[]); 58 | std::map>& get_strs(); 59 | std::map& get_bools(); 60 | std::vector& get_files(); 61 | }; 62 | -------------------------------------------------------------------------------- /lttoolbox/entry_token.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | 19 | 20 | EntryToken::EntryToken() : 21 | type(paradigm) 22 | { 23 | } 24 | 25 | EntryToken::~EntryToken() 26 | { 27 | destroy(); 28 | } 29 | 30 | EntryToken::EntryToken(EntryToken const &e) 31 | { 32 | copy(e); 33 | } 34 | 35 | EntryToken & 36 | EntryToken::operator =(EntryToken const &e) 37 | { 38 | if(this != &e) 39 | { 40 | destroy(); 41 | copy(e); 42 | } 43 | 44 | return *this; 45 | } 46 | 47 | void 48 | EntryToken::copy(EntryToken const &e) 49 | { 50 | type = e.type; 51 | weight = e.weight; 52 | leftSide = e.leftSide; 53 | rightSide = e.rightSide; 54 | parName = e.parName; 55 | myregexp = e.myregexp; 56 | } 57 | 58 | void 59 | EntryToken::destroy() 60 | { 61 | } 62 | 63 | void 64 | EntryToken::setParadigm(UStringView np) 65 | { 66 | parName = np; 67 | type = paradigm; 68 | } 69 | 70 | void 71 | EntryToken::setSingleTransduction(std::vector const &pi, std::vector const &pd, double const ew) 72 | { 73 | weight = ew; 74 | leftSide = pi; 75 | rightSide = pd; 76 | type = single_transduction; 77 | } 78 | 79 | void 80 | EntryToken::setRegexp(UStringView r) 81 | { 82 | myregexp.clear(); 83 | ustring_to_vec32(r, myregexp); 84 | type = regexp; 85 | } 86 | 87 | void 88 | EntryToken::setRegexp(const std::vector& r) 89 | { 90 | myregexp = r; 91 | type = regexp; 92 | } 93 | 94 | void 95 | EntryToken::readRegexp(xmlTextReaderPtr reader) 96 | { 97 | XMLParseUtil::readValueInto32(reader, myregexp); 98 | type = regexp; 99 | } 100 | 101 | bool 102 | EntryToken::isParadigm() const 103 | { 104 | return type == paradigm; 105 | } 106 | 107 | bool 108 | EntryToken::isSingleTransduction() const 109 | { 110 | return type == single_transduction; 111 | } 112 | 113 | bool 114 | EntryToken::isRegexp() const 115 | { 116 | return type == regexp; 117 | } 118 | 119 | UString const & 120 | EntryToken::paradigmName() const 121 | { 122 | return parName; 123 | } 124 | 125 | std::vector const & 126 | EntryToken::left() const 127 | { 128 | return leftSide; 129 | } 130 | 131 | std::vector const & 132 | EntryToken::right() const 133 | { 134 | return rightSide; 135 | } 136 | 137 | std::vector const & 138 | EntryToken::regExp() const 139 | { 140 | return myregexp; 141 | } 142 | 143 | double const & 144 | EntryToken::entryWeight() const 145 | { 146 | return weight; 147 | } 148 | -------------------------------------------------------------------------------- /lttoolbox/exception.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #ifndef __EXCEPTION_ 18 | #define __EXCEPTION_ 19 | 20 | #include 21 | #include 22 | 23 | class Exception 24 | : public std::exception 25 | { 26 | public: 27 | Exception(const char* _msg) throw () 28 | : std::exception(), msg(_msg) 29 | { 30 | } 31 | 32 | virtual ~Exception() throw () 33 | { 34 | } 35 | 36 | const char* what() const throw () 37 | { 38 | return msg.c_str(); 39 | } 40 | 41 | private: 42 | std::string msg; 43 | }; 44 | 45 | class IOException : public Exception { 46 | public: 47 | IOException(const char* _msg) throw () : Exception(_msg) {}; 48 | }; 49 | 50 | class SerialisationException : public IOException { 51 | public: 52 | SerialisationException(const char* _msg) throw () : IOException(_msg) {}; 53 | }; 54 | 55 | class DeserialisationException : public IOException { 56 | public: 57 | DeserialisationException(const char* _msg) throw () : IOException(_msg) {}; 58 | }; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /lttoolbox/file_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef __FILE_UTILS_H__ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | UFILE* openOutTextFile(const std::string& fname); 27 | FILE* openOutBinFile(const std::string& fname); 28 | FILE* openInBinFile(const std::string& fname); 29 | 30 | void writeTransducerSet(FILE* output, UStringView letters, 31 | Alphabet& alpha, 32 | std::map& trans); 33 | void writeTransducerSet(FILE* output, const std::set& letters, 34 | Alphabet& alpha, 35 | std::map& trans); 36 | void readTransducerSet(FILE* input, std::set& letters, 37 | Alphabet& alpha, 38 | std::map& trans); 39 | void readTransducerSet(FILE* input, std::set& letters, 40 | Alphabet& alpha, 41 | std::map& trans); 42 | 43 | #endif // __FILE_UTILS_H__ 44 | -------------------------------------------------------------------------------- /lttoolbox/icx.rng: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /lttoolbox/input_file.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _LT_INPUT_FILE_H_ 19 | #define _LT_INPUT_FILE_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | class InputFile 26 | { 27 | private: 28 | FILE* infile; 29 | UChar32 ubuffer[3]; 30 | char cbuffer[4]; 31 | int buffer_size; 32 | void internal_read(); 33 | public: 34 | InputFile(); 35 | ~InputFile(); 36 | bool open(const char* fname = nullptr); 37 | #if HAVE_DECL_FMEMOPEN 38 | bool open_in_memory(char* input_buffer); 39 | #endif 40 | void open_or_exit(const char* fname = nullptr); 41 | void close(); 42 | void wrap(FILE* newinfile); 43 | UChar32 get(); 44 | UChar32 peek(); 45 | void unget(UChar32 c); 46 | bool eof(); 47 | void rewind(); 48 | // assumes that start has already been read 49 | // returns string from start to end inclusive 50 | // respects backslash escapes 51 | UString readBlock(const UChar32 start, const UChar32 end); 52 | // assumes [[ has already been read, reads to ]] 53 | // returns entire string, including brackets 54 | UString finishWBlank(); 55 | // read until ^ or \0 56 | // if readwblank == false, also stop at [[ 57 | // Note: relies on the fact that ubuffer has length >= 2 58 | UString readBlank(bool readwblank = false); 59 | }; 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /lttoolbox/lt-append.1: -------------------------------------------------------------------------------- 1 | .Dd March 30, 2022 2 | .Dt LT-APPEND 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-append 6 | .Nd combine two compiled dictionary transducers 7 | .Sh SYNOPSIS 8 | .Nm lt-append 9 | .Ar input_a 10 | .Ar input_b 11 | .Ar output 12 | .Sh DESCRIPTION 13 | .Nm lt-append 14 | will combine two compiled dictionaries as if they had been compiled 15 | from one big XML file, keeping sections separate. If 16 | .Ar input_a 17 | has sections 18 | .Dq main 19 | and 20 | .Dq final 21 | and 22 | .Ar input_b 23 | has section 24 | .Dq regex 25 | then 26 | .Ar output 27 | will have sections \[lq]main\[rq], \[lq]final\[rq] and \[lq]regex\[rq] 28 | (there is no cross-section minimisation, so internally there is no 29 | union, but the behaviour of running the transducer will be as if we 30 | had done the union). 31 | .Sh FILES 32 | .Bl -tag -width Ds 33 | .It Ar input_transducer_a 34 | The first input binary (a finite state transducer). 35 | .It Ar input_transducer_b 36 | The second input binary (a finite state transducer). 37 | .It Ar output_transducer 38 | The output binary with the combination of inputs (a finite state transducer). 39 | .El 40 | .Sh SEE ALSO 41 | .Xr apertium 1 , 42 | .Xr lt-comp 1 , 43 | .Xr lt-expand 1 , 44 | .Xr lt-print 1 , 45 | .Xr lt-proc 1 46 | .Sh AUTHOR 47 | Copyright \(co 2022 Apertium. 48 | This is free software. 49 | You may redistribute copies of it under the terms of 50 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 51 | .Sh BUGS 52 | Many... lurking in the dark and waiting for you! 53 | -------------------------------------------------------------------------------- /lttoolbox/lt-comp.1: -------------------------------------------------------------------------------- 1 | .Dd March 8, 2006 2 | .Dt LT-COMP 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-comp 6 | .Nd augmented letter transducer compiler for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-comp 9 | .Op Fl a | v | l | r | m | h 10 | .Cm lr | rl 11 | .Ar dictionary_file 12 | .Ar output_file 13 | .Op Ar acx_file 14 | .Sh DESCRIPTION 15 | .Nm lt-comp 16 | is the application responsible for compiling dictionaries used by 17 | .Xr lt-proc 1 18 | in Apertium into a compact and efficient representation 19 | (a class of finite-state transducers called augmented letter transducers). 20 | .Sh OPTIONS 21 | .Bl -tag -width Ds 22 | .It Fl a , Fl Fl alt 23 | Sets the value of the 24 | .Sy alt 25 | attribute to use in compilation. 26 | .Pp 27 | Note that if no value is set, all entries containing an \fIalt\fR 28 | attribute are omitted. 29 | .It Fl v , Fl Fl var 30 | Sets the value of the 31 | .Sy v 32 | attribute to use in compilation. 33 | This should only be used with monodixes; for bidixes, see 34 | .Fl l 35 | and 36 | .Fl r . 37 | .Pp 38 | Note that if no value is set, all entries containing a 39 | .Sy v 40 | attribute are considered to be 41 | .Em left-to-right . 42 | .It Fl l , Fl Fl var-left 43 | Sets the value of the 44 | .Sy vl 45 | attribute for use in compilation of bidixes. 46 | .Dq Left 47 | here refers to the side of the dictionary, so this option is only valid in 48 | .Cm rl 49 | mode. 50 | .It Fl r , Fl Fl var-right 51 | Sets the value of the 52 | .Sy vr 53 | attribute for use in compilation of bidixes. 54 | .Dq Right 55 | here refers to the side of the dictionary, so this option is only valid in 56 | .Cm lr 57 | mode. 58 | .It Fl m , Fl Fl keep-boundaries 59 | Keep any morpheme boundaries defined by the '' symbol 60 | .It Fl H , Fl Fl hfst 61 | expect HFST symbols 62 | .It Fl S , Fl Fl no-split 63 | don't attempt to split into word and punctuation transducers 64 | .It Fl j , Fl Fl jobs 65 | Parallelise minimisation by using one cpu core per section. By 66 | default, this also creates a new section after 50.000 entries. You can 67 | override this number by setting the environment variable 68 | LT_MAX_SECTION_ENTRIES to some number. If set to 0, sections are never 69 | split (but kept exactly as in the dix file). You can also set the 70 | environment variable LT_JOBS=true if you always want parallel 71 | minimisation even if lt-comp was called without this option. 72 | .It Fl h , Fl Fl help 73 | Prints a short help message. 74 | .It Cm lr 75 | The resulting transducer will process dictionary entries 76 | .Em left-to-right . 77 | .It Cm rl 78 | The resulting transducer will process dictionary entries 79 | .Em right-to-left . 80 | .El 81 | .Sh FILES 82 | .Bl -tag -width Ds 83 | .It Ar dictionary_file 84 | The input dictionary. 85 | .It Ar output_file 86 | The compiled dictionary (a finite state transducer). 87 | .It Ar acx_file 88 | Optional XML file of equivalent characters in monodices. 89 | .El 90 | .Sh SEE ALSO 91 | .Xr apertium 1 , 92 | .Xr apertium-tagger 1 , 93 | .Xr lt-expand 1 , 94 | .Xr lt-proc 1 95 | .Sh COPYRIGHT 96 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 97 | This is free software. 98 | You may redistribute copies of it under the terms of 99 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 100 | .Sh BUGS 101 | Many... lurking in the dark and waiting for you! 102 | -------------------------------------------------------------------------------- /lttoolbox/lt-compose.1: -------------------------------------------------------------------------------- 1 | .Dd September 25, 2022 2 | .Dt LT-COMPOSE 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-compose 6 | .Nd compiled dictionary composition for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-compose 9 | .Ar transducer1_binary 10 | .Ar transducer2_binary 11 | .Ar composed_binary 12 | .Sh DESCRIPTION 13 | .Nm lt-compose 14 | is the application responsible for composing two compiled 15 | dictionaries, matching the output-side of transducer1 with the 16 | input-side of transducer2. By default, matches are anchored to 17 | initial/final states, so the transducer2 has to match full paths (in 18 | regex terms, transducer2 is implicitly surrounded by ^ and $). But 19 | there is also support for letting transducer2 match sub-paths of 20 | transducer1 (in which matches become optional, making the composition 21 | a superset of transducer1). Matching sub-paths means that transducer2 22 | can start matching in the midst of paths of transducer2 (in regex 23 | terms, transducer2 is implicitly surrounded in .* on both sides). 24 | .Sh OPTIONS 25 | .Bl -tag -width Ds 26 | .It Fl i , Fl Fl inverted 27 | Apply transducer2 to the input-side (left) of transducer1 instead of 28 | the output-side. You would do this when altering the forms of an 29 | analyser. 30 | .It Fl a , Fl Fl anywhere 31 | Allow transducer2 to match sub-paths instead of requiring matching 32 | initial/final states. Matches then become optional. 33 | .It Fl j , Fl Fl jobs 34 | Parallelise composition by using one cpu core per section of 35 | transducer1. You can also set the environment variable LT_JOBS=true if 36 | you always want parallelisation where available in lttoolbox. 37 | .Sh FILES 38 | .Bl -tag -width Ds 39 | .It Ar transducer1_binary 40 | a finite state transducer 41 | .It Ar transducer2_binary 42 | a finite state transducer 43 | .It Ar composed_binary 44 | a finite state transducer 45 | .El 46 | .Sh SEE ALSO 47 | .Xr apertium 1 , 48 | .Xr apertium-tagger 1 , 49 | .Xr lt-comp 1 , 50 | .Xr lt-expand 1 , 51 | .Xr lt-print 1 , 52 | .Xr lt-trim 1 , 53 | .Xr lt-proc 1 54 | .Sh AUTHOR 55 | Copyright \(co 2005-2022 Universitat d'Alacant / Universidad de Alicante. 56 | This is free software. 57 | You may redistribute copies of it under the terms of 58 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 59 | .Sh BUGS 60 | Many... lurking in the dark and waiting for you! 61 | -------------------------------------------------------------------------------- /lttoolbox/lt-expand.1: -------------------------------------------------------------------------------- 1 | .Dd March 8, 2006 2 | .Dt LT-EXPAND 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-expand 6 | .Nd dictionary expander for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-expand 9 | .Op Fl a | v | l | r | m | h 10 | .Ar dictionary_file 11 | .Op Ar output_file 12 | .Sh DESCRIPTION 13 | .Nm lt-expand 14 | is the application responsible for expanding a dictionary 15 | into a simple list of input string-output string pairs 16 | by eliminating paradigms through substitution and unfolding. 17 | .Pp 18 | The output goes to 19 | .Ar output_file 20 | if it is present or to standard output if it is missing. 21 | .Sh OPTIONS 22 | .Bl -tag -width Ds 23 | .It Fl a , Fl Fl alt 24 | Sets the value of the 25 | .Sy alt 26 | attribute to use in expansion 27 | .It Fl v , Fl Fl var 28 | Sets the value of the 29 | .Sy v 30 | attribute to use in expansion of monodixes 31 | .It Fl l , Fl Fl var-left 32 | Sets the value of the 33 | .Sy vl 34 | attribute to use in expansion of bidixes 35 | .It Fl r , Fl Fl var-right 36 | Sets the value of the 37 | .Sy vr 38 | attribute to use in expansion of bidixes 39 | .It Fl m , Fl Fl keep-boundaries 40 | Keep any morpheme boundaries defined by the symbol 41 | .It Fl h , Fl Fl help 42 | Prints a short help message 43 | .El 44 | .Sh FILES 45 | .Bl -tag -width Ds 46 | .It Ar dictionary_file 47 | The input dictionary to expand. 48 | .It Ar output_file 49 | Text containing the expanded dictionary information. 50 | .El 51 | .Sh SEE ALSO 52 | .Xr apertium 1 , 53 | .Xr apertium-tagger 1 , 54 | .Xr lt-comp 1 , 55 | .Xr lt-proc 1 56 | .Sh COPYRIGHT 57 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 58 | This is free software. 59 | You may redistribute copies of it under the terms of 60 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 61 | .Sh BUGS 62 | Many... lurking in the dark and waiting for you! 63 | -------------------------------------------------------------------------------- /lttoolbox/lt-merge.1: -------------------------------------------------------------------------------- 1 | .Dd December 10, 2024 2 | .Dt LT-MERGE 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-merge 6 | .Nd lexical merger for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-merge 9 | .Op Fl u 10 | .Op Ar input_file Op Ar output_file 11 | .Sh DESCRIPTION 12 | .Nm lt-merge 13 | is the application responsible for merging and unmerging 14 | lexical units 15 | .Pp 16 | It accomplishes this. 17 | .Sh OPTIONS 18 | .Bl -tag -width Ds 19 | .It Fl u , Fl Fl unmerge 20 | Run in reverse, this splits previously merged words. 21 | .It Fl v , Fl Fl version 22 | Display the version number. 23 | .It Fl h , Fl Fl help 24 | Display this help. 25 | .El 26 | \" .Sh FILES 27 | \" .Bl -tag -width Ds 28 | \" .It Ar input_file 29 | \" The input compiled dictionary. 30 | \" .El 31 | .Sh SEE ALSO 32 | .Xr apertium 1 , 33 | .Xr lt-proc 1 34 | .Sh COPYRIGHT 35 | Copyright \(co 2024 Universitat d'Alacant / Universidad de Alicante. 36 | This is free software. 37 | You may redistribute copies of it under the terms of 38 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 39 | .Sh BUGS 40 | Many... lurking in the dark and waiting for you! 41 | -------------------------------------------------------------------------------- /lttoolbox/lt-paradigm.1: -------------------------------------------------------------------------------- 1 | .Dd June 30, 2022 2 | .Dt LT-PARADIGM 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-paradigm 6 | .Nd generate listings from a compiled transducer 7 | .Sh SYNOPSIS 8 | .Nm lt-paradigm 9 | .Op Fl a | s | z | h 10 | .Op Fl e Ar TAG 11 | .Ar fst_file 12 | .Op Ar input_file Op Ar output_file 13 | .Sh DESCRIPTION 14 | .Nm lt-paradigm 15 | prints paths matching input patterns from a transducer 16 | .Bl -tag -width Ds 17 | .It Ar fst_file 18 | The compiled transducer 19 | .It Ar input_file 20 | A list of patterns to be extracted, separated by newlines or nulls 21 | .It Ar output_file 22 | All paths matching the patterns in input_file. Each path is terminated by a newline and groups are separated by the separator used in the input. 23 | .El 24 | .Sh OPTIONS 25 | .Bl -tag -width Ds 26 | .It Fl a Fl Fl analyser 27 | Match patterns on the right side of the transducer rather than the left. 28 | .It Fl e Ar TAG Fl Fl exclude Ar TAG 29 | When expanding <*>, do use 30 | .Ar TAG 31 | .It Fl s Fl Fl sort 32 | Sort the output for each pattern. 33 | .It Fl z Fl Fl null-flush 34 | No-op, included for compatibility. 35 | .It Fl h Fl Fl help 36 | Prints a short help message. 37 | .El 38 | .Sh SEE ALSO 39 | .Xr lt-expand 1 , 40 | .Xr hfst-expand 1 , 41 | .Sh COPYRIGHT 42 | Copyright \(co 2022 Apertium 43 | This is free software. 44 | You may redistribute copies of it under the terms of 45 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 46 | .Sh BUGS 47 | Many... lurking in the dark and waiting for you! 48 | -------------------------------------------------------------------------------- /lttoolbox/lt-print.1: -------------------------------------------------------------------------------- 1 | .Dd March 8, 2006 2 | .Dt LT-PRINT 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-print 6 | .Nd compiled dictionary printer for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-print 9 | .Op Fl a | H 10 | .Ar bin_file 11 | .Op Ar output_file 12 | .Sh DESCRIPTION 13 | .Nm lt-print 14 | is the application responsible for printing compiled dictionaries in ATT format. 15 | .Bl -tag -width Ds 16 | .It Ar bin_file 17 | The compiled input file. 18 | .It Ar output_file 19 | The transducer in ATT format. 20 | .El 21 | .Sh OPTIONS 22 | .Bl -tag -width Ds 23 | .It Fl a, Fl Fl alpha 24 | print transducer alphabet instead of transducers 25 | .It 26 | .It Fl H , Fl Fl hfst 27 | use HFST-compatible character escapes, e.g. @_SPACE_@ for spaces and @0@ for epsilons. 28 | .It Fl h , Fl Fl help 29 | Prints a short help message. 30 | .El 31 | .Sh SEE ALSO 32 | .Xr apertium 1 , 33 | .Xr apertium-tagger 1 , 34 | .Xr lt-comp 1 , 35 | .Xr lt-expand 1 , 36 | .Xr lt-proc 1 37 | .Sh COPYRIGHT 38 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 39 | This is free software. 40 | You may redistribute copies of it under the terms of 41 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 42 | .Sh BUGS 43 | Currently requires a UTF-8 locale (and doesn't crash if it doesn't have one). 44 | -------------------------------------------------------------------------------- /lttoolbox/lt-tmxcomp.1: -------------------------------------------------------------------------------- 1 | .Dd March 8, 2006 2 | .Dt LT-COMP 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-tmxcomp 6 | .Nd translation memories compiler for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-tmxcomp 9 | .Ar lang1 Ns - Ns Ar lang2 10 | .Ar tmx_file 11 | .Ar output_file 12 | .Sh DESCRIPTION 13 | .Nm lt-comp 14 | is the application responsible for compiling translation memories in 15 | the TMX format used by 16 | .Xr lt-tmxproc 1 17 | in Apertium into a compact and efficient representation 18 | (a class of finite-state transducers called augmented letter transducers). 19 | .Sh OPTIONS 20 | .Bl -tag -width Ds 21 | .It Ar lang1 22 | Input language 23 | .It Ar lang2 24 | Output language 25 | .El 26 | .Sh FILES 27 | .Bl -tag -width Ds 28 | .It Ar tmx_file 29 | The input translation memory, in TMX format. 30 | .It Ar output_file 31 | The compiled translation memory (a finite state transducer). 32 | .El 33 | .Sh SEE ALSO 34 | .Xr apertium 1 , 35 | .Xr lt-tmxproc 1 36 | .Sh AUTHOR 37 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 38 | This is free software. 39 | You may redistribute copies of it under the terms of 40 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 41 | .Sh BUGS 42 | Many... lurking in the dark and waiting for you! 43 | -------------------------------------------------------------------------------- /lttoolbox/lt-tmxproc.1: -------------------------------------------------------------------------------- 1 | .Dd March 23, 2006 2 | .Dt LT-PROC 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-tmxproc 6 | .Nd translation stream processor for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-tmxproc 9 | .Ar fst_file 10 | .Op Ar input_file Op Ar output_file 11 | .Sh DESCRIPTION 12 | .Nm lt-tmxproc 13 | is the application responsible for preprocessing the translation 14 | stream in Apertium using a compiled translation memory. 15 | .Pp 16 | It accomplishes these tasks by reading binary files containing a 17 | compact and efficient representation of dictionaries (a class of 18 | finite-state transducers called augmented letter transducers). 19 | These files are generated by 20 | .Xr lt-tmxcomp 1 . 21 | .Sh FILES 22 | .Bl -tag -width Ds 23 | .It Ar input_file 24 | The input compiled dictionary. 25 | .El 26 | .Sh SEE ALSO 27 | .Xr apertium 1 , 28 | .Xr lt-tmxcomp 1 29 | .Sh AUTHOR 30 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 31 | This is free software. 32 | You may redistribute copies of it under the terms of 33 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 34 | .Sh BUGS 35 | Many... lurking in the dark and waiting for you! 36 | -------------------------------------------------------------------------------- /lttoolbox/lt-trim.1: -------------------------------------------------------------------------------- 1 | .Dd February 7, 2014 2 | .Dt LT-TRIM 1 3 | .Os Apertium 4 | .Sh NAME 5 | .Nm lt-trim 6 | .Nd compiled dictionary trimmer for Apertium 7 | .Sh SYNOPSIS 8 | .Nm lt-trim 9 | .Ar analyser_binary 10 | .Ar bidix_binary 11 | .Ar trimmed_analyser_binary 12 | .Sh DESCRIPTION 13 | .Nm lt-trim 14 | is the application responsible for trimming compiled dictionaries. 15 | The analyses (right-side when compiling lr) of analyser_binary are trimmed 16 | to the input side of bidix_binary (left-side when compiling lr, 17 | right-side when compiling rl), such that only analyses which would 18 | pass through 19 | .So 20 | .Xr lt-proc 1 21 | .Fl b Cm bidix_binary 22 | .Sc 23 | are kept. 24 | .Pp 25 | Both compound tags 26 | .Po 27 | .Dq , 28 | .Dq 29 | .Pc 30 | and join elements 31 | .Po 32 | .Dq 33 | in XML, 34 | .Dq + 35 | in the stream 36 | .Pc 37 | and the group element 38 | .Po 39 | .Dq 40 | in XML, 41 | .Dq # 42 | in the stream 43 | .Pc 44 | should be handled correctly, 45 | even combinations of + followed by # in monodix are handled. 46 | .Pp 47 | Some minor caveats: If you have the capitalised lemma 48 | .Dq Foo 49 | in the monodix, but 50 | .Dq foo 51 | in the bidix, an analysis 52 | .Dq \(a^Foo$ 53 | would pass through bidix when doing 54 | .Xr lt-proc 1 55 | .Fl b , 56 | but will not make it through trimming. 57 | Make sure your lemmas have the same capitalisation in the 58 | different dictionaries. 59 | Also, you should not have literal 60 | .Ql + 61 | or 62 | .Ql # 63 | in your lemmas. 64 | Since 65 | .Xr lt-comp 1 66 | doesn't escape these, 67 | .Nm 68 | cannot know that they are different from 69 | .Dq 70 | or 71 | .Dq , 72 | and you may get @-marked output this way. 73 | You can analyse 74 | .Ql + 75 | or 76 | .Ql # 77 | by having the literal symbol in the 78 | .Dq 79 | part and some other string (e.g., 80 | .Dq plus ) 81 | in the 82 | .Dq . 83 | .Pp 84 | You should not trim a generator unless you have a 85 | .Em very 86 | simple translator pipeline, 87 | since the output of bidix seldom goes unchanged through transfer. 88 | .Sh OPTIONS 89 | .Bl -tag -width Ds 90 | .It Fl s , Fl Fl match-section 91 | A section with this name (id@type) in the analyser will only be 92 | trimmed against a section with the same id in the bidix. (The default 93 | is to trim all sections of the analyser against all sections of the 94 | bidix.) Using this option can some times speed up trimming 95 | considerably. For example, if you have some complicated regular 96 | expressions, try putting them in a 97 | 98 |
99 | 100 | in both .dix files and passing 101 | .Dq regex@standard 102 | to \fI--match-section\fP. 103 | .Pp 104 | This argument may be used multiple times to specify multiple sections 105 | that must match by name. 106 | .Sh FILES 107 | .Bl -tag -width Ds 108 | .It Ar analyser_binary 109 | The untrimmed analyser dictionary (a finite state transducer). 110 | .It Ar bidix_binary 111 | The dictionary to use as trimmer (a finite state transducer). 112 | .It Ar trimmed_analyser_binary 113 | The trimmed analyser dictionary (a finite state transducer). 114 | .El 115 | .Sh SEE ALSO 116 | .Xr apertium 1 , 117 | .Xr apertium-tagger 1 , 118 | .Xr lt-comp 1 , 119 | .Xr lt-expand 1 , 120 | .Xr lt-print 1 , 121 | .Xr lt-proc 1 122 | .Sh AUTHOR 123 | Copyright \(co 2005, 2006 Universitat d'Alacant / Universidad de Alicante. 124 | This is free software. 125 | You may redistribute copies of it under the terms of 126 | .Lk https://www.gnu.org/licenses/gpl.html the GNU General Public License . 127 | .Sh BUGS 128 | Many... lurking in the dark and waiting for you! 129 | -------------------------------------------------------------------------------- /lttoolbox/lt_append.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | LtLocale::tryToSetLocale(); 26 | CLI cli("add sections to a compiled transducer", PACKAGE_VERSION); 27 | cli.add_bool_arg('k', "keep", "in case of section name conflicts, keep the one from the first transducer"); 28 | cli.add_bool_arg('s', "single", "treat input transducers as one-sided"); 29 | cli.add_bool_arg('h', "help", "print this message and exit"); 30 | cli.add_file_arg("bin_file1", false); 31 | cli.add_file_arg("bin_file2"); 32 | cli.add_file_arg("output_file"); 33 | cli.parse_args(argc, argv); 34 | 35 | bool pairs = !cli.get_bools()["single"]; 36 | bool keep = cli.get_bools()["keep"]; 37 | 38 | FILE* input1 = openInBinFile(cli.get_files()[0]); 39 | FILE* input2 = openInBinFile(cli.get_files()[1]); 40 | FILE* output = openOutBinFile(cli.get_files()[2]); 41 | 42 | Alphabet alpha1, alpha2; 43 | std::set chars1, chars2; 44 | std::map trans1, trans2; 45 | 46 | readTransducerSet(input1, chars1, alpha1, trans1); 47 | readTransducerSet(input2, chars2, alpha2, trans2); 48 | 49 | for (auto& it : chars2) { 50 | chars1.insert(it); 51 | } 52 | UString chars(chars1.begin(), chars1.end()); 53 | 54 | for (auto& it : trans2) { 55 | if (trans1.find(it.first) != trans1.end()) { 56 | if (keep) { 57 | continue; 58 | } else { 59 | std::cerr << "WARNING: section '" << it.first << "' appears in both transducers and will be overwritten!" << std::endl; 60 | } 61 | } 62 | it.second.updateAlphabet(alpha2, alpha1, pairs); 63 | trans1[it.first] = it.second; 64 | } 65 | 66 | writeTransducerSet(output, chars, alpha1, trans1); 67 | 68 | fclose(input1); 69 | fclose(input2); 70 | fclose(output); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /lttoolbox/lt_apply_acx.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | int main(int argc, char* argv[]) 25 | { 26 | LtLocale::tryToSetLocale(); 27 | CLI cli("apply an ACX file to a compiled transducer", PACKAGE_VERSION); 28 | cli.add_file_arg("input_file", false); 29 | cli.add_file_arg("acx_file"); 30 | cli.add_file_arg("output_file"); 31 | cli.parse_args(argc, argv); 32 | 33 | FILE* input = openInBinFile(cli.get_files()[0]); 34 | auto acx = readACX(cli.get_files()[1].c_str()); 35 | FILE* output = openOutBinFile(cli.get_files()[2]); 36 | 37 | Alphabet alpha; 38 | std::set letters; 39 | std::map trans; 40 | readTransducerSet(input, letters, alpha, trans); 41 | 42 | for (auto& it : trans) { 43 | it.second.applyACX(alpha, acx); 44 | } 45 | 46 | writeTransducerSet(output, letters, alpha, trans); 47 | 48 | fclose(input); 49 | fclose(output); 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /lttoolbox/lt_expand.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | LtLocale::tryToSetLocale(); 26 | CLI cli("expand the contents of a dictionary file", PACKAGE_VERSION); 27 | cli.add_bool_arg('m', "keep-boundaries", "keep morpheme boundaries"); 28 | cli.add_str_arg('v', "var", "set language variant", "VAR"); 29 | cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); 30 | cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); 31 | cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); 32 | cli.add_file_arg("dictionary_file", false); 33 | cli.add_file_arg("output_file"); 34 | cli.parse_args(argc, argv); 35 | 36 | Expander e; 37 | e.setKeepBoundaries(cli.get_bools()["keep-boundaries"]); 38 | auto args = cli.get_strs(); 39 | if (args.find("var") != args.end()) { 40 | e.setVariantValue(to_ustring(args["var"][0].c_str())); 41 | } 42 | if (args.find("alt") != args.end()) { 43 | e.setAltValue(to_ustring(args["alt"][0].c_str())); 44 | } 45 | if (args.find("var-left") != args.end()) { 46 | e.setVariantLeftValue(to_ustring(args["var-left"][0].c_str())); 47 | } 48 | if (args.find("var-right") != args.end()) { 49 | e.setVariantRightValue(to_ustring(args["var-right"][0].c_str())); 50 | } 51 | 52 | UFILE* output = openOutTextFile(cli.get_files()[1]); 53 | 54 | e.expand(cli.get_files()[0], output); 55 | u_fclose(output); 56 | 57 | return EXIT_SUCCESS; 58 | } 59 | -------------------------------------------------------------------------------- /lttoolbox/lt_invert.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | int main(int argc, char* argv[]) 23 | { 24 | LtLocale::tryToSetLocale(); 25 | 26 | CLI cli("reverse the direction of a compiled transducer", PACKAGE_VERSION); 27 | cli.add_bool_arg('h', "help", "print this message and exit"); 28 | cli.add_file_arg("in_bin"); 29 | cli.add_file_arg("out_bin"); 30 | cli.parse_args(argc, argv); 31 | 32 | FILE* input = openInBinFile(cli.get_files()[0]); 33 | FILE* output = openOutBinFile(cli.get_files()[1]); 34 | 35 | Alphabet alphabet; 36 | std::set alphabetic_chars; 37 | std::map transducers; 38 | readTransducerSet(input, alphabetic_chars, alphabet, transducers); 39 | 40 | for (auto& it : transducers) { 41 | it.second.invert(alphabet); 42 | } 43 | 44 | writeTransducerSet(output, alphabetic_chars, alphabet, transducers); 45 | 46 | fclose(input); 47 | fclose(output); 48 | return EXIT_SUCCESS; 49 | } 50 | -------------------------------------------------------------------------------- /lttoolbox/lt_locale.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #ifdef __MINGW32__ 24 | #include 25 | #endif 26 | 27 | 28 | void 29 | LtLocale::tryToSetLocale() 30 | { 31 | try { 32 | std::locale::global(std::locale(std::locale::classic(), "", std::locale::ctype)); 33 | } 34 | catch (...) { 35 | // Nothing 36 | } 37 | 38 | UErrorCode status = U_ZERO_ERROR; 39 | uloc_setDefault("en_US_POSIX", &status); 40 | ucnv_setDefaultName("UTF-8"); 41 | 42 | #if !defined(__CYGWIN__) && !defined (__MINGW32__) 43 | if(setlocale(LC_CTYPE, "") != NULL) 44 | { 45 | return; 46 | } 47 | 48 | std::cerr << "Warning: unsupported locale, fallback to \"C\"" << std::endl; 49 | 50 | setlocale(LC_ALL, "C"); 51 | #endif 52 | #ifdef __CYGWIN__ 53 | setlocale(LC_ALL, "C.UTF-8"); 54 | #endif 55 | #ifdef __MINGW32__ 56 | //SetConsoleInputCP(65001); 57 | SetConsoleOutputCP(65001); 58 | #endif 59 | } 60 | -------------------------------------------------------------------------------- /lttoolbox/lt_locale.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _MYLOCALE_ 19 | #define _MYLOCALE_ 20 | 21 | #include 22 | 23 | class LtLocale 24 | { 25 | public: 26 | static void tryToSetLocale(); 27 | }; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /lttoolbox/lt_merge.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2024 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | int main(int argc, char *argv[]) 25 | { 26 | LtLocale::tryToSetLocale(); 27 | CLI cli("merge lexical units from the one tagged BEG until END", PACKAGE_VERSION); 28 | cli.add_file_arg("input_file"); 29 | cli.add_file_arg("output_file"); 30 | cli.add_bool_arg('u', "unmerge", "Undo the merge"); 31 | cli.add_bool_arg('z', "null-flush", "flush output on the null character"); 32 | cli.parse_args(argc, argv); 33 | 34 | auto strs = cli.get_strs(); 35 | bool unmerge = cli.get_bools()["unmerge"]; 36 | InputFile input; 37 | if (!cli.get_files()[1].empty()) { 38 | input.open_or_exit(cli.get_files()[0].c_str()); 39 | } 40 | UFILE* output = openOutTextFile(cli.get_files()[1]); 41 | 42 | FSTProcessor fstp; 43 | fstp.setNullFlush(true); // cf. description of cli["null-flush"] 44 | fstp.initBiltrans(); 45 | if(unmerge) { 46 | fstp.quoteUnmerge(input, output); 47 | } 48 | else { 49 | fstp.quoteMerge(input, output); 50 | } 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /lttoolbox/lt_print.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | int main(int argc, char *argv[]) 23 | { 24 | LtLocale::tryToSetLocale(); 25 | CLI cli("dump a transducer to text in ATT format", PACKAGE_VERSION); 26 | cli.add_bool_arg('a', "alpha", "print transducer alphabet"); 27 | cli.add_bool_arg('H', "hfst", "use HFST-compatible character escapes"); 28 | cli.add_bool_arg('h', "help", "print this message and exit"); 29 | cli.add_file_arg("bin_file"); 30 | cli.add_file_arg("output_file"); 31 | cli.parse_args(argc, argv); 32 | 33 | bool alpha = cli.get_bools()["alpha"]; 34 | bool hfst = cli.get_bools()["hfst"]; 35 | 36 | FILE* input = openInBinFile(cli.get_files()[0]); 37 | UFILE* output = openOutTextFile(cli.get_files()[1]); 38 | 39 | Alphabet alphabet; 40 | std::set alphabetic_chars; 41 | std::map transducers; 42 | 43 | readTransducerSet(input, alphabetic_chars, alphabet, transducers); 44 | 45 | ///////////////////// 46 | 47 | if (alpha) { 48 | for (auto& it : alphabetic_chars) { 49 | u_fprintf(output, "%C\n", it); 50 | } 51 | for (int i = 1; i <= alphabet.size(); i++) { 52 | alphabet.writeSymbol(-i, output); 53 | u_fprintf(output, "\n"); 54 | } 55 | } else { 56 | bool first = true; 57 | for (auto& it : transducers) { 58 | if (!first) { 59 | u_fprintf(output, "--\n"); 60 | } 61 | it.second.joinFinals(); 62 | it.second.show(alphabet, output, 0, hfst); 63 | first = false; 64 | } 65 | } 66 | 67 | fclose(input); 68 | u_fclose(output); 69 | 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /lttoolbox/lt_restrict.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | void get_symbol(const std::string& s, Alphabet& alpha, const char* prefix, 24 | sorted_vector& vec) 25 | { 26 | UString t; 27 | t += '<'; 28 | t += to_ustring(prefix); 29 | t += ':'; 30 | t += to_ustring(s.c_str()); 31 | t += '>'; 32 | if (alpha.isSymbolDefined(t)) { 33 | vec.insert(alpha(alpha(t), alpha(t))); 34 | } 35 | } 36 | 37 | int main(int argc, char* argv[]) 38 | { 39 | LtLocale::tryToSetLocale(); 40 | CLI cli("remove paths from a transducer", PACKAGE_VERSION); 41 | cli.add_bool_arg('m', "minimise", "minimise transducers after deleting paths"); 42 | cli.add_str_arg('v', "var", "set language variant", "VAR"); 43 | cli.add_str_arg('a', "alt", "set alternative (monodix)", "ALT"); 44 | cli.add_str_arg('l', "var-left", "set left language variant (bidix)", "VAR"); 45 | cli.add_str_arg('r', "var-right", "set right language variant (bidix)", "VAR"); 46 | cli.add_file_arg("lr | rl", false); 47 | cli.add_file_arg("input_file"); 48 | cli.add_file_arg("output_file"); 49 | cli.parse_args(argc, argv); 50 | 51 | std::string dir = cli.get_files()[0]; 52 | if (dir == "lr") dir = "LR"; 53 | else if (dir == "rl") dir = "RL"; 54 | FILE* input = openInBinFile(cli.get_files()[1]); 55 | FILE* output = openOutBinFile(cli.get_files()[2]); 56 | 57 | Alphabet alpha; 58 | std::set letters; 59 | std::map trans; 60 | readTransducerSet(input, letters, alpha, trans); 61 | 62 | sorted_vector keep; 63 | sorted_vector drop; 64 | bool has_var = false; 65 | get_symbol(dir, alpha, "r", keep); 66 | for (auto& it : cli.get_strs()["var"]) { 67 | get_symbol(it, alpha, "v", keep); 68 | has_var = true; 69 | } 70 | for (auto& it : cli.get_strs()["alt"]) { 71 | get_symbol(it, alpha, "alt", keep); 72 | } 73 | for (auto& it : cli.get_strs()["var-left"]) { 74 | get_symbol(it, alpha, "vl", keep); 75 | } 76 | for (auto& it : cli.get_strs()["var-right"]) { 77 | get_symbol(it, alpha, "vr", keep); 78 | } 79 | 80 | for (int32_t i = 1; i <= alpha.size(); i++) { 81 | UString t; 82 | alpha.getSymbol(t, -i); 83 | if (StringUtils::startswith(t, u". 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | void endProgram(char *name) 27 | { 28 | if(name != NULL) 29 | { 30 | std::cout << basename(name) << " v" << PACKAGE_VERSION <<": build a letter transducer from a TMX translation memory" << std::endl; 31 | std::cout << "USAGE: " << basename(name) << " [OPTIONS] lang1-lang2 tmx_file output_file" << std::endl; 32 | std::cout << "Modes:" << std::endl; 33 | std::cout << " lang1: input language" << std::endl; 34 | std::cout << " lang2: output language" << std::endl; 35 | std::cout << "Options:" << std::endl; 36 | #if HAVE_GETOPT_LONG 37 | std::cout << " -o, --origin-code code the language code to be taken as lang1" << std::endl; 38 | std::cout << " -m, --meta-code code the language code to be taken as lang2" << std::endl; 39 | #else 40 | std::cout << " -o code the language code to be taken as lang1" << std::endl; 41 | std::cout << " -m code the language code to be taken as lang2" << std::endl; 42 | #endif 43 | } 44 | exit(EXIT_FAILURE); 45 | } 46 | 47 | 48 | int main(int argc, char *argv[]) 49 | { 50 | LtLocale::tryToSetLocale(); 51 | 52 | if(argc != 4) 53 | { 54 | endProgram(argv[0]); 55 | } 56 | 57 | TMXCompiler c; 58 | 59 | #if HAVE_GETOPT_LONG 60 | int option_index = 0; 61 | #endif 62 | while(true) 63 | { 64 | #if HAVE_GETOPT_LONG 65 | static struct option long_options[] = 66 | { 67 | {"origin-code", required_argument, 0, 'o'}, 68 | {"meta-code", required_argument, 0, 'm'}, 69 | {0, 0, 0, 0} 70 | }; 71 | 72 | int c_t = getopt_long(argc, argv, "o:m:", long_options, &option_index); 73 | #else 74 | int c_t = getopt(argc, argv, "o:m:"); 75 | #endif 76 | if(c_t == -1) 77 | { 78 | break; 79 | } 80 | 81 | switch(c_t) 82 | { 83 | case 'o': 84 | c.setOriginLanguageCode(to_ustring(optarg)); 85 | break; 86 | 87 | case 'm': 88 | c.setMetaLanguageCode(to_ustring(optarg)); 89 | break; 90 | 91 | default: 92 | endProgram(argv[0]); 93 | break; 94 | } 95 | } 96 | 97 | UString opc = to_ustring(argv[argc-3]); 98 | UString lo = opc.substr(0, opc.find('-')); 99 | UString lm = opc.substr(opc.find('-')+1); 100 | 101 | if(lo.empty() || lm.empty()) { 102 | endProgram(argv[0]); 103 | } 104 | 105 | c.parse(argv[argc-2], lo, lm); 106 | 107 | FILE *output = fopen(argv[argc-1], "wb"); 108 | if(!output) 109 | { 110 | std::cerr << "Error: Cannot open file '" << argv[2] << "'." << std::endl; 111 | exit(EXIT_FAILURE); 112 | } 113 | c.write(output); 114 | fclose(output); 115 | } 116 | -------------------------------------------------------------------------------- /lttoolbox/lt_tmxproc.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | int main(int argc, char *argv[]) 23 | { 24 | LtLocale::tryToSetLocale(); 25 | CLI cli("process a stream with a letter transducer"); 26 | cli.add_file_arg("fst_file", false); 27 | cli.add_file_arg("input_file"); 28 | cli.add_file_arg("output_file"); 29 | cli.add_bool_arg('s', "space", "allow a segment to match before space (as well as before punctuation)"); 30 | cli.add_bool_arg('z', "null-flush", "flush output on the null character (always on, this is a no-op for backwards compatibility)"); 31 | cli.parse_args(argc, argv); 32 | 33 | TranslationMemoryMode tm_mode = cli.get_bools()["space"] ? tm_space : tm_punct; 34 | 35 | FSTProcessor fstp; 36 | fstp.setNullFlush(true); // cf. description of cli["null-flush"] 37 | FILE* aux = openInBinFile(cli.get_files()[0]); 38 | fstp.load(aux); 39 | fclose(aux); 40 | fstp.initTMAnalysis(); 41 | if (!fstp.valid()) { 42 | return EXIT_FAILURE; 43 | } 44 | 45 | InputFile input; 46 | if (!cli.get_files()[1].empty()) { 47 | input.open_or_exit(cli.get_files()[1].c_str()); 48 | } 49 | UFILE* output = openOutTextFile(cli.get_files()[2].c_str()); 50 | 51 | fstp.tm_analysis(input, output, tm_mode); 52 | 53 | u_fclose(output); 54 | return EXIT_SUCCESS; 55 | } 56 | -------------------------------------------------------------------------------- /lttoolbox/match_exe.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | MatchExe::MatchExe() : 23 | initial_id(0) 24 | { 25 | } 26 | 27 | MatchExe::~MatchExe() 28 | { 29 | destroy(); 30 | } 31 | 32 | MatchExe::MatchExe(MatchExe const &te) 33 | { 34 | copy(te); 35 | } 36 | 37 | MatchExe::MatchExe(Transducer const &t, std::map const &final_type) 38 | { 39 | // memory allocation 40 | node_list.reserve(t.transitions.size()); 41 | 42 | for(auto it = t.transitions.begin(), 43 | limit = t.transitions.end(); it != limit; it++) 44 | { 45 | MatchNode mynode(it->second.size()); 46 | node_list.push_back(mynode); 47 | } 48 | 49 | // set up finals 50 | for(auto it = final_type.begin(), limit = final_type.end(); 51 | it != limit; it++) 52 | { 53 | finals[&node_list[it->first]] = it->second; 54 | } 55 | 56 | // set up initial node 57 | initial_id = t.initial; 58 | 59 | // set up the transitions 60 | for(auto it = t.transitions.begin(), 61 | limit = t.transitions.end(); it != limit; it++) 62 | { 63 | MatchNode &mynode = node_list[it->first]; 64 | int i = 0; 65 | for(auto it2 = it->second.begin(), 66 | limit2 = it->second.end(); it2 != limit2; it2++) 67 | { 68 | mynode.addTransition(it2->first, &node_list[it2->second.first], it2->second.second, i++); 69 | } 70 | } 71 | } 72 | 73 | MatchExe & 74 | MatchExe::operator =(MatchExe const &te) 75 | { 76 | if(this != &te) 77 | { 78 | destroy(); 79 | copy(te); 80 | } 81 | return *this; 82 | } 83 | 84 | void 85 | MatchExe::copy(MatchExe const &te) 86 | { 87 | initial_id = te.initial_id; 88 | node_list = te.node_list; 89 | finals = te.finals; 90 | } 91 | 92 | void 93 | MatchExe::destroy() 94 | { 95 | } 96 | 97 | MatchNode * 98 | MatchExe::getInitial() 99 | { 100 | return &node_list[initial_id]; 101 | } 102 | 103 | std::map & 104 | MatchExe::getFinals() 105 | { 106 | return finals; 107 | } 108 | -------------------------------------------------------------------------------- /lttoolbox/match_exe.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _MATCHEXE_ 19 | #define _MATCHEXE_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | 31 | /** 32 | * Matcher class for execution of lexical recognizing algorithms 33 | */ 34 | class MatchExe 35 | { 36 | private: 37 | /** 38 | * Initial state 39 | */ 40 | int initial_id; 41 | 42 | /** 43 | * MatchNode list 44 | */ 45 | std::vector node_list; 46 | 47 | /** 48 | * Set of final nodes 49 | */ 50 | std::map finals; 51 | 52 | /** 53 | * Copy function 54 | * @param te the transducer to be copied 55 | */ 56 | void copy(MatchExe const &te); 57 | 58 | /** 59 | * Destroy function 60 | */ 61 | void destroy(); 62 | 63 | public: 64 | 65 | /** 66 | * Constructor 67 | */ 68 | MatchExe(); 69 | 70 | /** 71 | * From transducer constructor 72 | * @param t the transducer 73 | * @param final_type the final types 74 | */ 75 | MatchExe(Transducer const &t, std::map const &final_type); 76 | 77 | /** 78 | * Destructor 79 | */ 80 | ~MatchExe(); 81 | 82 | /** 83 | * Copy constructor 84 | * @param te the transducer to be copied 85 | */ 86 | MatchExe(MatchExe const &te); 87 | 88 | /** 89 | * Assignment operator 90 | * @param te the transducer to be assigned 91 | * @return the assigned object 92 | */ 93 | MatchExe & operator =(MatchExe const &te); 94 | 95 | /** 96 | * Gets the initial node of the transducer 97 | * @return the initial node 98 | */ 99 | MatchNode * getInitial(); 100 | 101 | /** 102 | * Gets the set of final nodes 103 | * @return the set of final nodes 104 | */ 105 | std::map & getFinals(); 106 | }; 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /lttoolbox/match_node.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | 19 | MatchNode::MatchNode(int const svsize) : 20 | transitions(svsize) 21 | { 22 | } 23 | 24 | MatchNode::~MatchNode() 25 | { 26 | destroy(); 27 | } 28 | 29 | MatchNode::MatchNode(MatchNode const &n) : 30 | transitions(1) 31 | { 32 | copy(n); 33 | } 34 | 35 | MatchNode & 36 | MatchNode::operator =(MatchNode const &n) 37 | { 38 | if(this != &n) 39 | { 40 | destroy(); 41 | copy(n); 42 | } 43 | return *this; 44 | } 45 | 46 | void 47 | MatchNode::copy(MatchNode const &n) 48 | { 49 | transitions = n.transitions; 50 | } 51 | 52 | void 53 | MatchNode::destroy() 54 | { 55 | } 56 | 57 | void 58 | MatchNode::addTransition(int const i, MatchNode * const d, double w, int pos) 59 | { 60 | // transitions[i].insert({d, w}); 61 | transitions.add(i, d, w, pos); 62 | } 63 | -------------------------------------------------------------------------------- /lttoolbox/match_node.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _MATCHNODE_ 19 | #define _MATCHNODE_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | class MatchState; 27 | 28 | 29 | //class MatchNode; 30 | //typedef std::map > MNode; 31 | 32 | typedef SortedVector MNode; 33 | 34 | /** 35 | * Node class of TransExe. State is a friend class since the 36 | * algorithms are implemented in MatchState 37 | */ 38 | class MatchNode 39 | { 40 | private: 41 | friend class MatchState; 42 | 43 | /** 44 | * The outgoing transitions from this node. 45 | * Schema: (input symbol, destination, weight) 46 | */ 47 | MNode transitions; 48 | 49 | /** 50 | * Copy method 51 | * @param n the node to be copied 52 | */ 53 | void copy(MatchNode const &n); 54 | 55 | /** 56 | * Destroy method 57 | */ 58 | void destroy(); 59 | 60 | public: 61 | 62 | /** 63 | * Constructor 64 | */ 65 | MatchNode(int const svsize); 66 | 67 | /** 68 | * Destructor 69 | */ 70 | ~MatchNode(); 71 | 72 | /** 73 | * Copy constructor 74 | * @param n the node to be copied 75 | */ 76 | MatchNode(MatchNode const &n); 77 | 78 | /** 79 | * Assignment operator 80 | * @param n the node to be assigned 81 | * @return the assigned object 82 | */ 83 | MatchNode & operator=(MatchNode const &n); 84 | 85 | /** 86 | * Making a link between this node and another 87 | * @param i input symbol 88 | * @param d destination 89 | * @param w weight value 90 | */ 91 | void addTransition(int const i, MatchNode * const d, double const w, int pos); 92 | }; 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /lttoolbox/match_state.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | int const MatchState::BUF_LIMIT = 1024; 24 | 25 | MatchState::MatchState() 26 | { 27 | first = last = 0; 28 | state = new MatchNode *[BUF_LIMIT]; 29 | } 30 | 31 | MatchState::~MatchState() 32 | { 33 | destroy(); 34 | } 35 | 36 | MatchState::MatchState(MatchState const &s) 37 | { 38 | copy(s); 39 | } 40 | 41 | MatchState & 42 | MatchState::operator =(MatchState const &s) 43 | { 44 | if(this != &s) 45 | { 46 | destroy(); 47 | copy(s); 48 | } 49 | 50 | return *this; 51 | } 52 | 53 | void 54 | MatchState::destroy() 55 | { 56 | delete[] state; 57 | state = 0; 58 | } 59 | 60 | void 61 | MatchState::copy(MatchState const &s) 62 | { 63 | if (state == 0) 64 | { 65 | state = new MatchNode *[BUF_LIMIT]; 66 | } 67 | 68 | for(int i = 0; i < BUF_LIMIT; i++) 69 | { 70 | state[i] = s.state[i]; 71 | } 72 | first = s.first; 73 | last = s.last; 74 | } 75 | 76 | int 77 | MatchState::size() const 78 | { 79 | return last >= first ? last - first: last + BUF_LIMIT -first; 80 | } 81 | 82 | void 83 | MatchState::init(MatchNode *initial) 84 | { 85 | first = 0; 86 | last = 1; 87 | state[0] = initial; 88 | } 89 | 90 | void 91 | MatchState::applySymbol(MatchNode *pnode, int const symbol) 92 | { 93 | MatchNode *aux = pnode->transitions.search(symbol); 94 | if(aux != NULL) 95 | { 96 | state[last] = aux; 97 | last = (last + 1)%BUF_LIMIT; 98 | } 99 | } 100 | 101 | void 102 | MatchState::step(int const input) 103 | { 104 | int mylast = last; 105 | for(int i = first; i != mylast; i=(i+1)%BUF_LIMIT) 106 | { 107 | applySymbol(state[i], input); 108 | } 109 | first = mylast; 110 | } 111 | 112 | void 113 | MatchState::step(int const input, int const alt) 114 | { 115 | int mylast = last; 116 | for(int i = first; i != mylast; i=(i+1)%BUF_LIMIT) 117 | { 118 | applySymbol(state[i], input); 119 | applySymbol(state[i], alt); 120 | } 121 | first = mylast; 122 | } 123 | 124 | int 125 | MatchState::classifyFinals(std::map const &final_class) const 126 | { 127 | std::set empty_set; 128 | return classifyFinals(final_class, empty_set); 129 | } 130 | 131 | int 132 | MatchState::classifyFinals(std::map const &final_class, std::set const &banned_rules) const 133 | { 134 | int result = INT_MAX; 135 | for (int i = first; i != last; i = (i+1)%BUF_LIMIT) 136 | { 137 | auto it2 = final_class.find(state[i]); 138 | if(it2 != final_class.end()) 139 | { 140 | if(it2->second < result && banned_rules.find(it2->second) == banned_rules.end()) 141 | { 142 | result = it2->second; 143 | } 144 | } 145 | } 146 | return (result < INT_MAX)? result : (-1); 147 | } 148 | 149 | void 150 | MatchState::clear() 151 | { 152 | first = last = 0; 153 | } 154 | -------------------------------------------------------------------------------- /lttoolbox/match_state.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #ifndef _MATCHSTATE_ 18 | #define _MATCHSTATE_ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | 28 | /** 29 | * Class to represent the current state of transducer processing 30 | */ 31 | class MatchState 32 | { 33 | private: 34 | static int const BUF_LIMIT; 35 | MatchNode **state; 36 | int first; 37 | int last; 38 | 39 | /** 40 | * The current state of transducer processing 41 | */ 42 | // slist state; 43 | 44 | /** 45 | * Copy function 46 | * @param s the state to be copied 47 | */ 48 | void copy(MatchState const &s); 49 | 50 | /** 51 | * Destroy function 52 | */ 53 | void destroy(); 54 | 55 | 56 | void applySymbol(MatchNode *pnode, int const symbol); 57 | public: 58 | /** 59 | * Constructor 60 | */ 61 | MatchState(); 62 | 63 | /** 64 | * Destructor 65 | */ 66 | ~MatchState(); 67 | 68 | /** 69 | * Copy constructor 70 | * @param s the state to be copied 71 | */ 72 | MatchState(MatchState const &s); 73 | 74 | /** 75 | * Assignment operator 76 | * @param s the state to be assigned 77 | * @return the object that results from the assignation 78 | */ 79 | MatchState & operator =(MatchState const &s); 80 | 81 | /** 82 | * Number of alive transductions 83 | * @return the size 84 | */ 85 | int size() const; 86 | 87 | /** 88 | * step = apply + epsilonClosure 89 | * @param input the input symbol 90 | */ 91 | void step(int const input); 92 | 93 | /** 94 | * step = apply + epsilonClosure 95 | * @param input the input symbol 96 | * @param alt the alternative input symbol 97 | */ 98 | void step(int const input, int const alt); 99 | 100 | /** 101 | * Init the state with the initial node and empty output 102 | * @param initial the initial node of the transducer 103 | */ 104 | void init(MatchNode *initial); 105 | 106 | int classifyFinals(std::map const &final_class, std::set const &banned_rules) const; 107 | 108 | int classifyFinals(std::map const &final_class) const; 109 | 110 | void debug(); 111 | 112 | void clear(); 113 | 114 | }; 115 | 116 | #endif 117 | -------------------------------------------------------------------------------- /lttoolbox/my_stdio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | // cstdio wrapper for backwards compatibility 19 | 20 | #ifndef _IOHEADER_ 21 | #define _IOHEADER_ 22 | 23 | #include 24 | 25 | // Check individually all '_unlocked' functions because using 26 | // g++-2.95 fgetc_unlocked and fputc_unlocked compile perfectly but 27 | // fputs_unlocked is not defined in their C headers. 28 | 29 | #if !HAVE_DECL_FPUTS_UNLOCKED 30 | #define fputs_unlocked fputs 31 | #endif 32 | 33 | #if !HAVE_DECL_FGETC_UNLOCKED 34 | #define fgetc_unlocked fgetc 35 | #endif 36 | 37 | #if !HAVE_DECL_FPUTC_UNLOCKED 38 | #define fputc_unlocked fputc 39 | #endif 40 | 41 | #if !HAVE_DECL_FWRITE_UNLOCKED 42 | #define fwrite_unlocked fwrite 43 | #endif 44 | 45 | #if !HAVE_DECL_FREAD_UNLOCKED 46 | #define fread_unlocked fread 47 | #endif 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /lttoolbox/node.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | 19 | Node::Node() 20 | { 21 | } 22 | 23 | Node::~Node() 24 | { 25 | destroy(); 26 | } 27 | 28 | Node::Node(Node const &n) 29 | { 30 | copy(n); 31 | } 32 | 33 | Node & 34 | Node::operator =(Node const &n) 35 | { 36 | if(this != &n) 37 | { 38 | destroy(); 39 | copy(n); 40 | } 41 | return *this; 42 | } 43 | 44 | void 45 | Node::copy(Node const &n) 46 | { 47 | transitions = n.transitions; 48 | } 49 | 50 | void 51 | Node::destroy() 52 | { 53 | } 54 | 55 | void 56 | Node::addTransition(int const i, int const o, Node * const d, double const wt) 57 | { 58 | Dest &aux = transitions[i]; 59 | aux.size++; 60 | int *out_tag = new int[aux.size]; 61 | Node **dest = new Node*[aux.size]; 62 | double *out_weight = new double[aux.size]; 63 | 64 | for(int i = 0; i 1) 72 | { 73 | delete[] aux.out_tag; 74 | delete[] aux.dest; 75 | delete[] aux.out_weight; 76 | } 77 | 78 | out_tag[aux.size-1] = o; 79 | dest[aux.size-1] = d; 80 | out_weight[aux.size-1] = wt; 81 | aux.out_tag = out_tag; 82 | aux.dest = dest; 83 | aux.out_weight = out_weight; 84 | } 85 | -------------------------------------------------------------------------------- /lttoolbox/node.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _NODE_ 19 | #define _NODE_ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | class State; 26 | class Node; 27 | 28 | 29 | class Dest 30 | { 31 | private: 32 | int size; 33 | int *out_tag; 34 | Node **dest; 35 | double *out_weight; 36 | 37 | friend class State; 38 | friend class Node; 39 | 40 | void copy(Dest const &d) 41 | { 42 | destroy(); 43 | size = d.size; 44 | out_tag = new int[size]; 45 | dest = new Node*[size]; 46 | out_weight = new double[size]; 47 | } 48 | 49 | void destroy() 50 | { 51 | if(size != 0) 52 | { 53 | size = 0; 54 | if(out_tag) 55 | { 56 | delete[] out_tag; 57 | } 58 | if(dest) 59 | { 60 | delete[] dest; 61 | } 62 | if(out_weight) 63 | { 64 | delete[] out_weight; 65 | } 66 | } 67 | } 68 | 69 | void init() 70 | { 71 | size = 0; 72 | out_tag = NULL; 73 | dest = NULL; 74 | out_weight = NULL; 75 | } 76 | 77 | public: 78 | Dest() 79 | { 80 | init(); 81 | } 82 | 83 | ~Dest() 84 | { 85 | destroy(); 86 | } 87 | 88 | Dest(Dest const &d) 89 | { 90 | init(); 91 | copy(d); 92 | } 93 | 94 | Dest & operator=(Dest const &d) 95 | { 96 | if(this != &d) 97 | { 98 | destroy(); 99 | copy(d); 100 | } 101 | return *this; 102 | } 103 | }; 104 | 105 | 106 | 107 | /** 108 | * Node class of TransExe. State is a friend class since the 109 | * algorithms are implemented in State 110 | */ 111 | class Node 112 | { 113 | private: 114 | friend class State; 115 | 116 | /** 117 | * The outgoing transitions of this node. 118 | * Schema: (input symbol, (output symbol, destination, weight)) 119 | */ 120 | std::map transitions; 121 | 122 | /** 123 | * Copy method 124 | * @param n the node to be copied 125 | */ 126 | void copy(Node const &n); 127 | 128 | /** 129 | * Destroy method 130 | */ 131 | void destroy(); 132 | 133 | public: 134 | 135 | /** 136 | * Constructor 137 | */ 138 | Node(); 139 | 140 | /** 141 | * Destructor 142 | */ 143 | ~Node(); 144 | 145 | /** 146 | * Copy constructor 147 | * @param n the node to be copied 148 | */ 149 | Node(Node const &n); 150 | 151 | /** 152 | * Assignment operator 153 | * @param n the node to be assigned 154 | * @return the assigned object 155 | */ 156 | Node & operator=(Node const &n); 157 | 158 | /** 159 | * Making a link between this node and another 160 | * @param i input symbol 161 | * @param o output symbol 162 | * @param d destination 163 | * @param w weight value 164 | */ 165 | void addTransition(int i, int o, Node * const d, double wt); 166 | }; 167 | 168 | #endif 169 | -------------------------------------------------------------------------------- /lttoolbox/pool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apertium/lttoolbox/572bd0f70e5c3ecb64d212f796cdf5adfe4856c3/lttoolbox/pool.h -------------------------------------------------------------------------------- /lttoolbox/rcx.rng: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 1 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /lttoolbox/sorted_vector.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | #include 19 | 20 | 21 | void 22 | SortedVector::copy(SortedVector const &o) 23 | { 24 | sv = new SVNode[o.size]; 25 | size = o.size; 26 | 27 | for(int i = 0; i != size; i++) 28 | { 29 | sv[i].tag = o.sv[i].tag; 30 | sv[i].dest = o.sv[i].dest; 31 | sv[i].weight = o.sv[i].weight; 32 | } 33 | } 34 | 35 | void 36 | SortedVector::destroy() 37 | { 38 | delete[] sv; 39 | } 40 | 41 | SortedVector::SortedVector(int const fixed_size) 42 | { 43 | sv = new SVNode[fixed_size]; 44 | size = fixed_size; 45 | } 46 | 47 | SortedVector::~SortedVector() 48 | { 49 | destroy(); 50 | } 51 | 52 | SortedVector::SortedVector(SortedVector const &o) 53 | { 54 | copy(o); 55 | } 56 | 57 | SortedVector & 58 | SortedVector::operator =(SortedVector const &o) 59 | { 60 | if(this != &o) 61 | { 62 | destroy(); 63 | copy(o); 64 | } 65 | return *this; 66 | } 67 | 68 | void 69 | SortedVector::add(int tag, MatchNode *dest, double weight, int pos) 70 | { 71 | sv[pos].tag = tag; 72 | sv[pos].dest = dest; 73 | sv[pos].weight = weight; 74 | } 75 | 76 | MatchNode * 77 | SortedVector::search(int tag) 78 | { 79 | int left = 0, right = size-1; 80 | while(left <= right) 81 | { 82 | int mid = (left+right)/2; 83 | if(sv[mid].tag == tag) 84 | { 85 | return sv[mid].dest; 86 | } 87 | if(sv[mid].tag > tag) 88 | { 89 | right = mid - 1; 90 | } 91 | else 92 | { 93 | left = mid + 1; 94 | } 95 | } 96 | 97 | return NULL; 98 | } 99 | -------------------------------------------------------------------------------- /lttoolbox/sorted_vector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #ifndef _SORTEDVECTOR_ 18 | #define _SORTEDVECTOR_ 19 | 20 | class MatchNode; 21 | 22 | /** 23 | * Class representing the sorted vector of destinations for a given 24 | * MatchNode. Destinations are also MatchNode pointers. 25 | */ 26 | class SortedVector 27 | { 28 | private: 29 | 30 | /** 31 | * Triplet tag-destination-weight 32 | */ 33 | struct SVNode 34 | { 35 | int tag; 36 | MatchNode *dest; 37 | double weight; 38 | }; 39 | 40 | /** 41 | * Array of sorted SVNodes 42 | */ 43 | SVNode *sv; 44 | 45 | /** 46 | * Size of the array 47 | */ 48 | int size; 49 | 50 | void copy(SortedVector const &o); 51 | void destroy(); 52 | public: 53 | /** 54 | * Constructor 55 | * @param fixed_size size of the SortedVector 56 | */ 57 | SortedVector(int const fixed_size); 58 | 59 | /** 60 | * Destructor 61 | */ 62 | ~SortedVector(); 63 | 64 | /** 65 | * Copy constructor 66 | * @param o the item to be copied 67 | */ 68 | SortedVector(SortedVector const &o); 69 | 70 | /** 71 | * Assignment operator 72 | * @param o the item to be assigned 73 | */ 74 | SortedVector & operator =(SortedVector const &o); 75 | 76 | /** 77 | * Method to adding an item into a specified position in the array 78 | * @param tag the tag of the item 79 | * @param the destination MatchNode of the item 80 | * @param the weight value for the transition 81 | * @param pos the position to do the insertion 82 | */ 83 | void add(int tag, MatchNode *dest, double weight, int pos); 84 | 85 | /** 86 | * Searching method (classic binary search) 87 | * @param tag to search 88 | * @returns the destination MatchNode pointer 89 | */ 90 | MatchNode * search(int tag); 91 | }; 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /lttoolbox/stream_reader.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2024 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include 19 | 20 | StreamReader::StreamReader(InputFile* i) : in(i) {} 21 | 22 | StreamReader::~StreamReader() {} 23 | 24 | void StreamReader::next() { 25 | blank.clear(); 26 | wblank.clear(); 27 | readings.clear(); 28 | chunk.clear(); 29 | 30 | if (at_eof) return; 31 | 32 | at_null = false; 33 | 34 | blank = in->readBlank(false); 35 | 36 | UChar32 c = in->get(); 37 | 38 | if (c == '[') { 39 | in->get(); 40 | wblank = in->finishWBlank(); 41 | if (in->peek() != '^') { 42 | UString temp = blank + wblank; 43 | next(); 44 | blank = temp + blank; 45 | return; 46 | } 47 | c = in->get(); 48 | } 49 | 50 | if (c == '\0') { 51 | at_null = true; 52 | return; 53 | } 54 | 55 | if (in->eof()) { 56 | at_eof = true; 57 | return; 58 | } 59 | 60 | while (c != '$' && c != '\0' && !in->eof()) { 61 | readings.resize(readings.size()+1); 62 | auto& cur = readings.back(); 63 | c = in->peek(); 64 | if (c == '*' || c == '@' || c == '#' || c == '=' || c == '%') { 65 | in->get(); 66 | cur.mark = c; 67 | } 68 | c = in->get(); 69 | while (c != '/' && c != '$' && c != '\0' && !in->eof()) { 70 | if (c == '<') { 71 | UString tag = in->readBlock('<', '>'); 72 | cur.content += tag; 73 | if (alpha) { 74 | if (add_unknowns) alpha->includeSymbol(tag); 75 | cur.symbols.push_back((*alpha)(tag)); 76 | } 77 | } 78 | else if (c == '{') { 79 | chunk += in->readBlock('{', '}'); 80 | } 81 | else { 82 | cur.content += c; 83 | if (c == '\\') { 84 | UChar32 c2 = in->get(); 85 | if (alpha) cur.symbols.push_back(static_cast(c2)); 86 | cur.content += c2; 87 | } 88 | else if (alpha) { 89 | cur.symbols.push_back(static_cast(c)); 90 | } 91 | } 92 | c = in->get(); 93 | } 94 | } 95 | 96 | if (c == '\0') at_null = true; 97 | else if (c == U_EOF || in->eof()) at_eof = true; 98 | } 99 | -------------------------------------------------------------------------------- /lttoolbox/stream_reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2024 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef __LT_STREAM_READER_H__ 19 | #define __LT_STREAM_READER_H__ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | class StreamReader { 26 | private: 27 | InputFile* in; 28 | public: 29 | struct Reading { 30 | UChar32 mark = '\0'; 31 | UString content; 32 | std::vector symbols; 33 | }; 34 | bool at_null = false; 35 | bool at_eof = false; 36 | UString blank; 37 | UString wblank; 38 | std::vector readings; 39 | UString chunk; 40 | 41 | Alphabet* alpha = nullptr; 42 | bool add_unknowns = false; 43 | 44 | StreamReader(InputFile* i); 45 | ~StreamReader(); 46 | void next(); 47 | }; 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /lttoolbox/string_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __LT_STRING_UTILS_H__ 2 | #define __LT_STRING_UTILS_H__ 3 | 4 | #include 5 | #include 6 | 7 | class StringUtils { 8 | public: 9 | // delete leading and trailing whitespace 10 | static UStringView trim(UStringView str); 11 | 12 | // split string on delimiter 13 | static std::vector split(UStringView str, UStringView delim=u" "); 14 | 15 | // split but respect \ escapes 16 | static std::vector split_escaped(UStringView str, UChar delim); 17 | 18 | // inverse of split 19 | static UString join(const std::vector& vec, UStringView delim); 20 | 21 | // replace each occurrence of olds with news 22 | static UString substitute(UStringView str, UStringView olds, UStringView news); 23 | 24 | static UString itoa(int n); 25 | static std::string itoa_string(int n); 26 | static UString ftoa(double f); 27 | // these throw std::invalid_argument if parsing fails 28 | static int stoi(const UString& str); 29 | static double stod(const UString& str); 30 | 31 | static UString tolower(UStringView str); 32 | static UString toupper(UStringView str); 33 | static UString totitle(UStringView str); 34 | 35 | static UString getcase(UStringView str); 36 | static UString copycase(UStringView source, UStringView target); 37 | 38 | static bool caseequal(UStringView a, UStringView b); 39 | 40 | static bool startswith(UStringView str, UStringView prefix); 41 | static bool endswith(UStringView str, UStringView suffix); 42 | 43 | static UString merge_wblanks(UStringView w1, UStringView w2); 44 | }; 45 | 46 | #endif // __LT_STRING_UTILS_H__ 47 | -------------------------------------------------------------------------------- /lttoolbox/symbol_iter.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | symbol_iter::iterator::iterator(UStringView s) : str(s) 5 | { 6 | ++*this; 7 | } 8 | 9 | symbol_iter::iterator::iterator(const symbol_iter::iterator& other) 10 | : str(other.str), sloc(other.sloc), eloc(other.eloc) {} 11 | 12 | symbol_iter::iterator::~iterator() {} 13 | 14 | UStringView symbol_iter::iterator::operator*() const { 15 | return str.substr(sloc, eloc-sloc); 16 | } 17 | 18 | symbol_iter::iterator& symbol_iter::iterator::operator++() 19 | { 20 | if (sloc < str.size()) { 21 | sloc = eloc; 22 | UChar32 c; 23 | U16_NEXT(str.data(), eloc, str.size(), c); 24 | if (c == '\\') { 25 | sloc++; 26 | U16_NEXT(str.data(), eloc, str.size(), c); 27 | } else if (c == '<') { 28 | auto i = eloc; 29 | while (c != '>' && i < str.size()) U16_NEXT(str.data(), i, str.size(), c); 30 | if (c == '>') eloc = i; 31 | } 32 | if (eloc > str.size()) eloc = str.size(); 33 | } 34 | return *this; 35 | } 36 | 37 | bool symbol_iter::iterator::operator!=(const symbol_iter::iterator& o) const 38 | { 39 | return str != o.str || sloc != o.sloc || eloc != o.eloc; 40 | } 41 | 42 | bool symbol_iter::iterator::operator==(const symbol_iter::iterator& o) const 43 | { 44 | return str == o.str && sloc == o.sloc && eloc == o.eloc; 45 | } 46 | 47 | symbol_iter::iterator symbol_iter::begin() const 48 | { 49 | return symbol_iter::iterator(str); 50 | } 51 | 52 | symbol_iter::iterator symbol_iter::end() const 53 | { 54 | symbol_iter::iterator ret(str); 55 | ret.sloc = str.size(); 56 | ret.eloc = str.size(); 57 | return ret; 58 | } 59 | -------------------------------------------------------------------------------- /lttoolbox/symbol_iter.h: -------------------------------------------------------------------------------- 1 | #ifndef __LT_SYMBOL_ITER_H__ 2 | #define __LT_SYMBOL_ITER_H__ 3 | 4 | #include 5 | 6 | class symbol_iter 7 | { 8 | private: 9 | UStringView str; 10 | public: 11 | symbol_iter(UStringView s) : str(s) {} 12 | ~symbol_iter() {} 13 | class iterator 14 | { 15 | friend symbol_iter; 16 | private: 17 | UStringView str; 18 | UStringView::size_type sloc = 0; 19 | UStringView::size_type eloc = 0; 20 | public: 21 | iterator(UStringView s); 22 | iterator(const iterator& other); 23 | ~iterator(); 24 | UStringView operator*() const; 25 | iterator& operator++(); 26 | bool operator!=(const symbol_iter::iterator& other) const; 27 | bool operator==(const symbol_iter::iterator& other) const; 28 | }; 29 | iterator begin() const; 30 | iterator end() const; 31 | }; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /lttoolbox/trans_exe.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _TRANSEXE_ 19 | #define _TRANSEXE_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | 31 | /** 32 | * Transducer class for execution of lexical processing algorithms 33 | */ 34 | class TransExe 35 | { 36 | private: 37 | /** 38 | * Initial state 39 | */ 40 | int initial_id; 41 | 42 | /** 43 | * Default value of weight 44 | */ 45 | double default_weight; 46 | 47 | /** 48 | * Node list 49 | */ 50 | std::vector node_list; 51 | 52 | /** 53 | * Final node set mapped to its weight walues 54 | */ 55 | std::map finals; 56 | 57 | /** 58 | * Copy function 59 | * @param te the transducer to be copied 60 | */ 61 | void copy(TransExe const &te); 62 | 63 | /** 64 | * Destroy function 65 | */ 66 | void destroy(); 67 | 68 | public: 69 | 70 | /** 71 | * Constructor 72 | */ 73 | TransExe(); 74 | 75 | /** 76 | * Destructor 77 | */ 78 | ~TransExe(); 79 | 80 | /** 81 | * Copy constructor 82 | * @param te the transducer to be copied 83 | */ 84 | TransExe(TransExe const &te); 85 | 86 | /** 87 | * Assignment operator 88 | * @param te the transducer to be assigned 89 | * @return the assigned object 90 | */ 91 | TransExe & operator =(TransExe const &te); 92 | 93 | /** 94 | * Read method with an encoding base 95 | * @param input the stream 96 | * @param alphabet the alphabet object to decode the symbols 97 | */ 98 | void read(FILE *input, Alphabet const &alphabet); 99 | 100 | /** 101 | * Reduces all the final states to one 102 | */ 103 | void unifyFinals(); 104 | 105 | /** 106 | * Gets the initial node of the transducer 107 | * @return the initial node 108 | */ 109 | Node * getInitial(); 110 | 111 | /** 112 | * Gets the set of final nodes 113 | * @return the set of final nodes 114 | */ 115 | std::map & getFinals(); 116 | }; 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /lttoolbox/ustring.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #include "ustring.h" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace icu; 26 | 27 | void 28 | write(UStringView str, UFILE* output) 29 | { 30 | // u_fputs() inserts a newline 31 | u_fprintf(output, "%.*S", str.size(), str.data()); 32 | } 33 | 34 | UString 35 | to_ustring(const char* s) 36 | { 37 | return to_ustring(reinterpret_cast(s)); 38 | } 39 | 40 | UString 41 | to_ustring(const uint8_t* s) 42 | { 43 | auto sz = strlen(reinterpret_cast(s)); 44 | UString ret; 45 | ret.reserve(sz); 46 | utf8::utf8to16(s, s+sz, std::back_inserter(ret)); 47 | return ret; 48 | } 49 | 50 | void 51 | ustring_to_vec32(UStringView str, std::vector& vec) 52 | { 53 | if (str.empty()) { 54 | return; 55 | } 56 | 57 | size_t i = 0; 58 | size_t len = str.size(); 59 | vec.reserve(vec.size() + str.size()); 60 | int32_t c; 61 | while (i < str.size()) { 62 | U16_NEXT(str, i, len, c); 63 | vec.push_back(c); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /lttoolbox/ustring.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Apertium 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | 18 | #ifndef _LT_USTRING_H_ 19 | #define _LT_USTRING_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | typedef std::basic_string UString; 31 | typedef std::basic_string_view UStringView; 32 | 33 | void write(UStringView str, UFILE* output); 34 | 35 | UString to_ustring(const char* str); 36 | UString to_ustring(const uint8_t* str); 37 | 38 | // append UTF-16 string to UTF-32 vector of symbols 39 | void ustring_to_vec32(UStringView str, std::vector& vec); 40 | 41 | inline std::ostream& 42 | operator<<(std::ostream& ostr, char16_t c) 43 | { 44 | utf8::utf16to8(&c, &c+1, std::ostream_iterator(ostr)); 45 | return ostr; 46 | } 47 | 48 | inline std::ostream& 49 | operator<<(std::ostream& ostr, UStringView str) 50 | { 51 | utf8::utf16to8(str.begin(), str.end(), std::ostream_iterator(ostr)); 52 | return ostr; 53 | } 54 | 55 | inline UString operator "" _u(const char* str, std::size_t len) { 56 | UString us(len, 0); 57 | for (size_t i = 0; i < len; ++i) { 58 | us[i] = str[i]; 59 | } 60 | return us; 61 | } 62 | 63 | inline UString operator "" _u(const char16_t* str, std::size_t len) { 64 | UString us(len, 0); 65 | for (size_t i = 0; i < len; ++i) { 66 | us[i] = str[i]; 67 | } 68 | return us; 69 | } 70 | 71 | inline UStringView operator "" _uv(const char16_t* str, std::size_t len) { 72 | return UStringView(str, len); 73 | } 74 | 75 | inline UString US(UStringView usv) { 76 | return UString(usv); 77 | } 78 | 79 | inline void operator+=(UString& str, UChar32 c) 80 | { 81 | if (c <= 0xFFFF) { 82 | str += static_cast(c); 83 | } else { 84 | str += static_cast(0xD800 + ((c - 0x10000) >> 10)); 85 | str += static_cast(0xDC00 + (c & 0x3FF)); 86 | } 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /lttoolbox/win32/libgen.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "libgen.h" 4 | 5 | // https://www.opengroup.org/onlinepubs/007908775/xsh/basename.html 6 | 7 | char* basename(char *path) { 8 | if (path != NULL) { 9 | // Find the last position of the \ in the path name 10 | char* pos = strrchr(path, '\\'); 11 | 12 | if (pos != NULL) { // If a \ char was found... 13 | if (pos + 1 != NULL) // If it is not the last character in the string... 14 | return pos + 1; // then return a pointer to the first character after \. 15 | else 16 | return pos; // else return a pointer to \ 17 | 18 | } else { // If a \ char was NOT found 19 | return path; // return the pointer passed to basename (this is probably non-conformant) 20 | } 21 | 22 | } else { // If path == NULL, return "." 23 | return "."; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /lttoolbox/win32/libgen.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBGEN_H 2 | #define LIBGEN_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | char *basename(char *); 9 | 10 | #ifdef __cplusplus 11 | } 12 | #endif 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /lttoolbox/win32/unistd.h: -------------------------------------------------------------------------------- 1 | // This should really be defined elsewhere 2 | #define YY_INPUT(buf,result,max_size) \ 3 | if ( (result = fread( (char *) buf, 1, max_size, yyin )) < 0 ) \ 4 | YY_FATAL_ERROR( "input in flex scanner failed" ); 5 | 6 | #define fileno _fileno 7 | 8 | #define isatty(x) 0 9 | 10 | #define unlink _unlink 11 | -------------------------------------------------------------------------------- /lttoolbox/xml_parse_util.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | xmlTextReaderPtr 24 | XMLParseUtil::open_or_exit(const char* fname) 25 | { 26 | xmlTextReaderPtr reader = xmlReaderForFile(fname, NULL, 0); 27 | if (reader == NULL) { 28 | std::cerr << "Error: cannot open '" << fname << "' for reading." << std::endl; 29 | exit(EXIT_FAILURE); 30 | } 31 | return reader; 32 | } 33 | 34 | UString 35 | XMLParseUtil::attrib(xmlTextReaderPtr reader, UStringView name, UStringView fallback) 36 | { 37 | std::string temp; 38 | temp.reserve(name.size()); 39 | utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); 40 | auto attrname = reinterpret_cast(temp.c_str()); 41 | auto myattr = xmlTextReaderGetAttribute(reader, attrname); 42 | if(myattr == NULL) { 43 | xmlFree(myattr); 44 | return US(fallback); 45 | } else { 46 | auto result = to_ustring(reinterpret_cast(myattr)); 47 | xmlFree(myattr); 48 | return result; 49 | } 50 | } 51 | 52 | std::string 53 | XMLParseUtil::attrib_str(xmlTextReaderPtr reader, UStringView name) 54 | { 55 | std::string temp; 56 | temp.reserve(name.size()); 57 | utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); 58 | auto attrname = reinterpret_cast(temp.c_str()); 59 | auto myattr = xmlTextReaderGetAttribute(reader, attrname); 60 | if(myattr == NULL) { 61 | xmlFree(myattr); 62 | return ""; 63 | } else { 64 | std::string result = reinterpret_cast(myattr); 65 | xmlFree(myattr); 66 | return result; 67 | } 68 | } 69 | 70 | UString 71 | XMLParseUtil::readName(xmlTextReaderPtr reader) 72 | { 73 | const xmlChar* name = xmlTextReaderConstName(reader); 74 | if (name == NULL) return ""_u; 75 | return to_ustring(reinterpret_cast(name)); 76 | } 77 | 78 | UString 79 | XMLParseUtil::readValue(xmlTextReaderPtr reader) 80 | { 81 | const xmlChar* val = xmlTextReaderConstValue(reader); 82 | if (val == NULL) return ""_u; 83 | return to_ustring(reinterpret_cast(val)); 84 | } 85 | 86 | void 87 | XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, std::vector& vec) 88 | { 89 | const xmlChar* val = xmlTextReaderConstValue(reader); 90 | if (val == NULL) return; 91 | auto sz = xmlStrlen(val); 92 | vec.reserve(vec.size() + sz); 93 | utf8::utf8to32(val, val+sz, std::back_inserter(vec)); 94 | } 95 | 96 | bool 97 | XMLParseUtil::allBlanks(xmlTextReaderPtr reader) 98 | { 99 | for (auto& c : readValue(reader)) { 100 | if (!u_isspace(c)) return false; 101 | } 102 | return true; 103 | } 104 | 105 | void 106 | XMLParseUtil::error_and_die(xmlTextReaderPtr reader, const char* fmt, ...) 107 | { 108 | UFILE* err_out = u_finit(stderr, NULL, NULL); 109 | u_fprintf(err_out, "Error (%d): ", xmlTextReaderGetParserLineNumber(reader)); 110 | va_list argptr; 111 | va_start(argptr, fmt); 112 | u_vfprintf(err_out, fmt, argptr); 113 | va_end(argptr); 114 | u_fputc('\n', err_out); 115 | exit(EXIT_FAILURE); 116 | } 117 | -------------------------------------------------------------------------------- /lttoolbox/xml_parse_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, see . 16 | */ 17 | #ifndef _XMLPARSEUTIL_ 18 | #define _XMLPARSEUTIL_ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | 28 | class XMLParseUtil 29 | { 30 | public: 31 | 32 | static xmlTextReaderPtr open_or_exit(const char* fname); 33 | 34 | /* If attrib does not exist (or other error), returns fallback: */ 35 | static UString attrib(xmlTextReaderPtr reader, UStringView name, UStringView fallback=u""); 36 | 37 | static std::string attrib_str(xmlTextReaderPtr reader, UStringView name); 38 | 39 | static UString readName(xmlTextReaderPtr reader); 40 | static UString readValue(xmlTextReaderPtr reader); 41 | static void readValueInto32(xmlTextReaderPtr reader, std::vector& vec); 42 | 43 | static bool allBlanks(xmlTextReaderPtr reader); 44 | 45 | static void error_and_die(xmlTextReaderPtr reader, const char* fmt, ...); 46 | }; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /lttoolbox/xml_walk_util.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | children::children(xmlNode* node_) 5 | : node(node_), cur(node->children) 6 | { 7 | while (cur && cur->type != XML_ELEMENT_NODE) { 8 | cur = cur->next; 9 | } 10 | } 11 | 12 | children::children(const children& it) 13 | : node(it.node), cur(it.cur) 14 | {} 15 | 16 | children::~children() 17 | {} // we don't own the pointers, so we don't delete them 18 | 19 | children& 20 | children::operator++() 21 | { 22 | if (node && cur) { 23 | cur = cur->next; 24 | while (cur && cur->type != XML_ELEMENT_NODE) { 25 | cur = cur->next; 26 | } 27 | } 28 | return *this; 29 | } 30 | 31 | children 32 | children::begin() 33 | { 34 | return children(node); 35 | } 36 | 37 | children 38 | children::end() 39 | { 40 | children ret(node); 41 | ret.cur = nullptr; 42 | return ret; 43 | } 44 | 45 | bool 46 | children::operator!=(const children& other) const 47 | { 48 | return node != other.node || cur != other.cur; 49 | } 50 | 51 | bool 52 | children::operator==(const children& other) const 53 | { 54 | return node == other.node && cur == other.cur; 55 | } 56 | 57 | xmlNode* 58 | load_xml(const char* fname) 59 | { 60 | xmlDoc* doc = xmlReadFile(fname, NULL, 0); 61 | if (doc == nullptr) { 62 | std::cerr << "Error: Could not parse file '" << fname << "'." << std::endl; 63 | exit(EXIT_FAILURE); 64 | } 65 | return xmlDocGetRootElement(doc); 66 | } 67 | 68 | void 69 | error_and_die(xmlNode* node, const char* fmt, ...) 70 | { 71 | UFILE* err_out = u_finit(stderr, NULL, NULL); 72 | u_fprintf(err_out, "Error in %S on line %d: ", 73 | to_ustring((char*) node->doc->URL).c_str(), node->line); 74 | va_list argptr; 75 | va_start(argptr, fmt); 76 | u_vfprintf(err_out, fmt, argptr); 77 | va_end(argptr); 78 | u_fputc('\n', err_out); 79 | exit(EXIT_FAILURE); 80 | } 81 | 82 | UString 83 | getattr(xmlNode* node, const char* attr) 84 | { 85 | for (xmlAttr* i = node->properties; i != NULL; i = i->next) { 86 | if (!xmlStrcmp(i->name, (const xmlChar*) attr)) { 87 | return to_ustring((const char*) i->children->content); 88 | } 89 | } 90 | return ""_u; 91 | } 92 | 93 | UString 94 | getattr(xmlNode* node, UStringView attr, UStringView fallback) 95 | { 96 | for (xmlAttr* i = node->properties; i != NULL; i = i->next) { 97 | if (to_ustring((const char*) i->name) == attr) { 98 | return to_ustring((const char*) i->children->content); 99 | } 100 | } 101 | return US(fallback); 102 | } 103 | -------------------------------------------------------------------------------- /lttoolbox/xml_walk_util.h: -------------------------------------------------------------------------------- 1 | #ifndef _XML_WALK_UTIL_ 2 | #define _XML_WALK_UTIL_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | class children 10 | { 11 | private: 12 | xmlNode* node; 13 | xmlNode* cur; 14 | public: 15 | children(xmlNode* node); 16 | children(const children& it); 17 | ~children(); 18 | 19 | children& operator++(); 20 | children begin(); 21 | children end(); 22 | inline xmlNode* operator*() const { return cur; } 23 | bool operator!=(const children& other) const; 24 | bool operator==(const children& other) const; 25 | }; 26 | 27 | xmlNode* load_xml(const char* fname); 28 | void error_and_die(xmlNode* node, const char* fmt, ...); 29 | 30 | UString getattr(xmlNode* node, const char* attr); 31 | UString getattr(xmlNode* node, UStringView attr, UStringView fallback = u""); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /lttoolbox/xsd/acx.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(SWIG 3.0 REQUIRED) 2 | find_package(Python 3.8 REQUIRED) 3 | set(PYTHON_EXECUTABLE ${Python_EXECUTABLE}) 4 | 5 | get_directory_property(_defs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS) 6 | string(REPLACE ";" " -D" defs "-D${_defs}") 7 | 8 | set(PYTHON_FILE "lttoolbox.py") 9 | set(CPP_WRAP_FILE "lttoolbox_wrap.cpp") 10 | set(top_srcdir ${CMAKE_SOURCE_DIR}) 11 | set(BUILD_LIBDIR $) 12 | set(CXXFLAGS "${CMAKE_CXX_FLAGS} ${defs}") 13 | set(PACKAGE ${PROJECT_NAME}) 14 | set(PACKAGE_NAME ${PROJECT_NAME}) 15 | set(PACKAGE_VERSION ${PROJECT_VERSION}) 16 | set(LIBXML_CFLAGS ${LIBXML2_INCLUDE_DIR}) 17 | set(ICU_CFLAGS ${ICU_INCLUDE_DIR}) 18 | 19 | configure_file(lttoolbox.i.in lttoolbox.i @ONLY) 20 | configure_file(setup.py.in setup.py @ONLY) 21 | file(GENERATE OUTPUT setup.py INPUT ${CMAKE_CURRENT_BINARY_DIR}/setup.py) # In CMake 3.19, add: TARGET lttoolbox 22 | 23 | add_custom_command(OUTPUT ${CPP_WRAP_FILE} ${PYTHON_FILE} 24 | COMMAND ${PYTHON_EXECUTABLE} setup.py build 25 | COMMENT "Building ${PYTHON_FILE}" 26 | DEPENDS lttoolbox 27 | ) 28 | 29 | add_custom_target(wrapper ALL 30 | DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} 31 | VERBATIM 32 | ) 33 | 34 | if(NOT PYTHON_INSTALL_PARAMS) 35 | set(PYTHON_INSTALL_PARAMS "--prefix=${CMAKE_INSTALL_PREFIX} --root=\$ENV{DESTDIR}/") 36 | endif() 37 | 38 | set(INSTALL_WRAPPER "${PYTHON_EXECUTABLE} setup.py install ${PYTHON_INSTALL_PARAMS}") 39 | install(CODE "execute_process(COMMAND ${INSTALL_WRAPPER} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})") 40 | -------------------------------------------------------------------------------- /python/lttoolbox.i.in: -------------------------------------------------------------------------------- 1 | %module lttoolbox 2 | 3 | %include 4 | %include 5 | 6 | 7 | %typemap(in) (int argc, char **argv) { 8 | if (PyTuple_Check($input)) { 9 | int i = 0; 10 | $1 = PyTuple_Size($input); 11 | $2 = (char **) malloc(($1 + 1)*sizeof(char *)); 12 | for (i = 0; i < $1; i++) { 13 | PyObject *py_obj = PyTuple_GetItem($input, i); 14 | if (PyUnicode_Check(py_obj)) { 15 | $2[i] = strdup(PyUnicode_AsUTF8(py_obj)); 16 | } 17 | else { 18 | PyErr_SetString(PyExc_TypeError, "tuple must contain strings"); 19 | free($2); 20 | return NULL; 21 | } 22 | } 23 | $2[i] = 0; 24 | } else { 25 | PyErr_SetString(PyExc_TypeError, "not a tuple"); 26 | return NULL; 27 | } 28 | } 29 | 30 | %typemap(freearg) (int argc, char **argv) { 31 | free((char *) $2); 32 | } 33 | 34 | %inline%{ 35 | #define SWIG_FILE_WITH_INIT 36 | #include 37 | #include 38 | #include 39 | 40 | #include 41 | 42 | #include 43 | 44 | class FST: public FSTProcessor 45 | { 46 | public: 47 | /** 48 | * Imitates functionality of lt-proc using file path 49 | */ 50 | FST(char *dictionary_path) 51 | { 52 | FILE *dictionary = fopen(dictionary_path, "rb"); 53 | load(dictionary); 54 | fclose(dictionary); 55 | } 56 | 57 | void lt_proc(int argc, char **argv, char *input_path, char *output_path) 58 | { 59 | InputFile input; 60 | input.open(input_path); 61 | UFILE* output = u_fopen(output_path, "w", NULL, NULL); 62 | int cmd = 0; 63 | int c = 0; 64 | optind = 1; 65 | GenerationMode bilmode = gm_unknown; 66 | while (true) 67 | { 68 | c = getopt(argc, argv, "abcdeglmnoptwxzCIW"); 69 | if (c == -1) 70 | { 71 | break; 72 | } 73 | switch(c) 74 | { 75 | case 'a': 76 | case 'b': 77 | case 'e': 78 | case 'g': 79 | case 'p': 80 | case 't': 81 | case 'x': 82 | if(cmd == 0) { 83 | cmd = c; 84 | } else if (cmd == 'g' && c == 'b') { 85 | cmd = c; 86 | } 87 | break; 88 | case 'c': 89 | setCaseSensitiveMode(true); 90 | break; 91 | case 'd': 92 | if (cmd == 0) cmd = 'g'; 93 | bilmode = gm_all; 94 | break; 95 | case 'l': 96 | if (cmd == 0) cmd = 'g'; 97 | bilmode = gm_tagged; 98 | break; 99 | case 'm': 100 | if (cmd == 0) cmd = 'g'; 101 | bilmode = gm_tagged_nm; 102 | break; 103 | case 'n': 104 | if (cmd == 0) cmd = 'g'; 105 | bilmode = gm_clean; 106 | break; 107 | case 'o': 108 | if (cmd == 0) cmd = 'b'; 109 | setBiltransSurfaceForms(true); 110 | break; 111 | case 'w': 112 | setDictionaryCaseMode(true); 113 | break; 114 | case 'z': 115 | setNullFlush(true); 116 | break; 117 | case 'C': 118 | if (cmd == 0) cmd = 'g'; 119 | bilmode = gm_carefulcase; 120 | break; 121 | case 'I': 122 | setUseDefaultIgnoredChars(false); 123 | break; 124 | case 'W': 125 | setDisplayWeightsMode(true); 126 | break; 127 | default: 128 | break; 129 | } 130 | } 131 | 132 | switch(cmd) 133 | { 134 | case 'b': 135 | initBiltrans(); 136 | bilingual(input, output, bilmode); 137 | break; 138 | 139 | case 'e': 140 | initDecomposition(); 141 | analysis(input, output); 142 | break; 143 | 144 | case 'g': 145 | initGeneration(); 146 | generation(input, output, bilmode); 147 | break; 148 | 149 | case 'p': 150 | initPostgeneration(); 151 | postgeneration(input, output); 152 | break; 153 | 154 | case 't': 155 | initPostgeneration(); 156 | transliteration(input, output); 157 | break; 158 | 159 | case 'x': 160 | initPostgeneration(); 161 | intergeneration(input, output); 162 | break; 163 | 164 | case 'a': 165 | default: 166 | initAnalysis(); 167 | analysis(input, output); 168 | break; 169 | } 170 | 171 | u_fclose(output); 172 | } 173 | }; 174 | 175 | %} 176 | -------------------------------------------------------------------------------- /python/setup.py.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Setup for SWIG Python bindings for lttoolbox 5 | """ 6 | from distutils.core import Extension, setup 7 | from sys import platform 8 | 9 | compile_args = '@CXXFLAGS@'.split() + '@CPPFLAGS@'.split() + '@ICU_CFLAGS@'.split() 10 | link_args = [] 11 | if platform == 'darwin': 12 | compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] 13 | link_args.append('-mmacosx-version-min=10.7') 14 | 15 | lttoolbox_module = Extension( 16 | name='_lttoolbox', 17 | language='c++', 18 | sources=['lttoolbox.i'], 19 | swig_opts = ["-c++", '-I..', "-I@top_srcdir@", "-Wall"], 20 | include_dirs=['@top_srcdir@', '@top_srcdir@/lttoolbox'] + '@LIBXML_CFLAGS@'.replace('-I', '').split() + '@ICU_CFLAGS@'.replace('-I', '').split(), 21 | library_dirs=['@BUILD_LIBDIR@'], 22 | libraries=['lttoolbox', 'xml2', 'icuio', 'icui18n', 'icuuc', 'icudata'], 23 | extra_compile_args=compile_args, 24 | extra_link_args=link_args, 25 | ) 26 | 27 | setup( 28 | name='@PACKAGE@', 29 | version='@PACKAGE_VERSION@', 30 | description='SWIG interface to @PACKAGE_NAME@', 31 | long_description="SWIG interface to @PACKAGE_NAME@ for use in apertium-python", 32 | # TODO: author, maintainer, url 33 | author_email='@PACKAGE_BUGREPORT@', 34 | license='GPL-3.0+', 35 | maintainer_email='@PACKAGE_BUGREPORT@', 36 | ext_modules=[lttoolbox_module], 37 | py_modules=['lttoolbox'], 38 | data_files=[], 39 | ) 40 | -------------------------------------------------------------------------------- /tests/README: -------------------------------------------------------------------------------- 1 | Tests require python3, run like 2 | 3 | python3 tests/run_tests.py 4 | 5 | You may have to do "(sudo) make install" once before running the tests. 6 | 7 | They should all pass. 8 | -------------------------------------------------------------------------------- /tests/data/a2b.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

ab

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/alphabet.att: -------------------------------------------------------------------------------- 1 | 0 1 a a 2 | 1 2 b b 3 | 2 3 c c 4 | 0 3 . . 5 | 4 3 6 | 0 4 c c 7 | 4 3 @_SPACE_@ @_SPACE_@ 8 | 0 3 ? ? 9 | 3 10 | -------------------------------------------------------------------------------- /tests/data/alphabetic-after-group-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |

13 |

TAIL

14 |
15 |
16 | 17 |
18 |

a te

19 |

include meh

20 |
21 |
22 | 23 | -------------------------------------------------------------------------------- /tests/data/alphabetic-after-group-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÖØÙÚÛÜàáâäåæçèéêëìíîïñòóôöøùúûüŠš 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

as a

15 |

include include

16 |
17 | 18 |
19 | -------------------------------------------------------------------------------- /tests/data/apostrophe.att: -------------------------------------------------------------------------------- 1 | 0 1 a a 2 | 0 1 á á 3 | 0 1 ã ã 4 | 0 1 b b 5 | 0 1 c c 6 | 0 1 d d 7 | 0 1 e e 8 | 0 1 é é 9 | 0 1 ê ê 10 | 0 1 ë ë 11 | 0 1 ẽ ẽ 12 | 0 1 f f 13 | 0 1 g g 14 | 0 1 h h 15 | 0 1 i i 16 | 0 1 í í 17 | 0 1 ï ï 18 | 0 1 ĩ ĩ 19 | 0 1 j j 20 | 0 1 k k 21 | 0 1 l l 22 | 0 1 m m 23 | 0 1 n n 24 | 0 1 ñ ñ 25 | 0 1 o o 26 | 0 1 ó ó 27 | 0 1 ô ô 28 | 0 1 õ õ 29 | 0 1 p p 30 | 0 1 q q 31 | 0 1 r r 32 | 0 1 s s 33 | 0 1 t t 34 | 0 1 u u 35 | 0 1 ú ú 36 | 0 1 ü ü 37 | 0 1 ũ ũ 38 | 0 1 v v 39 | 0 1 x x 40 | 0 1 y y 41 | 0 1 ý ý 42 | 0 1 ỹ ỹ 43 | 0 1 z z 44 | 1 2 ' ʼ 45 | 1 2 ’ ʼ 46 | 2 3 a a 47 | 2 3 á á 48 | 2 3 ã ã 49 | 2 3 b b 50 | 2 3 c c 51 | 2 3 d d 52 | 2 3 e e 53 | 2 3 é é 54 | 2 3 ê ê 55 | 2 3 ë ë 56 | 2 3 ẽ ẽ 57 | 2 3 f f 58 | 2 3 g g 59 | 2 3 h h 60 | 2 3 i i 61 | 2 3 í í 62 | 2 3 ï ï 63 | 2 3 ĩ ĩ 64 | 2 3 j j 65 | 2 3 k k 66 | 2 3 l l 67 | 2 3 m m 68 | 2 3 n n 69 | 2 3 ñ ñ 70 | 2 3 o o 71 | 2 3 ó ó 72 | 2 3 ô ô 73 | 2 3 õ õ 74 | 2 3 p p 75 | 2 3 q q 76 | 2 3 r r 77 | 2 3 s s 78 | 2 3 t t 79 | 2 3 u u 80 | 2 3 ú ú 81 | 2 3 ü ü 82 | 2 3 ũ ũ 83 | 2 3 v v 84 | 2 3 x x 85 | 2 3 y y 86 | 2 3 ý ý 87 | 2 3 ỹ ỹ 88 | 2 3 z z 89 | 3 -------------------------------------------------------------------------------- /tests/data/append1.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | a 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

aa

12 |
13 | 14 |
15 | -------------------------------------------------------------------------------- /tests/data/append2.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | b 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

bb

12 |
13 | 14 |
15 | -------------------------------------------------------------------------------- /tests/data/arabic-punct.att: -------------------------------------------------------------------------------- 1 | 0 1 ، ، 0.000 2 | 0 1 ؛ ؛ 0.000 3 | 0 1 ؟ ؟ 0.000 4 | 0 2 a a 0.000 5 | 0 2 b b 0.000 6 | 1 3 @0@ 0.000 7 | 2 4 @0@ 0.000 8 | 3 0.000 9 | 4 0.000 10 | -------------------------------------------------------------------------------- /tests/data/baregroup-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | fri 16 | fråkvarandre 17 |
18 | 19 |
20 | -------------------------------------------------------------------------------- /tests/data/basic.acx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/data/basic.lsx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

12 | prpers 13 | 14 |

15 |
16 |
17 | -------------------------------------------------------------------------------- /tests/data/bidix-epsilons-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

14 |
15 |
16 | 17 |
18 |

aaaa

19 |

aaaa

20 |

baba

21 |
22 |
23 | -------------------------------------------------------------------------------- /tests/data/bidix-epsilons-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 | 8 | 9 | 10 |

b b

11 |
12 | 13 |

a a

14 |
15 |
16 | 17 |
18 | a 19 | b 20 |
21 |
22 | -------------------------------------------------------------------------------- /tests/data/bidixpardef-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |

10 |
11 |
12 |
13 |

abc

14 |
15 |
16 | -------------------------------------------------------------------------------- /tests/data/bidixpardef-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

c c

12 |
13 |
14 | -------------------------------------------------------------------------------- /tests/data/big-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |

hjerterytmeovervåkningenhjerterytmeovervåkning

17 |

hjerteklaffhjerteklaff

18 |

overvåkningenovervåkning

19 | [A-ZÆØÅ]+[a-zæøåA-ZÆØÅ]+!

20 | 21 |

vasvass

22 |

sengaseng

23 |
24 | 25 | 26 |
27 | -------------------------------------------------------------------------------- /tests/data/biproc-skips-tags-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

vihkivihki

12 |

vihkivihki

13 |
14 |
15 | -------------------------------------------------------------------------------- /tests/data/cat-epsilon-loop.att: -------------------------------------------------------------------------------- 1 | 0 1 @0@ @0@ 2 | 1 2 c c 3 | 2 3 a a 4 | 3 4 t t 5 | 1 1 @0@ @0@ 6 | 4 7 | -------------------------------------------------------------------------------- /tests/data/cat-epsilon-to-final.att: -------------------------------------------------------------------------------- 1 | 0 1 @0@ @0@ 2 | 1 2 c c 3 | 2 3 a a 4 | 3 4 t t 5 | 1 4 @0@ @0@ 6 | 4 7 | -------------------------------------------------------------------------------- /tests/data/cat-multiple-fst.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 2 | 1 2 a a 3 | 2 3 t t 4 | 3 4 @0@ + 5 | 4 5 @0@ n 6 | 5 7 | 4 5 @0@ v 8 | -- 9 | 0 1 c c 10 | 1 2 a a 11 | 2 3 t t 12 | 3 4 @0@ + 13 | 4 5 @0@ n 14 | 5 6 @0@ + 15 | 6 7 s 16 | 7 -------------------------------------------------------------------------------- /tests/data/cat-weight-final.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 0 2 | 1 2 a a 0 3 | 2 3 t t 0 4 | 3 2.5 5 | -------------------------------------------------------------------------------- /tests/data/cat-weight-heavy.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 2 | 1 2 a a 3 | 2 3 t t 4 | 3 65536 5 | -------------------------------------------------------------------------------- /tests/data/cat-weight-initial.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 2.5 2 | 1 2 a a 0 3 | 2 3 t t 0 4 | 3 0 5 | -------------------------------------------------------------------------------- /tests/data/cat-weight-middle.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 2 | 1 2 a a 2.5 3 | 2 3 t t 4 | 3 5 | -------------------------------------------------------------------------------- /tests/data/cat-weight-negative.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 4.567895 2 | 1 2 a a 0.989532 3 | 2 3 t t 2.796193 4 | 3 4 @0@ + -0.824564 5 | 4 5 @0@ n 1.824564 6 | 5 -0.525487 7 | 4 5 @0@ v 2.856296 8 | -------------------------------------------------------------------------------- /tests/data/cat-weight.att: -------------------------------------------------------------------------------- 1 | 0 1 c c 4.567895 2 | 1 2 a a 0.989532 3 | 2 3 t t 2.796193 4 | 3 4 @0@ + 0.824564 5 | 4 5 @0@ n 1.824564 6 | 5 0.525487 7 | 4 5 @0@ v 2.856296 8 | -------------------------------------------------------------------------------- /tests/data/cmp-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |

aa

16 |

bb

17 |

cd

18 |

dd

19 |
20 | 21 |
22 | -------------------------------------------------------------------------------- /tests/data/cmp-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |

aa

16 |

bb

17 | 18 |

cd

19 |

dd

20 | 21 |

aa

22 |

bb

23 | 24 |

cd

25 |

dd

26 |
27 | 28 |
29 | -------------------------------------------------------------------------------- /tests/data/compose1.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐýÝñÑüÜíÍıİËë-0123456789̇ 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

¤

12 |
13 |
14 | 15 |
16 | 17 | 18 | 19 |

opp¤ opp

20 | 21 | 22 |

app app

23 | 24 | py

25 | 26 | 27 | upp

28 | 29 | upp

30 | 31 | tupp

32 | 33 |
34 |
35 | -------------------------------------------------------------------------------- /tests/data/diverging-paths-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 |

x x

11 |

ya ya

12 |
13 | 14 |
15 | -------------------------------------------------------------------------------- /tests/data/diverging-paths-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 | 8 |
9 |

xaxa

10 |

yaya

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/double-clitics-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

adA

12 |

bB

13 |

cC

14 |

xX

15 |
16 |
17 | -------------------------------------------------------------------------------- /tests/data/double-clitics-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |

-b-c bc

15 |

16 |
17 |
18 | 19 |
20 | a

dd

21 | x 22 | y 23 |
24 |
25 | -------------------------------------------------------------------------------- /tests/data/empty-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
10 |

bb

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/empty-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

aa

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/entirely-empty.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |
11 |
12 | -------------------------------------------------------------------------------- /tests/data/entry-weights.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |

27 |

a

28 |

ej

29 |

a

30 |

om

31 |

je

32 |

o

33 | 34 |

aj

35 |

ow

36 |

omaj

37 |

ow

38 |

omaj

39 |

omaj

40 |

aj

41 | 42 |

ojo

43 |

ow

44 |

am

45 |

ow

46 |

ami

47 |

ach

48 |

ojo

49 |
50 |
51 | 52 |
53 | nan 54 |
55 | 56 |
57 | -------------------------------------------------------------------------------- /tests/data/expand-re.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 |

abcab

18 |

abab

19 |

yy

20 |

nn

21 | xyz:abc[qxj]+

22 |
23 | 24 |
25 | -------------------------------------------------------------------------------- /tests/data/final-epsilons-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |

ea x

10 |
11 |
12 | -------------------------------------------------------------------------------- /tests/data/final-epsilons-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠ 4 | 5 | 6 | 7 | 8 |
9 |

e e

10 |

e e

aa

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/gardenpath-mwe.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |

legge legge

17 |

leggeopptil leggeopptil

18 |

opp opp

19 | 20 |

leggesegopptil leggesegopptil

21 |

seg seg

22 | 23 |

St.Petersburg St.Petersburg

24 |

Xy Xy

25 |

F F

26 |

G G

27 |
28 | 29 |
30 |

. .

31 |
32 | 33 |
34 | -------------------------------------------------------------------------------- /tests/data/group-after-join-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |

hasGaa

10 |

notG a

11 |

jya

12 |
13 |
14 | -------------------------------------------------------------------------------- /tests/data/group-after-join-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ·ÀÁÂÄÇÈÉÊËÌÍÎÏÑÒÓÔÖÙÚÛÜàáâäçèéêëìíîïñòóôöùúûüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzœ 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

-jy jy

12 |

13 |
14 |
15 |
16 | hasG

aa

17 | hasG 18 | notG

aa

19 | notG 20 |
21 |
22 | -------------------------------------------------------------------------------- /tests/data/group-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 |

abc feh

14 |

pq feh

15 |

pqr feh

16 | 17 |

jl feh

18 |

k feh

19 | 20 |

defverylongword

21 | 22 | 23 |

pqn feh

24 |

xy feh

25 |
26 | 27 |
28 | -------------------------------------------------------------------------------- /tests/data/group-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 |

abc abc

17 |

pq pq

18 |

pqr pqr

19 | 20 |

jkl jkl

21 | 22 |

defdef

23 | 24 | 25 |

pqs pqs

26 |

xyz xyz

27 | 28 |

jkm jkm

29 |

jnl jnl

30 |
31 | 32 |
33 | -------------------------------------------------------------------------------- /tests/data/intergen.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 |

dónadona

9 |
10 |
11 | -------------------------------------------------------------------------------- /tests/data/left-unbalanced-epsilons-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 |

aba

9 |
10 |
11 | -------------------------------------------------------------------------------- /tests/data/left-unbalanced-epsilons-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 |
8 |

aa

9 |
10 |
11 | -------------------------------------------------------------------------------- /tests/data/lemma-entry-weights.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | abcdefghijklmnopqrstuvwxyz 4 | 5 | 6 | 7 | 8 |
9 |

walkwalk

10 |

walkwalk

11 |
12 |
-------------------------------------------------------------------------------- /tests/data/lhs-empty-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

a

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/lhs-ws-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

aa

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/longleft-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

herdeherde

13 |
14 | 15 |
16 | -------------------------------------------------------------------------------- /tests/data/longleft-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

herdende herde

13 |
14 | 15 |
16 | -------------------------------------------------------------------------------- /tests/data/merging-paths-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

einen

13 |

einen

14 |
15 |
16 | -------------------------------------------------------------------------------- /tests/data/merging-paths-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

n n

14 |

i n

15 |
16 |
17 |
18 | e 19 |
20 |
21 | -------------------------------------------------------------------------------- /tests/data/minimal-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

abxy

15 |

yz

16 |
17 | 18 |
19 |

jj

20 |

gg

21 |
22 | 23 |
24 | -------------------------------------------------------------------------------- /tests/data/minimal-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 |

abcab

18 |

abab

19 |

yy

20 |

nn

21 |
22 | 23 |
24 |

jgjg

25 |

jhjh

26 |

kgkg

27 |
28 | 29 |
30 | -------------------------------------------------------------------------------- /tests/data/morpheme-boundaries.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

ff

9 |

vesf

10 |
11 | 12 |

13 |

es

14 |
15 | 16 |

17 |

s

18 |
19 | 20 |

isis

21 |

esis

22 |
23 |
24 |
25 | cat 26 | wol 27 | church 28 | bat 29 | rat 30 | ax 31 |
32 |
33 | -------------------------------------------------------------------------------- /tests/data/multichar.att: -------------------------------------------------------------------------------- 1 | 0 1 א אַן 2 | 1 2 @0@ 3 | 2 4 | -------------------------------------------------------------------------------- /tests/data/non-bmp.att: -------------------------------------------------------------------------------- 1 | 0 1 𐅀 𐅀 0.000 2 | 0 1 𐅁 𐅁 0.000 3 | 0 1 𐅂 𐅂 0.000 4 | 0 1 𐅃 𐅃 0.000 5 | 0 1 𐅄 𐅄 0.000 6 | 0 1 𐅅 𐅅 0.000 7 | 0 1 𐅆 𐅆 0.000 8 | 0 1 𐅇 𐅇 0.000 9 | 0 1 𐅈 𐅈 0.000 10 | 0 1 𐅉 𐅉 0.000 11 | 0 1 𐅊 𐅊 0.000 12 | 0 1 𐅋 𐅋 0.000 13 | 0 1 𐅌 𐅌 0.000 14 | 0 1 𐅍 𐅍 0.000 15 | 0 1 𐅎 𐅎 0.000 16 | 0 1 𐅏 𐅏 0.000 17 | 1 1 𐅀 𐅀 0.000 18 | 1 1 𐅁 𐅁 0.000 19 | 1 1 𐅂 𐅂 0.000 20 | 1 1 𐅃 𐅃 0.000 21 | 1 1 𐅄 𐅄 0.000 22 | 1 1 𐅅 𐅅 0.000 23 | 1 1 𐅆 𐅆 0.000 24 | 1 1 𐅇 𐅇 0.000 25 | 1 1 𐅈 𐅈 0.000 26 | 1 1 𐅉 𐅉 0.000 27 | 1 1 𐅊 𐅊 0.000 28 | 1 1 𐅋 𐅋 0.000 29 | 1 1 𐅌 𐅌 0.000 30 | 1 1 𐅍 𐅍 0.000 31 | 1 1 𐅎 𐅎 0.000 32 | 1 1 𐅏 𐅏 0.000 33 | 1 2 @0@ 0.000 34 | 2 0.000 35 | -------------------------------------------------------------------------------- /tests/data/non-bmp.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 𐅀𐅁𐅂𐅃𐅄𐅅𐅆𐅇𐅈𐅉𐅊𐅋𐅌𐅍𐅎𐅏 4 | 5 | 6 | 7 | 8 | 9 |

10 |
11 |
12 |
13 | [𐅀𐅁𐅂𐅃𐅄𐅅𐅆𐅇𐅈𐅉𐅊𐅋𐅌𐅍𐅎𐅏]+ 14 |
15 |
16 | -------------------------------------------------------------------------------- /tests/data/numbers.tmx: -------------------------------------------------------------------------------- 1 | 2 | 3 |
12 | 13 | 14 | 15 | kake 1 16 | 17 | 18 | kake 1 19 | 20 | 21 | 22 | 23 | kaffe 1 og 2 24 | 25 | 26 | kaffi 1 og 2 27 | 28 | 29 | 30 | 31 | ost 99 eller 78 kjeks 32 | 33 | 34 | ost 99 eller 78 kjeks 35 | 36 | 37 | 38 | 39 | 3 på halv 40 | 41 | 42 | 3 på halv 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /tests/data/oci-pgen.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | [aeiouàáéèíóòúü] 7 | 8 | 9 | 10 | 11 |

12 | detlo 13 | del 14 |

15 | 16 | 17 |

18 | detlo 19 | del' 20 |

21 | 22 | 23 | 24 |

25 | detla 26 | del' 27 |

28 | 29 | 30 | 31 |

32 | detla 33 | dela 34 |

35 | 36 | 37 |

38 | detlos 39 | dels 40 |

41 | 42 | 43 |

44 | detlas 45 | delas 46 |

47 | 48 | 49 |

50 | lo 51 | del' 52 |

53 | 54 | 55 | 56 |

57 | la 58 | del' 59 |

60 | 61 | 62 | 63 | 64 |

65 | detla 66 | del' 67 |

68 | 69 | 70 | 71 | 72 | 73 | 74 |
75 | 76 |

77 | de 78 | 79 |

80 | 81 | 82 | 83 | 84 |

85 | ade 86 | a 87 |

88 | 89 | 90 | 91 | 92 |

93 | adetlo 94 | al 95 |

96 | 97 | 98 |

99 | adetlos 100 | als 101 |

102 | 103 | 104 |

105 | adetlo 106 | al' 107 |

108 | 109 | 110 | 111 |

112 | detla 113 | l' 114 |

115 | 116 | 117 | 118 |

119 | detlo 120 | l' 121 |

122 | 123 | 124 | 125 |

126 | lo 127 | l' 128 |

129 | 130 | 131 | 132 |

133 | la 134 | l' 135 |

136 | 137 | 138 | 139 |

140 | detla 141 | la 142 |

143 | 144 | 145 |

146 | detlo 147 | lo 148 |

149 | 150 | 151 |

152 | detlas 153 | las 154 |

155 | 156 | 157 |

158 | detlos 159 | los 160 |

161 | 162 | 163 |
164 | 165 | 166 | -------------------------------------------------------------------------------- /tests/data/pass-through.lsx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | foo 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 |
20 |
21 | -------------------------------------------------------------------------------- /tests/data/plus-lemma-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 |

I+Dqqq

15 |

abxy

16 |

yz

17 |
18 | 19 |
20 |

jj

21 |

gg

22 |
23 | 24 |
25 | -------------------------------------------------------------------------------- /tests/data/plus-lemma-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |

I+DI+D

21 |

abcab

22 |

abab

23 |

yy

24 |

nn

25 |

jgjg

26 |
27 | 28 |
29 | -------------------------------------------------------------------------------- /tests/data/postgen-overlap.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | " 9 | 10 | 11 | 12 |
13 | 14 | 15 |

16 | abc 17 | xbc 18 |

19 |
20 | 21 | 22 |

23 | bc 24 | yc 25 |

26 |
27 | 28 | 29 |

30 | c 31 | z 32 |

33 |
34 | 35 |
36 | 37 |
38 | -------------------------------------------------------------------------------- /tests/data/postgen-short.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |

12 | la 13 | sela 14 |

15 |
16 |
17 | 18 | 19 |
20 | 21 |
22 | 23 | 24 |

25 | ea 26 | a 27 |

28 | 29 | 30 |
31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/data/postgen.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

14 | el 15 | l 16 |

17 |
18 | 19 |

20 | el 21 | l 22 |

23 |
24 |
25 | 26 | 27 | 28 |

29 | la 30 | sela 31 |

32 |
33 | 34 |

35 | las 36 | selas 37 |

38 |
39 | 40 |

41 | lo 42 | selo 43 |

44 |
45 | 46 |

47 | los 48 | selos 49 |

50 |
51 |
52 | 53 | 54 |
55 | 56 |
57 | 58 | 59 |

60 | de 61 | de 62 |

63 | 64 | 65 | 66 | 67 |

68 | oho 69 | uho 70 |

71 | 72 | 73 | 74 |

75 | le 76 | 77 |

78 | 79 | 80 | 81 | 82 |

83 | les 84 | lepetest 85 |

86 | 87 | 88 | 89 |

90 | lespes 91 | lespestest 92 |

93 | 94 | 95 |

sssss

96 | 97 |
98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /tests/data/pp2p.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

pp¤p

11 |

pp¤pp

12 |
13 |
14 | -------------------------------------------------------------------------------- /tests/data/rhs-empty-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

a

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/rhs-ws-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

aa

11 |
12 |
13 | -------------------------------------------------------------------------------- /tests/data/sectiondupes.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

aa

11 |
12 |
13 |

aa

14 |
15 |
16 | -------------------------------------------------------------------------------- /tests/data/sections.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

X X

13 |
14 | 15 |
16 |

. .

17 |
18 | 19 |
20 | -------------------------------------------------------------------------------- /tests/data/simple.tmx: -------------------------------------------------------------------------------- 1 | 2 | 3 |
12 | 13 | 14 | 15 | Ikke så merkelig 16 | 17 | 18 | Ikkje så merkeleg 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/data/slash-tags.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 |

10 | *lobwana1.1 11 | *lopwana1.1 12 |

13 |
14 |
15 |
-------------------------------------------------------------------------------- /tests/data/space-eof-incond.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |

foobar

11 |
12 |
13 | 14 |
15 | .y

16 | .xx

17 | .

18 |
19 | 20 |
21 | -------------------------------------------------------------------------------- /tests/data/spcmp.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 |

a1- a1

19 |

b b

20 |

c c

21 | 22 | 23 |

a1-ca1-c

24 |

a1-da1-d

25 | 26 |

w w

27 |

wy wy

28 |

wyx wyx

29 | 30 |

a a

31 | 32 | 33 |

pf pf

34 |

p p

35 | 36 | 37 | vese

38 | ve

39 | set

40 | u

41 | 42 |
43 |
44 |

1 1

45 |

- -

46 |
47 | 48 |
49 | -------------------------------------------------------------------------------- /tests/data/unbalanced-epsilons-bi.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 |

abcdefghijklmnopqrstuvwxyzre

14 |

15 |
16 |
17 |
18 | -------------------------------------------------------------------------------- /tests/data/unbalanced-epsilons-mono.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÆØÅabcdefghijklmnopqrstuvwxyzæøåcqwxzCQWXZéèêóòâôÉÊÈÓÔÒÂáàÁÀäÄöÖšŠčČðđÐ 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | r

e e

15 | r

er e

16 | r

es e

17 | r

et e

18 |
19 |
20 | -------------------------------------------------------------------------------- /tests/data/underscore.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 |

_n_n

13 |

||

14 |
15 | 16 | 17 | 18 |
19 | -------------------------------------------------------------------------------- /tests/data/upp2up.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 |
10 |

upp¤up

11 |

upp¤upp

12 |

pypy

13 |
14 |
15 | -------------------------------------------------------------------------------- /tests/data/variants.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 |

abcab

18 |

abab

19 |

ababbb

20 |

yy

21 |

nn

22 |
23 | 24 |
25 |

jgjg

26 |

jhjh

27 |

kgkg

28 |
29 | 30 |
31 | -------------------------------------------------------------------------------- /tests/data/walk-weight.att: -------------------------------------------------------------------------------- 1 | 0 1 w w 2.000000 2 | 1 2 a a 0.000000 3 | 2 3 l l 0.000000 4 | 3 4 k k 0.000000 5 | 4 5 @0@ 0.500000 6 | 4 6 s 1.000000 7 | 4 7 @0@ 0.000000 8 | 4 8 s 0.000000 9 | 5 9 @0@ 0.000000 10 | 6 9 @0@ 0.000000 11 | 7 9 @0@ 0.000000 12 | 7 9 @0@ 0.000000 13 | 8 10 @0@ 0.000000 14 | 9 0.000000 15 | 10 5 @0@ 0.000000 16 | -------------------------------------------------------------------------------- /tests/data/wordbound-blank.dix: -------------------------------------------------------------------------------- 1 | 2 | 3 | ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |

legge legge

17 |

leggeopptil leggeopptil

18 |

opp opp

19 | 20 |

leggesegopptil leggesegopptil

21 |

seg seg

22 | 23 |

St.Petersburg St.Petersburg

24 |

Xy Xy

25 |

F F

26 |

G G

27 |
28 | 29 |
30 |

. .

31 |
32 | 33 |
34 | -------------------------------------------------------------------------------- /tests/lt_append/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from basictest import ProcTest 3 | import unittest 4 | 5 | class AppendProcTest(unittest.TestCase, ProcTest): 6 | dix1 = "data/append1.dix" 7 | dix2 = "data/append2.dix" 8 | dir1 = "lr" 9 | dir2 = "lr" 10 | procflags = ["-z"] 11 | 12 | def compileTest(self, tmpd): 13 | self.compileDix(self.dir1, self.dix1, binName=tmpd+'/dix1.bin') 14 | self.compileDix(self.dir2, self.dix2, binName=tmpd+'/dix2.bin') 15 | self.callProc('lt-append', [tmpd+"/dix1.bin", 16 | tmpd+"/dix2.bin", 17 | tmpd+"/compiled.bin"]) 18 | return True 19 | 20 | class SimpleAppend(AppendProcTest): 21 | inputs = ["a", "b"] 22 | expectedOutputs = ["^a/a$", 23 | "^b/b$"] 24 | -------------------------------------------------------------------------------- /tests/lt_apply_acx/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from basictest import ProcTest 3 | 4 | class AcxTest(unittest.TestCase, ProcTest): 5 | dix = 'data/minimal-mono.dix' 6 | acx = 'data/basic.acx' 7 | procdir = 'lr' 8 | inputs = ['abc', 'ábc', 'äbc'] 9 | expectedOutputs = ['^abc/ab$', 10 | '^ábc/ab$', 11 | '^äbc/ab$'] 12 | 13 | def compileTest(self, tmpd): 14 | ret = self.compileDix(self.procdir, self.dix, 15 | binName=tmpd+'/plain.bin') 16 | if not ret: return ret 17 | self.callProc('lt-apply-acx', 18 | [tmpd+'/plain.bin', self.acx, tmpd+'/compiled.bin']) 19 | return True 20 | -------------------------------------------------------------------------------- /tests/lt_comp/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from basictest import ProcTest, PrintTest 4 | import unittest 5 | 6 | class CompNormalAndJoin(unittest.TestCase, ProcTest): 7 | inputs = ["abc", "ab", "y", "n", "jg", "jh", "kg"] 8 | expectedOutputs = ["^abc/ab$", "^ab/ab$", "^y/y$", "^n/n$", "^jg/j+g$", "^jh/j+h$", "^kg/k+g$"] 9 | 10 | 11 | class EmptyDixOk(unittest.TestCase, ProcTest): 12 | procdix = "data/entirely-empty.dix" 13 | inputs = ["abc"] 14 | expectedOutputs = ["^abc/*abc$"] 15 | 16 | 17 | class CompEmptyLhsShouldError(unittest.TestCase, ProcTest): 18 | procdix = "data/lhs-empty-mono.dix" 19 | expectedCompRetCodeFail = True 20 | 21 | 22 | class CompEmptyRhsShouldError(unittest.TestCase, ProcTest): 23 | procdir = "rl" 24 | procdix = "data/rhs-empty-mono.dix" 25 | expectedCompRetCodeFail = True 26 | 27 | 28 | class CompLhsInitialSpaceShouldError(unittest.TestCase, ProcTest): 29 | procdix = "data/lhs-ws-mono.dix" 30 | expectedCompRetCodeFail = True 31 | 32 | 33 | class CompRhsInitialSpaceShouldError(unittest.TestCase, ProcTest): 34 | procdix = "data/rhs-ws-mono.dix" 35 | procdir = "rl" 36 | expectedCompRetCodeFail = True 37 | 38 | 39 | class CompAttEpsilonLoopShouldError(unittest.TestCase, ProcTest): 40 | procdix = "data/cat-epsilon-loop.att" 41 | expectedCompRetCodeFail = True 42 | 43 | class CompAttEpsilonToFinalShouldError(unittest.TestCase, ProcTest): 44 | procdix = "data/cat-epsilon-to-final.att" 45 | expectedCompRetCodeFail = True 46 | 47 | class CompSplitMultichar(unittest.TestCase, ProcTest): 48 | procdix = "data/multichar.att" 49 | inputs = ["א"] 50 | expectedOutputs = ["^א/אַן$"] 51 | 52 | class CompLSX(unittest.TestCase, PrintTest): 53 | printdix = "data/basic.lsx" 54 | expectedOutput = '''0 1 0.000000\t 55 | 1 1 0.000000\t 56 | 1 2 0.000000\t 57 | 2 3 0.000000\t 58 | 3 3 0.000000\t 59 | 3 4 <$> 0.000000\t 60 | 4 5 p <$> 0.000000\t 61 | 5 6 r ε 0.000000\t 62 | 6 7 p ε 0.000000\t 63 | 7 8 e ε 0.000000\t 64 | 8 9 r ε 0.000000\t 65 | 9 10 s ε 0.000000\t 66 | 10 11 ε 0.000000\t 67 | 11 12 <$> ε 0.000000\t 68 | 12 14 ε ε 0.000000\t 69 | 12 13 <$> <$> 0.000000\t 70 | 13 14 ε ε 0.000000\t 71 | 14 0.000000 72 | ''' 73 | 74 | 75 | class VariantNoTest(unittest.TestCase, ProcTest): 76 | procdix = 'data/variants.dix' 77 | procdir = 'lr' 78 | compflags = [] 79 | inputs = ['y'] 80 | expectedOutputs = ['^y/*y$'] 81 | 82 | 83 | class VariantHoTest(unittest.TestCase, ProcTest): 84 | procdix = 'data/variants.dix' 85 | procdir = 'lr' 86 | compflags = ['--var-right=ho'] 87 | inputs = ['y'] 88 | expectedOutputs = ['^y/y$'] 89 | 90 | 91 | class RestrictTest(unittest.TestCase, ProcTest): 92 | procdix = 'data/variants.dix' 93 | procdir = 'lr' 94 | restrictflags = [] 95 | inputs = ['abc', 'ab'] 96 | expectedOutputs = ['^abc/ab$', '^ab/*ab$'] 97 | 98 | def compileTest(self, tmpd): 99 | ret = self.compileDix('u', self.procdix, binName=tmpd+'/uni.bin') 100 | if not ret: return ret 101 | self.callProc('lt-restrict', 102 | [self.procdir, tmpd+'/uni.bin', tmpd+'/compiled.bin'], 103 | self.restrictflags) 104 | 105 | class RestrictRL1(RestrictTest): 106 | procdir = 'rl' 107 | restrictflags = ['-v', 'gascon'] 108 | inputs = ['abc', 'ab'] 109 | expectedOutputs = ['^abc/*abc$', '^ab/ab$'] 110 | 111 | class RestrictRL2(RestrictTest): 112 | procdir = 'rl' 113 | restrictflags = ['-v', 'oci'] 114 | inputs = ['abc', 'ab'] 115 | expectedOutputs = ['^abc/*abc$', '^ab/abbb$'] 116 | -------------------------------------------------------------------------------- /tests/lt_compose/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from basictest import ProcTest 4 | import unittest 5 | 6 | 7 | class ComposeProcTest(unittest.TestCase, ProcTest): 8 | monodix = "data/compose1.dix" 9 | monodir = "lr" 10 | bidix = "data/pp2p.dix" 11 | bidir = "lr" 12 | procflags = ["-z"] 13 | composeflags = ["--inverted", "--anywhere"] 14 | 15 | def compileTest(self, tmpd): 16 | self.compileDix(self.monodir, self.monodix, binName=tmpd+'/f.bin') 17 | self.compileDix(self.bidir, self.bidix, binName=tmpd+'/g.bin') 18 | self.callProc('lt-compose', 19 | self.composeflags + [tmpd+"/f.bin", 20 | tmpd+"/g.bin", 21 | tmpd+"/compiled.bin"]) 22 | # The above already asserts retcode, so if we got this far we know it 23 | # compiled fine: 24 | return True 25 | 26 | 27 | class ComposeSimpleCompound(ComposeProcTest): 28 | procflags = ["-e", "-z"] 29 | inputs = ["oppy", "appy", 30 | "py", 31 | "opp", "app"] 32 | expectedOutputs = ["^oppy/opp+py$", "^appy/app+py$", 33 | "^py/py$", 34 | "^opp/*opp$", "^app/*app$"] 35 | 36 | 37 | class ComposeNotEverywhere(ComposeProcTest): 38 | procflags = ["-e", "-z"] 39 | inputs = ["upp", "up", "uppy"] 40 | expectedOutputs = ["^upp/upp$", 41 | "^up/*up$", 42 | "^uppy/upp+py$"] 43 | 44 | 45 | class ComposeAnchored(ComposeProcTest): 46 | composeflags = ["--inverted"] 47 | bidix = "data/upp2up.dix" 48 | procflags = ["-e", "-z"] 49 | inputs = ["upp", "up", 50 | "tuppy", "tupp", 51 | "uppy", "upppy", "py", 52 | "opp", "oppy", 53 | "app", "appy"] 54 | expectedOutputs = ["^upp/*upp$", "^up/*up$", 55 | "^tuppy/*tuppy$", "^tupp/*tupp$", 56 | "^uppy/upp+py$", "^upppy/upp+py$", "^py/py$", 57 | "^opp/*opp$", "^oppy/*oppy$", 58 | "^app/*app$", "^appy/*appy$" 59 | ] 60 | -------------------------------------------------------------------------------- /tests/lt_expand/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from basictest import BasicTest 3 | 4 | class ExpandTest(unittest.TestCase, BasicTest): 5 | expanddix = 'data/minimal-mono.dix' 6 | expanddir = 'lr' 7 | expectedOutput = '''abc:ab 8 | ab:ab 9 | y:y 10 | n:n 11 | jg:j+g 12 | jh:j+h 13 | kg:k+g 14 | ''' 15 | expandflags = [] 16 | 17 | def runTest(self): 18 | pp = self.openPipe('lt-expand', self.expandflags + [self.expanddix]) 19 | self.assertEqual(self.communicateFlush(None, pp), 20 | self.expectedOutput) 21 | self.closePipe(pp, False) 22 | 23 | class ExpandRegex(ExpandTest): 24 | expanddix = 'data/expand-re.dix' 25 | expectedOutput = '''abc:ab 26 | ab:ab 27 | y:y 28 | n:n 29 | __REGEXP__xyz\\:abc[qxj]\\+:__REGEXP__xyz\\:abc[qxj]\\+ 30 | ''' 31 | -------------------------------------------------------------------------------- /tests/lt_merge/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | from basictest import ProcTest 4 | import unittest 5 | 6 | class MergeTest(unittest.TestCase, ProcTest): 7 | inputs = ['^nochange$'] 8 | expectedOutputs = ['^nochange$'] 9 | procflags = [] 10 | 11 | def compileTest(self, tmpd): 12 | return True # "pass" 13 | 14 | def openProc(self, tmpd): 15 | return self.openPipe('lt-merge', self.procflags+[]) 16 | 17 | 18 | class SimpleTest(MergeTest): 19 | inputs = ['^ikke/ikke$ ^«/«$^så/så$ ^veldig/v$^»/»$ ^bra/bra$' ] 20 | expectedOutputs = ['^ikke/ikke$ ^«så veldig»/«så veldig»$ ^bra/bra$'] 21 | 22 | 23 | class SingleTest(MergeTest): 24 | inputs = ['^not/very$' ] 25 | expectedOutputs = ['^not/not$'] 26 | 27 | 28 | class UnknownTest(MergeTest): 29 | inputs = ['^foo/*foo$' ] 30 | expectedOutputs = ['^foo/*foo$'] 31 | 32 | 33 | class EscapeTest(MergeTest): 34 | # Using r'' to avoid doubling escapes even more: 35 | inputs = [r'^ikke/ikke$ ^«/«$^så/så$ ^ve\[dig/v$^»/»$ ^bra/bra$'] 36 | expectedOutputs = [r'^ikke/ikke$ ^«så ve\\\[dig»/«så ve\\\[dig»$ ^bra/bra$'] 37 | 38 | 39 | class WordblankTest(MergeTest): 40 | # Using r'' to avoid doubling escapes even more: 41 | inputs = [r'^«/«$[[tf:i:a]]^ve\/ldig/v$^»/»$'] 42 | expectedOutputs = [r'^«\[\[tf:i:a\]\]\^\$ve\\\/ldig»/«\[\[tf:i:a\]\]\^\$ve\\\/ldig»$'] 43 | 44 | 45 | class SimpleUnmergeTest(MergeTest): 46 | procflags = ['--unmerge'] 47 | # Using r'' to avoid doubling escapes even more: 48 | inputs = [r'^ikkje/ikkje$ ^«Se og Hør»/«Se og Hør»$ ^då/då$'] 49 | expectedOutputs = [r'^ikkje/ikkje$ «Se og Hør» ^då/då$'] 50 | 51 | 52 | class EscapedUnmergeTest(MergeTest): 53 | procflags = ['--unmerge'] 54 | # Using r'' to avoid doubling escapes even more: 55 | inputs = [r'^ikkje/ikkje$ ^«\[\[tf:i:a\]\]\^\$s\\\^å»/«\[\[tf:i:a\]\]\^\$s\\\^å»$'] 56 | expectedOutputs = [r'^ikkje/ikkje$ «[[tf:i:a]]^$s\^å»'] 57 | -------------------------------------------------------------------------------- /tests/lt_paradigm/__init__.py: -------------------------------------------------------------------------------- 1 | from basictest import ProcTest 2 | import unittest 3 | 4 | class ParadigmTest(unittest.TestCase, ProcTest): 5 | inputs = ['ab<*>', 'y<*>', '*'] 6 | expectedOutputs = ['ab:abc\nab:ab', 7 | 'y:y', 8 | 'ab:abc'] 9 | procdix = 'data/minimal-mono.dix' 10 | procdir = 'rl' 11 | sortoutput = True 12 | 13 | def runTestFlush(self, tmpd): 14 | proc = self.openPipe('lt-paradigm', 15 | self.procflags+[tmpd+'/compiled.bin']) 16 | self.assertEqual(len(self.inputs), len(self.expectedOutputs)) 17 | for inp, exp in zip(self.inputs, self.expectedOutputs): 18 | out = self.communicateFlush(inp + '\n', proc).strip() 19 | if self.sortoutput: 20 | srt = '\n'.join(sorted(out.splitlines())) 21 | self.assertEqual(exp, srt) 22 | else: 23 | self.assertEqual(exp, out) 24 | self.closePipe(proc, expectFail=self.expectedRetCodeFail) 25 | 26 | class ParadigmAnalyzerTest(ParadigmTest): 27 | procdir = 'lr' 28 | procflags = ['-a'] 29 | 30 | class ExcludeTest(ParadigmTest): 31 | procflags = ['-e', ''] 32 | inputs = ['*<*>'] 33 | expectedOutputs = ['ab:abc'] 34 | 35 | class SortTest(ParadigmTest): 36 | procflags = ['-s'] 37 | inputs = ['*<*>'] 38 | expectedOutputs = ['ab:abc\nab:ab\nn:n\ny:y'] 39 | sortoutput = False 40 | 41 | class ExcludeSingleTest(ParadigmTest): 42 | procdix = 'data/unbalanced-epsilons-mono.dix' 43 | inputs = ['*<*>', '*<*-pres>', '*<*-inf-pret>'] 44 | expectedOutputs = [ 45 | 're:re\nre:rer\nre:res\nre:ret', 46 | 're:re\nre:ret', 47 | 're:rer\nre:res' 48 | ] 49 | 50 | class OrTagTest(ParadigmTest): 51 | procdix = 'data/unbalanced-epsilons-mono.dix' 52 | inputs = ['re<|pres|pret>', 're<|inf>', 're<|xqz>'] 53 | expectedOutputs = [ 54 | 're:rer\nre:res\nre:ret', 55 | 're:re', 56 | '' 57 | ] 58 | 59 | class OrTagRepeatTest(ParadigmTest): 60 | procdix = 'data/unbalanced-epsilons-mono.dix' 61 | inputs = [ 62 | 're<*|vblex|pres|pret>', 63 | 're<*|inf|vblex>', 64 | 're<*|n|adj|vblex|inf>' 65 | ] 66 | expectedOutputs = [ 67 | 're:rer\nre:res\nre:ret', 68 | 're:re', 69 | 're:re', 70 | ] 71 | -------------------------------------------------------------------------------- /tests/lt_print/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | from basictest import PrintTest 4 | 5 | 6 | class NonWeightedFst(unittest.TestCase, PrintTest): 7 | printdix = "data/biproc-skips-tags-mono.dix" 8 | printdir = "lr" 9 | expectedOutput = "0\t1\tv\tv\t0.000000\t\n1\t2\ti\ti\t0.000000\t\n2\t3\th\th\t0.000000\t\n3\t4\tk\tk\t0.000000\t\n4\t5\ti\ti\t0.000000\t\n5\t6\t\t\t0.000000\t\n6\t10\t\u03b5\t\u03b5\t0.000000\t\n6\t7\t\t\t0.000000\t\n7\t8\t\t\t0.000000\t\n8\t9\t\t\t0.000000\t\n9\t10\t\u03b5\t\u03b5\t0.000000\t\n10\t0.000000\n" 10 | 11 | 12 | class WeightedFst(unittest.TestCase, PrintTest): 13 | printdix = "data/cat-weight.att" 14 | printdir = "lr" 15 | expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t0.525487\n" 16 | 17 | 18 | class NegativeWeightedFst(unittest.TestCase, PrintTest): 19 | printdix = "data/cat-weight-negative.att" 20 | printdir = "lr" 21 | expectedOutput = "0\t1\tc\tc\t4.567895\t\n1\t2\ta\ta\t0.989532\t\n2\t3\tt\tt\t2.796193\t\n3\t4\tε\t+\t-0.824564\t\n4\t5\tε\tn\t1.824564\t\n4\t5\tε\tv\t2.856296\t\n5\t-0.525487\n" 22 | 23 | 24 | class MulticharCompFst(unittest.TestCase, PrintTest): 25 | printdix = "data/multichar.att" 26 | printdir = "lr" 27 | expectedOutput = "0\t1\tא\tא\t0.000000\t\n1\t2\tε\tַ\t0.000000\t\n2\t3\tε\tן\t0.000000\t\n3\t4\tε\t\t0.000000\t\n4\t0.000000\n" 28 | 29 | 30 | class SectionsFst(unittest.TestCase, PrintTest): 31 | printdix = "data/sections.dix" 32 | printdir = "lr" 33 | expectedOutput = """0\t1\t.\t.\t0.000000\t 34 | 1\t2\tε\t\t0.000000\t 35 | 2\t0.000000 36 | -- 37 | 0\t1\tX\tX\t0.000000\t 38 | 1\t2\tε\t\t0.000000\t 39 | 2\t0.000000 40 | """ 41 | 42 | 43 | class Alphabet(unittest.TestCase, PrintTest): 44 | printdix = "data/alphabet.att" 45 | printdir = "lr" 46 | printflags = ["-a"] 47 | expectedOutput = """A 48 | B 49 | C 50 | a 51 | b 52 | c 53 | 54 | """ 55 | -------------------------------------------------------------------------------- /tests/lt_proc/null_flush_invalid_stream_format.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import sys 5 | import unittest 6 | from proctest import ProcTest 7 | 8 | # These tests are for invalid Apertium Stream format; lt-proc's output 9 | # for these seems system-dependent, so we can't use them as regression 10 | # tests (until that's fixed, if it's worth fixing). 11 | 12 | class NoSuperblankBeforeNUL(unittest.TestCase, ProcTest): 13 | inputs = ["The dog gladly eats homework.", 14 | "If wé swim fast enough,", 15 | "we should reach shallow waters.", 16 | "before;", 17 | "the sharks;", 18 | "come."] 19 | 20 | expectedOutputs = ["^The/The$ ^dog/dog$ ^gladly/gladly$ ^eats/eat$ ^homework/homework$", 21 | "^If/If$ ^wé/*wé$ ^swim/swim/swim$ ^fast/fast/fast$ ^enough/enough/enough$", 22 | "^we/prpers$ ^should/should$ ^reach/reach/reach$ ^shallow/shallow$ ^waters/water$", 23 | "^before/before/before/before$", 24 | "^the/the$ ^sharks/shark$", 25 | "^come/come/come/come$"] 26 | 27 | class WronglyEscapedLetter(unittest.TestCase, ProcTest): 28 | inputs = ["before you g\\o to bed.[][\n]"] 29 | expectedOutputs = ["^before/before/before/before$ ^you/prpers/prpers$ "] 30 | expectedRetCodeFail = True 31 | 32 | 33 | class UnescapedAngleBracket(unittest.TestCase, ProcTest): 34 | inputs = ["Simon prefers dark chocolate>.[][\n]"] 35 | expectedOutputs = ["^Simon/Simon$ ^prefers/prefer$ ^dark/dark/dark$ "] 36 | expectedRetCodeFail = True 37 | 38 | class UnclosedSuperblank(unittest.TestCase, ProcTest): 39 | inputs = ["you should always[ eat"] 40 | #expectedOutputs = ["^you/prpers/prpers$ ^should/should$ "] 41 | expectedOutputs = [""] 42 | expectedRetCodeFail = True 43 | -------------------------------------------------------------------------------- /tests/lt_tmxproc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from basictest import ProcTest as _ProcTest 3 | import unittest 4 | 5 | class TmxProcTest(unittest.TestCase, _ProcTest): 6 | procdix = 'data/simple.tmx' 7 | procflags = [] 8 | procdir = 'nob-nno' 9 | 10 | def compileDix(self, dir, dix, flags=None, binName='compiled.bin', 11 | expectFail=False): 12 | return self.callProc('lt-tmxcomp', 13 | [dir, dix, binName], 14 | flags, 15 | expectFail) 16 | 17 | def compileTest(self, tmpd): 18 | return self.compileDix(self.procdir, self.procdix, 19 | flags=self.compflags, 20 | binName=tmpd+'/compiled.bin', 21 | expectFail=self.expectedCompRetCodeFail) 22 | 23 | def openProc(self, tmpd): 24 | return self.openPipe('lt-tmxproc', self.procflags+[tmpd+'/compiled.bin']) 25 | 26 | class Simple(TmxProcTest): 27 | inputs = ['Ikke så merkelig.\nJa, ja.',] 28 | expectedOutputs = ['[Ikkje så merkeleg].\nJa, ja.'] 29 | 30 | 31 | class SimpleSpaceSep(TmxProcTest): 32 | procflags = ['-s'] 33 | inputs = ['Ikke så merkelig at det skjer.',] 34 | expectedOutputs = ['[Ikkje så merkeleg] at det skjer.'] 35 | 36 | 37 | class Numbers(TmxProcTest): 38 | procdix = 'data/numbers.tmx' 39 | procflags = ['-s'] 40 | inputs = [ 41 | 'kake 1 og kjeks', 42 | '3 og kake 8 og kjeks', 43 | '3 og kaffe 9 og kjeks 2', 44 | '3 og kaffe 9 og 7 kjeks 2', 45 | '3 og kaffe 9 og 2 ost ', 46 | '1 3 eller ost 88 eller 89 kjeks', 47 | '3 på halv fire', 48 | '1 og 3 på halv fire', 49 | ] 50 | expectedOutputs = [ 51 | '[kake 1] og kjeks', 52 | '3 og [kake 8] og kjeks', 53 | '3 og kaffe 9 og kjeks 2', 54 | '3 og [kaffi 9 og 7] kjeks 2', 55 | '3 og [kaffi 9 og 2] ost ', 56 | '1 3 eller [ost 88 eller 89 kjeks]', 57 | '[3 på halv] fire', 58 | '1 og [3 på halv] fire', 59 | ] 60 | 61 | @unittest.expectedFailure 62 | class NumbersTwice(TmxProcTest): 63 | procdix = 'data/numbers.tmx' 64 | procflags = ['-s'] 65 | inputs = [ 66 | '3 kake 8 og kjeks 2', 67 | '3 kaffe 9 og kjeks 2', 68 | '1 3 ost 99 eller ost 88 eller 89 kjeks', 69 | '1 3 på halv fire', 70 | ] 71 | expectedOutputs = [ 72 | '3 [kake 8] og kjeks 2', 73 | '3 kaffe 9 og kjeks 2', 74 | '1 3 ost 99 eller [ost 88 eller 89 kjeks]', 75 | '1 [3 på halv] fire', 76 | ] 77 | 78 | -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | sys.path.append(os.path.realpath(".")) 6 | import unittest 7 | 8 | os.environ['LTTOOLBOX_PATH'] = '../lttoolbox' 9 | if len(sys.argv) > 1: 10 | os.environ['LTTOOLBOX_PATH'] = sys.argv[1] 11 | 12 | modules = ['lt_proc', 'lt_trim', 'lt_print', 'lt_comp', 'lt_append', 13 | 'lt_paradigm', 'lt_expand', 'lt_apply_acx', 'lt_compose', 14 | 'lt_tmxproc', 'lt_merge'] 15 | 16 | 17 | if __name__ == "__main__": 18 | os.chdir(os.path.dirname(__file__)) 19 | failures = 0 20 | for module in modules: 21 | suite = unittest.TestLoader().loadTestsFromName(module) 22 | res = unittest.TextTestRunner(verbosity = 2).run(suite) 23 | failures += len(res.failures) 24 | sys.exit(min(failures, 255)) 25 | --------------------------------------------------------------------------------