├── macports-foma ├── PortIndex.quick ├── PortIndex └── devel │ └── foma │ └── Portfile ├── foma ├── python │ ├── .gitignore │ ├── tox.ini │ ├── test_foma.py │ ├── README.md │ ├── phonrule.py │ ├── foma2js.py │ └── attapply.py ├── tests │ ├── test-error-rendering.foma │ ├── test-segfault-empty-fst.foma │ ├── test-segfault-eliminate.foma │ ├── test-segfault-long-name │ ├── test-leaky-redefine.foma │ ├── test-leaky-test.foma │ └── run.sh ├── libfoma.pc.in ├── lexc.h ├── README.cmatrix ├── README.symbols ├── README ├── demo.html ├── reverse.c ├── contrib │ ├── foma_apply_down.js │ ├── lexc.plist │ ├── foma2js.perl │ └── foma.plist ├── mem.c ├── int_stack.c ├── cmatrix.l ├── extract.c ├── stringhash.c ├── define.c ├── trie.c ├── coaccessible.c ├── docs │ └── examples │ │ └── hixkaryana-ot-verification.foma ├── foma.h ├── stack.c ├── topsort.c ├── lexc.l ├── CMakeLists.txt ├── utf8.c ├── fomalibconf.h ├── CHANGELOG ├── foma.c ├── cgflookup.c ├── COPYING ├── flookup.c └── sigma.c ├── .gitignore ├── .github └── workflows │ ├── test.yml │ └── build.yml └── README.md /macports-foma/PortIndex.quick: -------------------------------------------------------------------------------- 1 | foma 0 2 | -------------------------------------------------------------------------------- /foma/python/.gitignore: -------------------------------------------------------------------------------- 1 | .pytest_cache 2 | .tox 3 | -------------------------------------------------------------------------------- /foma/tests/test-error-rendering.foma: -------------------------------------------------------------------------------- 1 | def blah 123; 2 | regex ||; 3 | -------------------------------------------------------------------------------- /foma/tests/test-segfault-empty-fst.foma: -------------------------------------------------------------------------------- 1 | regex a:b; 2 | save defined /tmp/a 3 | -------------------------------------------------------------------------------- /foma/tests/test-segfault-eliminate.foma: -------------------------------------------------------------------------------- 1 | regex 0 - 0 ; !Creates xducer with 0 paths 2 | eliminate flags !seg fault 3 | -------------------------------------------------------------------------------- /foma/tests/test-segfault-long-name: -------------------------------------------------------------------------------- 1 | define ForeignGuessIntermediateToSurfaceGrammar [a-z]; 2 | save defined /tmp/out -------------------------------------------------------------------------------- /foma/tests/test-leaky-redefine.foma: -------------------------------------------------------------------------------- 1 | # fail if memory is not released when variable is redefined 2 | define foo a*; 3 | define foo a*; 4 | -------------------------------------------------------------------------------- /foma/python/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py36 3 | # Do not require setup.py 4 | skipsdist = True 5 | [testenv] 6 | deps=pytest 7 | commands=pytest 8 | -------------------------------------------------------------------------------- /foma/tests/test-leaky-test.foma: -------------------------------------------------------------------------------- 1 | regex a:b; 2 | test null 3 | test identity 4 | test sequential 5 | test unambiguous 6 | test lower-universal 7 | test upper-universal 8 | print shortest-string -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | lex.*.[co] 2 | .*.sw[pon] 3 | *.o 4 | *.a 5 | *.so.* 6 | foma/foma 7 | flookup 8 | cgflookup 9 | libfoma.js 10 | libfoma.wasm 11 | regex.* 12 | !regex.l 13 | *.dylib 14 | **/CMakeFiles/ 15 | Makefile 16 | *.so 17 | *.pc 18 | *.cmake 19 | CMake* 20 | -------------------------------------------------------------------------------- /foma/libfoma.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 3 | libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ 4 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ 5 | 6 | Name: foma 7 | Description: The foma library 8 | Version: @PROJECT_VERSION@ 9 | Cflags: -I${includedir}/ 10 | Libs: Libs: -L${libdir} -lfoma 11 | -------------------------------------------------------------------------------- /foma/lexc.h: -------------------------------------------------------------------------------- 1 | void lexc_init(); 2 | void lexc_add_mc(char *symbol); 3 | int lexc_find_mc(char *symbol); 4 | struct states *lexc_find_lex_state(char *name); 5 | void lexc_add_word(); 6 | struct fsm *lexc_to_fsm(void); 7 | void lexc_set_current_lexicon(char *name, int which); 8 | void lexc_set_current_word(char *name); 9 | void lexc_clear_current_word(); 10 | void lexc_set_network(struct fsm *net); 11 | void lexc_trim(char *s); 12 | -------------------------------------------------------------------------------- /foma/tests/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -ex 3 | 4 | foma -q -f test-leaky-redefine.foma || exit 1; 5 | foma -q -f test-segfault-eliminate.foma || exit 1; 6 | if ! foma -q -f test-error-rendering.foma 2>&1 | grep -q 'syntax error' 7 | then 8 | exit 1 9 | fi 10 | foma -q -f test-segfault-long-name > /dev/null || exit 1 11 | foma -q -f test-segfault-empty-fst.foma > /dev/null || exit 1; 12 | foma -q -f test-leaky-test.foma > /dev/null || exit 1; 13 | -------------------------------------------------------------------------------- /macports-foma/PortIndex: -------------------------------------------------------------------------------- 1 | foma 533 2 | variants universal depends_build {port:bison port:flex port:libtool} portdir devel/foma description {xfst-compatible C++ finite-state transducer library} homepage https://code.google.com/p/foma/ epoch 0 platforms darwin name foma depends_lib port:zlib long_description {Foma is designed to be a complete replacement for the closed-source Xerox tool xfst. Everything that compiles with xfst should compile with Foma. If not it is a bug.} maintainers mans.hulden@gmail.com license GPL-2 categories devel version 0.9.16alpha revision 0 3 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test foma 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | 11 | - name: dependencies 12 | run: | 13 | sudo apt-get -qy update 14 | sudo apt-get -qfy install --no-install-recommends cmake 15 | 16 | - name: build with sanitizer 17 | run: cd foma && export CFLAGS="-g3 -Wall -fsanitize=address" && cmake . && make -j 18 | 19 | - name: run tests 20 | run: cd foma/tests && PATH=../:$PATH ./run.sh 21 | -------------------------------------------------------------------------------- /foma/python/test_foma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: UTF-8 -*- 3 | 4 | """ 5 | Test cases for Foma Python bindings. 6 | """ 7 | 8 | import pytest 9 | from foma import FST 10 | 11 | 12 | def test_load_fst(): 13 | fst = FST.load('ate.fsm') 14 | assert isinstance(fst, FST) 15 | 16 | 17 | def test_apply_fst(eat_fst): 18 | result, = eat_fst.apply_up('ate') 19 | assert result == 'eat+V+Past' 20 | 21 | 22 | def test_apply_down(eat_fst): 23 | result, = eat_fst.apply_down('eat+V+3P+Sg') 24 | assert result == 'eats' 25 | 26 | 27 | @pytest.fixture 28 | def eat_fst(): 29 | return FST.load('ate.fsm') 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # foma 2 | 3 | Finite-state transducer technology. 4 | 5 | ## Compilation Instructions 6 | 7 | 1. Ensure GNU readline library is installed. 8 | 2. `cd foma` 9 | 3. `cmake CMakeLists.txt` 10 | 4. `make` 11 | 5. `make install` (if you want to install it) 12 | 13 | ## Compiling to Web Assembly (wasm) using Emscripten 14 | 15 | The `readline` and `zlib` dependencies are not needed for the wasm build. 16 | 17 | ``` 18 | # Install and activate Emscripten SDK 19 | # (You may want to choose a different place in your filesystem to put emsdk) 20 | git clone https://github.com/emscripten-core/emsdk.git # only needed the first time 21 | cd emsdk 22 | git pull 23 | ./emsdk install latest 24 | ./emsdk activate latest 25 | # Follow printed instructions 26 | cd .. 27 | 28 | # Build foma to WebAssembly using Emscripten 29 | cd foma 30 | emcmake cmake 31 | emmake make 32 | 33 | # Start a local web server: 34 | python3 -m http.server 8000 35 | ``` 36 | 37 | Open a web browser and navigate to http://localhost:8000/demo.html. 38 | The demo page allows you to test Foma regular expressions directly in your browser. 39 | -------------------------------------------------------------------------------- /foma/README.cmatrix: -------------------------------------------------------------------------------- 1 | Since version 0.9.8alpha, foma allows attaching a confusion matrix specification to a network. The command "read cmatrix " read a confusion matrix and attaches is to the top network on the stack. Subsequent "apply med" commands will use this matrix in determining the minimum cost approximate match to a word. If no confusion matrix is specified, Levenshtein distance is used (insert = substitute = delete = 1). 2 | 3 | The command "print cmatrix" also prints out the confusion matrix attached to the top network in tabular format. 4 | 5 | The format of the confusion matrix should be clear from the following example confusion matrix file: 6 | 7 | --CUT HERE-- 8 | Insert 1 9 | Substitute 2 10 | Delete 1 11 | Cost 1 12 | a:b c:d 13 | Cost 3 14 | :x x: x:y 15 | --CUT HERE-- 16 | 17 | The above snippet specifies a matrix where the default insertion cost is 1 unit, the default substitution cost is 2 units, and the default deletion cost is 1 unit. Also, substituting an "a" with a "b" costs 1 unit, as does substituting a "c" for a "d". Inserting an "x" costs 3 units, as does deleting an "x" and substituting an "x" for a "y". 18 | 19 | All costs must be positive integer values. A cost specification that involves symbols not found in the alphabet of the top network are not included in the matrix, but are warned about. 20 | -------------------------------------------------------------------------------- /macports-foma/devel/foma/Portfile: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4 2 | # $Id$ 3 | 4 | PortSystem 1.0 5 | 6 | name foma 7 | version 0.9.16alpha 8 | categories devel 9 | platforms darwin 10 | license GPL-2 11 | maintainers mans.hulden@gmail.com 12 | 13 | description xfst-compatible C++ finite-state transducer library 14 | 15 | long_description Foma is designed to be a complete replacement for the \ 16 | closed-source Xerox tool xfst. Everything that compiles \ 17 | with xfst should compile with Foma. If not it is a bug. 18 | 19 | homepage https://code.google.com/p/foma/ 20 | master_sites googlecode 21 | 22 | checksums rmd160 17944b8fc164014262996cd643a0923d56869692 \ 23 | sha1 fcc984252931d337578b0344cdcaa77efd0ff092 \ 24 | sha256 dc019dcdf6dc8f7e881394f3d76821006e7f0cf6e3fe1c12ac2887fa48d81208 25 | 26 | depends_lib port:zlib 27 | depends_build port:bison \ 28 | port:flex \ 29 | port:libtool 30 | 31 | # The following deletes the configure phase, which doesn't work with the present build system: 32 | configure {} 33 | 34 | #configure.args --enable-lexc 35 | 36 | test.run no 37 | test.target check 38 | -------------------------------------------------------------------------------- /foma/python/README.md: -------------------------------------------------------------------------------- 1 | # Python Extras 2 | 3 | ## foma.py 4 | 5 | This is a foma interface implemented in Python. Requires libfoma installed. 6 | 7 | ## attapply.py 8 | 9 | This is a stand-alone Python utility for reading AT\&T files and applying transductions. Useful for minimizing dependencies. Also supports weighted transducers, in which case `apply()` returns output strings in least-cost order. 10 | 11 | ## phonrule.py 12 | 13 | This is a simple helper tool for debugging foma-scripts that are sequences of phonological rules meant to apply in a certain order. 14 | It assumes a grammar is written as a sequence of define-statements and ordered rewrite-rules combined with a chain-statement (simulating composition of the rules). It then passes words from STDIN through the sequence of transducers and prints a decorated output to STDOUT where rules that fire are shown between brackets. 15 | 16 | Example: 17 | 18 | ``` 19 | # myscript.foma 20 | def ARule a -> b || c _ d; # Rule one 21 | def BRule b -> c || _ d; # Rule two 22 | chain ARule, BRule 23 | ``` 24 | 25 | We can now run the following, passing the word `cad` through the two transducers and tracing the rule actions: 26 | 27 | ``` 28 | $echo "cad" | python phonrule.py myscript.foma 29 | ``` 30 | 31 | and the output is 32 | 33 | ``` 34 | cad[ARule|Rule one]cbd[BRule|Rule two]ccd 35 | ``` 36 | 37 | ## foma2js.py 38 | 39 | This is a port of `foma/contrib/foma2js.perl` to Python 3. Get the help using `foma2js.py -h`. Everything else like in the original program. 40 | -------------------------------------------------------------------------------- /foma/README.symbols: -------------------------------------------------------------------------------- 1 | Foma accepts multiple variants of some operators for compatibility reasons. All symbols in the ASCII range except [1-9A-Za-z'=] are reserved and need to be escaped with % or " " (naturally non-escaped ? is the wildcard symbol and 0 the epsilon symbol). 2 | 3 | In addition the following Unicode symbols are reserved and need to be escaped as well, unless used as their operator meaning: 4 | 5 | Octal bytes Character name Hex code point ASCII equivalent 6 | 7 | ¬ 302 254 NOT SIGN U+00AC ~ 8 | × 303 227 MULTIPLICATION SIGN U+00D7 .x. : 9 | Σ 316 243 GREEK CAPITAL LETTER SIGMA U+03A3 ? 10 | ε 316 265 GREEK SMALL LETTER EPSILON U+03B5 [] 0 11 | → 342 206 222 RIGHTWARDS ARROW U+2192 12 | ↔ 342 206 224 LEFT RIGHT ARROW U+2194 13 | ∀ 342 210 200 FOR ALL U+2200 14 | ∃ 342 210 203 THERE EXISTS U+2203 15 | ∅ 342 210 205 EMPTY SET U+2205 \? 16 | ∈ 342 210 210 ELEMENT OF U+2208 17 | ∘ 342 210 230 RING OPERATOR U+2218 .o. 18 | ∥ 342 210 245 PARALLEL TO U+2225 <> 19 | ∧ 342 210 247 LOGICAL AND U+2227 & 20 | ∨ 342 210 250 LOGICAL OR U+2228 | 21 | ∩ 342 210 251 INTERSECTION U+2229 & 22 | ∪ 342 210 252 UNION U+222A | 23 | ≤ 342 211 244 LESS-THAN OR EQUAL TO U+2264 24 | ≥ 342 211 245 GREATER-THAN OR EQUAL TO U+2265 25 | ≺ 342 211 272 PRECEDES U+227A < 26 | ≻ 342 211 273 SUCCEEDS U+227B > 27 | -------------------------------------------------------------------------------- /foma/README: -------------------------------------------------------------------------------- 1 | Foma 2 | ==== 3 | 4 | version 0.10.0 5 | 2021/06/01 6 | 7 | Author: Mans Hulden 8 | Email: mans.hulden@gmail.com 9 | WWW: http://fomafst.github.io 10 | 11 | 12 | What is foma? 13 | ------------- 14 | 15 | Foma is a multi-purpose finite-state toolkit designed for applications ranging from natural language processing and research in automata theory. It should be upwardly compatible with Xerox xfst and lexc, with the exception of binary file reading and writing. 16 | 17 | 18 | Distribution 19 | ------------ 20 | 21 | Foma is licensed under the Apache License, version 2. You should have received a copy of the licence with the source code. 22 | 23 | 24 | Compatibility 25 | ------------- 26 | 27 | Foma is developed and tested on a Linux system. 28 | 29 | It has also been compiled on win32, Mac OSX and Sun Solaris systems. The source code should be reasonably portable. It relies on the GNU readline library. Foma also needs GNU bison (developed using 2.3) and flex (>2.5.31). The last two are only necessary if you are making changes to the parser (.y) or lexer (.l) files. 30 | 31 | 32 | Compiling/installing foma 33 | ------------------------- 34 | 35 | A generic Makefile that has been used to compile the Linux, win32, and Mac OSX versions is included. It assumes you have the header files and the necessary libraries mentioned above. Some pre-built binaries are available on http://foma.googlecode.com. Before endeavoring a compile, these are recommended as building for e.g. win32 may be both frustrating and time-consuming. 36 | 37 | To compile foma (and flookup) as well as the foma static and dynamic library, "make; make install" should work on most UNIX systems. The default installation target /usr/local can be changed in the Makefile. 38 | 39 | 40 | Bugs 41 | ---- 42 | 43 | Many. The current release is 0.10.0, and bug reports will be gratefully received at mans.hulden@gmail.com. 44 | 45 | -------------------------------------------------------------------------------- /foma/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |
6 |
7 | 8 | 9 |
10 | 11 |
12 | 13 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /foma/reverse.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include "foma.h" 20 | 21 | struct fsm *fsm_reverse(struct fsm *net) { 22 | struct fsm *revnet; 23 | struct fsm_construct_handle *revh; 24 | struct fsm_read_handle *inh; 25 | int i; 26 | 27 | inh = fsm_read_init(net); 28 | revh = fsm_construct_init(net->name); 29 | fsm_construct_copy_sigma(revh, net->sigma); 30 | 31 | while (fsm_get_next_arc(inh)) { 32 | fsm_construct_add_arc_nums(revh, fsm_get_arc_target(inh)+1, fsm_get_arc_source(inh)+1, fsm_get_arc_num_in(inh), fsm_get_arc_num_out(inh)); 33 | } 34 | 35 | while ((i = fsm_get_next_final(inh)) != -1) { 36 | fsm_construct_add_arc_nums(revh, 0, i+1, EPSILON, EPSILON); 37 | } 38 | while ((i = fsm_get_next_initial(inh)) != -1) { 39 | fsm_construct_set_final(revh, i+1); 40 | } 41 | fsm_construct_set_initial(revh, 0); 42 | fsm_read_done(inh); 43 | revnet = fsm_construct_done(revh); 44 | revnet->is_deterministic = 0; 45 | revnet->is_epsilon_free = 0; 46 | fsm_destroy(net); 47 | return(revnet); 48 | } 49 | -------------------------------------------------------------------------------- /foma/contrib/foma_apply_down.js: -------------------------------------------------------------------------------- 1 | // Basic recursive apply down function for Javascript runtime. 2 | // Caveat: does not support flag diacritics and will recurse infinitely 3 | // on input-side epsilon-loops. 4 | // Use the foma2js.perl script to convert foma binaries to a Javascript array which 5 | // is needed as the first argument of foma_apply_down. 6 | 7 | function foma_apply_down(Net, inString) { 8 | Rep = new Object; 9 | Rep.answer = new Array; 10 | Rep.results = 0; 11 | foma_apply_dn(Net, inString, 0, 0, '', Rep); 12 | return(Rep.answer); 13 | } 14 | 15 | function foma_apply_dn(Net, inString, Position, State, outString, Reply) { 16 | if (Net.f[State] === 1 && Position === inString.length) { 17 | Reply.answer.push(outString); 18 | Reply.results++; 19 | } 20 | var match = 0; 21 | for (var len = 0; len <= Net.maxlen && len <= inString.length - Position; len++) { 22 | var key = State + '|' + inString.substr(Position,len); 23 | for (var key2 in Net.t[key]) { 24 | for (var targetState in Net.t[key][key2]) { 25 | if (targetState == null) { 26 | return; 27 | } 28 | var outputSymbol = Net.t[key][key2][targetState]; 29 | match = 1; 30 | if (outputSymbol === '@UN@') { outputSymbol = '?'; } 31 | foma_apply_dn(Net, inString, Position+len, targetState, outString + outputSymbol, Reply); 32 | } 33 | } 34 | } 35 | if (match === 0 && Net.s[inString.substr(Position,1)] == null && inString.length > Position) { 36 | key = State + '|' + '@ID@'; 37 | for (key2 in Net.t[key]) { 38 | for (targetState in Net.t[key][key2]) { 39 | if (targetState == null) { 40 | return; 41 | } 42 | outputSymbol = Net.t[key][key2][targetState]; 43 | if (outputSymbol === '@UN@') { outputSymbol = '?'; } 44 | if (outputSymbol === '@ID@') { outputSymbol = inString.substr(Position,1); } 45 | foma_apply_dn(Net, inString, Position+1, targetState, outString + outputSymbol, Reply); 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /foma/mem.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include "foma.h" 19 | #include 20 | #include 21 | 22 | /* Global variables */ 23 | int g_show_flags = 0; 24 | int g_obey_flags = 1; 25 | int g_flag_is_epsilon = 0; 26 | int g_print_space = 0; 27 | int g_print_pairs = 0; 28 | int g_minimal = 1; 29 | int g_name_nets = 0; 30 | int g_print_sigma = 1; 31 | int g_quit_on_fail = 1; 32 | int g_quote_special = 0; 33 | int g_recursive_define = 0; 34 | int g_sort_arcs = 1; 35 | int g_verbose = 1; 36 | int g_minimize_hopcroft = 1; 37 | int g_compose_tristate = 0; 38 | int g_list_limit = 100; 39 | int g_list_random_limit = 15; 40 | int g_med_limit = 3; 41 | int g_med_cutoff = 15; 42 | int g_lexc_align = 0; 43 | char *g_att_epsilon = "@0@"; 44 | 45 | char *xxstrndup(const char *s, size_t n) { 46 | char *r = NULL; 47 | const char *p = s; 48 | while(*p++ && n--); 49 | n = p - s - 1; 50 | r = (char *) malloc(n + 1); 51 | if(r != NULL) { 52 | memcpy(r, s, n); 53 | r[n] = 0; 54 | } 55 | return r; 56 | } 57 | 58 | int next_power_of_two(int v) { 59 | int i; 60 | for (i=0; v > 0; i++) 61 | v = v >> 1; 62 | return (1 << i); 63 | } 64 | 65 | unsigned int round_up_to_power_of_two(unsigned int v) { 66 | v--; 67 | v |= v >> 1; 68 | v |= v >> 2; 69 | v |= v >> 4; 70 | v |= v >> 8; 71 | v |= v >> 16; 72 | v++; 73 | return(v); 74 | } 75 | -------------------------------------------------------------------------------- /foma/contrib/lexc.plist: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 15 | 16 | 17 | 18 | BBEditDocumentType 19 | CodelessLanguageModule 20 | 21 | 22 | BBLMLanguageDisplayName 23 | Lexc 24 | 25 | BBLMLanguageCode 26 | lexc 27 | 28 | BBLMPreferredFilenameExtension 29 | lexc 30 | 31 | BBLMSuffixMap 32 | 33 | 34 | BBLMLanguageSuffix 35 | .lexc 36 | 37 | 38 | 39 | 40 | BBLMCommentLineDefault 41 | ! 42 | 43 | 45 | BBLMColorsSyntax 46 | 47 | 48 | BBLMSupportsTextCompletion 49 | 50 | 51 | 52 | 54 | BBLMKeywordList 55 | 56 | Multichar_Symbols 57 | Definitions 58 | LEXICON 59 | 60 | 61 | 63 | Language Features 64 | 65 | Identifier and Keyword Character Class 66 | A-Za-z0-9_\?!. 67 | 68 | Comment Pattern 69 | (!).+$ 70 | 71 | String Pattern 72 | " ( 75 | [^"\r\\] (?# match anything potential ending quote, new line, or start of escape sequence) 76 | | \\. (?# match any escape sequence pair) 77 | )*? 78 | " 79 | ) 80 | ) 81 | ]]> 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build foma 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - name: dependencies 19 | run: | 20 | sudo apt-get -qy update 21 | sudo apt-get -qfy install --no-install-recommends cmake 22 | 23 | - name: build 24 | run: cd foma && cmake . && make -j 25 | 26 | - name: Archive production artifacts 27 | uses: actions/upload-artifact@v2 28 | with: 29 | name: dist-linux 30 | path: | 31 | foma/foma 32 | foma/flookup 33 | foma/cgflookup 34 | foma/libfoma.so.* 35 | foma/README* 36 | foma/CHANGELOG 37 | foma/COPYING 38 | 39 | build-windows: 40 | 41 | runs-on: windows-latest 42 | 43 | defaults: 44 | run: 45 | shell: msys2 {0} 46 | 47 | steps: 48 | - uses: actions/checkout@v3 49 | 50 | - uses: msys2/setup-msys2@v2 51 | with: 52 | msystem: MINGW64 53 | update: true 54 | install: cmake gcc bison flex make zlib-devel libreadline-devel ncurses-devel 55 | 56 | - name: Compile Foma 57 | run: cd foma && cmake . && make -j 58 | 59 | - name: Copy MINGW64 DLLs to build directory 60 | run: cp /usr/bin/{msys-2.0.dll,msys-ncursesw6.dll,msys-readline8.dll,msys-z.dll} foma 61 | 62 | - name: Archive production artifacts 63 | uses: actions/upload-artifact@v2 64 | with: 65 | name: dist-windows 66 | path: | 67 | foma/foma.exe 68 | foma/flookup.exe 69 | foma/cgflookup.exe 70 | foma/*.dll 71 | foma/README* 72 | foma/CHANGELOG 73 | foma/COPYING 74 | 75 | build-macos: 76 | 77 | runs-on: macos-latest 78 | 79 | steps: 80 | - uses: actions/checkout@v3 81 | 82 | - name: Compile Foma 83 | run: brew install bison cmake && export PATH="$(brew --prefix bison)/bin:$PATH" && cd foma && cmake . && make -j 84 | 85 | - name: Archive production artifacts 86 | uses: actions/upload-artifact@v2 87 | with: 88 | name: dist-macos 89 | path: | 90 | foma/foma 91 | foma/cgflookup 92 | foma/flookup 93 | foma/libfoma.a 94 | foma/libfoma.*.dylib 95 | foma/README* 96 | foma/CHANGELOG 97 | foma/COPYING 98 | -------------------------------------------------------------------------------- /foma/int_stack.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include "foma.h" 21 | 22 | #define MAX_STACK 2097152 23 | #define MAX_PTR_STACK 2097152 24 | 25 | static int a[MAX_STACK]; 26 | static int top = -1; 27 | 28 | static void *ptr_stack[MAX_PTR_STACK]; 29 | static int ptr_stack_top = -1; 30 | 31 | int ptr_stack_isempty() { 32 | return ptr_stack_top == -1; 33 | } 34 | 35 | void ptr_stack_clear() { 36 | ptr_stack_top = -1; 37 | } 38 | 39 | void *ptr_stack_pop() { 40 | return ptr_stack[ptr_stack_top--]; 41 | } 42 | 43 | int ptr_stack_isfull() { 44 | return (ptr_stack_top == (MAX_PTR_STACK - 1)); 45 | } 46 | 47 | void ptr_stack_push(void *ptr) { 48 | if (ptr_stack_isfull()) { 49 | fprintf(stderr, "Pointer stack full!\n"); 50 | exit(1); 51 | } 52 | ptr_stack[++ptr_stack_top] = ptr; 53 | } 54 | 55 | 56 | int int_stack_isempty() { 57 | return top == -1; 58 | } 59 | 60 | void int_stack_clear() { 61 | top = -1; 62 | } 63 | 64 | int int_stack_find (int entry) { 65 | int i; 66 | if (int_stack_isempty()) { 67 | return 0; 68 | } 69 | for(i = 0; i <= top ; i++) { 70 | if (entry == a[i]) { 71 | return 1; 72 | } 73 | } 74 | return 0; 75 | } 76 | 77 | int int_stack_size () { 78 | return (top + 1); 79 | } 80 | 81 | void int_stack_push(int c) { 82 | if (int_stack_isfull()) { 83 | fprintf(stderr, "Stack full!\n"); 84 | exit(1); 85 | } 86 | a[++top] = c; 87 | } 88 | 89 | 90 | int int_stack_pop() { 91 | return a[top--]; 92 | } 93 | 94 | int int_stack_isfull() { 95 | return (top == (MAX_STACK - 1)); 96 | } 97 | -------------------------------------------------------------------------------- /foma/cmatrix.l: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | %option noyywrap 18 | %option nounput 19 | %option noinput 20 | %{ 21 | 22 | #include 23 | #include 24 | #include 25 | #include "foma.h" 26 | 27 | extern int cmatrixlex(); 28 | static struct fsm *mnet; 29 | static int currcost; 30 | static char *instring; 31 | 32 | void my_cmatrixparse(struct fsm *net, char *my_string) { 33 | 34 | YY_BUFFER_STATE my_string_buffer; 35 | 36 | currcost = 1; 37 | my_string_buffer = cmatrix_scan_string(my_string); 38 | mnet = net; 39 | cmatrix_init(mnet); 40 | cmatrixlex(); 41 | cmatrix_delete_buffer(my_string_buffer); 42 | } 43 | 44 | %} 45 | 46 | ANYUTF [\001-\177]|[\300-\337].|[\340-\357]..|[\360-\367]... 47 | NOCOLON ([\001-\177]{-}[\011\040\012\014\072])|[\300-\337].|[\340-\357]..|[\360-\367]... 48 | SP [\040]|[\011]|[\012]|[\014] 49 | 50 | %x SUB DEL INS COST OUTSTRING 51 | 52 | %% 53 | 54 | Substitute{SP}+/[0-9]+ { BEGIN(SUB); } 55 | Delete{SP}+/[0-9]+ { BEGIN(DEL); } 56 | Insert{SP}+/[0-9]+ { BEGIN(INS); } 57 | Cost{SP}+/[0-9]+ { BEGIN(COST); } 58 | ^#.* { } 59 | 60 | :{NOCOLON}+ { 61 | cmatrix_set_cost(mnet, NULL, cmatrixtext+1, currcost); 62 | } 63 | 64 | {NOCOLON}+: { 65 | *(cmatrixtext+strlen(cmatrixtext)-1) = '\0'; 66 | cmatrix_set_cost(mnet, cmatrixtext, NULL, currcost); 67 | } 68 | 69 | {NOCOLON}+:/{NOCOLON}+ { 70 | instring = xxstrndup(cmatrixtext, strlen(cmatrixtext)-1); 71 | BEGIN(OUTSTRING); 72 | 73 | } 74 | {NOCOLON}+ { 75 | cmatrix_set_cost(mnet, instring, cmatrixtext, currcost); 76 | BEGIN(INITIAL); 77 | } 78 | 79 | [0-9]+ { 80 | cmatrix_default_substitute(mnet, atoi(cmatrixtext)); 81 | BEGIN(INITIAL); 82 | } 83 | [0-9]+ { 84 | cmatrix_default_delete(mnet, atoi(cmatrixtext)); 85 | BEGIN(INITIAL); 86 | } 87 | [0-9]+ { 88 | cmatrix_default_insert(mnet, atoi(cmatrixtext)); 89 | BEGIN(INITIAL); 90 | } 91 | 92 | [0-9]+ { 93 | currcost = atoi(cmatrixtext); 94 | BEGIN(INITIAL); 95 | } 96 | 97 | <*>[\012|\040] { } 98 | -------------------------------------------------------------------------------- /foma/extract.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include "foma.h" 20 | 21 | struct fsm *fsm_lower(struct fsm *net) { 22 | struct fsm_state *fsm; 23 | int i, prevstate, out; 24 | fsm = net->states; 25 | fsm_state_init(sigma_max(net->sigma)); 26 | prevstate = -1; 27 | for (i = 0; (fsm+i)->state_no != - 1; prevstate = (fsm+i)->state_no, i++) { 28 | if (prevstate != -1 && prevstate != (fsm+i)->state_no) { 29 | fsm_state_end_state(); 30 | } 31 | if (prevstate != (fsm+i)->state_no) { 32 | fsm_state_set_current_state((fsm+i)->state_no, (fsm+i)->final_state, (fsm+i)->start_state); 33 | } 34 | if ((fsm+i)->target != -1) { 35 | out = ((fsm+i)->out == UNKNOWN) ? IDENTITY : (fsm+i)->out; 36 | fsm_state_add_arc((fsm+i)->state_no, out, out, (fsm+i)->target, (fsm+i)->final_state, (fsm+i)->start_state); 37 | } 38 | } 39 | fsm_state_end_state(); 40 | free(net->states); 41 | fsm_state_close(net); 42 | fsm_update_flags(net,NO,NO,NO,UNK,UNK,UNK); 43 | sigma_cleanup(net,0); 44 | return(net); 45 | } 46 | 47 | struct fsm *fsm_upper(struct fsm *net) { 48 | struct fsm_state *fsm; 49 | int i, prevstate, in; 50 | fsm = net->states; 51 | fsm_state_init(sigma_max(net->sigma)); 52 | prevstate = -1; 53 | for (i = 0; (fsm+i)->state_no != - 1; prevstate = (fsm+i)->state_no, i++) { 54 | if (prevstate != -1 && prevstate != (fsm+i)->state_no) { 55 | fsm_state_end_state(); 56 | } 57 | if (prevstate != (fsm+i)->state_no) { 58 | fsm_state_set_current_state((fsm+i)->state_no, (fsm+i)->final_state, (fsm+i)->start_state); 59 | } 60 | if ((fsm+i)->target != -1) { 61 | in = ((fsm+i)->in == UNKNOWN) ? IDENTITY : (fsm+i)->in; 62 | fsm_state_add_arc((fsm+i)->state_no, in, in, (fsm+i)->target, (fsm+i)->final_state, (fsm+i)->start_state); 63 | } 64 | } 65 | fsm_state_end_state(); 66 | free(net->states); 67 | fsm_state_close(net); 68 | fsm_update_flags(net,NO,NO,NO,UNK,UNK,UNK); 69 | sigma_cleanup(net,0); 70 | return(net); 71 | } 72 | -------------------------------------------------------------------------------- /foma/stringhash.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include "fomalib.h" 19 | #include 20 | 21 | #define STRING_HASH_SIZE 8191 22 | 23 | unsigned int sh_hashf(char *string); 24 | 25 | struct sh_handle *sh_init() { 26 | struct sh_handle *sh; 27 | sh = malloc(sizeof(struct sh_handle)); 28 | sh->hash = calloc(STRING_HASH_SIZE, sizeof(struct sh_hashtable)); 29 | return(sh); 30 | } 31 | 32 | void sh_done(struct sh_handle *sh) { 33 | int i; 34 | struct sh_hashtable *hash, *hashp; 35 | for (i=0; i < STRING_HASH_SIZE; i++) { 36 | hash = sh->hash + i; 37 | if (hash->string != NULL) 38 | free(hash->string); 39 | for (hash=hash->next ; hash != NULL ; hash = hashp) { 40 | hashp = hash->next; 41 | if (hash->string != NULL) 42 | free(hash->string); 43 | free(hash); 44 | } 45 | } 46 | free(sh->hash); 47 | free(sh); 48 | } 49 | 50 | int sh_get_value(struct sh_handle *sh) { 51 | return(sh->lastvalue); 52 | } 53 | 54 | char *sh_find_string(struct sh_handle *sh, char *string) { 55 | struct sh_hashtable *hash; 56 | for (hash = sh->hash + sh_hashf(string) ; hash != NULL; hash = hash->next) { 57 | if (hash->string == NULL) 58 | return NULL; 59 | if (strcmp(hash->string, string) == 0) { 60 | sh->lastvalue = hash->value; 61 | return(hash->string); 62 | } 63 | } 64 | return NULL; 65 | } 66 | 67 | char *sh_find_add_string(struct sh_handle *sh, char *string, int value) { 68 | char *s; 69 | s = sh_find_string(sh, string); 70 | if (s == NULL) 71 | return (sh_add_string(sh, string, value)); 72 | else 73 | return(s); 74 | } 75 | 76 | char *sh_add_string(struct sh_handle *sh, char *string, int value) { 77 | struct sh_hashtable *hash, *newhash; 78 | 79 | hash = sh->hash + sh_hashf(string); 80 | if (hash->string == NULL) { 81 | hash->string = strdup(string); 82 | hash->value = value; 83 | return(hash->string); 84 | } else { 85 | newhash = malloc(sizeof(struct sh_hashtable)); 86 | newhash->string = strdup(string); 87 | newhash->value = value; 88 | newhash->next = hash->next; 89 | hash->next = newhash; 90 | return(newhash->string); 91 | } 92 | } 93 | 94 | unsigned int sh_hashf(char *string) { 95 | register unsigned int hash; 96 | hash = 0; 97 | 98 | while (*string != '\0') { 99 | hash = hash * 101 + *string++; 100 | } 101 | return (hash % STRING_HASH_SIZE); 102 | } 103 | -------------------------------------------------------------------------------- /foma/python/phonrule.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple helper tool for debugging foma-scripts 3 | that are sequences of phonological rules meant to apply 4 | in a certain order. It assumes a grammar is written as a 5 | sequence of define-statements and ordered rewrite-rules 6 | combined with a chain-statement (simulating composition 7 | of the rules). It then passes words from STDIN through 8 | the sequence of transducers and prints a decorated output 9 | to STDOUT where rules that fire are shown between brackets. 10 | 11 | Example: 12 | 13 | ----myscript.foma---- 14 | def ARule a -> b || c _ d; # Rule one 15 | def BRule b -> c || _ d; # Rule two 16 | chain ARule, BRule 17 | ---------------------- 18 | 19 | Now, 20 | 21 | $echo "cad" | python phonrule.py myscript.foma 22 | 23 | produces the output 24 | 25 | cad[ARule|Rule one]cbd[BRule|Rule two]ccd 26 | 27 | Author: Mans Hulden 28 | License: Apache (Version 2.0) 29 | Last Update: 11/07/2016 30 | """ 31 | 32 | import sys 33 | import re 34 | from foma import * 35 | 36 | import codecs 37 | 38 | class Ruleset: 39 | 40 | def __init__(self): 41 | self.rules = {} 42 | self.comments = {} 43 | self.rc = [] 44 | self.zerosymbols = ['"\u00b7"'] # Special symbols that behave like 0 in rules 45 | 46 | def rule_add(self, rulename, rule, commentline): 47 | """Compiles a rule, adds it to defined FSMs and stores a comment line 48 | and a rule name.""" 49 | 50 | if '->' in rule: 51 | zeroes = '|'.join(self.zerosymbols) 52 | rule = '[~$[' + zeroes + '] .o. [' + rule + ']]/[' + zeroes + ']' 53 | 54 | FST.define(rule, rulename) 55 | myrule = FST(rule) 56 | self.rules[rulename] = myrule 57 | self.comments[rulename] = commentline 58 | 59 | def readrules(self, fomalines): 60 | """Reads foma rules either as define statements, or chain statements. 61 | The input lines may also consist of comments following a hash, e.g. 62 | define Crule C -> 0 || _ C C # Delete first of three C's 63 | define Vrule V -> 0 || _ V V # Delete first of three V's 64 | chain Crule, Vrule 65 | """ 66 | for lineno, l in enumerate(fomalines): 67 | if 'define' in l or 'def ' in l: 68 | rulecom = l.split(' #') 69 | r = re.findall("(defi?n?e?)\s+(\S+)\s+([^;]+)", rulecom[0]) 70 | if len(r[0]) != 3: 71 | print "Syntax error on line %i" % lineno 72 | (_, rulename, rule) = r[0] 73 | if len(rulecom) > 1: 74 | commentline = rulecom[1].strip() 75 | else: 76 | commentline = '' 77 | self.rule_add(rulename, rule, commentline) 78 | if 'chain' in l: 79 | l = l.replace(';','') 80 | chain = re.findall('chain\s+(.*)', l) 81 | rc = chain[0].replace(' ','').split(',') 82 | self.rc = rc 83 | 84 | def applyrules(self, word, printall = True): 85 | """Apply a list of rules simulating composition. 86 | returns a string representation of the derivation.""" 87 | output = [word] 88 | s = output[0] 89 | for rulename in self.rc: 90 | try: 91 | transducer = self.rules[rulename] 92 | except KeyError, e: 93 | key = rulename.decode("utf-8").encode("utf-8") 94 | print 'KeyError: Rule "%s" not found!' % key 95 | raise KeyError('Key not found: '+key); 96 | comment = self.comments[rulename] 97 | newoutput = transducer[output[0]] 98 | if newoutput[0] != output[0] or printall == True: 99 | if comment != '': 100 | s += "[" + rulename + "|" + comment + "]" 101 | else: 102 | s += "[" + rulename + "]" 103 | s += newoutput[0] 104 | output = newoutput 105 | return s 106 | 107 | def main(argv): 108 | if len(sys.argv) < 2: 109 | print "Usage: phonrule.py [foma script]\n" 110 | sys.exit(1) 111 | fomafile = sys.argv[1] 112 | fomalines = [line.rstrip() for line in open(fomafile)] 113 | r = Ruleset() 114 | r.readrules(fomalines) 115 | if len(sys.argv) > 2: 116 | print r.applyrules(sys.argv[2], printall = False) 117 | else: 118 | for w in iter(sys.stdin.readline, ''): 119 | print r.applyrules(w.rstrip(), printall = False) 120 | sys.exit(1) 121 | 122 | if __name__ == "__main__": 123 | main(sys.argv[1:]) 124 | -------------------------------------------------------------------------------- /foma/define.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "foma.h" 5 | 6 | extern int g_verbose; 7 | struct defined_networks *g_defines; 8 | struct defined_functions *g_defines_f; 9 | 10 | /* Find a defined symbol from the symbol table */ 11 | /* Return the corresponding FSM */ 12 | struct fsm *find_defined(struct defined_networks *def, char *string) { 13 | struct defined_networks *d; 14 | for (d = def; d != NULL; d = d->next) { 15 | if (d->name != NULL && strcmp(string, d->name) == 0) { 16 | return(d->net); 17 | } 18 | } 19 | return NULL; 20 | } 21 | 22 | struct defined_networks *defined_networks_init(void) { 23 | struct defined_networks *def; 24 | def = calloc(1, sizeof(struct defined_networks)); /* Dummy first entry, so we can maintain the ptr */ 25 | return def; 26 | } 27 | 28 | struct defined_functions *defined_functions_init(void) { 29 | struct defined_functions *deff; 30 | deff = calloc(1, sizeof(struct defined_functions)); /* Dummy first entry */ 31 | return deff; 32 | } 33 | 34 | /* Removes a defined network from the list */ 35 | /* Returns 0 on success, 1 if the definition did not exist */ 36 | /* Undefines all if NULL is passed as the string argument */ 37 | 38 | int remove_defined(struct defined_networks *def, char *string) { 39 | struct defined_networks *d, *d_prev, *d_next; 40 | int exists = 0; 41 | /* Undefine all */ 42 | if (string == NULL) { 43 | for (d = def; d != NULL; d = d_next) { 44 | d_next = d->next; 45 | if (d->net != NULL) 46 | fsm_destroy(d->net); 47 | if (d->name != NULL) 48 | free(d->name); 49 | } 50 | return 0; 51 | } 52 | d_prev = NULL; 53 | for (d = def; d != NULL; d_prev = d, d = d->next) { 54 | if (d->name != NULL && strcmp(d->name, string) == 0) { 55 | exists = 1; 56 | break; 57 | } 58 | } 59 | if (exists == 0) { 60 | return 1; 61 | } 62 | if (d == def) { 63 | if (d->next != NULL) { 64 | fsm_destroy(d->net); 65 | free(d->name); 66 | d->name = d->next->name; 67 | d->net = d->next->net; 68 | d_next = d->next->next; 69 | free(d->next); 70 | d->next = d_next; 71 | } else { 72 | fsm_destroy(d->net); 73 | free(d->name); 74 | d->next = NULL; 75 | d->name = NULL; 76 | d->net = NULL; 77 | } 78 | } else { 79 | fsm_destroy(d->net); 80 | free(d->name); 81 | d_prev->next = d->next; 82 | free(d); 83 | } 84 | return 0; 85 | } 86 | 87 | /* Finds defined regex "function" based on name, numargs */ 88 | /* Returns the corresponding regex */ 89 | char *find_defined_function(struct defined_functions *deff, char *name, int numargs) { 90 | struct defined_functions *d; 91 | for (d = deff ; d != NULL; d = d->next) { 92 | if (d->name != NULL && strcmp(d->name, name) == 0 && d->numargs == numargs) { 93 | return(d->regex); 94 | } 95 | } 96 | return NULL; 97 | } 98 | 99 | /* Add a function to list of defined functions */ 100 | int add_defined_function(struct defined_functions *deff, char *name, char *regex, int numargs) { 101 | struct defined_functions *d; 102 | for (d = deff; d != NULL; d = d->next) { 103 | if (d->name != NULL && strcmp(d->name, name) == 0 && d->numargs == numargs) { 104 | free(d->regex); 105 | d->regex = strdup(regex); 106 | if (g_verbose) 107 | { 108 | fprintf(stderr,"redefined %s@%i)\n", name, numargs); 109 | fflush(stderr); 110 | } 111 | return 1; 112 | } 113 | } 114 | if (deff->name == NULL) { 115 | d = deff; 116 | } else { 117 | d = malloc(sizeof(struct defined_functions)); 118 | d->next = deff->next; 119 | deff->next = d; 120 | } 121 | d->name = strdup(name); 122 | d->regex = strdup(regex); 123 | d->numargs = numargs; 124 | return 0; 125 | } 126 | 127 | /* Add a network to list of defined networks */ 128 | /* Returns 0 on success or 1 on redefinition or -1 if name is too long */ 129 | /* Always maintain head of list at same ptr */ 130 | 131 | int add_defined(struct defined_networks *def, struct fsm *net, char *string) { 132 | struct defined_networks *d; 133 | if (net == NULL) 134 | return 0; 135 | if (strlen(string) > FSM_NAME_LEN) { 136 | return(-1); 137 | } 138 | 139 | fsm_count(net); 140 | for (d = def; d != NULL; d = d->next) { 141 | if (d->name != NULL && strcmp(d->name, string) == 0) { 142 | fsm_destroy(d->net); 143 | free(d->name); 144 | d->net = net; 145 | d->name = strdup(string); 146 | return 1; 147 | } 148 | } 149 | if (def->name == NULL) { 150 | d = def; 151 | } else { 152 | d = malloc(sizeof(struct defined_networks)); 153 | d->next = def->next; 154 | def->next = d; 155 | } 156 | d->name = strdup(string); 157 | d->net = net; 158 | return 0; 159 | } 160 | -------------------------------------------------------------------------------- /foma/contrib/foma2js.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ################################################################### 4 | # Converts foma file to js array for use with Javascript runtime # 5 | # Outputs a js array of all the transitions, indexed in the # 6 | # input direction. This array can be passed to the js function # 7 | # foma_apply_down() in foma_apply_down.js for stand-alone # 8 | # transducer application. # 9 | # # 10 | # Usage: foma2js [-n array variable name] [file] # 11 | # MH 20120127 # 12 | ################################################################### 13 | 14 | use Switch; 15 | use utf8; 16 | use Compress::Zlib; 17 | 18 | my $buffer ; my $filein ; my $file; my $jsnetname = 'myNet'; 19 | 20 | die "Usage: fomatojs [-n name] filename" if $#ARGV < 0; 21 | foreach (my $argnum = 0 ; $argnum <= $#ARGV; $argnum++) { 22 | if ($ARGV[$argnum] =~ '^-h$') { 23 | print "Usage: foma2js [-n array variable name] [file]\n"; 24 | exit; 25 | } 26 | if ($ARGV[$argnum] =~ '^-n$') { 27 | if ($argnum+1 <= $#ARGV) { 28 | $jsnetname = $ARGV[$argnum+1]; 29 | } else { 30 | die; 31 | } 32 | } else { 33 | $file = $ARGV[$argnum]; 34 | } 35 | } 36 | 37 | my $gz = gzopen($file, "rb") 38 | or die "Cannot open $file: $gzerrno\n" ; 39 | while ($gz->gzread($buffer) > 0) { 40 | $filein .= $buffer; 41 | } 42 | 43 | die "Error reading from $file: $gzerrno" . ($gzerrno+0) . "\n" 44 | if $gzerrno != Z_STREAM_END ; 45 | $gz->gzclose(); 46 | 47 | my @lines = split '\n', $filein; 48 | 49 | my $mode = "none"; 50 | my $version; my $numnets = 0; my %pr; 51 | my $line = 0; 52 | my @sigma; 53 | my $longestsymbollength = 0; 54 | foreach (@lines) { 55 | chomp; 56 | if ($_ =~ /##foma-net ([0-9]+\.[0-9]+)##/) { 57 | $version = $1; 58 | $numnets++; 59 | if ($numnets > 1) { 60 | die "Only one network per file supported" 61 | } 62 | next; 63 | } 64 | if ($_ =~ /##props##/) { 65 | $mode = "props"; 66 | next; 67 | } 68 | if ($_ =~ /##sigma##/) { 69 | $mode = "sigma"; 70 | next; 71 | } 72 | if ($_ =~ /##states##/) { 73 | $mode = "states"; 74 | next; 75 | } 76 | if ($_ =~ /##end##/) { 77 | $mode = "none"; 78 | next; 79 | } 80 | switch($mode) { 81 | case "props" { 82 | ($pr{"arity"}, $pr{"arccount"},$pr{"statecount"},$pr{"linecount"}, $pr{"finalcount"},$pr{"pathcount"},$pr{"is_deterministic"},$pr{"is_pruned"},$pr{"is_minimized"},$pr{"is_epsilon_free"},$pr{"is_loop_free"},$pr{"extras"},$pr{"name"}) = split ' '; 83 | } 84 | case "states" { 85 | #state in out target final 86 | @transitions = split ' '; 87 | if ($transitions[0] == -1) { next; } 88 | if ($transitions[1] == -1 && $#transitions == 3) { 89 | $arrstate = $transitions[0]; 90 | $arrfinal = $transitions[3]; 91 | if ($arrfinal == 1) { 92 | $finals[$arrstate] = 1; 93 | } 94 | next; 95 | } 96 | if ($#transitions == 4) { 97 | $arrstate = $transitions[0]; 98 | $arrin = $transitions[1]; 99 | $arrout = $transitions[2]; 100 | $arrtarget = $transitions[3]; 101 | $arrfinal = $transitions[4]; 102 | if ($arrfinal == 1) { 103 | $finals[$arrstate] = 1; 104 | } 105 | } 106 | elsif ($#transitions == 3) { 107 | $arrstate = $transitions[0]; 108 | $arrin = $transitions[1]; 109 | $arrtarget = $transitions[2]; 110 | $arrfinal = $transitions[3]; 111 | $arrout = $arrin; 112 | if ($arrfinal == 1) { 113 | $finals[$arrstate] = 1; 114 | } 115 | } 116 | elsif ($#transitions == 2) { 117 | $arrin = $transitions[0]; 118 | $arrout = $transitions[1]; 119 | $arrtarget = $transitions[2]; 120 | } 121 | elsif ($#transitions == 1) { 122 | $arrin = $transitions[0]; 123 | $arrtarget = $transitions[1]; 124 | $arrout = $arrin; 125 | } 126 | push(@{$trans{$arrstate ."|" .$sigma[$arrin]}}, "\{$arrtarget:\'$sigma[$arrout]\'\}"); 127 | } 128 | case "sigma" { 129 | (my $number, my $symbol) = split ' '; 130 | $symbol =~ s/^\@_EPSILON_SYMBOL_\@$//g; 131 | $symbol =~ s/^\@_IDENTITY_SYMBOL_\@$/\@ID\@/g; 132 | $symbol =~ s/^\@_UNKNOWN_SYMBOL_\@$/\@UN\@/g; 133 | $symbol =~ s/'/\\'/g; 134 | $sigma[$number] = $symbol; 135 | if ($number > 2) { 136 | utf8::decode($symbol); 137 | if (length($symbol) > $longestsymbollength) { 138 | $longestsymbollength = length($symbol); 139 | } 140 | 141 | } 142 | } 143 | case "none" { 144 | die "Format error"; 145 | } 146 | } 147 | } 148 | 149 | print "var $jsnetname = new Object;\n"; 150 | print "$jsnetname.t = Array;\n"; 151 | print "$jsnetname.f = Array;\n"; 152 | print "$jsnetname.s = Array;\n\n"; 153 | 154 | foreach $k (keys %trans) { 155 | ($state, $in) = split /\|/, $k; 156 | $in =~ s/^\@UN\@$/\@ID\@/; 157 | print "$jsnetname.t\[$state + '|' + \'$in\'\] = \["; 158 | print join (',', @{$trans{$k}}) ."\];\n"; 159 | } 160 | 161 | for ($i = 0; $i <= $pr{'statecount'}; $i++) { 162 | if (defined $finals[$i] and $finals[$i]) { 163 | print "$jsnetname.f\[$i\] = 1;\n"; 164 | } 165 | } 166 | 167 | for ($i = 3 ; $i <= $#sigma; $i++) { 168 | if (defined $sigma[$i]) { 169 | print "$jsnetname.s\['$sigma[$i]'\] = $i;\n"; 170 | } 171 | } 172 | 173 | print "$jsnetname.maxlen = $longestsymbollength ;\n"; 174 | -------------------------------------------------------------------------------- /foma/python/foma2js.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Converts foma file to js array for use with Javascript runtime 3 | Outputs a js array of all the transitions, indexed in the 4 | input direction. This array can be passed to the js function 5 | foma_apply_down() in foma_apply_down.js for stand-alone 6 | transducer application.""" 7 | 8 | import sys 9 | import re 10 | import argparse 11 | import gzip 12 | from enum import Enum 13 | from collections import defaultdict 14 | 15 | 16 | class Mode(Enum): 17 | NONE = 0 18 | PROPS = 1 19 | SIGMA = 2 20 | STATES = 4 21 | 22 | 23 | def main(): 24 | argparser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) 25 | argparser.add_argument('-n', '--name', help='Array variable name', default='myNet') 26 | argparser.add_argument('file', help='Foma file') 27 | args = argparser.parse_args() 28 | with gzip.open(args.file) as gzfile: 29 | lines = [line.decode('utf-8').strip() for line in gzfile.readlines()] 30 | mode = Mode.NONE 31 | num_nets = 0 32 | version = 0.0 33 | longest_symbol_length = 0 34 | props = {} 35 | sigma = {} 36 | trans = defaultdict(list) 37 | finals = {} 38 | RE_FOMA_NET = re.compile(r'.*##foma-net ([0-9]+\.[0-9]+)##') 39 | RE_PROPS = re.compile(r'##props##') 40 | RE_SIGMA = re.compile(r'##sigma##') 41 | RE_STATES = re.compile(r'##states##') 42 | RE_END = re.compile(r'##end##') 43 | for line in lines: 44 | match = RE_FOMA_NET.match(line) 45 | if match: 46 | version = float(match.group(1)) 47 | num_nets += 1 48 | if num_nets > 1: 49 | raise ValueError('Only one network per file supported') 50 | continue 51 | match = RE_PROPS.match(line) 52 | if match: 53 | mode = Mode.PROPS 54 | continue 55 | match = RE_SIGMA.match(line) 56 | if match: 57 | mode = Mode.SIGMA 58 | continue 59 | match = RE_STATES.match(line) 60 | if match: 61 | mode = Mode.STATES 62 | continue 63 | match = RE_END.match(line) 64 | if match: 65 | mode = Mode.NONE 66 | continue 67 | if mode is Mode.PROPS: 68 | prop_names = ['arity', 'arccount', 'statecount', 'linecount', 'finalcount', 'pathcount', 'is_deterministic', 'is_pruned', 'is_minimized', 'is_epsilon_free', 'is_loop_free', 'extras', 'name'] 69 | props = {k: v for k, v in zip(prop_names, line.split(' '))} 70 | elif mode is Mode.STATES: 71 | #state in out target final 72 | transitions = tuple(map(int, line.split(' '))) 73 | if transitions[0] == -1: 74 | continue 75 | if transitions[1] == -1 and len(transitions) == 4: 76 | arr_state, _, _, arr_final = transitions 77 | if arr_final == 1: 78 | finals[arr_state] = 1 79 | continue 80 | if len(transitions) == 5: 81 | arr_state, arr_in, arr_out, arr_target, arr_final = transitions 82 | if arr_final == 1: 83 | finals[arr_state] = 1 84 | elif len(transitions) == 4: 85 | arr_state, arr_in, arr_target, arr_final = transitions 86 | arr_out = arr_in 87 | if arr_final == 1: 88 | finals[arr_state] = 1 89 | elif len(transitions) == 3: 90 | arr_in, arr_out, arr_target = transitions 91 | elif len(transitions) == 2: 92 | arr_in, arr_target = transitions 93 | arr_out = arr_in 94 | trans_key = '{}|{}'.format(arr_state, sigma[arr_in]) 95 | trans_value = '{{{}:\'{}\'}}'.format(arr_target, sigma[arr_out]) 96 | trans[trans_key].append(trans_value) 97 | elif mode is Mode.SIGMA: 98 | tokens = line.split(' ') 99 | number = int(tokens[0]) 100 | symbol = tokens[1] if len(tokens) > 1 else '' 101 | if symbol == '@_EPSILON_SYMBOL_@': 102 | symbol = '' 103 | if symbol == '@_IDENTITY_SYMBOL_@': 104 | symbol = '@ID@' 105 | if symbol == '@_UNKNOWN_SYMBOL_@': 106 | symbol = '@UN@' 107 | if '\'' in symbol: 108 | symbol = symbol.replace('\'', '\\\'') 109 | sigma[number] = symbol 110 | if number > 2 and len(symbol) > longest_symbol_length: 111 | longest_symbol_length = len(symbol) 112 | elif mode is Mode.NONE: 113 | raise ValueError('Format error') 114 | 115 | print('var {} = new Object;'.format(args.name)) 116 | print('{}.t = Array;'.format(args.name)) 117 | print('{}.f = Array;'.format(args.name)) 118 | print('{}.s = Array;'.format(args.name)) 119 | print() 120 | 121 | for key in trans: 122 | state, inp = key.split('|', maxsplit=1) 123 | if inp == '@UN@': 124 | inp = '@ID@' 125 | print('{}.t[{} + \'|\' + \'{}\'] = [{}];'.format(args.name, state, inp, ','.join(trans[key]))) 126 | 127 | for i in range(0, int(props['statecount'])+1): 128 | if i in finals: 129 | print('{}.f[{}] = 1;'.format(args.name, i)) 130 | 131 | for i in range(3, len(sigma)): 132 | if i in sigma: 133 | print('{}.s[\'{}\'] = {};'.format(args.name, sigma[i], i)) 134 | 135 | print('{}.maxlen = {} ;'.format(args.name, longest_symbol_length)) 136 | 137 | 138 | if __name__ == "__main__": 139 | main() 140 | -------------------------------------------------------------------------------- /foma/trie.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include "fomalib.h" 19 | #include 20 | 21 | #define THASH_TABLESIZE 1048573 22 | #define TRIE_STATESIZE 32768 23 | 24 | unsigned int trie_hashf(unsigned int source, char *insym, char *outsym); 25 | 26 | struct fsm_trie_handle *fsm_trie_init() { 27 | struct fsm_trie_handle *th; 28 | 29 | th = calloc(1,sizeof(struct fsm_trie_handle)); 30 | th->trie_hash = calloc(THASH_TABLESIZE, sizeof(struct trie_hash)); 31 | th->trie_states = calloc(TRIE_STATESIZE, sizeof(struct trie_states)); 32 | th->statesize = TRIE_STATESIZE; 33 | th->trie_cursor = 0; 34 | th->sh_hash = sh_init(); 35 | return(th); 36 | } 37 | 38 | struct fsm *fsm_trie_done(struct fsm_trie_handle *th) { 39 | struct trie_hash *thash, *thashp; 40 | struct fsm *newnet; 41 | struct fsm_construct_handle *newh; 42 | unsigned int i; 43 | 44 | newh = fsm_construct_init("name"); 45 | for (i = 0; i < THASH_TABLESIZE; i++) { 46 | thash = (th->trie_hash)+i; 47 | for ( ; thash != NULL; thash = thash->next) { 48 | if (thash->insym != NULL) { 49 | fsm_construct_add_arc(newh, thash->sourcestate, thash->targetstate, thash->insym, thash->outsym); 50 | } else { 51 | break; 52 | } 53 | } 54 | } 55 | for (i = 0; i <= th->used_states; i++) { 56 | if ((th->trie_states+i)->is_final == 1) { 57 | fsm_construct_set_final(newh, i); 58 | } 59 | } 60 | fsm_construct_set_initial(newh, 0); 61 | newnet = fsm_construct_done(newh); 62 | /* Free all mem */ 63 | for (i=0; i < THASH_TABLESIZE; i++) { 64 | for (thash=((th->trie_hash)+i)->next; thash != NULL; thash = thashp) { 65 | thashp = thash->next; 66 | free(thash); 67 | } 68 | } 69 | sh_done(th->sh_hash); 70 | free(th->trie_states); 71 | free(th->trie_hash); 72 | free(th); 73 | return(newnet); 74 | } 75 | 76 | void fsm_trie_add_word(struct fsm_trie_handle *th, char *word) { 77 | int i, len; 78 | char *wcopy; 79 | wcopy = strdup(word); 80 | len = strlen(wcopy); 81 | for (i=0 ; *word != '\0' && i < len; word = word + utf8skip(word)+1, i++) { 82 | strncpy(wcopy, word, utf8skip(word)+1); 83 | *(wcopy+utf8skip(word)+1) = '\0'; 84 | fsm_trie_symbol(th, wcopy, wcopy); 85 | } 86 | free(wcopy); 87 | fsm_trie_end_word(th); 88 | } 89 | 90 | void fsm_trie_end_word(struct fsm_trie_handle *th) { 91 | (th->trie_states+th->trie_cursor)->is_final = 1; 92 | th->trie_cursor = 0; 93 | } 94 | 95 | void fsm_trie_symbol(struct fsm_trie_handle *th, char *insym, char *outsym) { 96 | unsigned int h; 97 | struct trie_hash *thash, *newthash; 98 | 99 | h = trie_hashf(th->trie_cursor, insym, outsym); 100 | if ((th->trie_hash+h)->insym != NULL) { 101 | for (thash = th->trie_hash+h; thash != NULL; thash = thash->next) { 102 | if (strcmp(thash->insym, insym) == 0 && strcmp(thash->outsym, outsym) == 0 && thash->sourcestate == th->trie_cursor) { 103 | /* Exists, move cursor */ 104 | th->trie_cursor = thash->targetstate; 105 | return; 106 | } 107 | } 108 | } 109 | /* Doesn't exist */ 110 | 111 | /* Insert trans, move counter and cursor */ 112 | th->used_states++; 113 | thash = th->trie_hash+h; 114 | if (thash->insym == NULL) { 115 | thash->insym = sh_find_add_string(th->sh_hash, insym,1); 116 | thash->outsym = sh_find_add_string(th->sh_hash, outsym,1); 117 | thash->sourcestate = th->trie_cursor; 118 | thash->targetstate = th->used_states; 119 | } else { 120 | newthash = calloc(1, sizeof(struct trie_hash)); 121 | newthash->next = thash->next; 122 | newthash->insym = sh_find_add_string(th->sh_hash, insym,1); 123 | newthash->outsym = sh_find_add_string(th->sh_hash, outsym,1); 124 | newthash->sourcestate = th->trie_cursor; 125 | newthash->targetstate = th->used_states; 126 | thash->next = newthash; 127 | } 128 | th->trie_cursor = th->used_states; 129 | 130 | /* Realloc */ 131 | if (th->used_states >= th->statesize) { 132 | th->statesize = next_power_of_two(th->statesize); 133 | th->trie_states = realloc(th->trie_states, th->statesize * sizeof(struct trie_states)); 134 | } 135 | (th->trie_states+th->used_states)->is_final = 0; 136 | } 137 | 138 | unsigned int trie_hashf(unsigned int source, char *insym, char *outsym) { 139 | 140 | /* Hash based on insym, outsym, and sourcestate */ 141 | register unsigned int hash; 142 | hash = 0; 143 | 144 | while (*insym != '\0') { 145 | hash = hash * 101 + *insym++; 146 | } 147 | while (*outsym != '\0') { 148 | hash = hash * 101 + *outsym++; 149 | } 150 | hash = hash * 101 + source; 151 | return (hash % THASH_TABLESIZE); 152 | } 153 | -------------------------------------------------------------------------------- /foma/coaccessible.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include "foma.h" 22 | 23 | struct invtable { 24 | int state; 25 | struct invtable *next; 26 | }; 27 | 28 | struct fsm *fsm_coaccessible(struct fsm *net) { 29 | 30 | struct invtable *inverses, *temp_i, *temp_i_prev, *current_ptr; 31 | int i, j, s, t, *coacc, current_state, markcount, *mapping, terminate, new_linecount, new_arccount, *added, old_statecount; 32 | 33 | 34 | struct fsm_state *fsm; 35 | 36 | fsm = net->states; 37 | new_arccount = 0; 38 | /* printf("statecount %i\n",net->statecount); */ 39 | old_statecount = net->statecount; 40 | inverses = calloc(net->statecount, sizeof(struct invtable)); 41 | coacc = malloc(sizeof(int)*(net->statecount)); 42 | mapping = malloc(sizeof(int)*(net->statecount)); 43 | added = malloc(sizeof(int)*(net->statecount)); 44 | 45 | for (i=0; i < (net->statecount); i++) { 46 | (inverses+i)->state = -1; 47 | *(coacc+i) = 0; 48 | *(added+i) = 0; 49 | } 50 | 51 | for (i=0; (fsm+i)->state_no != -1; i++) { 52 | s = (fsm+i)->state_no; 53 | t = (fsm+i)->target; 54 | if (t != -1 && s != t) { 55 | 56 | if (((inverses+t)->state) == -1) { 57 | (inverses+t)->state = s; 58 | } else { 59 | temp_i = malloc(sizeof(struct invtable)); 60 | temp_i->next = (inverses+t)->next; 61 | (inverses+t)->next = temp_i; 62 | temp_i->state = s; 63 | } 64 | } 65 | } 66 | 67 | /* Push & mark finals */ 68 | 69 | markcount = 0; 70 | for (i=0; (fsm+i)->state_no != -1; i++) { 71 | if ((fsm+i)->final_state && (!*(coacc+((fsm+i)->state_no)))) { 72 | int_stack_push((fsm+i)->state_no); 73 | *(coacc+(fsm+i)->state_no) = 1; 74 | markcount++; 75 | } 76 | } 77 | 78 | terminate = 0; 79 | while(!int_stack_isempty()) { 80 | current_state = int_stack_pop(); 81 | current_ptr = inverses+current_state; 82 | while(current_ptr != NULL && current_ptr->state != -1) { 83 | if (!*(coacc+(current_ptr->state))) { 84 | *(coacc+(current_ptr->state)) = 1; 85 | int_stack_push(current_ptr->state); 86 | markcount++; 87 | } 88 | current_ptr = current_ptr->next; 89 | } 90 | if (markcount >= net->statecount) { 91 | /* printf("Already coacc\n"); */ 92 | terminate = 1; 93 | int_stack_clear(); 94 | break; 95 | } 96 | } 97 | 98 | 99 | if (terminate == 0) { 100 | *mapping = 0; /* state 0 always exists */ 101 | new_linecount = 0; 102 | for (i=1,j=0; i < (net->statecount);i++) { 103 | if (*(coacc+i) == 1) { 104 | j++; 105 | *(mapping+i) = j; 106 | } 107 | } 108 | 109 | for (i=0,j=0; (fsm+i)->state_no != -1; i++) { 110 | if (i > 0 && (fsm+i)->state_no != (fsm+i-1)->state_no && (fsm+i-1)->final_state && !*(added+((fsm+i-1)->state_no))) { 111 | add_fsm_arc(fsm, j++, *(mapping+((fsm+i-1)->state_no)), -1, -1, -1, 1, (fsm+i-1)->start_state); 112 | new_linecount++; 113 | *(added+((fsm+i-1)->state_no)) = 1; 114 | /* printf("addf ad %i\n",i); */ 115 | } 116 | if (*(coacc+((fsm+i)->state_no)) && (((fsm+i)->target == -1) || *(coacc+((fsm+i)->target)))) { 117 | (fsm+j)->state_no = *(mapping+((fsm+i)->state_no)); 118 | if ((fsm+i)->target == -1) { 119 | (fsm+j)->target = -1; 120 | } else { 121 | (fsm+j)->target = *(mapping+((fsm+i)->target)); 122 | } 123 | (fsm+j)->final_state = (fsm+i)->final_state; 124 | (fsm+j)->start_state = (fsm+i)->start_state; 125 | (fsm+j)->in = (fsm+i)->in; 126 | (fsm+j)->out = (fsm+i)->out; 127 | j++; 128 | new_linecount++; 129 | *(added+(fsm+i)->state_no) = 1; 130 | if ((fsm+i)->target != -1) { 131 | new_arccount++; 132 | } 133 | } 134 | } 135 | 136 | if ((i > 1) && ((fsm+i-1)->final_state) && *(added+((fsm+i-1)->state_no)) == 0) { 137 | /* printf("addf\n"); */ 138 | add_fsm_arc(fsm, j++, *(mapping+((fsm+i-1)->state_no)), -1, -1, -1, 1, (fsm+i-1)->start_state); 139 | new_linecount++; 140 | } 141 | 142 | if (new_linecount == 0) { 143 | add_fsm_arc(fsm, j++, 0, -1, -1, -1, -1, -1); 144 | } 145 | 146 | add_fsm_arc(fsm, j, -1, -1, -1, -1, -1, -1); 147 | if (markcount == 0) { 148 | /* We're dealing with the empty language */ 149 | free(fsm); 150 | net->states = fsm_empty(); 151 | fsm_sigma_destroy(net->sigma); 152 | net->sigma = sigma_create(); 153 | } 154 | net->linecount = new_linecount; 155 | net->arccount = new_arccount; 156 | net->statecount = markcount; 157 | } 158 | 159 | /* printf("Markccount %i \n",markcount); */ 160 | 161 | for (i = 0; i < old_statecount ; i++) { 162 | for (temp_i = inverses+i; temp_i != NULL ; ) { 163 | temp_i_prev = temp_i; 164 | temp_i = temp_i->next; 165 | if (temp_i_prev != inverses+i) 166 | free(temp_i_prev); 167 | } 168 | } 169 | free(inverses); 170 | 171 | free(coacc); 172 | free(added); 173 | free(mapping); 174 | net->is_pruned = YES; 175 | return(net); 176 | } 177 | -------------------------------------------------------------------------------- /foma/docs/examples/hixkaryana-ot-verification.foma: -------------------------------------------------------------------------------- 1 | # hixkaryana-ot-verification.foma 2 | 3 | # Copyright (C) 2016 Mans Hulden 4 | 5 | # This is the complete program code that performs a "formal comparison" 6 | # of an OT and a rule-based account of Hixkaryana stress as documented in the paper: 7 | 8 | # Hulden, M. (2017). "Formal and Computational Verification of Phonological Analyses". Phonology 34. 9 | 10 | # Run with foma -l hixkaryana-ot-verification.foma 11 | 12 | # This program is free software; you can redistribute it and/or modify 13 | # it under the terms of GNU General Public License as published by 14 | # the Free Software Foundation; either version 2 of the License, or 15 | # (at your option) any later version. 16 | # This program is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | 21 | # The first analysis is the rule-based one presented in Kager's 22 | # "Optimality Theory" (1999), p. 149: 23 | 24 | # See also, section 7.2.1 in Phonology article 25 | # Step 1: Syllabify. 26 | # Step 2: Mark the final syllable of each word as extrametrical. 27 | # Step 3: Assign iambs {(L[H]),(L[L]),([H])} iteratively from left to right. [] is stressed 28 | # Step 4: When the entire metrical domain is a single light syllable, assign ([L]) to it. 29 | # Step 5: Lengthen the vowel of each strong open syllable. 30 | 31 | def V [a|e|i|o|u]; # Vowels 32 | def SV [á|é|í|ó|ú]; # Stressed Vowels 33 | def LSV SV ":"; # Long stressed vowels 34 | def LV [SV|V] ":"; # Long vowels 35 | 36 | def C [b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z|tʃ]; # Consonants 37 | 38 | def E .#. | "." ; # Edges 39 | def Light C* [V|SV] ; # Light syllables 40 | def Heavy C* [[V|SV] C+|C* [LSV|LV] C*] ; # Heavy syllables 41 | 42 | def Syllable Light | Heavy; 43 | 44 | def Syllabify C* V+ C* @> ... "." || _ (C) V ; # A leftmost-shortest rule 45 | # provides classical MaxOnset behavior 46 | def MarkFinal \E* -> %< ... %> || E _ .#. ; # Mark final syllable as extrametrical 47 | def Stress [a:á|e:é|i:í|o:ó|u:ú]; # Stressed counterparts of vowels 48 | def Iambs ( Light %. ) Heavy | Light %. Light @-> %( ... %) || E _ E .o. Stress -> || _ \V* %) ; 49 | def OnlyL 0:%( C* Stress 0:%) -> || .#. _ [%. %< | .#.] ; # Single light syllable stress 50 | 51 | def Lengthen [..] -> {:} || [á|é|í|ó|ú] _ %) ; # Vowel lengthening 52 | 53 | regex [C|V]* .o. Syllabify .o. MarkFinal .o. Iambs .o. OnlyL .o. Lengthen .o. %<|%> -> 0; 54 | 55 | echo Testing words with rule-based account 56 | 57 | down atʃowowo 58 | down kaja 59 | down tohkurehona 60 | down torono 61 | down kananihno 62 | 63 | def Rule; 64 | 65 | # What follows is the OT analysis given in Kager (pp. 150-160), with the UnevenIamb 66 | # constraint fixed to only treat syllables with long vowels as heavy, 67 | # which yields precisely the same transducer as the above rule-based account. 68 | 69 | def S \"."*; 70 | def Scan S ["." S]* (->) %( ... %) || E _ E; 71 | 72 | def Length [V|SV] (->) ... {:}; 73 | # We rig the stress to be assigned only on rightmost syllables in feet 74 | def Stress a (->) á, e (->) é, i (->) í , o (->) ó , u (->) ú || _ C* %) ; 75 | # The syllabification is given to our Gen: 76 | def Gen [C|V]* .o. Syllabify .o. Scan .o. Stress .o. Length; 77 | 78 | # General helper macros 79 | def AddViol [?* 0:"*" ?*]+; 80 | def Worsen [Gen.i .o. Gen]/{*} .o. AddViol; 81 | def Markup %(|%)|"."|":"; 82 | def Str a (->) á, e (->) é, i (->) í , o (->) ó , u (->) ú; 83 | # Undo Gen, add at least one violation, then redo Gen arbitrarily 84 | def Worsen Markup -> 0 .o. Str.i .o. [?* 0:{*} ?*]+ .o. 0 -> Markup .o. Str; 85 | 86 | # Eval macro 87 | def Eval(X) X .o. ~[[X .o. Worsen].l] .o. {*} -> 0; 88 | 89 | # CONSTRAINTS 90 | # "A grammatical word must be a prosodic word" (Kager, p. 152) 91 | def GrWd=PrWd [..] -> {*} || .#. _ \"("* .#. ; 92 | 93 | # "Feet are binary under moraic or syllabic analysis" (Kager, p. 156) 94 | def FtBin [..] -> {*} || %( [\["."|")"]* "."]^>1 \["."|")"]* %) _ , %( Light %) _; 95 | 96 | # "No foot is final in PrWd" (Kager, p. 151) 97 | def NonFinality [..] -> {*} || %) _ .#.; 98 | 99 | # "Heavy syllables are stressed" (Kager, p. 155) 100 | def WSP [Heavy & ~$SV] -> ... {*}; 101 | 102 | # "(LH) > (LL,H)" (Kager, p. 151) 103 | def UnevenIamb [%( Light "." Light %) | %( [Heavy-[(C) [SV|V] C+]] %) ] -> ... {*} ; 104 | 105 | # Uncomment the below line to get original OT behavior. The above line is the fix 106 | # which says a syllable is H only if it has a long vowel in it (i.e. CVC doesn't count). 107 | # This fix produces equivalence with the rule-based account. 108 | # def UnevenIamb [%( Light "." Light %) | %( Heavy %) ] -> ... {*} ; 109 | 110 | # "Syllables are parsed by feet" (Kager, p. 153) 111 | def ParseSyl Syllable -> ... {*} || E _ E; 112 | 113 | # Worsener helper for All-Feet-Left 114 | def WorsenFeet %)|%.|":" -> 0 .o. [?+ 0:%( [?|0:%(]*] | [?* [%(:0 ?+ 0:%(] ?*]+ .o. 115 | 0 -> %)|%.|":" .o. Str.i .o. Str ; 116 | 117 | # A macro to handle the All-Feet-Left worsening elegantly. 118 | def EvalAllFeetLeft(X) X .o. ~[X .o. WorsenFeet].l ; 119 | 120 | # "Output moras have input correspondents" (Kager, p. 156) 121 | def DepMuIO ":" -> ... {*}; 122 | 123 | # This is the complete ranking: 124 | # NonFinality >> GrWd=PrWd >> FtBin >> WSP >> UnevenIamb >> ParseSyl >> AllFeetLeft >> DepMuIO 125 | 126 | regex Eval(EvalAllFeetLeft(Eval(Eval(Eval(Eval(Eval(Eval(Gen .o. NonFinality) .o. GrWd=PrWd) .o. FtBin) .o. WSP) .o. UnevenIamb) .o. ParseSyl)) .o. DepMuIO); 127 | 128 | echo Testing words with OT-based account 129 | 130 | down atʃowowo 131 | down kaja 132 | down tohkurehona 133 | down torono 134 | down kananihno 135 | 136 | def OT; 137 | 138 | echo Testing (structural) equivalence of OT and rule-based account with fixed Uneven-Iamb constraint 139 | 140 | regex OT; 141 | regex Rule; 142 | test equivalent 143 | -------------------------------------------------------------------------------- /foma/contrib/foma.plist: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 15 | 16 | 17 | 18 | BBEditDocumentType 19 | CodelessLanguageModule 20 | 21 | 22 | BBLMLanguageDisplayName 23 | Foma 24 | 25 | BBLMLanguageCode 26 | foma 27 | 28 | BBLMPreferredFilenameExtension 29 | foma 30 | 31 | BBLMSuffixMap 32 | 33 | 34 | BBLMLanguageSuffix 35 | .foma 36 | 37 | 38 | 39 | 40 | BBLMCommentLineDefault 41 | ! 42 | 43 | 45 | BBLMColorsSyntax 46 | 47 | 48 | BBLMSupportsTextCompletion 49 | 50 | 51 | 52 | 54 | BBLMKeywordList 55 | 56 | OFF 57 | ON 58 | ambiguous 59 | apply 60 | apropos 61 | att 62 | att-epsilon 63 | clear 64 | close 65 | cmatrix 66 | compact 67 | complete 68 | compose 69 | compose-tristate 70 | concatenate 71 | crossproduct 72 | def 73 | define 74 | defined 75 | determinize 76 | dot 77 | down 78 | echo 79 | eliminate 80 | equivalent 81 | export 82 | extract 83 | flag 84 | flag-diacritics 85 | flags 86 | for 87 | functional 88 | help 89 | hopcroft-min 90 | identity 91 | ignore 92 | in 93 | intersect 94 | invert 95 | label 96 | letter 97 | lexc 98 | lexc-align 99 | license 100 | load 101 | lower-side 102 | lower-universal 103 | lower-words 104 | machine 105 | med 106 | med-cutoff 107 | med-limit 108 | minimal 109 | minimize 110 | name 111 | negate 112 | net 113 | non-null 114 | null 115 | obey-flags 116 | one-plus 117 | out 118 | pairs 119 | pop 120 | print 121 | print-pairs 122 | print-sigma 123 | print-space 124 | prolog 125 | prune 126 | push 127 | quit 128 | quit-on-fail 129 | random-lower 130 | random-pairs 131 | random-upper 132 | random-words 133 | read 134 | recursive-define 135 | re 136 | regex 137 | reverse 138 | rotate 139 | save 140 | sequential 141 | set 142 | shortest-string 143 | shortest-string-size 144 | show 145 | show-flags 146 | shuffle 147 | sigma 148 | size 149 | sort 150 | source 151 | spaced-text 152 | stack 153 | substitute 154 | symbol 155 | system 156 | test 157 | text 158 | turn 159 | twosided 160 | unambiguous 161 | undefine 162 | union 163 | up 164 | upper 165 | upper-side 166 | upper-universal 167 | upper-words 168 | variable 169 | variables 170 | verbose 171 | view 172 | warranty 173 | words 174 | write 175 | zero-plus 176 | .#. 177 | 178 | 179 | 180 | 181 | 183 | Language Features 184 | 185 | Identifier and Keyword Character Class 186 | A-Za-z0-9_\?!. 187 | 188 | Comment Pattern 189 | (^#|[^.]#|#[^.]|!).+$ 190 | 191 | String Pattern 192 | " ( 195 | [^"\r\\] (?# match anything potential ending quote, new line, or start of escape sequence) 196 | | \\. (?# match any escape sequence pair) 197 | )*? 198 | " 199 | ) 200 | ) 201 | ]]> 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /foma/foma.h: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include "fomalib.h" 19 | 20 | #define AP_D 1 /* Apply down */ 21 | #define AP_U 2 /* Apply up */ 22 | #define AP_M 3 /* Apply minimum edit distance */ 23 | 24 | #define PROMPT_MAIN 0 /* Regular prompt */ 25 | #define PROMPT_A 1 /* Apply prompt */ 26 | 27 | /** Runtime options */ 28 | struct _fsm_options { 29 | _Bool skip_word_boundary_marker; 30 | }; 31 | extern struct _fsm_options fsm_options; 32 | 33 | extern struct defined_networks *g_defines; 34 | extern struct defined_functions *g_defines_f; 35 | 36 | /** User stack */ 37 | struct stack_entry { 38 | int number; 39 | struct apply_handle *ah; 40 | struct apply_med_handle *amedh; 41 | struct fsm *fsm; 42 | struct stack_entry *next; 43 | struct stack_entry *previous; 44 | }; 45 | 46 | /* Quantifier & Logic-related */ 47 | char *find_quantifier(char *string); 48 | void add_quantifier (char *string); 49 | void purge_quantifier (char *string); 50 | struct fsm *union_quantifiers(); 51 | int count_quantifiers(); 52 | void clear_quantifiers(); 53 | 54 | /* Main Stack functions */ 55 | int stack_add(struct fsm *fsm); 56 | int stack_size(); 57 | int stack_init(); 58 | struct fsm *stack_pop(); 59 | int stack_isempty(); 60 | int stack_turn(); 61 | struct stack_entry *stack_find_top(); 62 | struct stack_entry *stack_find_second(); 63 | struct stack_entry *stack_find_bottom(); 64 | int stack_clear(); 65 | int stack_rotate(); 66 | int stack_print(); 67 | struct apply_handle *stack_get_ah(); 68 | struct apply_med_handle *stack_get_med_ah(); 69 | 70 | /* Iface */ 71 | void iface_ambiguous_upper(void); 72 | void iface_apply_down(char *word); 73 | int iface_apply_file(char *infilename, char *outfilename, int direction); 74 | void iface_apply_med(char *word); 75 | void iface_apply_set_params(struct apply_handle *h); 76 | void iface_apply_up(char *word); 77 | void iface_apropos(char *s); 78 | void iface_close(void); 79 | void iface_compact(void); 80 | void iface_complete(void); 81 | void iface_compose(void); 82 | void iface_conc(void); 83 | void iface_crossproduct(void); 84 | void iface_determinize(void); 85 | void iface_eliminate_flags(void); 86 | void iface_eliminate_flag(char *name); 87 | int iface_extract_number(char *s); 88 | void iface_extract_ambiguous(void); 89 | void iface_extract_unambiguous(void); 90 | void iface_factorize(void); 91 | void iface_help_search(char *s); 92 | void iface_help(void); 93 | void iface_ignore(void); 94 | void iface_intersect(void); 95 | void iface_invert(void); 96 | void iface_load_defined(char *filename); 97 | void iface_load_stack(char *filename); 98 | void iface_lower_side(void); 99 | void iface_minimize(void); 100 | void iface_one_plus(void); 101 | void iface_pop(void); 102 | void iface_label_net(void); 103 | void iface_letter_machine(void); 104 | void iface_lower_words(int limit); 105 | void iface_name_net(char *name); 106 | void iface_negate(void); 107 | void iface_print_cmatrix(void); 108 | void iface_print_cmatrix_att(char *filename); 109 | void iface_print_net(char *netname, char *filename); 110 | void iface_print_defined(void); 111 | void iface_print_dot(char *filename); 112 | void iface_print_shortest_string(); 113 | void iface_print_shortest_string_size(); 114 | void iface_print_name(void); 115 | void iface_quit(void); 116 | void iface_apply_random(char *(*applyer)(struct apply_handle *h), int limit); 117 | void iface_random_lower(int limit); 118 | void iface_random_upper(int limit); 119 | void iface_random_words(int limit); 120 | void iface_pairs(int limit); 121 | void iface_pairs_file(char *filename); 122 | void iface_random_pairs(int limit); 123 | void iface_print_sigma(void); 124 | void iface_print_stats(void); 125 | void iface_shuffle(void); 126 | void iface_sort(void); 127 | void iface_sort_input(void); 128 | void iface_sort_output(void); 129 | int iface_stack_check(int size); 130 | void iface_upper_words(int limit); 131 | void iface_prune(void); 132 | int iface_read_att(char *filename); 133 | int iface_read_prolog(char *filename); 134 | int iface_read_spaced_text(char *filename); 135 | int iface_read_text(char *filename); 136 | void iface_reverse(void); 137 | void iface_rotate(void); 138 | void iface_save_defined(char *filename); 139 | void iface_save_stack(char *filename); 140 | void iface_sequentialize(void); 141 | void iface_set_variable(char *name, char *value); 142 | void iface_show_variables(void); 143 | void iface_show_variable(char *name); 144 | void iface_sigma_net(); 145 | void iface_substitute_defined (char *original, char *substitute); 146 | void iface_substitute_symbol (char *original, char *substitute); 147 | void iface_test_equivalent(void); 148 | void iface_test_functional(void); 149 | void iface_test_identity(void); 150 | void iface_test_lower_universal(void); 151 | void iface_test_sequential(void); 152 | void iface_test_unambiguous(void); 153 | void iface_test_upper_universal(void); 154 | void iface_test_nonnull(void); 155 | void iface_test_null(void); 156 | void iface_turn(void); 157 | void iface_twosided_flags(void); 158 | void iface_union(void); 159 | void iface_upper_side(void); 160 | void iface_view(void); 161 | void iface_warranty(void); 162 | void iface_words(int limit); 163 | void iface_words_file(char *filename, int type); 164 | int iface_write_att(char *filename); 165 | void iface_write_prolog(char *filename); 166 | void iface_zero_plus(void); 167 | int print_stats(struct fsm *net); 168 | -------------------------------------------------------------------------------- /foma/stack.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include "foma.h" 22 | 23 | extern int g_verbose; 24 | 25 | struct stack_entry *main_stack; 26 | 27 | int stack_size() { 28 | int i; 29 | struct stack_entry *stack_ptr; 30 | for (i=0, stack_ptr = main_stack; stack_ptr->next != NULL; i++) 31 | stack_ptr = stack_ptr->next; 32 | return i; 33 | } 34 | 35 | int stack_init() { 36 | main_stack = malloc(sizeof(struct stack_entry)); 37 | main_stack->number = -1; 38 | main_stack->fsm = NULL; 39 | main_stack->next = NULL; 40 | main_stack->previous = NULL; 41 | return 1; 42 | } 43 | 44 | int stack_add(struct fsm *fsm) { 45 | int i; 46 | struct stack_entry *stack_ptr, *stack_ptr_previous; 47 | stack_ptr_previous = NULL; 48 | 49 | fsm_count(fsm); 50 | if (strcmp(fsm->name,"") == 0) 51 | sprintf(fsm->name, "%X",rand()); 52 | for (i=0, stack_ptr = main_stack; stack_ptr->number != -1; i++) { 53 | stack_ptr_previous = stack_ptr; 54 | stack_ptr = stack_ptr->next; 55 | } 56 | stack_ptr->next = malloc(sizeof(struct stack_entry)); 57 | stack_ptr->fsm = fsm; 58 | stack_ptr->ah = NULL; 59 | stack_ptr->amedh = NULL; 60 | stack_ptr->number = i; 61 | stack_ptr->previous = stack_ptr_previous; 62 | (stack_ptr->next)->number = -1; 63 | (stack_ptr->next)->fsm = NULL; 64 | (stack_ptr->next)->next = NULL; 65 | (stack_ptr->next)->previous = stack_ptr; 66 | if (g_verbose) 67 | { 68 | print_stats(fsm); 69 | } 70 | return(stack_ptr->number); 71 | } 72 | 73 | struct apply_med_handle *stack_get_med_ah() { 74 | struct stack_entry *se; 75 | se = stack_find_top(); 76 | if (se == NULL) { 77 | return(NULL); 78 | } 79 | if (se->amedh == NULL) { 80 | se->amedh = apply_med_init(se->fsm); 81 | apply_med_set_align_symbol(se->amedh, "-"); 82 | } 83 | return(se->amedh); 84 | } 85 | 86 | struct apply_handle *stack_get_ah() { 87 | struct stack_entry *se; 88 | se = stack_find_top(); 89 | if (se == NULL) { 90 | return(NULL); 91 | } 92 | if (se->ah == NULL) { 93 | se->ah = apply_init(se->fsm); 94 | } 95 | return(se->ah); 96 | } 97 | 98 | struct fsm *stack_pop(void) { 99 | int i; 100 | struct fsm *fsm; 101 | struct stack_entry *stack_ptr; 102 | if (stack_size() == 1) { 103 | fsm = main_stack->fsm; 104 | main_stack->fsm = NULL; 105 | stack_clear(); 106 | return(fsm); 107 | } 108 | for (i=0, stack_ptr = main_stack; (stack_ptr->next)->number != -1; stack_ptr = stack_ptr->next, i++); 109 | (stack_ptr->previous)->next = stack_ptr->next; 110 | (stack_ptr->next)->previous = stack_ptr->previous; 111 | fsm = stack_ptr->fsm; 112 | if (stack_ptr->ah != NULL) { 113 | apply_clear(stack_ptr->ah); 114 | stack_ptr->ah = NULL; 115 | } 116 | if (stack_ptr->amedh != NULL) { 117 | apply_med_clear(stack_ptr->amedh); 118 | stack_ptr->amedh = NULL; 119 | } 120 | stack_ptr->fsm = NULL; 121 | free(stack_ptr); 122 | return(fsm); 123 | } 124 | 125 | int stack_isempty () { 126 | if (main_stack->next == NULL) { 127 | return 1; 128 | } else { 129 | return 0; 130 | } 131 | } 132 | 133 | int stack_turn () { 134 | struct stack_entry *stack_ptr; 135 | if (stack_isempty()) { 136 | printf("Stack is empty.\n"); 137 | return 0; 138 | } 139 | if (stack_size() == 1) { 140 | return 1; 141 | } 142 | 143 | stack_ptr = stack_find_top(); 144 | main_stack->next = stack_ptr->next; 145 | (stack_ptr->next)->previous = main_stack; 146 | main_stack = stack_ptr; 147 | 148 | while (stack_ptr->previous != NULL) { 149 | stack_ptr->next = stack_ptr->previous; 150 | stack_ptr = stack_ptr->next; 151 | } 152 | for (stack_ptr = main_stack; stack_ptr->number != -1;) { 153 | (stack_ptr->next)->previous = stack_ptr; 154 | } 155 | return 1; 156 | } 157 | 158 | struct stack_entry *stack_find_top () { 159 | struct stack_entry *stack_ptr; 160 | if (main_stack->number == -1) { 161 | return NULL; 162 | } 163 | for (stack_ptr = main_stack; (stack_ptr->next)->number != -1; stack_ptr = stack_ptr->next); 164 | return(stack_ptr); 165 | } 166 | 167 | struct stack_entry *stack_find_bottom () { 168 | if (main_stack->number == -1) { 169 | return NULL; 170 | } 171 | return(main_stack); 172 | } 173 | 174 | struct stack_entry *stack_find_second () { 175 | struct stack_entry *stack_ptr; 176 | /* 177 | if (main_stack->number == -1) { 178 | return NULL; 179 | } 180 | */ 181 | for (stack_ptr = main_stack; (stack_ptr->next)->number != -1; stack_ptr = stack_ptr->next); 182 | return(stack_ptr->previous); 183 | } 184 | 185 | int stack_clear(void) { 186 | struct stack_entry *stack_ptr; 187 | for (stack_ptr = main_stack ; stack_ptr->next != NULL; stack_ptr = main_stack) { 188 | if (stack_ptr->ah != NULL) 189 | apply_clear(stack_ptr->ah); 190 | if (stack_ptr->amedh != NULL) 191 | apply_med_clear(stack_ptr->amedh); 192 | 193 | main_stack = stack_ptr->next; 194 | fsm_destroy(stack_ptr->fsm); 195 | free(stack_ptr); 196 | } 197 | free(stack_ptr); 198 | return(stack_init()); 199 | } 200 | 201 | int stack_rotate () { 202 | 203 | /* Top element of stack to bottom */ 204 | struct stack_entry *stack_ptr; 205 | struct fsm *temp_fsm; 206 | 207 | if (stack_isempty()) { 208 | printf("Stack is empty.\n"); 209 | return -1; 210 | } 211 | if (stack_size() == 1) { 212 | return 1; 213 | } 214 | stack_ptr = stack_find_top(); 215 | temp_fsm = main_stack->fsm; 216 | main_stack->fsm = stack_ptr->fsm; 217 | stack_ptr->fsm = temp_fsm; 218 | return 1; 219 | } 220 | 221 | int stack_print () { 222 | return 1; 223 | } 224 | -------------------------------------------------------------------------------- /foma/topsort.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include "foma.h" 22 | 23 | struct fsm *fsm_topsort (struct fsm *net) { 24 | 25 | /* We topologically sort the network by looking for a state */ 26 | /* with inverse count 0. We then examine all the arcs from that */ 27 | /* state, and decrease the target invcounts. If we find a new */ 28 | /* state with invcount 0, we push that on the stack to be treated */ 29 | /* If the graph is cyclic, one of two things will happen: */ 30 | 31 | /* (1) We fail to find a state with invcount 0 before we've treated */ 32 | /* all states */ 33 | /* (2) A state under treatment has an arc to a state already treated */ 34 | /* or itself (we mark a state as treated as soon as we start */ 35 | /* working on it). */ 36 | /* Of course we also count the number of paths in the network. */ 37 | 38 | int i, j, curr_state, *statemap, treatcount, *order, lc, *newnum, newtarget, newstate; 39 | unsigned short int *invcount; 40 | unsigned char *treated, overflow; 41 | long long grand_pathcount, *pathcount; 42 | struct fsm_state *fsm, *curr_fsm, *new_fsm; 43 | 44 | if (net == NULL) { return NULL; } 45 | 46 | fsm_count(net); 47 | 48 | fsm = net->states; 49 | 50 | statemap = malloc(sizeof(int)*net->statecount); 51 | order = malloc(sizeof(int)*net->statecount); 52 | pathcount = malloc(sizeof(long long)*net->statecount); 53 | newnum = malloc(sizeof(int)*net->statecount); 54 | invcount = malloc(sizeof(unsigned short int)*net->statecount); 55 | treated = malloc(sizeof(unsigned char)*net->statecount); 56 | 57 | for (i=0; i < net->statecount; i++) { 58 | *(statemap+i) = -1; 59 | *(invcount+i) = 0; 60 | *(treated+i) = 0; 61 | *(order+i) = 0; 62 | *(pathcount+i) = 0; 63 | } 64 | 65 | for (i=0, lc=0; (fsm+i)->state_no != -1; i++) { 66 | lc++; 67 | if ((fsm+i)->target != -1) { 68 | (*(invcount+(fsm+i)->target))++; 69 | /* Do a fast check here to see if we have a selfloop */ 70 | if ((fsm+i)->state_no == (fsm+i)->target) { 71 | net->pathcount = PATHCOUNT_CYCLIC; 72 | net->is_loop_free = 0; 73 | goto cyclic; 74 | } 75 | } 76 | if (*(statemap+(fsm+i)->state_no) == -1) { 77 | *(statemap+(fsm+i)->state_no) = i; 78 | } 79 | } 80 | 81 | treatcount = net->statecount; 82 | int_stack_clear(); 83 | int_stack_push(0); 84 | grand_pathcount = 0; 85 | 86 | *(pathcount+0) = 1; 87 | 88 | overflow = 0; 89 | for (i=0 ; !int_stack_isempty(); i++) { 90 | /* Treat a state */ 91 | curr_state = int_stack_pop(); 92 | *(treated+curr_state) = 1; 93 | *(order+i) = curr_state; 94 | *(newnum+curr_state) = i; 95 | 96 | treatcount--; 97 | curr_fsm = fsm+*(statemap+curr_state); 98 | while (curr_fsm->state_no == curr_state) { 99 | if (curr_fsm->target != -1 ) { 100 | (*(invcount+(curr_fsm->target)))--; 101 | 102 | /* Check if we overflow the path counter */ 103 | 104 | if (!overflow) { 105 | *(pathcount+(curr_fsm->target)) += *(pathcount+curr_state); 106 | if ((*(pathcount+(curr_fsm->target)) < 0)) { 107 | overflow = 1; 108 | } 109 | } 110 | 111 | /* Case (1) for cyclic */ 112 | if (*(treated+(curr_fsm)->target) == 1) { 113 | net->pathcount = PATHCOUNT_CYCLIC; 114 | net->is_loop_free = 0; 115 | goto cyclic; 116 | } 117 | if ( *(invcount+(curr_fsm->target)) == 0) { 118 | int_stack_push(curr_fsm->target); 119 | } 120 | } 121 | curr_fsm++; 122 | } 123 | } 124 | 125 | /* Case (2) */ 126 | if (treatcount > 0) { 127 | net->pathcount = PATHCOUNT_CYCLIC; 128 | net->is_loop_free = 0; 129 | goto cyclic; 130 | } 131 | 132 | new_fsm = malloc(sizeof(struct fsm_state) * (lc+1)); 133 | for (i=0, j=0 ; i < net->statecount; i++) { 134 | 135 | curr_state = *(order+i); 136 | curr_fsm = fsm+*(statemap+curr_state); 137 | 138 | if (curr_fsm->final_state == 1 && !overflow) { 139 | grand_pathcount += *(pathcount + curr_state); 140 | if (grand_pathcount < 0) 141 | overflow = 1; 142 | } 143 | 144 | for (; curr_fsm->state_no == curr_state; curr_fsm++) { 145 | 146 | newstate = curr_fsm->state_no == -1 ? -1 : *(newnum+(curr_fsm->state_no)); 147 | newtarget = curr_fsm->target == -1 ? -1 : *(newnum+(curr_fsm->target)); 148 | add_fsm_arc(new_fsm, j, newstate, curr_fsm->in, curr_fsm->out, newtarget, curr_fsm->final_state, curr_fsm->start_state); 149 | j++; 150 | } 151 | } 152 | 153 | add_fsm_arc(new_fsm, j, -1, -1, -1, -1, -1, -1); 154 | net->states = new_fsm; 155 | net->pathcount = grand_pathcount; 156 | net->is_loop_free = 1; 157 | if (overflow == 1) { 158 | net->pathcount = PATHCOUNT_OVERFLOW; 159 | } 160 | free(fsm); 161 | 162 | cyclic: 163 | 164 | free(statemap); 165 | free(order); 166 | free(pathcount); 167 | free(newnum); 168 | free(invcount); 169 | free(treated); 170 | int_stack_clear(); 171 | return(net); 172 | } 173 | -------------------------------------------------------------------------------- /foma/python/attapply.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Utility for reading .att format automata and transducers and doing apply up or down, 5 | written in Python. Also supports weighted FSMs. In the weighted case the apply routines 6 | return the strings in order cheapest first. The apply functions work as a generator, 7 | yielding output strings as long as there is a valid transduction. Input files are 8 | standard AT&T format, with the last column being optionally a weight. The files can be 9 | compressed with gzip. The apply generator produces tuples of (output, weight). For 10 | unweighted automata/transducers, the weight is always 0.0. 11 | 12 | Example usage: 13 | 14 | >>> import attapply 15 | >>> t = attapply.ATTFST('spanish.att.gz') # Spanish morphological analyzer 16 | >>> list(t.apply(u'tuviera', dir = 'up')) # Analyze 17 | [(u'tener[Subj][Imp][Form1][1P][Sg]', 0.0), (u'tener[Subj][Imp][Form1][3P][Sg]', 0.0)] 18 | 19 | >>> list(t.apply('tener[Subj][Imp][Form1][1P][Sg]')) # Generate, default dir is 'down' 20 | [(u'tuviera', 0.0)] 21 | 22 | Author: Mans Hulden 23 | License: Apache (Version 2.0) 24 | Last Update: 11/07/2016 25 | """ 26 | 27 | import codecs, gzip 28 | from heapq import * 29 | 30 | class State: 31 | def __init__(self): 32 | self.final = False 33 | self.finalweight = 0.0 34 | self.transitionsin = {} 35 | self.transitionsout = {} 36 | 37 | def add_transition(self, target, input, output, weight): 38 | if input not in self.transitionsin: 39 | self.transitionsin[input] = [] 40 | if output not in self.transitionsout: 41 | self.transitionsout[output] = [] 42 | self.transitionsin[input].append((output, target, weight)) 43 | self.transitionsout[output].append((input, target, weight)) 44 | 45 | def get_transitions(self, symbol, dir = 'down'): 46 | if dir == 'down': 47 | s = self.transitionsin 48 | else: 49 | s = self.transitionsout 50 | if symbol in s: 51 | return s[symbol] 52 | else: 53 | return [] 54 | 55 | def set_final(self, finalweight): 56 | self.final = True 57 | self.finalweight = finalweight 58 | 59 | class ATTFST: 60 | 61 | def __init__(self, attfile, epsilon_symbol = u'@0@', identity_symbol = u'@_IDENTITY_SYMBOL_@', unknown_symbol = '@_UNKNOWN_SYMBOL_@'): 62 | 63 | """Reads an AT&T file (possibly gzipped) and inits data structures 64 | to apply() can be called. If the AT&T file contains the special symbols 65 | epsilon, identity (repeat unknown) or unknown (one-sided unknown), these 66 | can be specified. The defaults are what foma produces with write att.""" 67 | 68 | self.epsilon_symbol = epsilon_symbol 69 | self.identity_symbol = identity_symbol 70 | self.unknown_symbol = unknown_symbol 71 | try: 72 | lines = [line.rstrip('\n') for line in codecs.getreader('utf-8')(gzip.open(attfile), errors='replace')] 73 | except: 74 | lines = [line.rstrip('\n') for line in codecs.open(attfile, "r", encoding="utf-8")] 75 | self.states = {} 76 | self.alphabet = set() 77 | for l in lines: 78 | fields = l.split('\t') 79 | if len(fields) > 3: 80 | fields = l.split('\t') 81 | source = int(fields[0]) 82 | target = int(fields[1]) 83 | insym = self._map_syms(fields[2]) 84 | outsym = self._map_syms(fields[3]) 85 | if len(fields) > 4: 86 | weight = float(fields[4]) 87 | else: 88 | weight = 0.0 89 | self.alphabet.add(insym) 90 | self.alphabet.add(outsym) 91 | if int(source) not in self.states: 92 | nss = State() 93 | self.states[source] = nss 94 | if int(target) not in self.states: 95 | nst = State() 96 | self.states[target] = nst 97 | self.states[source].add_transition(target, insym, outsym, weight) 98 | elif len(fields) < 3 and len(fields) > 0: 99 | final = int(fields[0]) 100 | finalweight = 0.0 101 | if len(fields) > 1: 102 | finalweight = float(fields[1]) 103 | if final not in self.states: 104 | nss = State() 105 | self.states[final] = nss 106 | self.states[final].set_final(finalweight) 107 | 108 | def _map_syms(self, s): 109 | if s == self.epsilon_symbol: 110 | return u'' 111 | return s 112 | 113 | def tokenize(self, word): 114 | tokens = [] 115 | start = 0 116 | while start < len(word): 117 | t = word[start] 118 | for length in range(1, len(word) - start + 1): 119 | if word[start:start+length] in self.alphabet: 120 | t = word[start:start+length] 121 | tokens.append(t) 122 | start += len(t) 123 | return tokens 124 | 125 | def apply(self, word, dir = 'down', tokenizer = None, return_joined = True): 126 | 127 | """Main apply function. Tokenizer func can be passed to the function. 128 | If no tokenizer is given, the alphabet of the FSM is used for 129 | tokenization of the input string, longest-match (as in foma). 130 | By default, the dir = 'down'. The output is by default joined, 131 | but a list of tokens can also be produced if return_joined is 132 | False.""" 133 | 134 | if tokenizer == None: 135 | w = self.tokenize(word) 136 | else: 137 | w = tokenizer(word) 138 | heap = [] 139 | heappush(heap, (0.0, 0, [], 0, False)) # (cost, -pos, output, state, final_included), use negpos to serve as tiebreaker 140 | while len(heap) > 0: 141 | cost, negpos, output, state, final_included = heappop(heap) 142 | if final_included == True: 143 | if return_joined == True: 144 | yield(''.join(output), cost) 145 | else: 146 | yield(output, cost) 147 | else: 148 | if -negpos == len(w): 149 | # only match epsilon or finaladd 150 | if final_included == False: 151 | for outsym, target, weight in self.states[state].get_transitions('', dir = dir): 152 | heappush(heap, (cost + weight, negpos, output + [outsym], target, False)) 153 | if self.states[state].final == True: 154 | heappush(heap, (cost + self.states[state].finalweight, negpos, output, state, True)) 155 | else: # Match other symbols as well 156 | nextsym = [w[-negpos]] 157 | if w[-negpos] not in self.alphabet: 158 | nextsym = [self.unknown_symbol, self.identity_symbol] 159 | for ns in nextsym: 160 | for outsym, target, weight in self.states[state].get_transitions(ns, dir = dir): 161 | if outsym == self.identity_symbol: 162 | outsym = w[-negpos] 163 | elif outsym == self.unknown_symbol: 164 | outsym = u'?' 165 | heappush(heap, (cost + weight, negpos - 1, output + [outsym], target, False)) 166 | for outsym, target, weight in self.states[state].get_transitions('', dir = dir): # Epsilons 167 | heappush(heap, (cost + weight, negpos, output + [outsym], target, False)) 168 | -------------------------------------------------------------------------------- /foma/lexc.l: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | %option noyywrap 19 | %option nounput 20 | %option noinput 21 | %top{ 22 | #define YY_BUF_SIZE 16777216 23 | } 24 | %{ 25 | #include 26 | #include "foma.h" 27 | #include "lexc.h" 28 | 29 | #define SOURCE_LEXICON 0 30 | #define TARGET_LEXICON 1 31 | #define YY_USER_ACTION lexccolumn += lexcleng; 32 | static int lexentries; 33 | extern int lexclex(); 34 | static struct defined_networks *olddefines; 35 | extern int my_yyparse(char *my_string, int lineno, struct defined_networks *defined_nets, struct defined_functions *defined_funcs); 36 | extern struct fsm *current_parse; 37 | static char *tempstr; 38 | int lexccolumn = 0; 39 | 40 | struct fsm *fsm_lexc_parse_string(char *string, int verbose) { 41 | 42 | olddefines = g_defines; 43 | YY_BUFFER_STATE my_string_buffer; 44 | my_string_buffer = lexc_scan_string(string); 45 | lexentries = -1; 46 | lexclineno = 1; 47 | lexc_init(); 48 | if (lexclex() != 1) { 49 | if (lexentries != -1) { 50 | printf("%i\n",lexentries); 51 | } 52 | } 53 | lexc_delete_buffer(my_string_buffer); 54 | g_defines = olddefines; 55 | return(lexc_to_fsm()); 56 | } 57 | 58 | struct fsm *fsm_lexc_parse_file(char *filename, int verbose) { 59 | char *mystring; 60 | mystring = file_to_mem(filename); 61 | return(fsm_lexc_parse_string(mystring, verbose)); 62 | } 63 | 64 | void lexc_trim(char *s) { 65 | /* Remove trailing ; and = and space and initial space */ 66 | int i,j; 67 | for (i = strlen(s)-1; *(s+i) == ';' || *(s+i) == '=' || *(s+i) == ' ' || *(s+i) == '\t'; i--) 68 | *(s+i) = '\0'; 69 | for (i=0; *(s+i) == ' ' || *(s+i) == '\t' || *(s+i) == '\n'; i++) { 70 | } 71 | for (j=0; *(s+i) != '\0'; i++, j++) { 72 | *(s+j) = *(s+i); 73 | } 74 | *(s+j) = *(s+i); 75 | } 76 | 77 | %} 78 | 79 | /* Nonreserved = anything except ; < > ! or space */ 80 | 81 | NONRESERVED [\001-\177]{-}[\011\012\014\015\040\041\042\045\073\074\076]|[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]|[\360-\367][\200-\277][\200-\277][\200-\277]|[\045][\001-\177]|[\045][\300-\337][\200-\277]|[\045][\340-\357][\200-\277][\200-\277]|[\045][\360-\367][\200-\277][\200-\277][\200-\277] 82 | 83 | INFOSTRING [\001-\177]{-}[\042\012\015]|[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]|[\360-\367][\200-\277][\200-\277][\200-\277] 84 | 85 | INSIDEREGEX [\001-\177]{-}[\073\173\175\042\045\076]|[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]|[\360-\367][\200-\277][\200-\277][\200-\277]|(@>)|(>@)|(->)|(=>)|^> 86 | 87 | INSIDEDEFREGEX [\001-\177]{-}[\073\173\175\042\045]|[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]|[\360-\367][\200-\277][\200-\277][\200-\277] 88 | 89 | SPACE [\040]|[\011]|[\014] 90 | 91 | ANY [\001-\177]|[\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277]|[\360-\367][\200-\277][\200-\277][\200-\277] 92 | 93 | %x MCS LEXICON DEF LEXENTRIES INSIDEREGEX REGEX REGEXB REGEXQ DEFREGEX DEFREGEXB DEFREGEXQ EATUPINFO 94 | %% 95 | 96 | /* Files begin with one of these three identifiers */ 97 | <*>Multichar_Symbols { 98 | BEGIN(MCS); 99 | } 100 | 101 | <*>Definitions { 102 | BEGIN(DEF); 103 | } 104 | 105 | /* This line needs to be above the space glob */ 106 | /* otherwise spaces get eaten up in a regex */ 107 | ({INSIDEREGEX}|%{ANY})* { 108 | yymore(); 109 | } 110 | 111 | <*>{SPACE}+ { } 112 | <*>[\015]?\n { lexclineno++; lexccolumn = 1;} 113 | /* Multichar definitions */ 114 | 115 | /* A Multichar definition can contain anything except nonescaped space */ 116 | {NONRESERVED}+ { 117 | lexc_add_mc(lexctext); 118 | } 119 | 120 | <*>(LEXICON|Lexicon){SPACE}+{NONRESERVED}+ { 121 | lexc_trim(lexctext+8); 122 | if (lexentries != -1) { 123 | printf("%i, ",lexentries); 124 | } 125 | printf("%s...",lexctext+8); 126 | fflush(stdout); 127 | lexentries = 0; 128 | lexc_set_current_lexicon(lexctext+8, SOURCE_LEXICON); 129 | BEGIN(LEXENTRIES); 130 | } 131 | 132 | /* Grab info string */ 133 | [\042]{INFOSTRING}*[\042]{SPACE}*; { 134 | BEGIN(LEXENTRIES); 135 | } 136 | /* Target followed by info string */ 137 | {NONRESERVED}+{SPACE}+/[\042]{INFOSTRING}*[\042]{SPACE}*; { 138 | lexc_trim(lexctext); 139 | lexc_set_current_lexicon(lexctext, TARGET_LEXICON); 140 | lexc_add_word(); 141 | lexc_clear_current_word(); 142 | lexentries++; 143 | if (lexentries %10000 == 0) { 144 | printf("%i...",lexentries); 145 | fflush(stdout); 146 | } 147 | BEGIN(EATUPINFO); 148 | } 149 | 150 | 151 | /* Regular entries contain anything (not starting with <) and end in a nonescaped SPACE */ 152 | {NONRESERVED}+ { 153 | lexc_set_current_word(lexctext); 154 | } 155 | 156 | 157 | {NONRESERVED}+{SPACE}*; { 158 | //printf("[%s]\n", lexctext); 159 | lexc_trim(lexctext); 160 | lexc_set_current_lexicon(lexctext, TARGET_LEXICON); 161 | lexc_add_word(); 162 | lexc_clear_current_word(); 163 | lexentries++; 164 | if (lexentries %10000 == 0) { 165 | printf("%i...",lexentries); 166 | fflush(stdout); 167 | } 168 | } 169 | 170 | /* A REGEX entry begins and ends with a < , > */ 171 | [\074] { 172 | BEGIN(REGEX); 173 | } 174 | /* \076 = > */ 175 | [\076] { 176 | *(lexctext+lexcleng-1) = ';'; 177 | if (my_yyparse(lexctext, lexclineno, g_defines, NULL) == 0) { 178 | lexc_set_network(current_parse); 179 | } 180 | BEGIN(LEXENTRIES); 181 | } 182 | 183 | [{] { 184 | BEGIN(REGEXB); 185 | yymore(); 186 | } 187 | [^}] { 188 | yymore(); 189 | } 190 | [}] { 191 | BEGIN(REGEX); 192 | yymore(); 193 | } 194 | (["])* { 195 | BEGIN(REGEXQ); 196 | yymore(); 197 | } 198 | ([^"]*) { 199 | yymore(); 200 | } 201 | ([\042]) { 202 | BEGIN(REGEX); 203 | yymore(); 204 | } 205 | {NONRESERVED}+{SPACE}+={SPACE}+ { 206 | lexc_trim(lexctext); 207 | tempstr = strdup(lexctext); 208 | BEGIN(DEFREGEX); 209 | } 210 | /* \073 = ; */ 211 | [\073] { 212 | if (my_yyparse(lexctext, lexclineno, g_defines, NULL) == 0) { 213 | add_defined(g_defines, fsm_topsort(fsm_minimize(current_parse)),tempstr); 214 | } 215 | free(tempstr); 216 | BEGIN(DEF); 217 | } 218 | ({INSIDEDEFREGEX}|%{ANY})* { 219 | yymore(); 220 | } 221 | [{] { 222 | BEGIN(DEFREGEXB); 223 | yymore(); 224 | } 225 | [^}] { 226 | yymore(); 227 | } 228 | [}] { 229 | BEGIN(DEFREGEX); 230 | yymore(); 231 | } 232 | (["])* { 233 | BEGIN(DEFREGEXQ); 234 | yymore(); 235 | } 236 | ([^"]*) { 237 | yymore(); 238 | } 239 | ([\042]) { 240 | BEGIN(DEFREGEX); 241 | yymore(); 242 | } 243 | <*>((!).*[\015]?(\n)) { 244 | /* printf ("Comment: [%s]\n",lexctext); */ 245 | lexclineno++; 246 | lexccolumn = 1; 247 | } 248 | 249 | <*>(.) { printf("\n***Syntax error on line %i column %i at '%s'\n",lexclineno,lexccolumn,lexctext); return 1;} 250 | -------------------------------------------------------------------------------- /foma/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}) 3 | 4 | file(READ fomalib.h _VERSION_FILE) 5 | string(REGEX REPLACE ".*MAJOR_VERSION ([0-9]+).*MINOR_VERSION ([0-9]+).*BUILD_VERSION ([0-9]+).*" "\\1.\\2.\\3" _VERSION ${_VERSION_FILE}) 6 | 7 | project(foma 8 | VERSION ${_VERSION} 9 | LANGUAGES C 10 | ) 11 | 12 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 13 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 14 | set(CMAKE_MACOSX_RPATH ON) 15 | 16 | include(GNUInstallDirs) 17 | 18 | if(MSVC) 19 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8 /std:c17 /permissive- /W4 /MP") 20 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /O2") 21 | set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG") 22 | 23 | add_definitions(-DYY_NO_UNISTD_H) # Prevent Flex generated code including unistd.h 24 | else() 25 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -fvisibility=hidden -fPIC") 26 | 27 | # Require latest possible C standard 28 | include(CheckCCompilerFlag) 29 | foreach(flag "-std=c2y" "-std=c23" "-std=c2x" "-std=c18" "-std=c17" "-std=c11" "-std=c1x" "-std=c99") 30 | string(REGEX REPLACE "[^a-z0-9]" "" _flag ${flag}) 31 | CHECK_C_COMPILER_FLAG(${flag} COMPILER_SUPPORTS_${_flag}) 32 | if(COMPILER_SUPPORTS_${_flag}) 33 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}") 34 | set(_ENABLED_C ${flag}) 35 | break() 36 | endif() 37 | endforeach() 38 | if(NOT _ENABLED_C) 39 | message(FATAL_ERROR "Could not enable at least C99 - upgrade your compiler") 40 | endif() 41 | 42 | add_definitions(-D_GNU_SOURCE) 43 | endif() 44 | 45 | if(WIN32) 46 | add_definitions(-D_SECURE_SCL=0 -D_ITERATOR_DEBUG_LEVEL=0 -D_CRT_SECURE_NO_DEPRECATE -DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN -DNOMINMAX) 47 | endif() 48 | 49 | find_package(BISON REQUIRED) 50 | find_package(FLEX REQUIRED) 51 | 52 | # getopt & readline 53 | find_path(GETOPT_INCLUDE getopt.h) 54 | include_directories(${GETOPT_INCLUDE}) 55 | if(VCPKG_TOOLCHAIN) 56 | find_library(GETOPT_LIB NAMES getopt) 57 | add_definitions(-DHAVE_GETOPT_LONG) 58 | else() 59 | set(GETOPT_LIB) 60 | 61 | find_package(PkgConfig REQUIRED) 62 | pkg_search_module(READLINE readline) 63 | endif() 64 | 65 | # Only look for readline if not building for WASM 66 | if(NOT EMSCRIPTEN) 67 | if(NOT READLINE_INCLUDE_DIRS) 68 | find_path(READLINE_INCLUDE_DIRS readline.h PATH_SUFFIXES readline REQUIRED) 69 | find_library(READLINE_LIBRARIES readline REQUIRED) 70 | endif() 71 | include_directories(${READLINE_INCLUDE_DIRS}) 72 | endif() 73 | 74 | # zlib handling 75 | if(EMSCRIPTEN) 76 | # Emscripten-specific zlib handling 77 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --use-port=zlib") 78 | set(ZLIB_LIBS "") 79 | else() 80 | find_path(ZLIB_INCLUDE zlib.h REQUIRED) 81 | if(VCPKG_TOOLCHAIN) 82 | find_package(ZLIB REQUIRED) 83 | find_library(ZLIB_LIBRARIES z REQUIRED) 84 | set(ZLIB_LIBS ZLIB::ZLIB) 85 | else() 86 | find_library(ZLIB_LIBRARIES z REQUIRED) 87 | set(ZLIB_LIBS ${ZLIB_LIBRARIES}) 88 | endif() 89 | include_directories(${ZLIB_INCLUDE}) 90 | endif() 91 | 92 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 93 | 94 | BISON_TARGET(Bregex regex.y "${CMAKE_CURRENT_BINARY_DIR}/regex.c" COMPILE_FLAGS "-v") 95 | FLEX_TARGET(Fregex regex.l "${CMAKE_CURRENT_BINARY_DIR}/lex.yy.c" COMPILE_FLAGS "-8") 96 | FLEX_TARGET(Flexc lexc.l "${CMAKE_CURRENT_BINARY_DIR}/lex.lexc.c" COMPILE_FLAGS "-8 --prefix=lexc") 97 | FLEX_TARGET(Finterface interface.l "${CMAKE_CURRENT_BINARY_DIR}/lex.interface.c" COMPILE_FLAGS "-8 --prefix=interface") 98 | FLEX_TARGET(Fcmatrix cmatrix.l "${CMAKE_CURRENT_BINARY_DIR}/lex.cmatrix.c" COMPILE_FLAGS "-8 --prefix=cmatrix") 99 | 100 | set(SOURCES 101 | foma.h 102 | fomalib.h 103 | fomalibconf.h 104 | lexc.h 105 | apply.c 106 | coaccessible.c 107 | constructions.c 108 | define.c 109 | determinize.c 110 | dynarray.c 111 | extract.c 112 | flags.c 113 | int_stack.c 114 | io.c 115 | lexcread.c 116 | mem.c 117 | minimize.c 118 | reverse.c 119 | rewrite.c 120 | sigma.c 121 | spelling.c 122 | stringhash.c 123 | structures.c 124 | topsort.c 125 | trie.c 126 | utf8.c 127 | ${FLEX_Fregex_OUTPUTS} 128 | ${FLEX_Flexc_OUTPUTS} 129 | ${FLEX_Fcmatrix_OUTPUTS} 130 | ${BISON_Bregex_OUTPUTS} 131 | ) 132 | 133 | add_library(foma-static STATIC ${SOURCES}) 134 | target_link_libraries(foma-static PUBLIC ${ZLIB_LIBS}) 135 | set_target_properties(foma-static PROPERTIES ARCHIVE_OUTPUT_NAME foma) 136 | 137 | add_library(foma-shared SHARED ${SOURCES}) 138 | target_link_libraries(foma-shared PRIVATE ${ZLIB_LIBS}) 139 | set_target_properties(foma-shared PROPERTIES 140 | LIBRARY_OUTPUT_NAME foma RUNTIME_OUTPUT_NAME foma 141 | VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) 142 | if(NOT MSVC) 143 | set_target_properties(foma-shared PROPERTIES ARCHIVE_OUTPUT_NAME foma) 144 | endif() 145 | 146 | # WASM-specific target 147 | if(EMSCRIPTEN) 148 | # Create WASM library target 149 | add_executable(libfoma ${SOURCES}) 150 | target_link_libraries(libfoma PRIVATE ${ZLIB_LIBS}) 151 | 152 | # Emscripten-specific link flags - export all fomalib functions 153 | set(EXPORTED_FUNCTIONS 154 | "[\'_malloc\',\'_free\',\'_defined_networks_init\',\'_defined_functions_init\',\'_add_defined\',\'_add_defined_function\',\'_fsm_construct_init\',\'_fsm_construct_set_final\',\'_fsm_construct_set_initial\',\'_fsm_construct_add_arc\',\'_fsm_construct_add_arc_nums\',\'_fsm_construct_add_symbol\',\'_fsm_construct_check_symbol\',\'_fsm_construct_copy_sigma\',\'_fsm_construct_done\',\'_fsm_parse_regex\',\'_fsm_create\',\'_fsm_empty\',\'_fsm_empty_set\',\'_fsm_universal\',\'_fsm_identity\',\'_fsm_contains\',\'_fsm_contains_one\',\'_fsm_contains_opt_one\',\'_fsm_minimize\',\'_fsm_determinize\',\'_fsm_epsilon_remove\',\'_fsm_reverse\',\'_fsm_invert\',\'_fsm_lower\',\'_fsm_upper\',\'_fsm_kleene_star\',\'_fsm_kleene_plus\',\'_fsm_optionality\',\'_fsm_concat\',\'_fsm_union\',\'_fsm_intersect\',\'_fsm_compose\',\'_fsm_complement\',\'_fsm_minus\',\'_fsm_simple_replace\',\'_fsm_context_restrict\',\'_fsm_isempty\',\'_fsm_isfunctional\',\'_fsm_isidentity\',\'_fsm_destroy\',\'_fsm_copy\',\'_fsm_complete\',\'_apply_init\',\'_apply_clear\',\'_apply_up\',\'_apply_down\',\'_apply_words\',\'_apply_upper_words\',\'_apply_lower_words\',\'_apply_random_words\',\'_apply_random_upper\',\'_apply_random_lower\',\'_fsm_get_library_version_string\',\'_fsm_set_option\',\'_fsm_get_option\',\'_fsm_read_binary_file\',\'_fsm_write_binary_file\',\'_fsm_trie_init\',\'_fsm_trie_done\',\'_fsm_trie_add_word\',\'_fsm_trie_end_word\',\'_sigma_copy\',\'_fsm_sigma_destroy\',\'_fsm_merge_sigma\']") 155 | 156 | set_target_properties(libfoma PROPERTIES 157 | SUFFIX ".js" 158 | LINK_FLAGS "-s WASM=1 \ 159 | -s EXPORTED_RUNTIME_METHODS=[\'ccall\',\'cwrap\',\'stringToUTF8\',\'UTF8ToString\'] \ 160 | -s EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS} \ 161 | -s ENVIRONMENT=web" 162 | ) 163 | 164 | # Add custom target to clean WASM-generated files 165 | set(WASM_GENERATED_FILES 166 | ${CMAKE_CURRENT_BINARY_DIR}/libfoma.js 167 | ${CMAKE_CURRENT_BINARY_DIR}/libfoma.wasm 168 | ) 169 | 170 | set_directory_properties(PROPERTIES 171 | ADDITIONAL_CLEAN_FILES "${WASM_GENERATED_FILES}" 172 | ) 173 | 174 | install(TARGETS libfoma RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 175 | else() 176 | # Regular foma-bin target 177 | add_executable(foma-bin foma.c stack.c iface.c ${FLEX_Finterface_OUTPUTS}) 178 | target_link_libraries(foma-bin PRIVATE foma-static ${READLINE_LIBRARIES} ${GETOPT_LIB}) 179 | target_link_directories(foma-bin PRIVATE ${READLINE_LIBRARY_DIRS}) 180 | set_target_properties(foma-bin PROPERTIES RUNTIME_OUTPUT_NAME foma) 181 | 182 | if(MSYS OR NOT WIN32) 183 | add_executable(flookup flookup.c) 184 | target_link_libraries(flookup PRIVATE foma-static ${GETOPT_LIB}) 185 | install(TARGETS flookup RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 186 | endif() 187 | 188 | add_executable(cgflookup cgflookup.c) 189 | target_link_libraries(cgflookup PRIVATE foma-static ${GETOPT_LIB}) 190 | 191 | configure_file(libfoma.pc.in libfoma.pc @ONLY) 192 | 193 | # Install 194 | install(TARGETS foma-static foma-shared ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 195 | install(FILES fomalib.h fomalibconf.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) 196 | install(TARGETS foma-bin cgflookup RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 197 | install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libfoma.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") 198 | 199 | endif() 200 | -------------------------------------------------------------------------------- /foma/utf8.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include "foma.h" 21 | 22 | unsigned char *int2utf8str(int codepoint); 23 | 24 | static int hexstrtoint(char *str); 25 | 26 | /* Removes trailing character c, as well as spaces and tabs */ 27 | char *remove_trailing(char *s, char c) { 28 | int i, len; 29 | len = strlen(s)-1; 30 | for (i = len; i>=0 ; i--) { 31 | if (*(s+i) != c && *(s+i) != ' ' && *(s+i) != '\t') { 32 | break; 33 | } 34 | *(s+i) = '\0'; 35 | } 36 | return(s); 37 | } 38 | 39 | /* Remove trailing space and \t */ 40 | char *trim(char *string) { 41 | int i; 42 | if (string == NULL) 43 | return(string); 44 | for (i = strlen(string) - 1; i >=0; i--) { 45 | if (*(string+i) != ' ' && *(string+i) != '\t') 46 | break; 47 | *(string+i) = '\0'; 48 | } 49 | return(string); 50 | } 51 | 52 | /* Reverses string in-place */ 53 | char *xstrrev(char *str) { 54 | char *p1, *p2; 55 | if (! str || ! *str) 56 | return str; 57 | for (p1 = str, p2 = str + strlen(str) - 1; p2 > p1; ++p1, --p2) { 58 | *p1 ^= *p2; 59 | *p2 ^= *p1; 60 | *p1 ^= *p2; 61 | } 62 | return str; 63 | } 64 | 65 | char *escape_string(char *string, char chr) { 66 | size_t i,j; 67 | char *newstring; 68 | for (i=0,j=0; i < strlen(string); i++) { 69 | if (string[i] == chr) { 70 | j++; 71 | } 72 | } 73 | if (j>0) { 74 | newstring = calloc((strlen(string)+j),sizeof(char)); 75 | for (i=0,j=0; i 5 && *(s+i+1) == 0x75 && ishexstr(s+i+2)) { 125 | for (unistr=utf8code16tostr(s+i+2); *unistr; j++, unistr++) { 126 | *(s+j) = *unistr; 127 | } 128 | i += 6; 129 | } else { 130 | for(skip = utf8skip(s+i)+1; skip > 0; skip--) { 131 | *(s+j) = *(s+i); 132 | i++; j++; 133 | } 134 | } 135 | } 136 | *(s+j) = *(s+i); 137 | } 138 | 139 | 140 | /* Replace equal length substrings in s */ 141 | char *streqrep(char *s, char *oldstring, char *newstring) { 142 | char *ptr; 143 | int len; 144 | len = strlen(oldstring); 145 | 146 | while ((ptr = strstr(s, oldstring)) != NULL) { 147 | memcpy(ptr, newstring, len); 148 | } 149 | return(s); 150 | } 151 | 152 | int ishexstr (char *str) { 153 | int i; 154 | for (i=0; i<4; i++) { 155 | if ((*(str+i) > 0x2f && *(str+i) < 0x3a) || (*(str+i) > 0x40 && *(str+i) < 0x47) || (*(str+i) > 0x60 && *(str+i) < 0x67)) 156 | continue; 157 | return 0; 158 | } 159 | return 1; 160 | } 161 | int utf8strlen(char *str) { 162 | int i,j, len; 163 | len = strlen(str); 164 | for (i=0, j=0; *(str+i) != '\0' && i < len;j++ ) { 165 | i = i + utf8skip(str+i) + 1; 166 | } 167 | return j; 168 | } 169 | 170 | /* Checks if the next character in the string is a combining character */ 171 | /* according to Unicode 7.0 */ 172 | /* i.e. codepoints 0300-036F Combining Diacritical Marks */ 173 | /* 1AB0-1ABE Combining Diacritical Marks Extended */ 174 | /* 1DC0-1DFF Combining Diacritical Marks Supplement */ 175 | /* 20D0-20F0 Combining Diacritical Marks for Symbols */ 176 | /* FE20-FE2D Combining Half Marks */ 177 | /* Returns number of bytes of char. representation, or 0 if not combining */ 178 | 179 | int utf8iscombining(unsigned char *s) { 180 | if (*s == '\0' || *(s+1) == '\0') 181 | return 0; 182 | if (!(*s == 0xcc || *s == 0xcd || *s == 0xe1 || *s == 0xe2 || *s == 0xef)) 183 | return 0; 184 | /* 0300-036F */ 185 | if (*s == 0xcc && *(s+1) >= 0x80 && *(s+1) <= 0xbf) 186 | return 2; 187 | if (*s == 0xcd && *(s+1) >= 0x80 && *(s+1) <= 0xaf) 188 | return 2; 189 | if (*(s+2) == '\0') 190 | return 0; 191 | /* 1AB0-1ABE */ 192 | if (*s == 0xe1 && *(s+1) == 0xaa && *(s+2) >= 0xb0 && *(s+2) <= 0xbe) 193 | return 3; 194 | /* 1DC0-1DFF */ 195 | if (*s == 0xe1 && *(s+1) == 0xb7 && *(s+2) >= 0x80 && *(s+2) <= 0xbf) 196 | return 3; 197 | /* 20D0-20F0 */ 198 | if (*s == 0xe2 && *(s+1) == 0x83 && *(s+2) >= 0x90 && *(s+2) <= 0xb0) 199 | return 3; 200 | /* FE20-FE2D */ 201 | if (*s == 0xef && *(s+1) == 0xb8 && *(s+2) >= 0xa0 && *(s+2) <= 0xad) 202 | return 3; 203 | return 0; 204 | } 205 | 206 | int utf8skip(char *str) { 207 | unsigned char s; 208 | 209 | s = (unsigned char)(unsigned int) (*str); 210 | if (s < 0x80) 211 | return 0; 212 | if ((s & 0xe0) == 0xc0) { 213 | return 1; 214 | } 215 | if ((s & 0xf0) == 0xe0) { 216 | return 2; 217 | } 218 | if ((s & 0xf8) == 0xf0) { 219 | return 3; 220 | } 221 | return -1; 222 | } 223 | 224 | unsigned char *utf8code16tostr(char *str) { 225 | int codepoint; 226 | codepoint = (hexstrtoint(str) << 8) + hexstrtoint(str+2); 227 | return(int2utf8str(codepoint)); 228 | } 229 | 230 | unsigned char *int2utf8str(int codepoint) { 231 | unsigned char *value; 232 | value = malloc(sizeof(unsigned char)*5); 233 | 234 | if (codepoint < 0x80) { 235 | *(value) = (unsigned char)(codepoint); 236 | *(value+1) = 0; 237 | return(value); 238 | } else if (codepoint < 0x800) { 239 | *(value) = (0xc0 | (unsigned char)(codepoint >> 6)); 240 | *(value+1) = (0x80 | (unsigned char)(codepoint & 0x3f)); 241 | *(value+2) = 0; 242 | return(value); 243 | } else if (codepoint < 0x10000) { 244 | *(value) = (0xe0 | (unsigned char)(codepoint >> 12)); 245 | *(value+1) = (0x80 | (unsigned char)((codepoint >> 6) & 0x3f)); 246 | *(value+2) = (0x80 | (unsigned char)(codepoint & 0x3f)); 247 | *(value+3) = 0; 248 | return(value); 249 | } else { 250 | return (0); 251 | } 252 | } 253 | 254 | int hexstrtoint(char *str) { 255 | int hex; 256 | 257 | if (*str > 0x60) { 258 | hex = (*str - 0x57) << 4; 259 | } else if (*str > 0x40) { 260 | hex = (*str - 0x37) << 4; 261 | } else { 262 | hex = (*str - 0x30) << 4; 263 | } 264 | if (*(str+1) > 0x60) { 265 | hex += (*(str+1) - 0x57); 266 | } else if (*(str+1) > 0x40) { 267 | hex += (*(str+1) - 0x37); 268 | } else { 269 | hex += (*(str+1) - 0x30); 270 | } 271 | return hex; 272 | } 273 | -------------------------------------------------------------------------------- /foma/fomalibconf.h: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | struct state_array { 19 | struct fsm_state *transitions; 20 | }; 21 | 22 | struct fsm_trans_list { 23 | short int in; 24 | short int out; 25 | int target; 26 | struct fsm_trans_list *next; 27 | }; 28 | 29 | struct fsm_state_list { 30 | _Bool used; 31 | _Bool is_final; 32 | _Bool is_initial; 33 | short int num_trans; 34 | int state_number; 35 | struct fsm_trans_list *fsm_trans_list; 36 | }; 37 | 38 | struct fsm_sigma_list { 39 | char *symbol; 40 | }; 41 | 42 | struct fsm_sigma_hash { 43 | char *symbol; 44 | short int sym; 45 | struct fsm_sigma_hash *next; 46 | }; 47 | 48 | typedef void *fsm_read_binary_handle; 49 | 50 | struct fsm_construct_handle { 51 | struct fsm_state_list *fsm_state_list; 52 | int fsm_state_list_size; 53 | struct fsm_sigma_list *fsm_sigma_list; 54 | int fsm_sigma_list_size; 55 | struct fsm_sigma_hash *fsm_sigma_hash; 56 | int fsm_sigma_hash_size; 57 | int maxstate; 58 | int maxsigma; 59 | int numfinals; 60 | int hasinitial; 61 | char *name; 62 | }; 63 | 64 | struct apply_med_handle { 65 | struct astarnode { 66 | short int wordpos; 67 | int fsmstate; 68 | short int f; 69 | short int g; 70 | short int h; 71 | int in; 72 | int out; 73 | int parent; 74 | } *agenda; 75 | int bytes_per_letter_array; 76 | uint8_t *letterbits; 77 | uint8_t *nletterbits; 78 | int astarcount; 79 | int heapcount; 80 | int heap_size; 81 | int agenda_size; 82 | int maxdepth; 83 | int maxsigma; 84 | int wordlen; 85 | int utf8len; 86 | int cost; 87 | int nummatches; 88 | int curr_state; 89 | int curr_g; 90 | int curr_pos; 91 | int lines; 92 | int curr_agenda_offset; 93 | int curr_node_has_match; 94 | int med_limit; 95 | int med_cutoff; 96 | int med_max_heap_size; 97 | int nodes_expanded; 98 | int *cm; 99 | char *word; 100 | char *instring; 101 | int instring_length; 102 | char *outstring; 103 | int outstring_length; 104 | char *align_symbol; 105 | int *heap; 106 | int *intword; 107 | struct sh_handle *sigmahash; 108 | struct state_array *state_array; 109 | struct fsm *net; 110 | struct fsm_state *curr_ptr; 111 | _Bool hascm; 112 | }; 113 | 114 | struct apply_handle { 115 | 116 | int ptr; 117 | int curr_ptr; 118 | int ipos; 119 | int opos; 120 | int mode; 121 | int printcount; 122 | int *numlines; 123 | int *statemap; 124 | int *marks; 125 | 126 | struct sigma_trie { 127 | int signum; 128 | struct sigma_trie *next; 129 | } *sigma_trie; 130 | 131 | struct sigmatch_array { 132 | int signumber ; 133 | int consumes ; 134 | } *sigmatch_array; 135 | 136 | struct sigma_trie_arrays { 137 | struct sigma_trie *arr; 138 | struct sigma_trie_arrays *next; 139 | } *sigma_trie_arrays; 140 | 141 | int binsearch; 142 | int indexed; 143 | int state_has_index; 144 | int sigma_size; 145 | int sigmatch_array_size; 146 | int current_instring_length; 147 | int has_flags; 148 | int obey_flags; 149 | int show_flags; 150 | int print_space; 151 | char *space_symbol; 152 | char *separator; 153 | char *epsilon_symbol; 154 | int print_pairs; 155 | int apply_stack_ptr; 156 | int apply_stack_top; 157 | int oldflagneg; 158 | int outstringtop; 159 | int iterate_old; 160 | int iterator; 161 | uint8_t *flagstates; 162 | char *outstring; 163 | char *instring; 164 | struct sigs { 165 | char *symbol; 166 | int length; 167 | } *sigs; 168 | char *oldflagvalue; 169 | 170 | struct fsm *last_net; 171 | struct fsm_state *gstates; 172 | struct sigma *gsigma; 173 | struct apply_state_index { 174 | int fsmptr; 175 | struct apply_state_index *next; 176 | } **index_in, **index_out, *iptr; 177 | 178 | struct flag_list { 179 | char *name; 180 | char *value; 181 | short neg; 182 | struct flag_list *next; 183 | } *flag_list; 184 | 185 | struct flag_lookup { 186 | int type; 187 | char *name; 188 | char *value; 189 | } *flag_lookup ; 190 | 191 | struct searchstack { 192 | int offset; 193 | struct apply_state_index *iptr; 194 | int state_has_index; 195 | int opos; 196 | int ipos; 197 | int visitmark; 198 | char *flagname; 199 | char *flagvalue; 200 | int flagneg; 201 | } *searchstack ; 202 | }; 203 | 204 | 205 | /* Automaton functions operating on fsm_state */ 206 | int add_fsm_arc(struct fsm_state *fsm, int offset, int state_no, int in, int out, int target, int final_state, int start_state); 207 | struct fsm_state *fsm_state_copy(struct fsm_state *fsm_state, int linecount); 208 | 209 | /* Functions for constructing a FSM arc-by-arc */ 210 | /* At the end of the constructions, the flags are updated automatically */ 211 | 212 | /* Call fsm_state_init with the alphabet size to initialize the new machine */ 213 | struct fsm_state *fsm_state_init(int sigma_size); 214 | 215 | /* Call set current state before calling fsm_state_add_arc */ 216 | void fsm_state_set_current_state(int state_no, int final_state, int start_state); 217 | 218 | /* Add an arc */ 219 | void fsm_state_add_arc(int state_no, int in, int out, int target, int final_state, int start_state); 220 | 221 | /* Call fsm_state_close() when done with arcs to a state */ 222 | void fsm_state_close(struct fsm *net); 223 | 224 | /* Call this when done with entire FSM */ 225 | void fsm_state_end_state(); 226 | 227 | struct state_array *map_firstlines(struct fsm *net); 228 | 229 | FEXPORT void fsm_count(struct fsm *net); 230 | 231 | void fsm_sort_lines(struct fsm *net); 232 | void fsm_update_flags(struct fsm *net, int det, int pru, int min, int eps, int loop, int completed); 233 | 234 | int sort_cmp(const void *a, const void *b); 235 | 236 | int find_arccount(struct fsm_state *fsm); 237 | 238 | /* Internal int stack */ 239 | int int_stack_isempty(); 240 | int int_stack_isfull(); 241 | void int_stack_clear(); 242 | int int_stack_find (int entry); 243 | void int_stack_push(int c); 244 | int int_stack_pop(); 245 | int int_stack_status(); 246 | int int_stack_size(); 247 | 248 | /* Internal ptr stack */ 249 | int ptr_stack_isempty(); 250 | void ptr_stack_clear(); 251 | void *ptr_stack_pop(); 252 | int ptr_stack_isfull(); 253 | void ptr_stack_push(void *ptr); 254 | 255 | /* Sigma functions */ 256 | FEXPORT int sigma_add (char *symbol, struct sigma *sigma); 257 | FEXPORT int sigma_add_number(struct sigma *sigma, char *symbol, int number); 258 | FEXPORT int sigma_add_special (int symbol, struct sigma *sigma); 259 | FEXPORT struct sigma *sigma_remove(char *symbol, struct sigma *sigma); 260 | FEXPORT struct sigma *sigma_remove_num(int num, struct sigma *sigma); 261 | 262 | int sigma_find (char *symbol, struct sigma *sigma); 263 | int sigma_find_number (int number, struct sigma *sigma); 264 | int sigma_substitute(char *orig, char *sub, struct sigma *sigma); 265 | FEXPORT char *sigma_string(int number, struct sigma *sigma); 266 | int sigma_sort (struct fsm *net); 267 | void sigma_cleanup (struct fsm *net, int force); 268 | FEXPORT struct sigma *sigma_create (); 269 | int sigma_size(struct sigma *sigma); 270 | FEXPORT int sigma_max(struct sigma *sigma); 271 | struct fsm_sigma_list *sigma_to_list(struct sigma *sigma); 272 | 273 | /* Debug */ 274 | void xprintf(char *string); 275 | 276 | /* UTF8 */ 277 | unsigned char *utf8code16tostr(char *str); 278 | int utf8iscombining(unsigned char *s); 279 | int utf8skip(char *str); 280 | int utf8strlen(char *str); 281 | int ishexstr(char *str); 282 | void decode_quoted(char *s); 283 | void dequote_string(char *s); 284 | char *remove_trailing(char *s, char c); 285 | char *escape_string(char *string, char chr); 286 | char *xstrrev(char *str); 287 | 288 | /* Flag-related */ 289 | int flag_check(char *sm); 290 | char *flag_get_name(char *string); 291 | char *flag_get_value(char *string); 292 | int flag_get_type(char *string); 293 | 294 | /* Misc */ 295 | char *trim(char *string); 296 | void strip_newline(char *s); 297 | char *streqrep(char *s, char *oldstring, char *newstring); 298 | char *xxstrndup(const char *s, size_t n); 299 | int next_power_of_two(int v); 300 | unsigned int round_up_to_power_of_two(unsigned int v); 301 | -------------------------------------------------------------------------------- /foma/CHANGELOG: -------------------------------------------------------------------------------- 1 | 0.10.0 (20210601) 2 | 3 | - Add runtime options interface. 4 | - Fix numerous memory leaks and segfaults. 5 | - Allow to suppress verbose messages. 6 | - Drop leftover command "test star-free" 7 | - Render error message when input file contains UTF byte order mark. 8 | - Fix error rendering: make yyerror signature consistent with yyparse. 9 | - Skip flags elimination if there are no paths in network. 10 | - Makefile now creates libfoma.pc to allow pkg-config provide info about foma. 11 | - Fixed syntax error reporting line count in lexc. 12 | - Add Python 3 compatibility and Python 3 port of foma2js.perl 13 | - Sort alphabet after eliminating flags. 14 | - Make "lower" be "lower-words". 15 | - New rewrite algorithms. 16 | 17 | 0.9.18alpha (20150612) 18 | 19 | - Many bugfixes and speedups 20 | - Corrects handling of some rare unicode composing diacritics 21 | - Python bindings 22 | - Adds _closeu()-builtin function which closes sigma 23 | - Adds _sublabel(L1,symbol,L2)-builtin function: substitute all instances of symbol in L1 with L2 24 | - Adds separate pairs, pairs > file, random-pairs commands in interface 25 | - Adds possibility to automatically align lexc-entries 26 | 27 | 0.9.17alpha (20121117) 28 | 29 | - Many bugfixes in foma, flookup (apply code) 30 | - Adds possibility to redirect "print words" output to file by "print words > file" 31 | 32 | 0.9.16alpha (20111213) 33 | 34 | - New faster apply code, as well as optional indexing of arcs in flookup 35 | - adds rewrite rule formalism "transducers with backreferences" (e.g. T -> || L _ R, where T is a transducer) 36 | - flookup now has option to run as UDP server (-S) 37 | - Adds low-level functions _marktail(), _addfinalloop(), _addnonfinalloop(), _addsink(), leftrewr(), _flatten(). 38 | - Minor bugfixes in apply code 39 | - Some rare memory leak fixes throughout 40 | - Adds functions so that apply_med() can be used through the API 41 | - Adds functions to API fsm_get_next_state(), fsm_get_next_state_arc() 42 | 43 | 0.9.15alpha (20111110) 44 | 45 | - adds cgflookup utility to facilitate piping with Constraint Grammar parsers 46 | - flookup now applies in chain (a virtual compose) if there are several transducers in a file, except if the [-a] flag is given, in which case flookup looks for an output in each transducer until it finds one (simulating priority union .P.) 47 | - changes and bug fixes in application code/flag diacritic handling 48 | - lexc now accepts forced alignments with zeroes and info strings 49 | - \r (carriage return) awarness added 50 | - adds support for the construct regex @re"regexfile.txt" (assumes file has one regular expression on one line) 51 | - fixes argument order of interface commands compose net and concatenate net 52 | - adds functions in API to read in multiple nets from one file through an iterator fsm_read_binary_file_multiple_init(infilename)/fsm_read_binary_file_multiple(rh)) 53 | - improved error messages 54 | - adds "test sequential" ("tseq" for short) and assert stack commands 55 | - fixes bugs with loading/saving nets that have newlines in transitions 56 | 57 | 0.9.14alpha (20110203) 58 | 59 | - Changed tokenization in apply_up() and apply_down() to always choose the leftmost longest tokenization in case of multicharacter symbols sharing the same prefix. Previously, all possible tokenizations were given. For example, earlier regex [a a|aa] would match the input string "aa" in two ways, yielding two outputs, whereas only one tokenization is given now, the leftmost-longest one. This also speeds up the apply functions. 60 | - Fix minor bug in fsm_construct_set_final() and fsm_construct_set_initial(), affecting fsm_reverse() 61 | - Minimize automata read with "read text" and "read spaced-text" 62 | - Minor change in .dot output style 63 | - Fix bug in fsm_trie 64 | 65 | 0.9.13alpha (20101025) 66 | 67 | - Various memory-management improvements throughout 68 | - Adds the commands "read text" and "read spaced-text" for building automata/transducers from word lists. The functionality can be embedded in regexes with @txt"filename" and @stxt"filename". From the API, the functions fsm_read_text_file() and fsm_read_spaced_text_file(), reads files and builds trie FSMs from the lists 69 | - Adds the external utility flookup, which reads words from stdin and applies them to a transducer given in a file, and prints output to stdout 70 | - Adds the command "substitute defined", and the corresponding fsm_substitute_label() in the API. 71 | - Improved prolog file reading and writing 72 | - Adds a function to convert a multicharacter machine to a letter machine (where each transition is exactly one unicode symbol long). It is called fsm_letter_machine() in the API, and _lm() in the built-in functions 73 | - Print random-lower/random-upper/random-words now provide a "more" random distribution. Duplicated are not printed, but prefixed by a count 74 | - Thread-safety changes in apply.c API 75 | - Bug fixes in order of stack operations, printing, reverse operation (.r), function definitions, lower-side priority union (.p.) 76 | - New API calls fsm_construct_* and fsm_get_* for constructing and reading automata/transducers 77 | 78 | 0.9.12alpha (20091025) 79 | 80 | - OSX version "view" command launches native Graphviz (get it from http://www.pixelglow.com/graphviz/) 81 | - export cmatrix writes AT&T format weighted transducer 82 | - More API functions 83 | - Added support for reading/writing FSM files in AT&T format 84 | - Added "apply down/up < infile (> outfile)" 85 | - Separated the API functions into libfoma.h. Separate library builds. 86 | - Apply functions in API are now iterators 87 | - Minor bugfixes in regex parsing / lexc file parsing 88 | - Added the built-in _eq()-operator 89 | - Added extraction of attested symbol pairs and extraction of sigma as an FSM through "label net" and "sigma net" 90 | - Minor bugfix in fsm_cross_product() 91 | - Bugfix in "print dot" 92 | 93 | 0.9.11alpha (20090722) 94 | 95 | - Symbol handling efficiency modifications in fsm_minimize() and fsm_determinize() 96 | - Added the function fsm_epsilon_remove() (not used outside the API, however) 97 | - Minor bugfixes in fsm_rewrite() 98 | - A bug that was reintroduced in fsm_compose() in 0.9.10alpha is fixed 99 | - fsm_lower() fsm_upper() fsm_kleene_star() fsm_concatenate() have been rewritten 100 | - Added the functionality "ambiguous upper","extract ambiguous","extract unambiguous" and the corresponding built-in regex functions: _ambdom(), _ambpart(), _unambpart(). The first function extracts the input words that have multiple paths through a transducer. The other two split up a given transducer into an ambiguous or unambiguous one. 101 | 102 | 0.9.10alpha (20090717) 103 | - Added support for loading and saving networks in a binary file format and the commands "save stack" "load stack" "save defined" "load defined" and the regular expression construct 'regex @"filename"' which loads the network in "filename" (uses libz) 104 | - foma is no longer compiled with libgc as default 105 | - Some major memory management changes in fsm_minimize() and fsm_determinize(). Efficiency/memory tweaks in fsm_determinize(). 106 | - Minor bugfixes in fsm_compose() 107 | 108 | 0.9.9alpha (20090621) 109 | - Added support for saving networks in prolog file format (write prolog > filename or wpl > filename). 110 | - Bugfixes in "read prolog"/"rpl" 111 | - Minor bugfix in "test unambiguous"/_isunambiguous() 112 | 113 | 0.9.8alpha (20090604) 114 | - Added option to load confusion matrices (load cmatrix filename) that specify costs for minimum edit distance matching and attaches to a network. The command "print cmatrix" prints the matrix associated with the network on top of the stack. These are used whenever "apply med" is called. If no matrix is specified, the default distance is Levenshtein. 115 | - Added the global variables med-cutoff and med-limit to control the med search 116 | - Minor bugfixes in fsm_compose() 117 | - Changes in dot file output (print dot, print dot > filename) and "view net" 118 | - Added tests for transducer ambiguity (test unambiguous/tunam), and the equivalent built-in function _isunambiguous() which returns a boolean automaton. 119 | 120 | 0.9.7alpha (20090519) 121 | - Fixed bug in print shortest-string "alias: pss" 122 | - Added tests for transducer functionality (test functional/tfu) and transducer identity (test identity/tid). 123 | - Added support for built-in functions. They use the same notation as user-defined ones, except all begin with _. Functions that are network property tests such as _isfunctional() and _isidentity() return boolean automata (the empty set for FALSE and the empty string for TRUE). 124 | 125 | 0.9.6alpha (20090506) 126 | - Fixed bugs in apply up 127 | - Fixed bugs in left and right quotient (\\\ and ///) 128 | - Changed fsm_minus(), i.e. A - B, so that it subtracts paths instead of being A & ~B. This means transducer paths can be subtracted. 129 | - Added algorithms for finding the minimum edit distance between a word and a fsm. "apply med" is the same as "apply down" except it finds the cheapest approximate matches. Still experimental. 130 | 131 | 0.9.5alpha (20090325) 132 | - Fixed bug in lexcread that affected lexicons with very long words 133 | - Fixed bug in context restrict with 0 as both contexts, e.g. a => 0 _ 0 134 | 135 | 0.9.4alpha (20090116) 136 | - Tweaked memory and efficiency in minimization and determinization algorithms. For large automata, minimization uses much less temporary memory. 137 | - Added 'flag-is-epsilon' variable behavior and composition algorithm now obeys it 138 | 139 | -- 140 | 141 | 0.9.3alpha (20090113) 142 | - Recursive script-files now work 143 | - Changed lexc format reading so Lexicon is accepted as well as LEXICON as a keyword 144 | - Minor bugfix for comment behavior in script files 145 | - Minor bugfix in "define" format 146 | 147 | -- 148 | 149 | 0.9.2alpha (20090111) 150 | - Determinization now uses much less temporary memory without sacrificing efficiency. 151 | The memory use caused severe problems for compiling very large lexicons or determinizing 152 | large automata with limited memory. 153 | 154 | - Added a "-r" command line option for starting foma without readline (mainly for Win32 version) 155 | 156 | - Fixed a serious bug in the ternary `[A,B,C] substitution operator and added the interface command 157 | "substitute symbol B for A" 158 | 159 | - Added a slight optimization in directed replacements 160 | 161 | -- 162 | 163 | 0.9.1alpha (20081231) 164 | - First public release 165 | -------------------------------------------------------------------------------- /foma/foma.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #ifndef _MSC_VER 22 | #include 23 | #endif 24 | #include 25 | #include 26 | #include 27 | #include "foma.h" 28 | 29 | /* Front-end behavior variables */ 30 | int pipe_mode = 0; 31 | extern int g_verbose; 32 | static int use_readline = 1; 33 | 34 | int promptmode = PROMPT_MAIN; 35 | int apply_direction; 36 | 37 | /* Variable to pass the position of rl completion to our completer */ 38 | static int smatch; 39 | 40 | char *usagestring = "Usage: foma [-e \"command\"] [-f run-once-script] [-l startupscript] [-p] [-q] [-s] [-v]\n"; 41 | 42 | static char** my_completion(const char*, int ,int); 43 | char *my_generator(const char* , int); 44 | char *cmd [] = {"ambiguous upper","apply down","apply med","apply up","apropos","assert-stack","clear stack","close sigma","compact sigma","complete net","compose net","concatenate net","crossproduct net","define","determinize net","echo","eliminate flags","eliminate flag","export cmatrix","extract ambiguous","extract unambiguous","factorize","help license","help warranty","ignore net","intersect net","invert net","label net","letter machine","load defined","lower-side net","minimize net","name net","negate net","one-plus net","pop stack","print defined","print dot","print lower-words","print cmatrix","print name","print net","print random-lower","print random-upper","print random-words","print sigma","print size","print shortest-string","print shortest-string-length","print words","print pairs","print random-pairs","print upper-words","prune net","push defined","quit","read att","read cmatrix","read prolog","read lexc","read regex","read spaced-text","read text","reverse net","rotate stack","save defined","save stack","sequentialize","set","show variables","show variable","shuffle net","sigma","sigma net","source","sort in","sort net","sort out","substitute defined","substitute symbol","system","test unambiguous","test equivalent","test functional","test identity","test lower-universal","test upper-universal","test non-null","test null","test sequential","turn stack","twosided flag-diacritics","undefine","union net","upper-side net","view net","write att","write prolog","zero-plus net",NULL}; 45 | 46 | char *abbrvcmd [] = {"ambiguous","close","down","up","med","size","loadd","lower-words","upper-words","net","random-lower","random-upper","words","random-words","regex","rpl","au revoir","bye","exit","saved","seq","ss","stack","tunam","tid","tfu","tlu","tuu","tnu","tnn","tseq","tsf","equ","pss","psz","ratt","tfd","hyvästi","watt","wpl","examb","exunamb","pairs","random-pairs",NULL}; 47 | 48 | /* #include "yy.tab.h" */ 49 | 50 | int view_net(struct fsm *net); 51 | 52 | extern int input_is_file; 53 | extern int add_history (const char *); 54 | extern int my_yyparse(char *my_string); 55 | void print_help(); 56 | void xprintf(char *string) { return ; printf("%s",string); } 57 | char disclaimer[] = "Foma, version 0.10.0\nCopyright © 2008-2021 Mans Hulden\nThis is free software; see the source code for copying conditions.\nThere is ABSOLUTELY NO WARRANTY; for details, type \"help license\"\n\nType \"help\" to list all commands available.\nType \"help \" or help \"\" for further help.\n\n"; 58 | 59 | /* A static variable for holding the line. */ 60 | 61 | static char *command = (char *)NULL; 62 | char *flex_command = NULL; 63 | static char *line_read = (char *)NULL; 64 | char no_readline_line[512]; 65 | 66 | /* Read a string, and return a pointer to it. 67 | Returns NULL on EOF. */ 68 | 69 | char *rl_gets(char *prompt) { 70 | 71 | /* If the buffer has already been allocated, 72 | return the memory to the free pool. */ 73 | if (use_readline == 1) { 74 | if (line_read) { 75 | free(line_read); 76 | line_read = (char *)NULL; 77 | } 78 | } 79 | if (use_readline == 0) { 80 | printf("%s",prompt); 81 | line_read = fgets(no_readline_line, 511, stdin); 82 | if (line_read != NULL) { 83 | strip_newline(line_read); 84 | } 85 | } else { 86 | line_read = readline(prompt); 87 | } 88 | 89 | /* If the line has any text in it, 90 | save it on the history. */ 91 | if (use_readline == 1) { 92 | if (line_read && *line_read) 93 | add_history(line_read); 94 | } 95 | return (line_read); 96 | } 97 | 98 | int main(int argc, char *argv[]) { 99 | int opt; 100 | 101 | char *scriptfile, prompt[50]; 102 | extern void my_interfaceparse(char *my_string); 103 | /* YY_BUFFER_STATE flex_command; */ 104 | stack_init(); 105 | srand ((unsigned int)time(NULL)); 106 | /* Init defined_networks structures */ 107 | g_defines = defined_networks_init(); 108 | g_defines_f = defined_functions_init(); 109 | 110 | while ((opt = getopt(argc, argv, "e:f:hl:pqrsv")) != -1) { 111 | switch(opt) { 112 | case 'e': 113 | my_interfaceparse(optarg); 114 | break; 115 | case 'f': 116 | scriptfile = file_to_mem(optarg); 117 | if (scriptfile != NULL) { 118 | input_is_file = 1; 119 | my_interfaceparse(scriptfile); 120 | } 121 | exit(0); 122 | case 'h': 123 | print_help(); 124 | exit(0); 125 | case 'l': 126 | scriptfile = file_to_mem(optarg); 127 | if (scriptfile != NULL) { 128 | input_is_file = 1; 129 | my_interfaceparse(scriptfile); 130 | free(scriptfile); 131 | } 132 | break; 133 | case 'p': 134 | pipe_mode = 1; 135 | break; 136 | case 'q': 137 | g_verbose = 0; 138 | break; 139 | case 'r': 140 | use_readline = 0; 141 | break; 142 | case 's': 143 | exit(0); 144 | case 'v': 145 | printf("%s %i.%i.%i%s\n",argv[0],MAJOR_VERSION,MINOR_VERSION,BUILD_VERSION,STATUS_VERSION); 146 | exit(0); 147 | default: 148 | fprintf(stderr, "%s", usagestring); 149 | exit(EXIT_FAILURE); 150 | } 151 | } 152 | 153 | if (!pipe_mode && g_verbose) 154 | printf("%s",disclaimer); 155 | rl_basic_word_break_characters = " >"; 156 | 157 | rl_attempted_completion_function = my_completion; 158 | for(;;) { 159 | if (promptmode == PROMPT_MAIN) 160 | sprintf(prompt, "foma[%i]: ",stack_size()); 161 | if (promptmode == PROMPT_A && apply_direction == AP_D) 162 | sprintf(prompt, "apply down> "); 163 | if (promptmode == PROMPT_A && apply_direction == AP_U) 164 | sprintf(prompt, "apply up> "); 165 | if (promptmode == PROMPT_A && apply_direction == AP_M) 166 | sprintf(prompt, "apply med> "); 167 | if (pipe_mode || !g_verbose) 168 | prompt[0] = '\0'; 169 | 170 | fflush(stdout); 171 | 172 | command = rl_gets(prompt); 173 | 174 | if (command == NULL && promptmode == PROMPT_MAIN) { 175 | printf("\n"); 176 | exit(0); 177 | } 178 | if (command == NULL && promptmode == PROMPT_A) { 179 | /* apply_clear(); */ 180 | promptmode = PROMPT_MAIN; 181 | printf("\n"); 182 | continue; 183 | } 184 | input_is_file = 0; 185 | my_interfaceparse(command); 186 | } 187 | } 188 | 189 | void print_help() { 190 | printf("%s",usagestring); 191 | printf("Options:\n"); 192 | printf("-e \"command\"\texecute a command on startup (-e can be invoked several times)\n"); 193 | printf("-f scriptfile\tread commands from scriptfile on startup, and quit\n"); 194 | printf("-l scriptfile\tread commands from scriptfile on startup\n"); 195 | printf("-p\t\tpipe-mode\n"); 196 | printf("-q\t\tquiet mode (more quiet than pipe-mode)\n"); 197 | printf("-r\t\tdon't use readline library for input\n"); 198 | printf("-s\t\tstop execution and exit\n"); 199 | printf("-v\t\tprint version number\n"); 200 | } 201 | 202 | static char **my_completion(const char *text, int start, int end) { 203 | char **matches; 204 | 205 | matches = (char **)NULL; 206 | smatch = start; 207 | matches = rl_completion_matches ((char*)text, &my_generator); 208 | 209 | return (matches); 210 | } 211 | 212 | char *my_generator(const char *text, int state) { 213 | static int list_index, list_index2, len, nummatches; 214 | char *name; 215 | text = rl_line_buffer; 216 | if (!state) { 217 | list_index = 0; 218 | list_index2 = 0; 219 | nummatches = 0; 220 | len = strlen(text); 221 | } 222 | 223 | while ((name = cmd[list_index])) { 224 | list_index++; 225 | 226 | if (strncmp (name, text, len) == 0) { 227 | nummatches++; 228 | /* Can't use strdup here */ 229 | return(strdup(name+smatch)); 230 | } 231 | } 232 | 233 | if (rl_point > 0) { 234 | while ((name = abbrvcmd[list_index2])) { 235 | list_index2++; 236 | 237 | /* Can't use strdup here */ 238 | if (strncmp (name, text, len) == 0) 239 | return(strdup(name+smatch)); 240 | } 241 | } 242 | 243 | /* If no names matched, then return NULL. */ 244 | return ((char *)NULL); 245 | } 246 | -------------------------------------------------------------------------------- /foma/cgflookup.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #ifndef _MSC_VER 24 | #include 25 | #endif 26 | #include 27 | #include 28 | #include "fomalib.h" 29 | 30 | #define LINE_LIMIT 262144 31 | #define UDP_MAX 65535 32 | #define FLOOKUP_PORT 6062 33 | 34 | static char *usagestring = "Usage: cgflookup [-h] [-a] [-i] [-s \"separator\"] [-w \"wordseparator\"] [-v] [-x] [-b] [-I <#|#k|#m|f>] \n"; 35 | 36 | static char *helpstring = 37 | "Applies words from stdin to a foma transducer/automaton read from a file and prints results to stdout.\n" 38 | 39 | "If the file contains several nets, inputs will be passed through all of them (simulating composition) or applied as alternates if the -a flag is specified (simulating priority union: the first net is tried first, if that fails to produce an output, then the second is tried, etc.).\n\n" 40 | "Options:\n\n" 41 | "-h\t\tprint help\n" 42 | "-a\t\ttry alternatives (in order of nets loaded, default is to pass words through each)\n" 43 | "-b\t\tunbuffered output (flushes output after each input word, for use in bidirectional piping)\n" 44 | "-i\t\tinverse application (apply down instead of up)\n" 45 | "-I indextype\tindex arcs with indextype (one of -I f -I #k -I #m or -I #)\n" 46 | "\t\t(usually slower than the default except for states > 1,000 arcs)\n" 47 | "\t\t -I # will index all states containing # arcs or more\n" 48 | "\t\t -I NUMk will index states from densest to sparsest until reaching mem limit of # kB\n" 49 | "\t\t -I NUMM will index states from densest to sparsest until reaching mem limit of # MB\n" 50 | "\t\t -I f will index flag-containing states only\n" 51 | "-q\t\tdon't sort arcs before applying (usually slower, except for really small, sparse automata)\n" 52 | "-s \"separator\"\tchange input/output separator symbol (default is TAB)\n" 53 | "-u \"separator\"\tmark uppercase words with <*>\n" 54 | "-w \"separator\"\tchange words separator symbol (default is LF)\n" 55 | "-v\t\tprint version number\n"; 56 | 57 | struct lookup_chain { 58 | struct fsm *net; 59 | struct apply_handle *ah; 60 | struct lookup_chain *next; 61 | struct lookup_chain *prev; 62 | }; 63 | 64 | #define DIR_DOWN 0 65 | #define DIR_UP 1 66 | 67 | static char buffer[2048]; 68 | static int apply_alternates = 0, numnets = 0, direction = DIR_UP, results, buffered_output = 1, index_arcs = 0, index_flag_states = 0, index_cutoff = 0, index_mem_limit = INT_MAX, mark_uppercase = 0; 69 | static char *separator = "\t", *wordseparator = "", *line, *indent = "\t"; 70 | static FILE *INFILE; 71 | static struct lookup_chain *chain_head, *chain_tail, *chain_new, *chain_pos; 72 | static fsm_read_binary_handle fsrh; 73 | 74 | static char *(*applyer)(struct apply_handle *h, char *word) = &apply_up; /* Default apply direction = up */ 75 | static void handle_line(char *s); 76 | static void app_print(char *result); 77 | static char *get_next_line(); 78 | 79 | void app_print(char *result) { 80 | wchar_t testuc[1]; // Temp storage to test uc of first letter in string 81 | if (result == NULL) { 82 | fprintf(stdout, "\"<%s>\"\n", line); 83 | } else { 84 | /* Or format string first */ 85 | if (mark_uppercase) { 86 | mbstowcs(testuc, line, 1); 87 | if (iswupper(*testuc)) { 88 | fprintf(stdout,"%s%s <*>\n",indent, result); 89 | } else { 90 | fprintf(stdout,"%s%s\n",indent, result); 91 | } 92 | } else { 93 | fprintf(stdout,"%s%s\n",indent, result); 94 | } 95 | } 96 | } 97 | 98 | int main(int argc, char *argv[]) { 99 | int opt, sortarcs = 1; 100 | char *infilename; 101 | struct fsm *net; 102 | 103 | setvbuf(stdout, buffer, _IOFBF, sizeof(buffer)); 104 | 105 | while ((opt = getopt(argc, argv, "abhHiI:qs:uw:vx")) != -1) { 106 | switch(opt) { 107 | case 'a': 108 | apply_alternates = 1; 109 | break; 110 | case 'b': 111 | buffered_output = 0; 112 | break; 113 | case 'h': 114 | printf("%s%s\n", usagestring,helpstring); 115 | exit(0); 116 | case 'i': 117 | direction = DIR_DOWN; 118 | applyer = &apply_down; 119 | break; 120 | case 'q': 121 | sortarcs = 0; 122 | break; 123 | case 'I': 124 | if (strcmp(optarg, "f") == 0) { 125 | index_flag_states = 1; 126 | index_arcs = 1; 127 | } else if (strstr(optarg, "k") != NULL && strstr(optarg,"K") != NULL) { 128 | /* k limit */ 129 | index_mem_limit = 1024*atoi(optarg); 130 | index_arcs = 1; 131 | } else if (strstr(optarg, "m") != NULL && strstr(optarg,"M") != NULL) { 132 | /* m limit */ 133 | index_mem_limit = 1024*1024*atoi(optarg); 134 | index_arcs = 1; 135 | } else if (isdigit(*optarg)) { 136 | index_arcs = 1; 137 | index_cutoff = atoi(optarg); 138 | } 139 | break; 140 | case 's': 141 | separator = strdup(optarg); 142 | break; 143 | case 'u': 144 | mark_uppercase = 1; 145 | if (!setlocale(LC_CTYPE, "")) { 146 | fprintf(stderr, "Check uppercase flag is on, but can't set locale!\n"); 147 | } 148 | break; 149 | case 'w': 150 | wordseparator = strdup(optarg); 151 | break; 152 | case 'v': 153 | printf("cgflookup 1.03 (foma library version %s)\n", fsm_get_library_version_string()); 154 | exit(0); 155 | default: 156 | fprintf(stderr, "%s", usagestring); 157 | exit(EXIT_FAILURE); 158 | } 159 | } 160 | if (optind == argc) { 161 | fprintf(stderr, "%s", usagestring); 162 | exit(EXIT_FAILURE); 163 | } 164 | 165 | infilename = argv[optind]; 166 | 167 | if ((fsrh = fsm_read_binary_file_multiple_init(infilename)) == NULL) { 168 | perror("File error"); 169 | exit(EXIT_FAILURE); 170 | } 171 | chain_head = chain_tail = NULL; 172 | 173 | while ((net = fsm_read_binary_file_multiple(fsrh)) != NULL) { 174 | numnets++; 175 | chain_new = malloc(sizeof(struct lookup_chain)); 176 | if (direction == DIR_DOWN && net->arcs_sorted_in != 1 && sortarcs) { 177 | fsm_sort_arcs(net, 1); 178 | } 179 | if (direction == DIR_UP && net->arcs_sorted_out != 1 && sortarcs) { 180 | fsm_sort_arcs(net, 2); 181 | } 182 | chain_new->net = net; 183 | chain_new->ah = apply_init(net); 184 | if (direction == DIR_DOWN && index_arcs) { 185 | apply_index(chain_new->ah, APPLY_INDEX_INPUT, index_cutoff, index_mem_limit, index_flag_states); 186 | } 187 | if (direction == DIR_UP && index_arcs) { 188 | apply_index(chain_new->ah, APPLY_INDEX_OUTPUT, index_cutoff, index_mem_limit, index_flag_states); 189 | } 190 | 191 | chain_new->next = NULL; 192 | chain_new->prev = NULL; 193 | if (chain_tail == NULL) { 194 | chain_tail = chain_head = chain_new; 195 | } else if (direction == DIR_DOWN || apply_alternates == 1) { 196 | chain_tail->next = chain_new; 197 | chain_new->prev = chain_tail; 198 | chain_tail = chain_new; 199 | } else { 200 | chain_new->next = chain_head; 201 | chain_head->prev = chain_new; 202 | chain_head = chain_new; 203 | } 204 | } 205 | 206 | if (numnets < 1) { 207 | fprintf(stderr, "%s: %s\n", "File error", infilename); 208 | exit(EXIT_FAILURE); 209 | } 210 | 211 | /* Standard read from stdin */ 212 | line = calloc(LINE_LIMIT, sizeof(char)); 213 | INFILE = stdin; 214 | while (get_next_line() != NULL) { 215 | results = 0; 216 | handle_line(line); 217 | if (results == 0) { 218 | app_print(NULL); 219 | } 220 | fprintf(stdout, "%s", wordseparator); 221 | if (!buffered_output) { 222 | fflush(stdout); 223 | } 224 | } 225 | /* Cleanup */ 226 | for (chain_pos = chain_head; chain_pos != NULL; chain_pos = chain_head) { 227 | chain_head = chain_pos->next; 228 | if (chain_pos->ah != NULL) { 229 | apply_clear(chain_pos->ah); 230 | } 231 | if (chain_pos->net != NULL) { 232 | fsm_destroy(chain_pos->net); 233 | } 234 | free(chain_pos); 235 | } 236 | if (line != NULL) 237 | free(line); 238 | exit(0); 239 | } 240 | 241 | char *get_next_line() { 242 | char *r; 243 | if ((r = fgets(line, LINE_LIMIT, INFILE)) != NULL) { 244 | line[strcspn(line, "\n\r")] = '\0'; 245 | } 246 | return r; 247 | } 248 | 249 | void handle_line(char *s) { 250 | char *result, *tempstr; 251 | /* Apply alternative */ 252 | results = 0; 253 | if (apply_alternates == 1) { 254 | for (chain_pos = chain_head, tempstr = s; ; chain_pos = chain_pos->next) { 255 | result = applyer(chain_pos->ah, tempstr); 256 | if (result != NULL) { 257 | results++; 258 | if (results == 1) { 259 | fprintf(stdout, "\"<%s>\"\n",line); 260 | } 261 | app_print(result); 262 | while ((result = applyer(chain_pos->ah, NULL)) != NULL) { 263 | results++; 264 | app_print(result); 265 | } 266 | break; 267 | } 268 | if (chain_pos == chain_tail) { 269 | break; 270 | } 271 | } 272 | } else { 273 | 274 | /* Get result from chain */ 275 | for (chain_pos = chain_head, tempstr = s; ; chain_pos = chain_pos->next) { 276 | result = applyer(chain_pos->ah, tempstr); 277 | if (result != NULL && chain_pos != chain_tail) { 278 | tempstr = result; 279 | continue; 280 | } 281 | if (result != NULL && chain_pos == chain_tail) { 282 | do { 283 | results++; 284 | if (results == 1) { 285 | fprintf(stdout, "\"<%s>\"\n",line); 286 | } 287 | app_print(result); 288 | } while ((result = applyer(chain_pos->ah, NULL)) != NULL); 289 | } 290 | if (result == NULL) { 291 | /* Move up */ 292 | for (chain_pos = chain_pos->prev; chain_pos != NULL; chain_pos = chain_pos->prev) { 293 | result = applyer(chain_pos->ah, NULL); 294 | if (result != NULL) { 295 | tempstr = result; 296 | break; 297 | } 298 | } 299 | } 300 | if (chain_pos == NULL) { 301 | break; 302 | } 303 | } 304 | } 305 | } 306 | -------------------------------------------------------------------------------- /foma/COPYING: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /foma/flookup.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "fomalib.h" 29 | 30 | #define LINE_LIMIT 262144 31 | #define UDP_MAX 65535 32 | #define FLOOKUP_PORT 6062 33 | 34 | static char *usagestring = "Usage: flookup [-h] [-a] [-i] [-s \"separator\"] [-w \"wordseparator\"] [-v] [-x] [-b] [-I <#|#k|#m|f>] [-S] [-P] [-A] \n"; 35 | 36 | static char *helpstring = 37 | "Applies words from stdin to a foma transducer/automaton read from a file and prints results to stdout.\n" 38 | 39 | "If the file contains several nets, inputs will be passed through all of them (simulating composition) or applied as alternates if the -a flag is specified (simulating priority union: the first net is tried first, if that fails to produce an output, then the second is tried, etc.).\n\n" 40 | "Options:\n\n" 41 | "-h\t\tprint help\n" 42 | "-a\t\ttry alternatives (in order of nets loaded, default is to pass words through each)\n" 43 | "-b\t\tunbuffered output (flushes output after each input word, for use in bidirectional piping)\n" 44 | "-i\t\tinverse application (apply down instead of up)\n" 45 | "-I indextype\tindex arcs with indextype (one of -I f -I #k -I #m or -I #)\n" 46 | "\t\t(usually slower than the default except for states > 1,000 arcs)\n" 47 | "\t\t -I # will index all states containing # arcs or more\n" 48 | "\t\t -I NUMk will index states from densest to sparsest until reaching mem limit of # kB\n" 49 | "\t\t -I NUMM will index states from densest to sparsest until reaching mem limit of # MB\n" 50 | "\t\t -I f will index flag-containing states only\n" 51 | "-q\t\tdon't sort arcs before applying (usually slower, except for really small, sparse automata)\n" 52 | "-S\t\trun flookup as UDP server (default addr INADDR_ANY port 6062)\n" 53 | "-A\t\t specify address of server\n" 54 | "-P\t\t specify port of server (default 6062)\n" 55 | "-s \"separator\"\tchange input/output separator symbol (default is TAB)\n" 56 | "-w \"separator\"\tchange words separator symbol (default is LF)\n" 57 | "-v\t\tprint version number\n" 58 | "-x\t\tdon't echo input string"; 59 | 60 | struct lookup_chain { 61 | struct fsm *net; 62 | struct apply_handle *ah; 63 | struct lookup_chain *next; 64 | struct lookup_chain *prev; 65 | }; 66 | 67 | #define DIR_DOWN 0 68 | #define DIR_UP 1 69 | 70 | static struct sockaddr_in serveraddr, clientaddr; 71 | static int listen_sd, numbytes; 72 | static socklen_t addrlen; 73 | 74 | static char buffer[2048]; 75 | static int echo = 1, apply_alternates = 0, numnets = 0, direction = DIR_UP, results, buffered_output = 1, index_arcs = 0, index_flag_states = 0, index_cutoff = 0, index_mem_limit = INT_MAX, mode_server = 0, port_number = FLOOKUP_PORT, udpsize; 76 | static char *separator = "\t", *wordseparator = "\n", *server_address = NULL, *line, *serverstring = NULL; 77 | static FILE *INFILE; 78 | static struct lookup_chain *chain_head, *chain_tail, *chain_new, *chain_pos; 79 | static fsm_read_binary_handle fsrh; 80 | 81 | static char *(*applyer)(struct apply_handle *h, char *word) = &apply_up; /* Default apply direction = up */ 82 | static void handle_line(char *s); 83 | static void app_print(char *result); 84 | static char *get_next_line(); 85 | static void server_init(); 86 | 87 | void app_print(char *result) { 88 | 89 | if (!mode_server) { 90 | if (echo == 1) { 91 | fprintf(stdout, "%s%s",line, separator); 92 | } 93 | if (result == NULL) { 94 | fprintf(stdout,"+?\n"); 95 | } else { 96 | fprintf(stdout, "%s\n", result); 97 | } 98 | } else { 99 | if (echo == 1) { 100 | strncat(serverstring+udpsize, line, UDP_MAX-udpsize); 101 | udpsize += strlen(line); 102 | strncat(serverstring+udpsize, separator, UDP_MAX-udpsize); 103 | udpsize += strlen(separator); 104 | } 105 | if (result == NULL) { 106 | strncat(serverstring+udpsize, "?+\n", UDP_MAX-udpsize); 107 | udpsize += 3; 108 | } else { 109 | strncat(serverstring+udpsize, result, UDP_MAX-udpsize); 110 | udpsize += strlen(result); 111 | strncat(serverstring+udpsize, "\n", UDP_MAX-udpsize); 112 | udpsize++; 113 | } 114 | } 115 | } 116 | 117 | int main(int argc, char *argv[]) { 118 | int opt, sortarcs = 1; 119 | char *infilename; 120 | struct fsm *net; 121 | 122 | setvbuf(stdout, buffer, _IOFBF, sizeof(buffer)); 123 | 124 | while ((opt = getopt(argc, argv, "abhHiI:qs:SA:P:w:vx")) != -1) { 125 | switch(opt) { 126 | case 'a': 127 | apply_alternates = 1; 128 | break; 129 | case 'b': 130 | buffered_output = 0; 131 | break; 132 | case 'h': 133 | printf("%s%s\n", usagestring,helpstring); 134 | exit(0); 135 | case 'i': 136 | direction = DIR_DOWN; 137 | applyer = &apply_down; 138 | break; 139 | case 'q': 140 | sortarcs = 0; 141 | break; 142 | case 'I': 143 | if (strcmp(optarg, "f") == 0) { 144 | index_flag_states = 1; 145 | index_arcs = 1; 146 | } else if (strstr(optarg, "k") != NULL && strstr(optarg,"K") != NULL) { 147 | /* k limit */ 148 | index_mem_limit = 1024*atoi(optarg); 149 | index_arcs = 1; 150 | } else if (strstr(optarg, "m") != NULL && strstr(optarg,"M") != NULL) { 151 | /* m limit */ 152 | index_mem_limit = 1024*1024*atoi(optarg); 153 | index_arcs = 1; 154 | } else if (isdigit(*optarg)) { 155 | index_arcs = 1; 156 | index_cutoff = atoi(optarg); 157 | } 158 | break; 159 | case 's': 160 | separator = strdup(optarg); 161 | break; 162 | case 'S': 163 | mode_server = 1; 164 | break; 165 | case 'A': 166 | server_address = strdup(optarg); 167 | break; 168 | case 'P': 169 | port_number = atoi(optarg); 170 | break; 171 | case 'w': 172 | wordseparator = strdup(optarg); 173 | break; 174 | case 'v': 175 | printf("flookup 1.03 (foma library version %s)\n", fsm_get_library_version_string()); 176 | exit(0); 177 | case 'x': 178 | echo = 0; 179 | break; 180 | default: 181 | fprintf(stderr, "%s", usagestring); 182 | exit(EXIT_FAILURE); 183 | } 184 | } 185 | if (optind == argc) { 186 | fprintf(stderr, "%s", usagestring); 187 | exit(EXIT_FAILURE); 188 | } 189 | 190 | infilename = argv[optind]; 191 | 192 | if ((fsrh = fsm_read_binary_file_multiple_init(infilename)) == NULL) { 193 | perror("File error"); 194 | exit(EXIT_FAILURE); 195 | } 196 | chain_head = chain_tail = NULL; 197 | 198 | while ((net = fsm_read_binary_file_multiple(fsrh)) != NULL) { 199 | numnets++; 200 | chain_new = malloc(sizeof(struct lookup_chain)); 201 | if (direction == DIR_DOWN && net->arcs_sorted_in != 1 && sortarcs) { 202 | fsm_sort_arcs(net, 1); 203 | } 204 | if (direction == DIR_UP && net->arcs_sorted_out != 1 && sortarcs) { 205 | fsm_sort_arcs(net, 2); 206 | } 207 | chain_new->net = net; 208 | chain_new->ah = apply_init(net); 209 | if (direction == DIR_DOWN && index_arcs) { 210 | apply_index(chain_new->ah, APPLY_INDEX_INPUT, index_cutoff, index_mem_limit, index_flag_states); 211 | } 212 | if (direction == DIR_UP && index_arcs) { 213 | apply_index(chain_new->ah, APPLY_INDEX_OUTPUT, index_cutoff, index_mem_limit, index_flag_states); 214 | } 215 | 216 | chain_new->next = NULL; 217 | chain_new->prev = NULL; 218 | if (chain_tail == NULL) { 219 | chain_tail = chain_head = chain_new; 220 | } else if (direction == DIR_DOWN || apply_alternates == 1) { 221 | chain_tail->next = chain_new; 222 | chain_new->prev = chain_tail; 223 | chain_tail = chain_new; 224 | } else { 225 | chain_new->next = chain_head; 226 | chain_head->prev = chain_new; 227 | chain_head = chain_new; 228 | } 229 | } 230 | 231 | if (numnets < 1) { 232 | fprintf(stderr, "%s: %s\n", "File error", infilename); 233 | exit(EXIT_FAILURE); 234 | } 235 | 236 | if (mode_server) { 237 | server_init(); 238 | serverstring = calloc(UDP_MAX+1, sizeof(char)); 239 | line = calloc(UDP_MAX+1, sizeof(char)); 240 | addrlen = sizeof(clientaddr); 241 | for (;;) { 242 | numbytes = recvfrom(listen_sd, line, UDP_MAX, 0,(struct sockaddr *)&clientaddr, &addrlen); 243 | if (numbytes == -1) { 244 | perror("recvfrom() failed, aborting"); 245 | break; 246 | } 247 | line[numbytes] = '\0'; 248 | line[strcspn(line, "\n\r")] = '\0'; 249 | fflush(stdout); 250 | results = 0; 251 | udpsize = 0; 252 | serverstring[0] = '\0'; 253 | handle_line(line); 254 | if (results == 0) { 255 | app_print(NULL); 256 | } 257 | if (serverstring[0] != '\0') { 258 | numbytes = sendto(listen_sd, serverstring, strlen(serverstring), 0, (struct sockaddr *)&clientaddr, addrlen); 259 | if (numbytes < 0) { 260 | perror("sendto() failed"); fflush(stdout); 261 | } 262 | } 263 | } 264 | } else { 265 | /* Standard read from stdin */ 266 | line = calloc(LINE_LIMIT, sizeof(char)); 267 | INFILE = stdin; 268 | while (get_next_line() != NULL) { 269 | results = 0; 270 | handle_line(line); 271 | if (results == 0) { 272 | app_print(NULL); 273 | } 274 | fprintf(stdout, "%s", wordseparator); 275 | if (!buffered_output) { 276 | fflush(stdout); 277 | } 278 | } 279 | } 280 | /* Cleanup */ 281 | for (chain_pos = chain_head; chain_pos != NULL; chain_pos = chain_head) { 282 | chain_head = chain_pos->next; 283 | if (chain_pos->ah != NULL) { 284 | apply_clear(chain_pos->ah); 285 | } 286 | if (chain_pos->net != NULL) { 287 | fsm_destroy(chain_pos->net); 288 | } 289 | free(chain_pos); 290 | } 291 | if (serverstring != NULL) 292 | free(serverstring); 293 | if (line != NULL) 294 | free(line); 295 | exit(0); 296 | } 297 | 298 | char *get_next_line() { 299 | char *r; 300 | if ((r = fgets(line, LINE_LIMIT, INFILE)) != NULL) { 301 | line[strcspn(line, "\n\r")] = '\0'; 302 | } 303 | return r; 304 | } 305 | 306 | void handle_line(char *s) { 307 | char *result, *tempstr; 308 | /* Apply alternative */ 309 | if (apply_alternates == 1) { 310 | for (chain_pos = chain_head, tempstr = s; ; chain_pos = chain_pos->next) { 311 | result = applyer(chain_pos->ah, tempstr); 312 | if (result != NULL) { 313 | results++; 314 | app_print(result); 315 | while ((result = applyer(chain_pos->ah, NULL)) != NULL) { 316 | results++; 317 | app_print(result); 318 | } 319 | break; 320 | } 321 | if (chain_pos == chain_tail) { 322 | break; 323 | } 324 | } 325 | } else { 326 | 327 | /* Get result from chain */ 328 | for (chain_pos = chain_head, tempstr = s; ; chain_pos = chain_pos->next) { 329 | result = applyer(chain_pos->ah, tempstr); 330 | if (result != NULL && chain_pos != chain_tail) { 331 | tempstr = result; 332 | continue; 333 | } 334 | if (result != NULL && chain_pos == chain_tail) { 335 | do { 336 | results++; 337 | app_print(result); 338 | } while ((result = applyer(chain_pos->ah, NULL)) != NULL); 339 | } 340 | if (result == NULL) { 341 | /* Move up */ 342 | for (chain_pos = chain_pos->prev; chain_pos != NULL; chain_pos = chain_pos->prev) { 343 | result = applyer(chain_pos->ah, NULL); 344 | if (result != NULL) { 345 | tempstr = result; 346 | break; 347 | } 348 | } 349 | } 350 | if (chain_pos == NULL) { 351 | break; 352 | } 353 | } 354 | } 355 | } 356 | 357 | void server_init(void) { 358 | unsigned int rcvsize = 262144; 359 | int retval; 360 | char server_address_string[INET_ADDRSTRLEN]; 361 | 362 | if ((listen_sd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) == -1) { 363 | perror("socket() failed"); 364 | exit(1); 365 | } 366 | if (setsockopt(listen_sd, SOL_SOCKET, SO_RCVBUF, (char *) &rcvsize, sizeof(rcvsize)) < 0) { 367 | perror("setsockopt() failed"); 368 | exit(1); 369 | } 370 | if (setsockopt(listen_sd, SOL_SOCKET, SO_SNDBUF, (char *) &rcvsize, sizeof(rcvsize)) < 0) { 371 | perror("setsockopt() failed"); 372 | exit(1); 373 | } 374 | 375 | memset((char *) &serveraddr, 0, sizeof(serveraddr)); 376 | serveraddr.sin_family = AF_INET; 377 | serveraddr.sin_port = htons(port_number); 378 | if (server_address != NULL) { 379 | retval = inet_pton(AF_INET, server_address, &serveraddr.sin_addr.s_addr); 380 | if (retval != 1) { 381 | if (retval == 0) { 382 | printf("inet_pton() failed: string is not a valid address.\n"); 383 | exit(1); 384 | } 385 | perror("inet_pton() failed"); 386 | exit(1); 387 | } 388 | } else { 389 | serveraddr.sin_addr.s_addr = INADDR_ANY; 390 | } 391 | if (bind(listen_sd, (struct sockaddr *) &serveraddr, sizeof(serveraddr)) == -1) { 392 | perror("bind() failed"); 393 | exit(1); 394 | } 395 | if (inet_ntop(AF_INET, &serveraddr.sin_addr, server_address_string, INET_ADDRSTRLEN) == NULL) { 396 | perror("inet_ntop() failed"); 397 | exit(1); 398 | } 399 | printf("Started flookup server on %s port %i\n", server_address_string, port_number); fflush(stdout); 400 | } 401 | -------------------------------------------------------------------------------- /foma/sigma.c: -------------------------------------------------------------------------------- 1 | /* Foma: a finite-state toolkit and library. */ 2 | /* Copyright © 2008-2021 Mans Hulden */ 3 | 4 | /* This file is part of foma. */ 5 | 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ 7 | /* you may not use this file except in compliance with the License. */ 8 | /* You may obtain a copy of the License at */ 9 | 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ 11 | 12 | /* Unless required by applicable law or agreed to in writing, software */ 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ 15 | /* See the License for the specific language governing permissions and */ 16 | /* limitations under the License. */ 17 | 18 | #include 19 | #include 20 | #include "foma.h" 21 | 22 | struct sigma *sigma_remove(char *symbol, struct sigma *sigma) { 23 | struct sigma *sigma_start, *sigma_prev = NULL; 24 | sigma_prev = NULL; 25 | sigma_start = sigma; 26 | for ( ; sigma != NULL && sigma->number != -1; sigma_prev = sigma, sigma=sigma->next) { 27 | if (strcmp(sigma->symbol,symbol) == 0) { 28 | if (sigma_prev == NULL) { 29 | sigma_start = sigma->next; 30 | free(sigma->symbol); 31 | free(sigma); 32 | } else { 33 | (sigma_prev)->next = sigma->next; 34 | free(sigma->symbol); 35 | free(sigma); 36 | } 37 | break; 38 | } 39 | } 40 | return(sigma_start); 41 | } 42 | 43 | struct sigma *sigma_remove_num(int num, struct sigma *sigma) { 44 | struct sigma *sigma_start, *sigma_prev = NULL; 45 | sigma_prev = NULL; 46 | sigma_start = sigma; 47 | for ( ; sigma != NULL && sigma->number != -1; sigma_prev = sigma, sigma=sigma->next) { 48 | if (sigma->number == num) { 49 | if (sigma_prev == NULL) { 50 | sigma_start = sigma->next; 51 | free(sigma->symbol); 52 | free(sigma); 53 | } else { 54 | (sigma_prev)->next = sigma->next; 55 | free(sigma->symbol); 56 | free(sigma); 57 | } 58 | break; 59 | } 60 | } 61 | return(sigma_start); 62 | } 63 | 64 | int sigma_add_special (int symbol, struct sigma *sigma) { 65 | struct sigma *sigma_previous = NULL, *sigma_splice = NULL; 66 | char *str = NULL; 67 | if (symbol == EPSILON) 68 | str = strdup("@_EPSILON_SYMBOL_@"); 69 | if (symbol == IDENTITY) 70 | str = strdup("@_IDENTITY_SYMBOL_@"); 71 | if (symbol == UNKNOWN) 72 | str = strdup("@_UNKNOWN_SYMBOL_@"); 73 | 74 | /* Insert special symbols pre-sorted */ 75 | if (sigma->number == -1) { 76 | sigma->number = symbol; 77 | } else { 78 | for (;(sigma != NULL) && (sigma->number < symbol) && (sigma->number!=-1); sigma_previous=sigma,sigma = sigma->next) { 79 | } 80 | sigma_splice = malloc(sizeof(struct sigma)); 81 | if (sigma_previous != NULL) { 82 | (sigma_previous)->next = sigma_splice; 83 | sigma_splice->number = symbol; 84 | sigma_splice->symbol = str; 85 | (sigma_splice)->next = sigma; 86 | return(symbol); 87 | } else { 88 | sigma_splice->symbol = sigma->symbol; 89 | sigma_splice->number = sigma->number; 90 | sigma_splice->next = sigma->next; 91 | sigma->number = symbol; 92 | sigma->symbol = str; 93 | sigma->next = sigma_splice; 94 | return(symbol); 95 | } 96 | } 97 | sigma->next = NULL; 98 | sigma->symbol = str; 99 | return(symbol); 100 | } 101 | 102 | /* WARNING: this function will indeed add a symbol to sigma */ 103 | /* but it's up to the user to sort the sigma (affecting arc numbers in the network) */ 104 | /* before merge_sigma() is ever called */ 105 | 106 | int sigma_add (char *symbol, struct sigma *sigma) { 107 | int assert = -1; 108 | struct sigma *sigma_previous = NULL, *sigma_splice = NULL; 109 | 110 | /* Special characters */ 111 | if (strcmp(symbol, "@_EPSILON_SYMBOL_@") == 0) 112 | assert = EPSILON; 113 | if (strcmp(symbol,"@_IDENTITY_SYMBOL_@") == 0) 114 | assert = IDENTITY; 115 | if (strcmp(symbol,"@_UNKNOWN_SYMBOL_@") == 0) 116 | assert = UNKNOWN; 117 | 118 | /* Insert non-special in any order */ 119 | if (assert == -1) { 120 | if (sigma->number == -1) { 121 | sigma->number = 3; 122 | } else { 123 | for (; sigma->next != NULL; sigma = sigma->next) { 124 | } 125 | sigma->next = malloc(sizeof(struct sigma)); 126 | if ((sigma->number)+1 < 3) { 127 | (sigma->next)->number = 3; 128 | } else { 129 | (sigma->next)->number = (sigma->number)+1; 130 | } 131 | sigma = sigma->next; 132 | } 133 | sigma->next = NULL; 134 | sigma->symbol = strdup(symbol); 135 | return(sigma->number); 136 | } else { 137 | /* Insert special symbols pre-sorted */ 138 | if (sigma->number == -1) { 139 | sigma->number = assert; 140 | } else { 141 | for (;(sigma != NULL) && (sigma->number < assert) && (sigma->number!=-1); sigma_previous=sigma,sigma = sigma->next) { 142 | } 143 | sigma_splice = malloc(sizeof(struct sigma)); 144 | if (sigma_previous != NULL) { 145 | (sigma_previous)->next = sigma_splice; 146 | sigma_splice->number = assert; 147 | sigma_splice->symbol = malloc(sizeof(char)*(strlen(symbol)+1)); 148 | strcpy(sigma_splice->symbol, symbol); 149 | (sigma_splice)->next = sigma; 150 | return(assert); 151 | } else { 152 | sigma_splice->symbol = sigma->symbol; 153 | sigma_splice->number = sigma->number; 154 | sigma_splice->next = sigma->next; 155 | sigma->number = assert; 156 | sigma->symbol = malloc(sizeof(char)*(strlen(symbol)+1)); 157 | strcpy(sigma->symbol, symbol); 158 | sigma->next = sigma_splice; 159 | return(assert); 160 | } 161 | } 162 | sigma->next = NULL; 163 | sigma->symbol = strdup(symbol); 164 | return(assert); 165 | } 166 | } 167 | 168 | /* Remove symbols that are never used from sigma and renumber */ 169 | /* The variable force controls whether to remove even though */ 170 | /* @ or ? is present */ 171 | /* If force == 1, unused symbols are always removed regardless */ 172 | 173 | void sigma_cleanup (struct fsm *net, int force) { 174 | int i,j,first,maxsigma,*attested; 175 | struct fsm_state *fsm; 176 | struct sigma *sig, *sig_prev, *sign; 177 | 178 | if (force == 0) { 179 | if (sigma_find_number(IDENTITY, net->sigma) != -1) 180 | return; 181 | if (sigma_find_number(UNKNOWN, net->sigma) != -1) 182 | return; 183 | } 184 | 185 | maxsigma = sigma_max(net->sigma); 186 | if (maxsigma < 0) { return; } 187 | attested = malloc(sizeof(int)*(maxsigma+1)); 188 | for (i=0; i<=maxsigma; i++) 189 | *(attested+i) = 0; 190 | fsm = net->states; 191 | for (i=0; (fsm+i)->state_no != -1; i++) { 192 | if ((fsm+i)->in >=0) 193 | *(attested+(fsm+i)->in) = 1; 194 | if ((fsm+i)->out >=0) 195 | *(attested+(fsm+i)->out) = 1; 196 | } 197 | for (i=3,j=3; i<=maxsigma;i++ ) { 198 | if (*(attested+i)) { 199 | *(attested+i) = j; 200 | j++; 201 | } 202 | } 203 | for (i=0; (fsm+i)->state_no != -1; i++) { 204 | if ((fsm+i)->in > 2) 205 | (fsm+i)->in = *(attested+(fsm+i)->in); 206 | if ((fsm+i)->out > 2) 207 | (fsm+i)->out = *(attested+(fsm+i)->out); 208 | } 209 | sig_prev = NULL; 210 | for (sig = net->sigma; sig != NULL && sig->number != -1; sig = sign) { 211 | first = 1; 212 | sign = sig->next; 213 | if (!*(attested+(sig->number))) { 214 | free(sig->symbol); 215 | free(sig); 216 | if (sig_prev != NULL) { 217 | sig_prev->next = sign; 218 | first = 0; 219 | } else { 220 | first = 0; 221 | net->sigma = sign; 222 | } 223 | } else { 224 | sig->number = sig->number >= 3 ? *(attested+(sig->number)) : sig->number; 225 | } 226 | if (first) 227 | sig_prev = sig; 228 | } 229 | free(attested); 230 | return; 231 | } 232 | 233 | int sigma_max(struct sigma *sigma) { 234 | int i; 235 | if (sigma == NULL) 236 | return -1; 237 | for (i=-1; sigma != NULL; sigma = sigma->next) 238 | i = sigma->number > i ? sigma->number : i; 239 | return(i); 240 | } 241 | 242 | int sigma_size(struct sigma *sigma) { 243 | int i; 244 | for(i=0; sigma != NULL; sigma = sigma->next) 245 | i++; 246 | return(i); 247 | } 248 | 249 | struct fsm_sigma_list *sigma_to_list(struct sigma *sigma) { 250 | struct fsm_sigma_list *sl; 251 | struct sigma *s; 252 | sl = calloc(sigma_max(sigma)+1,sizeof(struct fsm_sigma_list)); 253 | for (s = sigma; s != NULL && s->number != -1; s = s->next) { 254 | (sl+(s->number))->symbol = s->symbol; 255 | } 256 | return sl; 257 | } 258 | 259 | int sigma_add_number(struct sigma *sigma, char *symbol, int number) { 260 | struct sigma *newsigma, *prev_sigma; 261 | prev_sigma = NULL; 262 | if (sigma->number == -1) { 263 | sigma->symbol = strdup(symbol); 264 | sigma->number = number; 265 | sigma->next = NULL; 266 | return(1); 267 | } 268 | for (newsigma = sigma; newsigma != NULL; newsigma = newsigma->next) { 269 | prev_sigma = newsigma; 270 | } 271 | newsigma = malloc(sizeof(struct sigma)); 272 | newsigma->symbol = strdup(symbol); 273 | newsigma->number = number; 274 | newsigma->next = NULL; 275 | prev_sigma->next = newsigma; 276 | return(1); 277 | } 278 | 279 | int sigma_find_number(int number, struct sigma *sigma) { 280 | if (sigma == NULL) 281 | return -1; 282 | if (sigma->number == -1) { 283 | return -1; 284 | } 285 | /* for (;(sigma != NULL) && (sigma->number <= number); sigma = sigma->next) { */ 286 | for (;(sigma != NULL) && (sigma->number != -1); sigma = sigma->next) { 287 | if (number == sigma->number) { 288 | return (sigma->number); 289 | } 290 | } 291 | return -1; 292 | } 293 | char *sigma_string(int number, struct sigma *sigma) { 294 | if (sigma == NULL) 295 | return NULL; 296 | if (sigma->number == -1) { 297 | return NULL; 298 | } 299 | for (;(sigma != NULL) && (sigma->number != -1); sigma = sigma->next) { 300 | if (number == sigma->number) { 301 | return (sigma->symbol); 302 | } 303 | } 304 | return NULL; 305 | } 306 | 307 | /* Substitutes string symbol for sub in sigma */ 308 | /* no check for duplicates */ 309 | int sigma_substitute(char *symbol, char *sub, struct sigma *sigma) { 310 | if (sigma->number == -1) { 311 | return -1; 312 | } 313 | for (; sigma != NULL && sigma->number != -1 ; sigma = sigma->next) { 314 | if (strcmp(sigma->symbol, symbol) == 0) { 315 | free(sigma->symbol); 316 | sigma->symbol = strdup(sub); 317 | return(sigma->number); 318 | } 319 | } 320 | return -1; 321 | } 322 | 323 | int sigma_find(char *symbol, struct sigma *sigma) { 324 | 325 | if (sigma == NULL || sigma->number == -1) { 326 | return -1; 327 | } 328 | for (; sigma != NULL && sigma->number != -1 ; sigma = sigma->next) { 329 | if (strcmp(sigma->symbol, symbol) == 0) { 330 | return (sigma->number); 331 | } 332 | } 333 | return -1; 334 | } 335 | 336 | struct ssort { 337 | char *symbol; 338 | int number; 339 | }; 340 | 341 | int ssortcmp(const void *_a, const void *_b) { 342 | const struct ssort *a = _a; 343 | const struct ssort *b = _b; 344 | return(strcmp(a->symbol, b->symbol)); 345 | } 346 | 347 | struct sigma *sigma_copy(struct sigma *sigma) { 348 | int f = 0; 349 | struct sigma *copy_sigma, *copy_sigma_s; 350 | 351 | if (sigma == NULL) { return NULL; } 352 | copy_sigma_s = malloc(sizeof(struct sigma)); 353 | 354 | for (copy_sigma = copy_sigma_s; sigma != NULL; sigma=sigma->next) { 355 | if (f == 1) { 356 | copy_sigma->next = malloc(sizeof(struct sigma)); 357 | copy_sigma = copy_sigma->next; 358 | } 359 | copy_sigma->number = sigma->number; 360 | if (sigma->symbol != NULL) 361 | copy_sigma->symbol = strdup(sigma->symbol); 362 | else 363 | copy_sigma->symbol = NULL; 364 | copy_sigma->next = NULL; 365 | f = 1; 366 | } 367 | return(copy_sigma_s); 368 | } 369 | 370 | /* Assigns a consecutive numbering to symbols in sigma > IDENTITY */ 371 | /* and sorts the sigma based on the symbol string contents */ 372 | 373 | int sigma_sort(struct fsm *net) { 374 | int(*comp)(const void*,const void*) = ssortcmp; 375 | int size, i, max, *replacearray; 376 | struct ssort *ssort; 377 | struct sigma *sigma; 378 | struct fsm_state *fsm_state; 379 | 380 | size = sigma_max(net->sigma); 381 | if (size < 0) { return 1; } 382 | ssort = malloc(sizeof(struct ssort)*size); 383 | 384 | for (i=0, sigma=net->sigma; sigma != NULL; sigma=sigma->next) { 385 | if (sigma->number > IDENTITY) { 386 | ssort[i].symbol = (char *)sigma->symbol; 387 | ssort[i].number = sigma->number; 388 | i++; 389 | } 390 | } 391 | max = i; 392 | qsort(ssort, max, sizeof(struct ssort), comp); 393 | replacearray = malloc(sizeof(int)*(size+3)); 394 | for (i=0; inumber] = i+3; 396 | 397 | /* Replace arcs */ 398 | for(i=0, fsm_state = net->states; (fsm_state+i)->state_no != -1; i++) { 399 | if ((fsm_state+i)->in > IDENTITY) 400 | (fsm_state+i)->in = replacearray[(fsm_state+i)->in]; 401 | if ((fsm_state+i)->out > IDENTITY) 402 | (fsm_state+i)->out = replacearray[(fsm_state+i)->out]; 403 | } 404 | /* Replace sigma */ 405 | for (i=0, sigma=net->sigma; sigma != NULL; sigma=sigma->next) { 406 | if (sigma->number > IDENTITY) { 407 | sigma->number = i+3; 408 | sigma->symbol = (ssort+i)->symbol; 409 | i++; 410 | } 411 | } 412 | free(replacearray); 413 | free(ssort); 414 | return(1); 415 | } 416 | 417 | struct sigma *sigma_create() { 418 | struct sigma *sigma; 419 | sigma = malloc(sizeof(struct sigma)); 420 | sigma->number = -1; /*Empty sigma*/ 421 | sigma->next = NULL; 422 | sigma->symbol = NULL; 423 | return(sigma); 424 | } 425 | --------------------------------------------------------------------------------