├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── CMakeModules ├── AddSources.cmake ├── CTAGS.cmake ├── CodeCoverage.cmake ├── FindGLIB2.cmake ├── FindGfsm.cmake ├── FindICU.cmake └── cpplint.cmake ├── COPYING ├── COPYING.LESSER ├── Dockerfile ├── Doxyfile.in ├── GIT-VERSION-GEN ├── MinGW.toolchain ├── README.md ├── doc ├── PythonBindings.md ├── UserGuide.md ├── acrh-paper.pdf └── example │ ├── bible_lexicon.fsm │ ├── bible_lexicon.sym │ ├── example.RuleBased.rulesfile │ ├── example.cfg │ ├── example_chain.cfg │ ├── fnhd_sample.txt │ ├── fnhd_train.Mapper.mapfile │ ├── fnhd_train.RuleBased.rulesfile │ ├── fnhd_train.WLD.paramfile │ ├── fnhd_train.txt │ └── normalize.py └── src ├── CMakeLists.txt ├── config.h.in ├── cycle.cpp ├── cycle.h ├── defines.h.in ├── gfsm ├── CMakeLists.txt ├── acceptor.cpp ├── acceptor.h ├── alphabet.cpp ├── alphabet.h ├── automaton.cpp ├── automaton.h ├── cascade.cpp ├── cascade.h ├── implode_explode.cpp ├── implode_explode.h ├── labelvector.cpp ├── labelvector.h ├── path.h ├── semiring.h ├── string_acceptor.cpp ├── string_acceptor.h ├── string_cascade.cpp ├── string_cascade.h ├── string_transducer.cpp ├── string_transducer.h ├── transducer.cpp └── transducer.h ├── gfsm_wrapper.h ├── gfsmlibs.h ├── interface.h ├── interface ├── CMakeLists.txt ├── input.cpp ├── input.h ├── iobase.h ├── output.cpp └── output.h ├── lexicon ├── CMakeLists.txt ├── lexicon.cpp ├── lexicon.h ├── lexicon_interface.h └── lexicon_main.cpp ├── main.cpp ├── norma.h ├── normalizer ├── CMakeLists.txt ├── base.h ├── cacheable.cpp ├── cacheable.h ├── exceptions.h ├── external.h ├── external │ ├── CMakeLists.txt │ ├── external.cpp │ └── external.h ├── mapper.h ├── mapper │ ├── CMakeLists.txt │ ├── mapper.cpp │ └── mapper.h ├── result.cpp ├── result.h ├── rulebased.h ├── rulebased │ ├── CMakeLists.txt │ ├── candidate_finder.cpp │ ├── candidate_finder.h │ ├── rule.cpp │ ├── rule.h │ ├── rule_collection.cpp │ ├── rule_collection.h │ ├── rule_learn.cpp │ ├── rule_learn.h │ ├── rulebased.cpp │ ├── rulebased.h │ └── symbols.h ├── wld.h └── wld │ ├── CMakeLists.txt │ ├── levenshtein_algorithm.cpp │ ├── levenshtein_algorithm.h │ ├── levenshtein_aligner.cpp │ ├── levenshtein_aligner.h │ ├── symbols.cpp │ ├── symbols.h │ ├── typedefs.h │ ├── weight_set.cpp │ ├── weight_set.h │ ├── wld.cpp │ └── wld.h ├── pluginsocket.cpp ├── pluginsocket.h ├── python ├── CMakeLists.txt ├── exception_wrapper.cpp ├── exception_wrapper.h ├── lexicon_wrapper.cpp ├── lexicon_wrapper.h ├── norma.cpp ├── norma │ ├── CMakeLists.txt │ ├── ChainNormalizer.py │ ├── LexiconWrapper.py │ ├── NormaCfgParser.py │ ├── NormalizerWrapper.py │ └── __init__.py ├── normalizer │ ├── mapper.cpp │ ├── normalizer_wrapper.h │ ├── rulebased.cpp │ └── wld.cpp ├── result_conv.cpp ├── result_conv.h ├── setup.py.in ├── string_impl_conv.cpp ├── string_impl_conv.h ├── training_conv.cpp └── training_conv.h ├── regex_impl.h ├── results_queue-inl.h ├── results_queue.h ├── string_impl.cpp ├── string_impl.h ├── tests ├── CMakeLists.txt ├── data │ ├── fileinput.txt │ ├── normalize.py │ ├── test-lexicon.gfsa │ ├── test-lexicon.gfsa.old │ ├── test-lexicon.lab │ ├── test-lexicon.lab.old │ ├── test-lexicon.txt │ ├── test-mapfile-malformed.txt │ ├── test-mapfile.txt │ ├── test-rulesfile-malformed.txt │ ├── test-rulesfile.txt │ ├── test-weights-malformed.txt │ └── test-weights.txt ├── gfsm_wrapper.cpp ├── interface_test.cpp ├── normalizer │ ├── CMakeLists.txt │ ├── external_test.cpp │ ├── lexicon_test.cpp │ ├── mapper.cpp │ ├── mock_lexicon.h │ ├── rulebased_test.cpp │ ├── wld_optimization.cpp │ ├── wld_optimization.h │ └── wld_test.cpp ├── python │ ├── CMakeLists.txt │ ├── py-norma.py.in │ ├── test_base.py │ ├── test_chain.py │ ├── test_lexicon.py │ ├── test_lexicon_icu.py │ ├── test_lexicon_std.py │ ├── test_mapper.py │ ├── test_result.py │ ├── test_rulebased.py │ └── test_wld.py ├── tests.h └── training_data.cpp ├── training_data-inl.h ├── training_data.cpp └── training_data.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | *.log 4 | *.swp 5 | tags 6 | cscope.out 7 | make_build.sh 8 | old_python_stuff/ 9 | build/ 10 | .ycm_extra_conf.py 11 | 12 | /python/norma/#CMakeLists.txt# 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: required 3 | 4 | language: cpp 5 | 6 | matrix: 7 | include: 8 | - compiler: gcc 9 | addons: 10 | apt: 11 | sources: 12 | - ubuntu-toolchain-r-test 13 | packages: 14 | - g++-4.9 15 | - gcc-4.9 16 | env: TRAVIS_CXX=g++-4.9 TRAVIS_CC=gcc-4.9 STRINGS=STD COVERAGE=FALSE 17 | - compiler: gcc 18 | addons: 19 | apt: 20 | sources: 21 | - ubuntu-toolchain-r-test 22 | packages: 23 | - g++-4.9 24 | - gcc-4.9 25 | env: TRAVIS_CXX=g++-4.9 TRAVIS_CC=gcc-4.9 STRINGS=ICU COVERAGE=FALSE 26 | - compiler: gcc 27 | addons: 28 | apt: 29 | sources: 30 | - ubuntu-toolchain-r-test 31 | packages: 32 | - g++-5 33 | - gcc-5 34 | env: TRAVIS_CXX=g++-5 TRAVIS_CC=gcc-5 STRINGS=STD COVERAGE=TRUE 35 | - compiler: gcc 36 | addons: 37 | apt: 38 | sources: 39 | - ubuntu-toolchain-r-test 40 | packages: 41 | - g++-5 42 | - gcc-5 43 | env: TRAVIS_CXX=g++-5 TRAVIS_CC=gcc-5 STRINGS=ICU COVERAGE=TRUE 44 | 45 | before_install: 46 | - sudo apt-get update -qq 47 | - sudo apt-get install -y libboost-all-dev glib2.0-dev 48 | 49 | install: 50 | - DEPS_DIR="${TRAVIS_BUILD_DIR}/deps" 51 | - mkdir ${DEPS_DIR} && cd ${DEPS_DIR} 52 | - GFSM_SRC="http://kaskade.dwds.de/~moocow/mirror/projects/gfsm" 53 | - | 54 | for pkg in gfsm-0.0.16-1 gfsmxl-0.0.15; do 55 | curl -O ${GFSM_SRC}/${pkg}.tar.gz 56 | tar xzf ${pkg}.tar.gz 57 | pushd ${pkg} 58 | CC=${TRAVIS_CC} CXX=${TRAVIS_CXX} ./configure && make && sudo make install 59 | popd 60 | done 61 | - | 62 | if [[ ${COVERAGE} == TRUE ]]; then 63 | curl -O http://ftp.debian.org/debian/pool/main/l/lcov/lcov_1.13.orig.tar.gz 64 | tar xzf lcov_1.13.orig.tar.gz 65 | pushd lcov-1.13/ 66 | make && sudo make install 67 | popd 68 | fi 69 | - cd .. 70 | 71 | script: 72 | - mkdir build && cd build 73 | - cmake .. -DSTRING_IMPL=${STRINGS} -DWITH_PYTHON=TRUE -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=${TRAVIS_CXX} -DCMAKE_C_COMPILER=${TRAVIS_CC} -DWITH_COVERAGE=${COVERAGE} 74 | - make VERBOSE=1 && make buildtests && make check && sudo make install 75 | 76 | after_script: 77 | - | 78 | if [[ ${COVERAGE} == TRUE ]]; then 79 | make coverage VERBOSE=1 80 | bash <(curl -s https://codecov.io/bash) -X gcov -f ${TRAVIS_BUILD_DIR}/build/coverage.info 81 | fi 82 | -------------------------------------------------------------------------------- /CMakeModules/AddSources.cmake: -------------------------------------------------------------------------------- 1 | macro (add_sources) 2 | file(RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") 3 | foreach(_src ${ARGN}) 4 | if(_relPath) 5 | list (APPEND SRCS "${_relPath}/${_src}") 6 | else() 7 | list (APPEND SRCS "${_src}") 8 | endif() 9 | endforeach() 10 | if(_relPath) 11 | set(SRCS ${SRCS} PARENT_SCOPE) 12 | endif() 13 | endmacro() 14 | 15 | macro (install_headers) 16 | file(RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}") 17 | install(FILES ${ARGN} DESTINATION "${CMAKE_INSTALL_PREFIX}/include/${CMAKE_PROJECT_NAME}/${_relPath}") 18 | endmacro() 19 | 20 | -------------------------------------------------------------------------------- /CMakeModules/CTAGS.cmake: -------------------------------------------------------------------------------- 1 | # CTAGS/CSCOPE 2 | # 2013-2014 Florian Petran 3 | # add a tags target that generates ctags/cscope and is also triggered by 4 | # all target 5 | # currently assumes *nix system, and ctags/cscope are in path but failure 6 | # is non fatal 7 | find_program(CTAGS ctags) 8 | find_program(CSCOPE cscope) 9 | if(NOT CTAGS STREQUAL "CTAGS-NOTFOUND") 10 | add_custom_target(ctags 11 | COMMAND ${CTAGS} --fields=+iaS --extra=+q -R ${CMAKE_SOURCE_DIR}/ 12 | COMMAND cp tags ${CMAKE_SOURCE_DIR}/ 13 | ) 14 | else() 15 | message(STATUS "ctags not found, skipping ctags generation.") 16 | set(CTAGS false) 17 | endif() 18 | if(NOT CSCOPE STREQUAL "CSCOPE-NOTFOUND") 19 | file(GLOB_RECURSE ALL_FILES ${CMAKE_SOURCE_DIR} *.cpp *.h) 20 | add_custom_target(cscope 21 | COMMAND ${CSCOPE} -bR ${ALLFILES} 22 | COMMAND cp cscope.out ${CMAKE_SOURCE_DIR}/ 23 | ) 24 | else() 25 | message(STATUS "cscope not found, skipping cscope generation.") 26 | set(CSCOPE falsE) 27 | endif() 28 | if(CTAGS AND CSCOPE) 29 | add_custom_target(tags ALL 30 | COMMAND make ctags 31 | COMMAND make cscope 32 | COMMENT "Generating ctags..." 33 | ) 34 | endif() 35 | 36 | -------------------------------------------------------------------------------- /CMakeModules/FindGLIB2.cmake: -------------------------------------------------------------------------------- 1 | # FindGLIB2.cmake shipped with kdelibs-4.17.4 2 | # modified to remove the check for pkg-config since that is done 3 | # in the main CMakeLists.txt 4 | # 5 | # - Try to find the GLIB2 libraries 6 | # Once done this will define 7 | # 8 | # GLIB2_FOUND - system has glib2 9 | # GLIB2_INCLUDE_DIR - the glib2 include directory 10 | # GLIB2_LIBRARIES - glib2 library 11 | 12 | # Copyright (c) 2008 Laurent Montel, 13 | # 14 | # Redistribution and use is allowed according to the terms of the BSD license. 15 | # 16 | # Redistribution and use in source and binary forms, with or without 17 | # modification, are permitted provided that the following conditions 18 | # are met: 19 | # 20 | # 1. Redistributions of source code must retain the copyright 21 | # notice, this list of conditions and the following disclaimer. 22 | # 2. Redistributions in binary form must reproduce the copyright 23 | # notice, this list of conditions and the following disclaimer in the 24 | # documentation and/or other materials provided with the distribution. 25 | # 3. The name of the author may not be used to endorse or promote products 26 | # derived from this software without specific prior written permission. 27 | # 28 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 | # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 | # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 33 | # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 37 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | 39 | if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) 40 | # Already in cache, be silent 41 | set(GLIB2_FIND_QUIETLY TRUE) 42 | endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) 43 | 44 | pkg_check_modules(PC_LibGLIB2 QUIET glib-2.0) 45 | 46 | find_path(GLIB2_MAIN_INCLUDE_DIR 47 | NAMES glib.h 48 | HINTS ${PC_LibGLIB2_INCLUDEDIR} 49 | PATH_SUFFIXES glib-2.0) 50 | 51 | find_library(GLIB2_LIBRARY 52 | NAMES glib-2.0 53 | HINTS ${PC_LibGLIB2_LIBDIR} 54 | ) 55 | 56 | set(GLIB2_LIBRARIES ${GLIB2_LIBRARY}) 57 | 58 | # search the glibconfig.h include dir under the same root where the library is found 59 | get_filename_component(glib2LibDir "${GLIB2_LIBRARIES}" PATH) 60 | 61 | find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h 62 | PATH_SUFFIXES glib-2.0/include 63 | HINTS ${PC_LibGLIB2_INCLUDEDIR} "${glib2LibDir}" ${CMAKE_SYSTEM_LIBRARY_PATH}) 64 | 65 | set(GLIB2_INCLUDE_DIR "${GLIB2_MAIN_INCLUDE_DIR}") 66 | 67 | # not sure if this include dir is optional or required 68 | # for now it is optional 69 | if(GLIB2_INTERNAL_INCLUDE_DIR) 70 | set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} "${GLIB2_INTERNAL_INCLUDE_DIR}") 71 | endif(GLIB2_INTERNAL_INCLUDE_DIR) 72 | 73 | include(FindPackageHandleStandardArgs) 74 | find_package_handle_standard_args(GLIB2 DEFAULT_MSG GLIB2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR) 75 | 76 | mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES) 77 | -------------------------------------------------------------------------------- /CMakeModules/FindGfsm.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Marcel Bollmann, 2014 3 | # 4 | # Find gfsm & gfsmxl libraries by Bryan Jurish 5 | # 6 | # 7 | # This CMake file was created mostly by copying from: 8 | # 9 | # 10 | # It is likely to contain errors/bugs. 11 | # 12 | 13 | pkg_check_modules(PC_LIBGFSM REQUIRED gfsm>=0.0.11) 14 | pkg_check_modules(PC_LIBGFSMXL REQUIRED gfsmxl>=0.0.11) 15 | set(LIBGFSM_DEFINITIONS ${PC_LIBGFSM_CFLAGS_OTHER} ) 16 | 17 | find_path(LIBGFSM_INCLUDE_DIR gfsm.h 18 | HINTS ${PC_LIBGFSM_INCLUDEDIR} ${PC_LIBGFSM_INCLUDE_DIRS} 19 | PATH_SUFFIXES gfsm ) 20 | find_path(LIBGFSMXL_INCLUDE_DIR gfsmxl.h 21 | HINTS ${PC_LIBGFSMXL_INCLUDEDIR} ${PC_LIBGFSMXL_INCLUDE_DIRS} 22 | PATH_SUFFIXES gfsmxl ) 23 | 24 | find_library(LIBGFSM_LIBRARY NAMES gfsm 25 | HINTS ${PC_LIBGFSM_LIBDIR} ${PC_LIBGFSM_LIBRARY_DIRS} ) 26 | find_library(LIBGFSMXL_LIBRARY NAMES gfsmxl 27 | HINTS ${PC_LIBGFSMXL_LIBDIR} ${PC_LIBGFSMXL_LIBRARY_DIRS} ) 28 | 29 | set(LIBGFSM_LIBRARIES ${LIBGFSM_LIBRARY} ${LIBGFSMXL_LIBRARY} ${GLIB2_LIBRARIES} ) 30 | set(LIBGFSM_INCLUDE_DIRS ${LIBGFSM_INCLUDE_DIR} ${LIBGFSMXL_INCLUDE_DIR} ${GLIB2_INCLUDE_DIRS} ) 31 | 32 | find_package_handle_standard_args(libgfsm DEFAULT_MSG LIBGFSM_LIBRARY LIBGFSM_INCLUDE_DIR) 33 | find_package_handle_standard_args(libgfsmxl DEFAULT_MSG LIBGFSMXL_LIBRARY LIBGFSMXL_INCLUDE_DIR) 34 | mark_as_advanced(LIBGFSM_INCLUDE_DIR LIBGFSM_LIBRARY) 35 | mark_as_advanced(LIBGFSMXL_INCLUDE_DIR LIBGFSMXL_LIBRARY) 36 | -------------------------------------------------------------------------------- /CMakeModules/cpplint.cmake: -------------------------------------------------------------------------------- 1 | # cpplint module 2 | # 2013-2014 Florian Petran 3 | # adds a custom lint target for style checking 4 | # also tries to download cpplint.py if it is not found 5 | # dependencies: 6 | # - needs python (for running cpplint.py) 7 | # - needs curl (for download) 8 | # downloading also assumes *nix system (chmod) but failure is non fatal 9 | 10 | find_program(CPPLINT cpplint.py) 11 | find_program(PYTHON python) 12 | if(CPPLINT STREQUAL "CPPLINT-NOTFOUND") 13 | find_package(CURL) 14 | if(CURL_FOUND) 15 | exec_program(curl 16 | ARGS "https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py -O") 17 | exec_program(chmod 18 | ARGS "+x ${CMAKE_CURRENT_BINARY_DIR}/cpplint.py") 19 | set(CPPLINT "${CMAKE_CURRENT_BINARY_DIR}/cpplint.py") 20 | else() 21 | set(CPPLINT "CPPLINT-NOTFOUND") 22 | endif() 23 | endif() 24 | if(NOT CPPLINT STREQUAL "CPPLINT-NOTFOUND" AND NOT PYTHON STREQUAL "PYTHON-NOTFOUND") 25 | set(LINT_FILTER "--filter=-readability/streams,-build/header_guard,-build/c++11,-build/include") 26 | file(GLOB_RECURSE ALLFILES ${CMAKE_SOURCE_DIR} *.cpp *.h) 27 | add_custom_target(lint 28 | COMMENT "Checking C++ style conformity..." 29 | COMMAND ${PYTHON} ${CPPLINT} ${LINT_FILTER} ${ALLFILES} 30 | ) 31 | else() 32 | message(STATUS "C++ style checker not found, skipping lint target.") 33 | endif() 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:xenial 2 | 3 | RUN apt-get update && apt-get install -y \ 4 | build-essential \ 5 | cmake \ 6 | libboost-filesystem-dev \ 7 | libboost-program-options-dev \ 8 | libboost-regex-dev \ 9 | libboost-system-dev \ 10 | libboost-test-dev \ 11 | libc6 \ 12 | libgcc1 \ 13 | libglib2.0-dev \ 14 | libicu-dev \ 15 | pkg-config \ 16 | wget \ 17 | zlibc \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | ################################################################################ 21 | # Download and build GFSM libraries 22 | # (required for Norma, and not available through repositories) 23 | 24 | WORKDIR /tmp/gfsm 25 | RUN wget http://kaskade.dwds.de/~moocow/mirror/projects/gfsm/gfsm-0.0.19-1.tar.gz 26 | RUN tar xzf gfsm-0.0.19-1.tar.gz 27 | WORKDIR /tmp/gfsm/gfsm-0.0.19-1 28 | RUN sh ./configure --prefix=/usr && make && make install 29 | 30 | WORKDIR /tmp/gfsm 31 | RUN wget http://kaskade.dwds.de/~moocow/mirror/projects/gfsm/gfsmxl-0.0.17.tar.gz 32 | RUN tar xzf gfsmxl-0.0.17.tar.gz 33 | WORKDIR /tmp/gfsm/gfsmxl-0.0.17 34 | RUN sh ./configure --prefix=/usr && make && make install 35 | 36 | RUN rm -rf /tmp/gfsm 37 | 38 | ################################################################################ 39 | 40 | ################################################################################ 41 | # Build Norma 42 | 43 | WORKDIR /tmp/norma 44 | COPY . . 45 | WORKDIR /tmp/norma/build_docker 46 | RUN cmake /tmp/norma \ 47 | -DSTRING_IMPL=ICU \ 48 | -DCMAKE_BUILD_TYPE=Release \ 49 | -DCMAKE_INSTALL_PREFIX=/usr \ 50 | && make \ 51 | && make test \ 52 | && make install 53 | 54 | RUN rm -rf /tmp/norma 55 | 56 | ################################################################################ 57 | 58 | WORKDIR /home 59 | ENV LANG=en_US.UTF-8 60 | 61 | ENTRYPOINT ["/usr/bin/normalize"] 62 | CMD ["/usr/bin/normalize", "--help"] 63 | -------------------------------------------------------------------------------- /Doxyfile.in: -------------------------------------------------------------------------------- 1 | INPUT = @CMAKE_CURRENT_SOURCE_DIR@/ 2 | USE_MDFILE_AS_MAINPAGE = README.md 3 | PROJECT_NAME = @CMAKE_PROJECT_NAME@ 4 | PROJECT_NUMBER = @PROJECT_VERSION@ 5 | RECURSIVE = YES 6 | EXCLUDE_PATTERNS = */tests/* */example/* */data/* */CMakeModules/* 7 | 8 | -------------------------------------------------------------------------------- /GIT-VERSION-GEN: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # based on GIT-VERSION-GEN from linux kernel 3 | 4 | DEF_VER=v0.2.0.GIT 5 | LF=' 6 | ' 7 | 8 | if [ -f version ]; then 9 | VN=$(/dev/null) && 12 | case "$VN" in 13 | *$LF*) (exit 1) ;; 14 | v[0-9]*) 15 | git update-index -q --refresh 16 | [ -z "$(git diff-index --name-only HEAD --)" ] || \ 17 | VN="$VN-dirty" ;; 18 | esac 19 | then 20 | VN=$(echo "$VN" | sed -e"s/-/./g"); 21 | else 22 | VN="$DEF_VER" 23 | fi 24 | 25 | echo $VN 26 | 27 | -------------------------------------------------------------------------------- /MinGW.toolchain: -------------------------------------------------------------------------------- 1 | # CMake toolchain file for cross-compilation on Windows 2 | #---- could not get this to work yet, but included here for future attempts 3 | set(CMAKE_SYSTEM_NAME Windows) 4 | set(TARGET_ARCH "64" CACHE STRING "Which architecture to target (32|64)") 5 | 6 | # Choosing the compiler (assuming MinGW) 7 | if(TARGET_ARCH STREQUAL "32") 8 | message(STATUS "Targeting 32-bit Windows using MinGW") 9 | set(COMPILER_PREFIX i686-w64-mingw32) 10 | elseif(TARGET_ARCH STREQUAL "64") 11 | message(STATUS "Targeting 64-bit Windows using MinGW") 12 | set(COMPILER_PREFIX x86_64-w64-mingw32) 13 | else() 14 | message(FATAL_ERROR "Target architecture ${TARGET_ARCH} not recognized!") 15 | endif() 16 | 17 | set(CMAKE_C_COMPILER "${COMPILER_PREFIX}-gcc") 18 | set(CMAKE_CXX_COMPILER "${COMPILER_PREFIX}-g++") 19 | set(CMAKE_RC_COMPILER "${COMPILER_PREFIX}-windres") 20 | 21 | # Target environment 22 | # (has to contain all the required dependencies) 23 | set(CMAKE_FIND_ROOT_PATH "/usr/${COMPILER_PREFIX}") 24 | 25 | # Search headers & libraries in the target environment, 26 | # search programs in the host environment 27 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 28 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 29 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Norma 2 | 3 | [![Build Status](https://travis-ci.org/comphist/norma.svg?branch=master)](https://travis-ci.org/comphist/norma) 4 | [![codecov](https://codecov.io/gh/comphist/norma/branch/master/graph/badge.svg)](https://codecov.io/gh/comphist/norma) 5 | 6 | Norma is a tool for **automatic spelling normalization** of non-standard language data. It 7 | uses a combination of different normalization techniques that typically require 8 | *training data* (= a list of manually normalized wordforms) and a *target 9 | dictionary* (= a list of valid wordforms in the target language). 10 | 11 | Besides this README, see also the [User Guide](doc/UserGuide.md) for more 12 | information on how to use Norma. 13 | 14 | If you have any questions, suggestions, or comments, please contact one of the authors: 15 | 16 | * Marcel Bollmann () 17 | * Florian Petran () 18 | 19 | #### License 20 | 21 | Norma is licensed under the 22 | [GNU Lesser General Public License (LGPL) v3](http://www.gnu.org/licenses/lgpl-3.0). 23 | 24 | #### Usage via Docker 25 | 26 | If you don't want to compile Norma and its dependencies from scratch, you can 27 | try to use it via a Docker image. To do so, install 28 | [Docker](https://www.docker.com/) (e.g. via your system's package manager) and 29 | run: 30 | 31 | docker run -v $(pwd):/home mbollmann/norma 32 | 33 | This should download the Docker image and display Norma's help output. You can 34 | now add command-line arguments the same way as with the `normalize` binary; for 35 | example, if you cloned this repository locally, you should be able to run: 36 | 37 | docker run -v $(pwd):/home mbollmann/norma -s -c doc/example/example.cfg -f doc/example/fnhd_sample.txt 38 | 39 | To use the `norma_lexicon` command with the docker image, run: 40 | 41 | docker run -v $(pwd):/home --entrypoint norma_lexicon mbollmann/norma 42 | 43 | Make sure that the files you specify exist within your current working directory. 44 | 45 | #### Dependencies 46 | 47 | * Needed for compilation: 48 | * GCC >= 4.9 49 | * CMake >= 2.8.10 50 | * Boost >= 1.54 51 | * in particular these libraries: Filesystem, Program Options, Regex, System, Test 52 | * pkg-config 53 | * gfsm >= 0.0.16-1 and gfsmxl >= 0.0.15, 54 | available from http://kaskade.dwds.de/~moocow/mirror/projects/gfsm/ 55 | * GLib >= 2.0 56 | * Optionally: 57 | * ICU >= 1.49 58 | * Doxygen (for generating the documentation) 59 | * Python 2 >= 2.7 and Boost::Python (for Python bindings/embeddings) 60 | 61 | #### How to do an out of source build 62 | 63 | mkdir build 64 | cd build 65 | cmake 66 | make 67 | 68 | There is a test suite included that can be run with `make test` (or `make check`, which reveals more detailed error 69 | messages), as well as API documentation that is generated via `make doc` (requires Doxygen). 70 | 71 | #### Configuration options (for CMake) 72 | 73 | * String implementation (default: ICU if available): 74 | `-DSTRING_IMPL=(ICU|STD)` 75 | * ICU - use ICU unicode strings 76 | * STD - use STL string - requires no additional library 77 | * Build type (default: Release): 78 | `-DCMAKE_BUILD_TYPE=(Debug|Release):` 79 | * Install prefix (default: /usr/local/) 80 | `-DCMAKE_INSTALL_PREFIX=` 81 | * To make Python bindings/embeddings (default: disabled), set 82 | `-DWITH_PYTHON=TRUE` 83 | 84 | #### Other platforms 85 | 86 | * Norma was developed on Linux and will not work on non-Unix platforms. 87 | * Compilation with clang will probably work but is untested. 88 | -------------------------------------------------------------------------------- /doc/acrh-paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comphist/norma/2459c2bb29c9f1daad38a6d7d15e28930782e2a8/doc/acrh-paper.pdf -------------------------------------------------------------------------------- /doc/example/bible_lexicon.fsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comphist/norma/2459c2bb29c9f1daad38a6d7d15e28930782e2a8/doc/example/bible_lexicon.fsm -------------------------------------------------------------------------------- /doc/example/bible_lexicon.sym: -------------------------------------------------------------------------------- 1 | ! 6 2 | " 10 3 | ' 9 4 | ( 11 5 | ) 12 6 | - 4 7 | . 8 8 | : 5 9 | <#> 1 10 | 2 11 | 3 12 | ? 7 13 | a 13 14 | b 21 15 | c 26 16 | d 22 17 | e 19 18 | f 29 19 | g 23 20 | h 24 21 | i 17 22 | j 38 23 | k 34 24 | l 27 25 | m 25 26 | n 16 27 | o 15 28 | p 28 29 | q 42 30 | r 14 31 | s 20 32 | t 18 33 | u 33 34 | v 40 35 | w 36 36 | x 41 37 | y 39 38 | z 30 39 | ß 35 40 | ä 31 41 | ö 37 42 | ü 32 43 | -------------------------------------------------------------------------------- /doc/example/example.cfg: -------------------------------------------------------------------------------- 1 | normalizers=RuleBased 2 | saveonexit=False 3 | perfilemode=False 4 | 5 | [Lexicon] 6 | fsmfile=bible_lexicon.fsm 7 | symfile=bible_lexicon.sym 8 | 9 | [RuleBased] 10 | rulesfile=example.RuleBased.rulesfile 11 | -------------------------------------------------------------------------------- /doc/example/example_chain.cfg: -------------------------------------------------------------------------------- 1 | normalizers=Mapper,RuleBased,WLD 2 | saveonexit=False 3 | perfilemode=False 4 | 5 | [Lexicon] 6 | fsmfile=bible_lexicon.fsm 7 | symfile=bible_lexicon.sym 8 | 9 | [Mapper] 10 | mapfile=fnhd_train.Mapper.mapfile 11 | 12 | [RuleBased] 13 | rulesfile=fnhd_train.RuleBased.rulesfile 14 | 15 | [WLD] 16 | paramfile=fnhd_train.WLD.paramfile 17 | max_weight=2.5 18 | train_ngrams=3 19 | train_divisor=7 20 | -------------------------------------------------------------------------------- /doc/example/normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function, division, absolute_import, unicode_literals 5 | import argparse 6 | import sys 7 | from norma import NormaCfgParser 8 | from pprint import pprint 9 | 10 | class MainApplication(object): 11 | def __init__(self, args): 12 | self.normalizers = NormaCfgParser(args.config).instantiate_all() 13 | self.normalizer_names = map((lambda n: n.name), self.normalizers) 14 | self.say("Instantiated %i normalizers:\n" % len(self.normalizers)) 15 | self.say(" %s\n" % ', '.join(self.normalizer_names)) 16 | self.infile = args.infile 17 | self.encoding = args.encoding 18 | self.candidates = args.candidates 19 | 20 | def say(self, text, to=sys.stdout): 21 | try: 22 | to.write(text.encode("utf-8")) 23 | except UnicodeError: 24 | to.write(text) 25 | 26 | def run(self): 27 | def make_result(r): 28 | return (r.word, r.score, r.origin) 29 | 30 | results = [] 31 | for line in self.infile: 32 | orig = line.decode(self.encoding).strip() 33 | 34 | current = {'orig': orig} 35 | for normalizer in self.normalizers: 36 | resultset = normalizer(orig, self.candidates) 37 | current[normalizer.name] = [make_result(r) for r in resultset] 38 | results.append(current) 39 | pprint(current) 40 | 41 | self.say("Processed %i tokens.\n" % len(results), to=sys.stderr) 42 | self.say("Done!\n", to=sys.stderr) 43 | 44 | if __name__ == '__main__': 45 | description = "Generates normalizations for a test set." 46 | epilog = "" 47 | parser = argparse.ArgumentParser(description=description, epilog=epilog) 48 | parser.add_argument('infile', 49 | metavar='INPUT', 50 | type=argparse.FileType('r'), 51 | help='Test set') 52 | parser.add_argument('-c', '--config', 53 | metavar='CONFIG', 54 | required=True, 55 | help='Norma configuration file') 56 | parser.add_argument('-n', '--candidates', 57 | metavar='N', 58 | type=int, 59 | default=3, 60 | help='Number of normalization candidates to generate (default: %(default)i)') 61 | parser.add_argument('-e', '--encoding', 62 | default='utf-8', 63 | help='Encoding of the input file (default: utf-8)') 64 | 65 | args = parser.parse_args() 66 | 67 | # launching application ... 68 | MainApplication(args).run() 69 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_sources(pluginsocket.cpp cycle.cpp string_impl.cpp training_data.cpp) 2 | install_headers(pluginsocket.h cycle.h gfsmlibs.h interface.h norma.h 3 | regex_impl.h string_impl.h training_data.h 4 | training_data-inl.h) 5 | -------------------------------------------------------------------------------- /src/config.h.in: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_CONFIG_H_ 19 | #define NORMA_CONFIG_H_ 20 | 21 | #define NORMA_NAME "@CMAKE_PROJECT_NAME@" 22 | #define NORMA_VERSION "@CMAKE_PROJECT_VERSION@" 23 | #define NORMA_DEFAULT_PLUGIN_BASE "@NORMA_DEFAULT_PLUGIN_BASE@" 24 | #define TEST_BASE_DIR "@NORMA_TEST_BASE_DIR@" 25 | 26 | #endif // NORMA_CONFIG_H_ 27 | 28 | -------------------------------------------------------------------------------- /src/cycle.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"cycle.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include"normalizer/result.h" 27 | #include"interface.h" 28 | #include"pluginsocket.h" 29 | #include"training_data.h" 30 | #include"results_queue.h" 31 | 32 | using std::map; 33 | using std::string; 34 | 35 | namespace Norma { 36 | Cycle::~Cycle() { 37 | if (_data != nullptr) 38 | delete _data; 39 | if (_plugins != nullptr) 40 | delete _plugins; 41 | } 42 | 43 | void Cycle::init(Input *input, Output* output, 44 | const std::map& params) { 45 | _in = input; 46 | _out = output; 47 | _params = params; 48 | _data = new TrainingData(); 49 | _in->initialize(this, _out, _data); 50 | _out->initialize(this, _in, _data); 51 | set_thread(_in->thread_suitable() && _out->thread_suitable()); 52 | } 53 | 54 | void Cycle::init_chain(const std::string& chain_definition, 55 | const std::string& plugin_base) { 56 | if (_in == nullptr || _out == nullptr) 57 | throw std::runtime_error("Cycle was not intialized!"); 58 | _plugins = new PluginSocket(chain_definition, plugin_base, _params); 59 | } 60 | 61 | void Cycle::start() { 62 | ResultsQueue res(policy); 63 | bool print_prob = settings["prob"]; 64 | Normalizer::LogLevel ll = _max_log_level; 65 | Output* o = _out; 66 | auto outputter = [print_prob, ll, o](Normalizer::Result r) { 67 | o->put_line(&r, print_prob, ll); 68 | }; 69 | auto producer = [this](string_impl line) { 70 | auto my_result = _plugins->normalize(line); 71 | return my_result; 72 | }; 73 | res.set_consumer(outputter); 74 | _in->begin(); 75 | while (!_in->request_quit()) { 76 | string_impl line = _in->get_line(); 77 | if (line.length() == 0) 78 | continue; 79 | if (settings["train"] && _in->request_train()) { 80 | training_pair(line); 81 | continue; 82 | } 83 | if (settings["normalize"]) 84 | res.add_producer(producer, line); 85 | if (settings["train"] && _out->request_train()) 86 | _plugins->train(_data); 87 | } 88 | res.finish(); 89 | _in->end(); 90 | } 91 | 92 | bool Cycle::training_pair(const string_impl& line) { 93 | string_size divpos = 0; 94 | for (string_size i = 1; i < line.length(); ++i) 95 | if (isspace(line[i]) 96 | && i != line.length() - 1 97 | && !isspace(line[i+1])) 98 | divpos = i; 99 | if (divpos != 0) { 100 | string_impl word, 101 | modern; 102 | extract(line, 0, divpos, &word); 103 | extract(line, divpos + 1, line.length(), &modern); 104 | _data->add_source(word); 105 | _data->add_target(modern); 106 | _plugins->train(_data); 107 | return true; 108 | } 109 | return false; 110 | } 111 | 112 | void Cycle::save_params() { 113 | _plugins->save_params(); 114 | } 115 | } // namespace Norma 116 | 117 | -------------------------------------------------------------------------------- /src/cycle.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef CYCLE_H_ 19 | #define CYCLE_H_ 20 | #include 21 | #include 22 | #include"string_impl.h" 23 | #include"training_data.h" 24 | #include"normalizer/result.h" 25 | #include"results_queue.h" 26 | 27 | namespace Norma { 28 | class Input; 29 | class Output; 30 | class PluginSocket; 31 | 32 | /// the main application cycle 33 | /** This class contains the main loop for input and output. 34 | * It connects Input, Output, and PluginSocket classes. 35 | * 36 | * Usually, an application should need just one Cycle object, 37 | * but it's conceivable to have multiple ones, e.g. when 38 | * normalizing several files parallel. 39 | **/ 40 | 41 | class Cycle { 42 | public: 43 | Cycle() = default; 44 | ~Cycle(); 45 | void init(Input* input, Output* output, 46 | const std::map& params); 47 | void init_chain(const std::string& chain_definition, 48 | const std::string& plugin_base); 49 | 50 | void start(); 51 | void save_params(); 52 | void set(const std::string setting, bool val) { 53 | settings[setting] = val; 54 | } 55 | bool get(const std::string setting) const { 56 | return settings.at(setting); 57 | } 58 | void set_thread(bool val) { 59 | policy = val ? std::launch::async : std::launch::deferred; 60 | } 61 | 62 | private: 63 | bool training_pair(const string_impl& line); 64 | 65 | std::map _params; 66 | Normalizer::LogLevel _max_log_level = Normalizer::LogLevel::WARN; 67 | std::map settings = { 68 | { "train", true }, 69 | { "normalize", true }, 70 | { "prob", true } }; 71 | TrainingData* _data = nullptr; 72 | PluginSocket* _plugins = nullptr; 73 | Input* _in = nullptr; 74 | Output* _out = nullptr; 75 | 76 | std::launch policy = std::launch::async|std::launch::deferred; 77 | }; 78 | } // namespace Norma 79 | #endif // CYCLE_H_ 80 | 81 | -------------------------------------------------------------------------------- /src/defines.h.in: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_DEFINES_H_ 19 | #define NORMA_DEFINES_H_ 20 | 21 | // compile time defines that need to be available at runtime 22 | #cmakedefine USE_ICU_STRING 23 | 24 | #endif // NORMA_DEFINES_H_ 25 | 26 | -------------------------------------------------------------------------------- /src/gfsm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_sources(alphabet.cpp labelvector.cpp implode_explode.cpp automaton.cpp 2 | acceptor.cpp string_acceptor.cpp transducer.cpp 3 | string_transducer.cpp cascade.cpp string_cascade.cpp) 4 | install_headers(acceptor.h alphabet.h automaton.h cascade.h 5 | implode_explode.h labelvector.h path.h semiring.h 6 | string_acceptor.h string_cascade.h string_transducer.h 7 | transducer.h) 8 | 9 | -------------------------------------------------------------------------------- /src/gfsm/acceptor.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"acceptor.h" 19 | #include 20 | #include"gfsmlibs.h" 21 | #include"labelvector.h" 22 | #include"path.h" 23 | 24 | namespace Gfsm { 25 | 26 | Acceptor& Acceptor::operator=(Acceptor a) { 27 | Automaton::operator=(a); 28 | return *this; 29 | } 30 | 31 | bool Acceptor::accepts(const LabelVector& vec) const { 32 | gfsmAutomaton* result = gfsm_automaton_new(); 33 | gfsm_automaton_lookup(_fsm, vec._vec, result); 34 | bool is_accepted = (gfsm_automaton_n_final_states(result) > 0); 35 | gfsm_automaton_free(result); 36 | return is_accepted; 37 | } 38 | 39 | std::set Acceptor::accepted() const { 40 | std::set paths = accepted_paths(); 41 | std::set acc; 42 | for (const Path& p : paths) { 43 | acc.insert(p.get_input()); 44 | } 45 | return acc; 46 | } 47 | 48 | void Acceptor::add_path(const LabelVector& vec, bool set_all_final) { 49 | gfsmStateId from = root(); 50 | gfsmWeight one = _fsm->sr->one; 51 | for (gfsmLabelVal value : vec) { 52 | gfsmArcIter iter; 53 | gfsmStateId to = gfsmNoState; 54 | gfsm_arciter_open(&iter, _fsm, from); 55 | while (gfsm_arciter_ok(&iter)) { 56 | gfsmArc* arc = gfsm_arciter_arc(&iter); 57 | // several assumptions are made here: 58 | // Acceptor must be deterministic, epsilon-free, and unweighted 59 | if (arc->lower == value) { 60 | to = arc->target; 61 | break; 62 | } 63 | gfsm_arciter_next(&iter); 64 | } 65 | if (to == gfsmNoState) { // need to create a new node & arc 66 | to = gfsm_automaton_n_states(_fsm); 67 | gfsm_automaton_add_arc(_fsm, from, to, value, value, one); 68 | } 69 | // set_all_final set? 70 | if (set_all_final && 71 | gfsm_automaton_state_is_final(_fsm, to) == FALSE) 72 | gfsm_automaton_set_final_state_full(_fsm, to, TRUE, one); 73 | // follow arc 74 | from = to; 75 | gfsm_arciter_close(&iter); 76 | } 77 | 78 | // accepting the path requires the last state to be final 79 | if (gfsm_automaton_state_is_final(_fsm, from) == FALSE) 80 | gfsm_automaton_set_final_state_full(_fsm, from, TRUE, one); 81 | } 82 | 83 | } // namespace Gfsm 84 | -------------------------------------------------------------------------------- /src/gfsm/acceptor.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_ACCEPTOR_H_ 19 | #define GFSM_ACCEPTOR_H_ 20 | #include 21 | #include 22 | #include"automaton.h" 23 | 24 | namespace Gfsm { 25 | class LabelVector; 26 | 27 | /// A finite-state acceptor. 28 | /** Implements functions specific to a finite-state acceptor, i.e., an 29 | Automaton where input labels always equal output labels. 30 | */ 31 | class Acceptor : public Automaton { 32 | public: 33 | Acceptor() : Automaton() { 34 | _fsm->flags.is_transducer = FALSE; 35 | } 36 | Acceptor(const Acceptor& a) : Automaton(a) {} 37 | Acceptor(Acceptor&& a) : Automaton(std::move(a)) {} 38 | Acceptor& operator=(Acceptor a); 39 | ~Acceptor() = default; 40 | 41 | /// Test if automaton is in a final state after reading a given LabelVector. 42 | /** @param vec LabelVector to test. 43 | */ 44 | bool accepts(const LabelVector& vec) const; 45 | 46 | /// Find all LabelVectors accepted by this automaton. 47 | /** @see Automaton::accepted_paths() 48 | */ 49 | std::set accepted() const; 50 | 51 | /// Add a path to the automaton. 52 | /** @param vec The LabelVector that should be accepted by the 53 | automaton. 54 | @param set_all_final If true, all subvectors of vec starting at 55 | index 0 will also be accepted (e.g., if vec = {1, 3, 5}, 56 | the automaton will also accept {1} and {1, 3}). 57 | */ 58 | void add_path(const LabelVector& vec, bool set_all_final = false); 59 | }; 60 | 61 | } // namespace Gfsm 62 | 63 | #endif // GFSM_ACCEPTOR_H_ 64 | -------------------------------------------------------------------------------- /src/gfsm/automaton.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_AUTOMATON_H_ 19 | #define GFSM_AUTOMATON_H_ 20 | #include 21 | #include 22 | #include"gfsmlibs.h" 23 | #include"semiring.h" 24 | #include"path.h" 25 | 26 | namespace Gfsm { 27 | class LabelVector; 28 | class Cascade; 29 | 30 | /// A finite-state automaton. 31 | /** Implements functions that apply to all types of finite-state 32 | automata. When creating new automata, it is probably better to 33 | use the specialized Acceptor or Transducer classes. 34 | */ 35 | class Automaton { 36 | friend class Cascade; 37 | public: 38 | Automaton() : Automaton(SemiringType::TROPICAL) {} 39 | explicit Automaton(SemiringType sr); 40 | Automaton(const Automaton& a); 41 | Automaton(Automaton&& a); 42 | Automaton& operator=(Automaton a); 43 | ~Automaton() throw(); 44 | /// Load a Gfsm automaton from a binary file. 45 | void load_binfile(const std::string& filename); 46 | /// Save the Gfsm automaton to a binary file. 47 | void save_binfile(const std::string& filename); 48 | 49 | void set_semiring_type(SemiringType sr); 50 | SemiringType get_semiring_type() const; 51 | 52 | /// Sort all arcs in the automaton. 53 | void arcsort(); 54 | /// Collect weights on adjacent, otherwise identical arcs. 55 | /** Should only be called on an arc-sorted automaton. 56 | */ 57 | void arcuniq(); 58 | /// Multiply all zero weights with semiring-zero. 59 | void arith_sr_zero_to_zero(); 60 | /// Determinize the automaton. 61 | void determinize(); 62 | /// Minimize the automaton. 63 | /** @param remove_eps Whether to perform epsilon removal. 64 | */ 65 | void minimize(bool remove_eps = true); 66 | 67 | /// Find all paths that are accepted by this automaton. 68 | /** If the automaton is cyclic, an empty set is returned, as the 69 | set of accepted paths would be infinite in this case. 70 | */ 71 | std::set accepted_paths() const; 72 | 73 | /// Make sure automaton has a root state 74 | void ensure_root() { root(); } 75 | 76 | protected: 77 | gfsmAutomaton* _fsm; /**< Pointer to automaton object. */ 78 | gfsmStateId _root = gfsmNoState; /**< ID of the root state. */ 79 | 80 | void set_gfsm_automaton(gfsmAutomaton* fsm); 81 | 82 | /// Return ID of the root state; creates a root state if none exists. 83 | gfsmStateId root(); 84 | }; 85 | 86 | } // namespace Gfsm 87 | 88 | #endif // GFSM_AUTOMATON_H_ 89 | 90 | -------------------------------------------------------------------------------- /src/gfsm/implode_explode.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"implode_explode.h" 19 | #include 20 | #include"string_impl.h" 21 | 22 | namespace Gfsm { 23 | 24 | std::vector explode(const string_impl& str, bool att) { 25 | std::vector vec; 26 | bool braces = false; 27 | string_impl sym = ""; 28 | for (string_size i = 0; i < str.length(); ++i) { 29 | if (braces) { 30 | if (str[i] == ']') { 31 | vec.push_back(sym); 32 | braces = false; 33 | } else { 34 | sym += from_char(str[i]); 35 | } 36 | } else if (att && str[i] == '[') { 37 | sym = ""; 38 | braces = true; 39 | } else if (str[i] == ' ') { 40 | vec.push_back(""); 41 | } else if (str[i] == '\t') { 42 | vec.push_back(""); 43 | } else { 44 | vec.push_back(from_char(str[i])); 45 | } 46 | } 47 | return vec; 48 | } 49 | 50 | string_impl implode(const std::vector& vec, bool att) { 51 | string_impl str = ""; 52 | for (const string_impl& c : vec) { 53 | if (att && c.length() > 1) { 54 | if (c == "") 55 | str += " "; 56 | else if (c == "") 57 | str += "\t"; 58 | else 59 | str += "[" + c + "]"; 60 | } else { 61 | str += c; 62 | } 63 | } 64 | return str; 65 | } 66 | 67 | } // namespace Gfsm 68 | -------------------------------------------------------------------------------- /src/gfsm/implode_explode.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_IMPLODE_EXPLODE_H_ 19 | #define GFSM_IMPLODE_EXPLODE_H_ 20 | #include 21 | #include"string_impl.h" 22 | 23 | namespace Gfsm { 24 | 25 | /// Convert symbol sequence from a string to a vector. 26 | /** @param str Symbol sequence to convert 27 | @param att If true, characters enclosed in square brackets will be treated 28 | as a single symbol; if false, each character will be converted 29 | to a separate symbol. 30 | */ 31 | std::vector explode(const string_impl& str, bool att = false); 32 | 33 | /// Convert symbol sequence from a vector to a string. 34 | /** @param vec Symbol sequence to convert 35 | @param att If true, multi-character symbols will be enclosed in square 36 | brackets in the returned string. 37 | */ 38 | string_impl implode(const std::vector& vec, bool att = false); 39 | 40 | } // namespace Gfsm 41 | 42 | #endif // GFSM_IMPLODE_EXPLODE_H_ 43 | -------------------------------------------------------------------------------- /src/gfsm/labelvector.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_LABELVECTOR_H_ 19 | #define GFSM_LABELVECTOR_H_ 20 | #include 21 | #include"gfsmlibs.h" 22 | 23 | namespace Gfsm { 24 | 25 | const gfsmLabelVal EPSILON_LABEL = 0; 26 | 27 | class Acceptor; 28 | 29 | /// A vector of label values. 30 | class LabelVector { 31 | friend class Acceptor; 32 | friend class Transducer; 33 | friend class Cascade; 34 | public: 35 | /// Construct an empty label vector. 36 | LabelVector() : _vec(g_ptr_array_new()) {} 37 | /// Construct a label vector from a list of numeric labels. 38 | explicit LabelVector(std::initializer_list args); 39 | LabelVector(const LabelVector& v); 40 | LabelVector(LabelVector&& v); 41 | LabelVector& operator=(LabelVector v); 42 | ~LabelVector(); 43 | 44 | /// Get an element from the vector. 45 | gfsmLabelVal get(unsigned int n) const; 46 | /// Append an element to the end of the vector. 47 | void push_back(const gfsmLabelVal value); 48 | /// Clear the vector. 49 | void clear(); 50 | /// Get the size of the vector. 51 | unsigned int size() const {return _vec->len;} 52 | 53 | bool operator==(const LabelVector& that) const; 54 | inline bool operator!=(const LabelVector& that) const { 55 | return !(*this == that); 56 | } 57 | bool operator<(const LabelVector& that) const; 58 | 59 | /// Iterator class for LabelVector. 60 | class const_iterator { 61 | friend class LabelVector; 62 | public: 63 | gfsmLabelVal operator*() const; 64 | const_iterator& operator++(); 65 | bool operator==(const const_iterator& that) const; 66 | bool operator!=(const const_iterator& that) const; 67 | protected: 68 | const_iterator(unsigned int pos, const LabelVector * const vec); 69 | private: 70 | unsigned int _pos; 71 | const LabelVector* _vec; 72 | }; 73 | const_iterator begin() const; 74 | const_iterator end() const; 75 | 76 | private: 77 | gfsmLabelVector *_vec; 78 | }; 79 | 80 | } // namespace Gfsm 81 | 82 | #endif // GFSM_LABELVECTOR_H_ 83 | -------------------------------------------------------------------------------- /src/gfsm/path.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_PATH_H_ 19 | #define GFSM_PATH_H_ 20 | #include 21 | #include 22 | #include"alphabet.h" 23 | #include"labelvector.h" 24 | #include"string_impl.h" 25 | 26 | namespace Gfsm { 27 | 28 | class StringPath; 29 | 30 | /// A weight.ed path in a finite-state automaton. 31 | class Path { 32 | friend class StringPath; 33 | public: 34 | Path() = default; 35 | Path(LabelVector i, LabelVector o, double w) 36 | : input(i), output(o), weight(w) {} 37 | 38 | bool operator==(const Path& that) const { 39 | return (input == that.input && 40 | output == that.output && 41 | weight == that.weight); 42 | } 43 | inline bool operator!=(const Path& that) const { 44 | return !(*this == that); 45 | } 46 | bool operator<(const Path& that) const { 47 | return (std::make_tuple(input, output, weight) 48 | < std::make_tuple(that.input, that.output, that.weight)); 49 | } 50 | const LabelVector& get_output() const { return output; } 51 | const LabelVector& get_input() const { return input; } 52 | const double& get_weight() const { return weight; } 53 | 54 | protected: 55 | LabelVector input; /**< Input label sequence. */ 56 | LabelVector output; /**< Output label sequence. */ 57 | double weight; /**< Weight of the path. */ 58 | }; 59 | 60 | /// A weighted path in a string-based finite-state automaton. 61 | /** Also stores symbol sequences in addition to label sequences. 62 | */ 63 | class StringPath : public Path { 64 | public: 65 | StringPath(std::vector i, std::vector o, 66 | double w) 67 | : input(i), output(o) { weight = w; } 68 | 69 | static StringPath from(const Path& p, const Alphabet& alph_in, 70 | const Alphabet& alph_out) { 71 | return StringPath(alph_in.map_labels_to_vector(p.input), 72 | alph_out.map_labels_to_vector(p.output), 73 | p.weight); 74 | } 75 | const std::vector& get_output() const { 76 | return output; 77 | } 78 | const std::vector& get_input() const { 79 | return input; 80 | } 81 | private: 82 | std::vector input; /**< Input symbol sequence. */ 83 | std::vector output; /**< Output symbol sequence. */ 84 | }; 85 | 86 | } // namespace Gfsm 87 | 88 | #endif // GFSM_PATH_H_ 89 | 90 | -------------------------------------------------------------------------------- /src/gfsm/semiring.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_SEMIRING_H_ 19 | #define GFSM_SEMIRING_H_ 20 | #include"gfsmlibs.h" 21 | 22 | namespace Gfsm { 23 | 24 | /// Identify a semiring by type. 25 | enum class SemiringType { 26 | UNKNOWN = gfsmSRTUnknown, 27 | BOOLEAN = gfsmSRTBoolean, 28 | LOG = gfsmSRTLog, 29 | REAL = gfsmSRTReal, 30 | TRIVIAL = gfsmSRTTrivial, 31 | TROPICAL = gfsmSRTTropical, 32 | PLOG = gfsmSRTPLog, 33 | ARCTIC = gfsmSRTArctic, 34 | FUZZY = gfsmSRTFuzzy, 35 | PROB = gfsmSRTProb 36 | }; 37 | 38 | } // namespace Gfsm 39 | 40 | #endif // GFSM_SEMIRING_H_ 41 | 42 | -------------------------------------------------------------------------------- /src/gfsm/string_acceptor.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"string_acceptor.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include"labelvector.h" 24 | #include"alphabet.h" 25 | 26 | namespace Gfsm { 27 | 28 | StringAcceptor::StringAcceptor(const StringAcceptor& a) 29 | : Acceptor(a), _alph(a._alph) {} 30 | 31 | StringAcceptor& StringAcceptor::operator=(StringAcceptor a) { 32 | Automaton::operator=(a); 33 | std::swap(_alph, a._alph); 34 | return *this; 35 | } 36 | 37 | void StringAcceptor::set_alphabet(const Alphabet& alph) { 38 | _alph = alph; 39 | } 40 | 41 | const Alphabet& StringAcceptor::get_alphabet() const { 42 | return _alph; 43 | } 44 | 45 | bool StringAcceptor::accepts(const string_impl& str) const { 46 | try { 47 | LabelVector v = _alph.map_symbols(str); 48 | return accepts(v); 49 | } 50 | catch (const std::out_of_range& err) { 51 | return false; 52 | } 53 | } 54 | 55 | bool StringAcceptor::accepts(const std::vector& str) const { 56 | try { 57 | LabelVector v = _alph.map_symbols(str); 58 | return accepts(v); 59 | } 60 | catch (const std::out_of_range& err) { 61 | return false; 62 | } 63 | } 64 | 65 | std::set StringAcceptor::accepted() const { 66 | std::set labels = Acceptor::accepted(); 67 | std::set acc; 68 | for (const LabelVector& vec : labels) { 69 | acc.insert(_alph.map_labels(vec)); 70 | } 71 | return acc; 72 | } 73 | 74 | std::set> StringAcceptor::accepted_vectors() const { 75 | std::set labels = Acceptor::accepted(); 76 | std::set> acc; 77 | for (const LabelVector& vec : labels) { 78 | acc.insert(_alph.map_labels_to_vector(vec)); 79 | } 80 | return acc; 81 | } 82 | 83 | void StringAcceptor::add_word(const string_impl& str, bool partials) { 84 | LabelVector v = _alph.cover(str); 85 | add_path(v, partials); 86 | } 87 | 88 | void StringAcceptor::add_word(const std::vector& str, 89 | bool partials) { 90 | LabelVector v = _alph.cover(str); 91 | add_path(v, partials); 92 | } 93 | 94 | } // namespace Gfsm 95 | -------------------------------------------------------------------------------- /src/gfsm/string_cascade.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"string_cascade.h" 19 | #include 20 | #include 21 | #include 22 | #include"gfsmlibs.h" 23 | #include"alphabet.h" 24 | #include"labelvector.h" 25 | #include"string_acceptor.h" 26 | #include"string_transducer.h" 27 | #include"path.h" 28 | #include"lexicon/lexicon.h" 29 | 30 | using Norma::Normalizer::Lexicon; 31 | 32 | namespace Gfsm { 33 | 34 | StringCascade& StringCascade::operator=(StringCascade a) { 35 | Cascade::operator=(a); 36 | std::swap(_alph_in, a._alph_in); 37 | std::swap(_alph_out, a._alph_out); 38 | return *this; 39 | } 40 | 41 | void StringCascade::append(const StringAcceptor& a) { 42 | // TODO(bollmann): could check for compatibility of the alphabets? 43 | if (_size == 0) 44 | _alph_in = a.get_alphabet(); 45 | Cascade::append(&a); 46 | _alph_out = a.get_alphabet(); 47 | } 48 | 49 | void StringCascade::append(const StringTransducer& a) { 50 | // TODO(bollmann): could check for compatibility of the alphabets? 51 | if (_size == 0) 52 | _alph_in = a.get_input_alphabet(); 53 | Cascade::append(&a); 54 | _alph_out = a.get_output_alphabet(); 55 | } 56 | 57 | void StringCascade::append(Lexicon* lex) { 58 | StringAcceptor* acceptor = lex->get_acceptor(); 59 | append(*acceptor); 60 | } 61 | 62 | std::set StringCascade::lookup_nbest(const string_impl& str) const { 63 | return find_map_nbest(_alph_in.map_symbols(str)); 64 | } 65 | 66 | std::set 67 | StringCascade::lookup_nbest(const std::vector& str) const { 68 | return find_map_nbest(_alph_in.map_symbols(str)); 69 | } 70 | 71 | std::set 72 | StringCascade::find_map_nbest(const LabelVector& vec) const { 73 | std::set results; 74 | std::set paths = Cascade::lookup_nbest(vec); 75 | for (const Path& p : paths) { 76 | results.insert(StringPath::from(p, _alph_in, _alph_out)); 77 | } 78 | return results; 79 | } 80 | 81 | std::set 82 | StringCascade::lookup_nbest(const std::vector& str, 83 | unsigned int max_paths, double max_weight) { 84 | set_max_paths(max_paths); 85 | set_max_weight(max_weight); 86 | return lookup_nbest(str); 87 | } 88 | 89 | std::set 90 | StringCascade::lookup_nbest(const string_impl& str, 91 | unsigned int max_paths, double max_weight) { 92 | set_max_paths(max_paths); 93 | set_max_weight(max_weight); 94 | return lookup_nbest(str); 95 | } 96 | } // namespace Gfsm 97 | -------------------------------------------------------------------------------- /src/gfsm/string_cascade.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_STRING_CASCADE_H_ 19 | #define GFSM_STRING_CASCADE_H_ 20 | #include 21 | #include 22 | #include"gfsmlibs.h" 23 | #include"alphabet.h" 24 | #include"string_acceptor.h" 25 | #include"string_transducer.h" 26 | #include"cascade.h" 27 | #include"path.h" 28 | #include"string_impl.h" 29 | 30 | namespace Norma { 31 | namespace Normalizer { 32 | class Lexicon; 33 | } // namespace Normalizer 34 | } // namespace Norma 35 | 36 | namespace Gfsm { 37 | 38 | /// A Cascade with an Alphabet. 39 | class StringCascade : public Cascade { 40 | public: 41 | StringCascade(unsigned int depth = 2, 42 | SemiringType sr = SemiringType::TROPICAL) 43 | : Cascade(depth, sr) {} 44 | StringCascade(const StringCascade& a) 45 | : Cascade(a), _alph_in(a._alph_in), _alph_out(a._alph_out) {} 46 | StringCascade(StringCascade&& a) 47 | : Cascade(std::move(a)), _alph_in(std::move(a._alph_in)), 48 | _alph_out(std::move(a._alph_out)) {} 49 | StringCascade& operator=(StringCascade a); 50 | ~StringCascade() = default; 51 | 52 | /// Append an automaton to the cascade. 53 | /** @see Cascade::append() */ 54 | void append(const StringAcceptor& a); 55 | /// Append an automaton to the cascade. 56 | /** @see Cascade::append() */ 57 | void append(const StringTransducer& a); 58 | void append(Norma::Normalizer::Lexicon* lex); 59 | 60 | /// Finds the n-best paths for a given input sequence. 61 | /** @see Cascade::lookup_nbest(const LabelVector&) const */ 62 | std::set lookup_nbest(const string_impl& str) const; 63 | /// Finds the n-best paths for a given input sequence. 64 | /** @see Cascade::lookup_nbest(const LabelVector&) const */ 65 | std::set lookup_nbest(const std::vector& str) 66 | const; 67 | /// Finds the n-best paths for a given input sequence. 68 | /** @see Cascade::lookup_nbest(const LabelVector&, unsigned int, double) */ 69 | std::set lookup_nbest(const string_impl& str, 70 | unsigned int max_paths, 71 | double max_weight); 72 | /// Finds the n-best paths for a given input sequence. 73 | /** @see Cascade::lookup_nbest(const LabelVector&, unsigned int, double) */ 74 | std::set lookup_nbest(const std::vector& str, 75 | unsigned int max_paths, 76 | double max_weight); 77 | 78 | protected: 79 | Alphabet _alph_in; 80 | Alphabet _alph_out; 81 | 82 | std::set find_map_nbest(const LabelVector& vec) const; 83 | }; 84 | 85 | } // namespace Gfsm 86 | 87 | #endif // GFSM_STRING_CASCADE_H_ 88 | 89 | -------------------------------------------------------------------------------- /src/gfsm/string_transducer.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"string_transducer.h" 19 | #include 20 | #include 21 | #include 22 | #include"gfsmlibs.h" 23 | #include"labelvector.h" 24 | #include"path.h" 25 | #include"string_impl.h" 26 | 27 | namespace Gfsm { 28 | 29 | StringTransducer& StringTransducer::operator=(StringTransducer a) { 30 | Automaton::operator=(a); 31 | std::swap(_alph_in, a._alph_in); 32 | std::swap(_alph_out, a._alph_out); 33 | return *this; 34 | } 35 | 36 | void StringTransducer::set_input_alphabet(const Alphabet& alph) { 37 | _alph_in = alph; 38 | } 39 | 40 | void StringTransducer::set_output_alphabet(const Alphabet& alph) { 41 | _alph_out = alph; 42 | } 43 | 44 | const Alphabet& StringTransducer::get_input_alphabet() const { 45 | return _alph_in; 46 | } 47 | 48 | const Alphabet& StringTransducer::get_output_alphabet() const { 49 | return _alph_out; 50 | } 51 | 52 | std::set StringTransducer::transduce(const string_impl& str) const { 53 | try { 54 | LabelVector v = _alph_in.map_symbols(str); 55 | return transduce_vector_to_string(v); 56 | } 57 | catch (const std::out_of_range& err) { 58 | return std::set(); 59 | } 60 | } 61 | 62 | std::set 63 | StringTransducer::transduce(const std::vector& str) const { 64 | try { 65 | LabelVector v = _alph_in.map_symbols(str); 66 | return transduce_vector_to_string(v); 67 | } 68 | catch (const std::out_of_range& err) { 69 | return std::set(); 70 | } 71 | } 72 | 73 | std::set 74 | StringTransducer::transduce_vector_to_string(const LabelVector& vec) const { 75 | std::set x; 76 | auto results = Transducer::transduce(vec); 77 | for (const Path& p : results) { 78 | try { 79 | x.insert(StringPath::from(p, _alph_in, _alph_out)); 80 | } 81 | catch (const std::out_of_range& err) {} 82 | } 83 | return x; 84 | } 85 | 86 | void StringTransducer::add_path(const string_impl& str_in, 87 | const string_impl& str_out, 88 | double weight, bool cyclic, bool final) { 89 | Transducer::add_path(Path(_alph_in.map_symbols(str_in), 90 | _alph_out.map_symbols(str_out), 91 | weight), 92 | cyclic, final); 93 | } 94 | 95 | void StringTransducer::add_path(const StringPath& path, 96 | bool cyclic, bool final) { 97 | Transducer::add_path(Path(_alph_in.map_symbols(path.get_input()), 98 | _alph_out.map_symbols(path.get_output()), 99 | path.get_weight()), 100 | cyclic, final); 101 | } 102 | 103 | } // namespace Gfsm 104 | -------------------------------------------------------------------------------- /src/gfsm/transducer.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"transducer.h" 19 | #include 20 | #include 21 | #include 22 | #include"gfsmlibs.h" 23 | #include"labelvector.h" 24 | #include"path.h" 25 | #include"semiring.h" 26 | 27 | namespace Gfsm { 28 | 29 | Transducer& Transducer::operator=(Transducer a) { 30 | Automaton::operator=(a); 31 | return *this; 32 | } 33 | 34 | std::set Transducer::transduce(const LabelVector& input) const { 35 | std::set tr; 36 | gfsmAutomaton* result = gfsm_automaton_new(); 37 | gfsm_automaton_lookup(_fsm, input._vec, result); 38 | if (gfsm_automaton_n_final_states(result) > 0) { 39 | Transducer a; 40 | a.set_gfsm_automaton(result); 41 | tr = a.accepted_paths(); 42 | } else { 43 | gfsm_automaton_free(result); 44 | } 45 | return tr; 46 | } 47 | 48 | void Transducer::add_path(const Path& p, bool cyclic, bool final) { 49 | gfsmStateId from = root(); 50 | gfsmStateId to = gfsm_automaton_n_states(_fsm); 51 | gfsmWeight w = p.get_weight(); 52 | size_t input_size = p.get_input().size(), 53 | output_size = p.get_output().size(), 54 | max_size = std::max(input_size, output_size); 55 | for (size_t i = 0; i < max_size; ++i) { 56 | gfsmLabelVal from_val = (i < input_size) ? p.get_input().get(i) : 0; 57 | gfsmLabelVal to_val = (i < output_size) ? p.get_output().get(i) : 0; 58 | if (cyclic && (i == max_size - 1)) { 59 | to = root(); // cyclic: back to the root 60 | } 61 | gfsm_automaton_add_arc(_fsm, from, to, from_val, to_val, w); 62 | w = _fsm->sr->one; 63 | from = to; 64 | to++; 65 | } 66 | if (final && (gfsm_automaton_state_is_final(_fsm, from) == FALSE)) { 67 | gfsm_automaton_set_final_state_full(_fsm, from, TRUE, _fsm->sr->one); 68 | } 69 | } 70 | } // namespace Gfsm 71 | -------------------------------------------------------------------------------- /src/gfsm/transducer.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_TRANSDUCER_H_ 19 | #define GFSM_TRANSDUCER_H_ 20 | #include 21 | #include 22 | #include 23 | #include"automaton.h" 24 | #include"path.h" 25 | 26 | namespace Gfsm { 27 | class LabelVector; 28 | 29 | /// A finite-state transducer. 30 | /** Implements functions specific to a finite-state transducer, i.e., 31 | an Automaton which returns an output sequence for a given input 32 | sequence of labels. 33 | */ 34 | class Transducer : public Automaton { 35 | public: 36 | Transducer() : Automaton() { 37 | _fsm->flags.is_transducer = TRUE; 38 | } 39 | Transducer(const Transducer& a) : Automaton(a) {} 40 | Transducer(Transducer&& a) : Automaton(std::move(a)) {} 41 | Transducer& operator=(Transducer a); 42 | ~Transducer() = default; 43 | 44 | /// Transduce a given input sequence. 45 | /** @param input Input sequence as a LabelVector 46 | @return A set of Path objects accepted by this transducer given the 47 | input sequence. Only paths ending in a final state are 48 | returned. 49 | */ 50 | std::set transduce(const LabelVector& input) const; 51 | 52 | /// Add a new path to the transducer. 53 | /** @param p The Path to be added 54 | @param cyclic If true, the final transition will return to the root 55 | state of the automaton, making it cyclic. 56 | @param final If true, the state after the final transition will be 57 | turned into a final state. 58 | */ 59 | void add_path(const Path& p, bool cyclic = false, bool final = true); 60 | 61 | /// Add a new cyclic path to the transducer. 62 | /** Convenience function for add_path(p, true, final). 63 | @see add_path() 64 | */ 65 | void add_cyclic_path(const Path& p, bool final = true) 66 | { add_path(p, true, final); } 67 | }; 68 | 69 | } // namespace Gfsm 70 | 71 | #endif // GFSM_TRANSDUCER_H_ 72 | -------------------------------------------------------------------------------- /src/gfsm_wrapper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_WRAPPER_H_ 19 | #define GFSM_WRAPPER_H_ 20 | #include"gfsm/automaton.h" 21 | #include"gfsm/acceptor.h" 22 | #include"gfsm/string_acceptor.h" 23 | #include"gfsm/transducer.h" 24 | #include"gfsm/string_transducer.h" 25 | #include"gfsm/cascade.h" 26 | #include"gfsm/string_cascade.h" 27 | #include"gfsm/semiring.h" 28 | #include"gfsm/alphabet.h" 29 | #include"gfsm/labelvector.h" 30 | #include"gfsm/path.h" 31 | #include"gfsm/implode_explode.h" 32 | #endif // GFSM_WRAPPER_ 33 | -------------------------------------------------------------------------------- /src/gfsmlibs.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef GFSM_LIBRARIES_H_ 19 | #define GFSM_LIBRARIES_H_ 20 | extern "C" { 21 | #include 22 | #include 23 | } 24 | #endif 25 | -------------------------------------------------------------------------------- /src/interface.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef INTERFACE_H_ 19 | #define INTERFACE_H_ 20 | #include"interface/iobase.h" 21 | #include"interface/input.h" 22 | #include"interface/output.h" 23 | #endif // INTERFACE_H_ 24 | 25 | -------------------------------------------------------------------------------- /src/interface/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_sources(input.cpp output.cpp) 2 | install_headers(iobase.h input.h output.h) 3 | 4 | -------------------------------------------------------------------------------- /src/interface/input.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"input.h" 19 | #include 20 | #include 21 | #include 22 | #include"string_impl.h" 23 | #include"cycle.h" 24 | 25 | using std::string; 26 | 27 | namespace Norma { 28 | ///////////////////////////// Input ////////////////////////////////////// 29 | string_impl Input::get_line() { 30 | std::string l; 31 | getline(*_input, l); 32 | return l; 33 | } 34 | 35 | //////////////////////////// FileInput /////////////////////////////////// 36 | FileInput::FileInput(const std::string& fname) 37 | : Input() { 38 | _file.open(fname); 39 | if (!_file.is_open()) 40 | throw std::runtime_error("Could not open input file!"); 41 | _input = &_file; 42 | _output = &std::cout; 43 | _error = &std::cerr; 44 | } 45 | 46 | FileInput::~FileInput() { 47 | try { 48 | _file.close(); 49 | } catch(...) { 50 | *_error << "Error while closing input file!" << std::endl; 51 | } 52 | } 53 | 54 | string_impl FileInput::get_line() { 55 | string_impl line = Input::get_line(); 56 | _request_train = (string_find(line, "\t") != string_npos); 57 | return line; 58 | } 59 | 60 | } // namespace Norma 61 | 62 | -------------------------------------------------------------------------------- /src/interface/input.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef INTERFACE_INPUT_H_ 19 | #define INTERFACE_INPUT_H_ 20 | #include 21 | #include 22 | #include 23 | #include"string_impl.h" 24 | #include"iobase.h" 25 | 26 | namespace Norma { 27 | class Cycle; 28 | 29 | /// Basic Input class, pure virtual 30 | /** Admittedly it doesn't make that much sense to have it as pure 31 | * virtual, but FileInput needs the filename as parameter, and 32 | * that shouldn't be in the common base class for Input, while 33 | * ShellInput needs a whole lot of other stuff. 34 | * 35 | * So essentially this doesn't serve any function other than 36 | * providing a common base for polymorphic access, so we can't 37 | * have our client code walking around with Input objects. 38 | * 39 | * Hence, request_quit is 0 even though it could easily return 40 | * _input->eof(). 41 | **/ 42 | class Input : public IOBase { 43 | public: 44 | Input() = default; 45 | virtual ~Input() {} 46 | /// anything that should be done before the processing 47 | /// begins, such as greeting the user 48 | virtual void begin() {} 49 | /// anything happening after the processing itself starts 50 | virtual void end() {} 51 | /// the actual function that reads a line and passes it 52 | /// to Cycle 53 | virtual string_impl get_line(); 54 | /// method to check if the Input requested program termination 55 | virtual bool request_quit() = 0; 56 | protected: 57 | virtual void store_line(const string_impl& line) { 58 | _training->add_source(line); 59 | } 60 | std::istream *_input; 61 | }; 62 | 63 | /// Input from a file 64 | class FileInput : public Input { 65 | public: 66 | explicit FileInput(const std::string& fname); 67 | ~FileInput(); 68 | string_impl get_line(); 69 | inline bool request_quit() { 70 | return _input->eof(); 71 | } 72 | bool thread_suitable() { 73 | return true; 74 | } 75 | private: 76 | std::ifstream _file; 77 | std::ostream *_output, *_error; 78 | }; 79 | 80 | } // namespace Norma 81 | #endif // INTERFACE_INPUT_H_ 82 | 83 | -------------------------------------------------------------------------------- /src/interface/iobase.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef INTERFACE_IOBASE_H_ 19 | #define INTERFACE_IOBASE_H_ 20 | #include 21 | #include 22 | #include"string_impl.h" 23 | #include"training_data.h" 24 | 25 | namespace Norma { 26 | class Cycle; 27 | 28 | /// Base class for both Input and Output. 29 | /** Provides all methods that are common to both classes, 30 | * such as accessing the history. 31 | * 32 | * I'd like to have this pure virtual, but can't think 33 | * of anything to sensibly set to 0. 34 | **/ 35 | class IOBase { 36 | public: 37 | IOBase() = default; 38 | virtual ~IOBase() = default; 39 | virtual TrainingData& training_data() { 40 | return *_training; 41 | } 42 | void initialize(Cycle* c, IOBase* opp, TrainingData* data) { 43 | _cycle = c; 44 | _opposite = opp; 45 | _training = data; 46 | } 47 | void store_last() { 48 | store_line(_line); 49 | } 50 | virtual bool thread_suitable() { 51 | return false; 52 | } 53 | virtual bool request_train() { return _request_train; } 54 | 55 | protected: 56 | virtual void store_line(const string_impl& line) = 0; 57 | TrainingData* _training; 58 | Cycle* _cycle; 59 | IOBase* _opposite; 60 | string_impl _line; 61 | bool _request_train = false; 62 | }; 63 | } // namespace Norma 64 | #endif // INTERFACE_IOBASE_H_ 65 | 66 | -------------------------------------------------------------------------------- /src/interface/output.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"output.h" 19 | #include 20 | #include 21 | #include"normalizer/result.h" 22 | 23 | using std::string; 24 | 25 | namespace Norma { 26 | //////////////////////////// Output ////////////////////////////////////// 27 | 28 | Output::Output() { 29 | _output = &std::cout; 30 | } 31 | 32 | void Output::put_line(Normalizer::Result* result, 33 | bool print_prob, Normalizer::LogLevel max_level) { 34 | *_output << result->word; 35 | if (print_prob) 36 | *_output << "\t" << result->score; 37 | *_output << std::endl; 38 | log_messages(result, max_level); 39 | } 40 | 41 | void Output::log_messages(Normalizer::Result* result, 42 | Normalizer::LogLevel max_level) { 43 | while (!result->messages.empty()) { 44 | Normalizer::LogLevel level; 45 | std::string origin, message; 46 | std::tie(level, origin, message) = result->messages.front(); 47 | if (level >= max_level) 48 | *_output << "[" << Normalizer::level_string(level) << "]:" 49 | << message << " Origin: " << origin << std::endl; 50 | result->messages.pop(); 51 | } 52 | } 53 | 54 | } // namespace Norma 55 | 56 | -------------------------------------------------------------------------------- /src/interface/output.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef INTERFACE_OUTPUT_H_ 19 | #define INTERFACE_OUTPUT_H_ 20 | #include 21 | #include 22 | #include"iobase.h" 23 | #include"normalizer/result.h" 24 | 25 | namespace Norma { 26 | /// Basic Output class. Not pure virtual since it's essentially 27 | /// non-interactive output already. 28 | class Output : public IOBase { 29 | public: 30 | Output(); 31 | virtual ~Output() = default; 32 | /// put a line on the output device and record it in the history 33 | virtual void put_line(Normalizer::Result* result, 34 | bool print_prob, 35 | Normalizer::LogLevel max_level); 36 | bool thread_suitable() { 37 | return true; 38 | } 39 | 40 | protected: 41 | virtual void store_line(const string_impl& line) { 42 | _training->add_target(line); 43 | } 44 | virtual void log_messages(Normalizer::Result* result, 45 | Normalizer::LogLevel max_level); 46 | std::ostream *_output; 47 | }; 48 | 49 | } // namespace Norma 50 | #endif // INTERFACE_OUTPUT_H_ 51 | 52 | -------------------------------------------------------------------------------- /src/lexicon/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_sources(lexicon.cpp) 2 | install_headers(lexicon.h lexicon_interface.h) 3 | include_directories("${CMAKE_SOURCE_DIR}/src") 4 | add_executable(norma_lexicon lexicon_main.cpp) 5 | target_link_libraries(norma_lexicon norma ${Boost_PROGRAM_OPTIONS_LIBRARY}) 6 | 7 | # should probably use $NORMA_BINDIR as install target, but this would require 8 | # reordering the main CMakeLists.txt (this file is included before NORMA_BINDIR 9 | # is defined) and probably making it a CACHE variable 10 | if (NOT CMAKE_INSTALL_BINDIR) 11 | set(CMAKE_INSTALL_BINDIR "bin") 12 | endif() 13 | install(TARGETS norma_lexicon DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") 14 | -------------------------------------------------------------------------------- /src/lexicon/lexicon.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_LEXICON_H_ 19 | #define NORMALIZER_LEXICON_H_ 20 | #include 21 | #include 22 | #include 23 | #include // NOLINT[build/include_order] 24 | #include"gfsm_wrapper.h" 25 | #include"string_impl.h" 26 | #include"lexicon/lexicon_interface.h" 27 | 28 | namespace Norma { 29 | namespace Normalizer { 30 | 31 | class Lexicon : public LexiconInterface { 32 | friend class Gfsm::StringCascade; 33 | public: 34 | Lexicon() {} 35 | ~Lexicon() { 36 | if (_fsm != nullptr) 37 | delete _fsm; 38 | } 39 | 40 | void set_lexfile(const std::string& fn) { 41 | _lexfile = boost::filesystem::path(fn); 42 | } 43 | std::string get_lexfile() const { return _lexfile.string(); } 44 | void set_symfile(const std::string& fn) { 45 | _symfile = boost::filesystem::path(fn); 46 | } 47 | std::string get_symfile() const { return _symfile.string(); } 48 | 49 | /// perform (possibly time-intensive) FST optimizations 50 | void optimize(); 51 | 52 | static const string_impl SYMBOL_BOUNDARY; 53 | static const string_impl SYMBOL_ANY; 54 | static const string_impl SYMBOL_EPSILON; 55 | 56 | const Gfsm::Alphabet& get_alphabet() const; 57 | 58 | protected: 59 | Gfsm::StringAcceptor* get_acceptor() const { return _fsm; } 60 | 61 | private: 62 | boost::filesystem::path _lexfile; 63 | boost::filesystem::path _symfile; 64 | Gfsm::StringAcceptor* _fsm = nullptr; 65 | 66 | gfsmLabelVal _label_boundary; 67 | gfsmLabelVal _label_any; 68 | gfsmLabelVal _label_epsilon; 69 | Gfsm::Alphabet init_alphabet(); 70 | 71 | void do_init(); 72 | void do_clear(); 73 | void do_set_from_params(const std::map& 74 | params); 75 | void do_save_params(); 76 | bool check_contains(const string_impl& word) const; 77 | bool check_contains_partial(const string_impl& word) const; 78 | bool add_word(const string_impl& word); 79 | std::vector retrieve_all_entries() const; 80 | unsigned int get_size() const; 81 | }; 82 | 83 | } // namespace Normalizer 84 | } // namespace Norma 85 | 86 | #endif // NORMALIZER_LEXICON_H_ 87 | -------------------------------------------------------------------------------- /src/lexicon/lexicon_interface.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_LEXICON_INTERFACE_H_ 19 | #define NORMALIZER_LEXICON_INTERFACE_H_ 20 | #include 21 | #include 22 | #include 23 | #include"string_impl.h" 24 | 25 | namespace Norma { 26 | namespace Normalizer { 27 | 28 | class LexiconInterface { 29 | public: 30 | virtual ~LexiconInterface() {} 31 | 32 | // avoid public virtual functions 33 | // I'm blindly following here 34 | void init() { 35 | do_init(); 36 | } 37 | void init(const std::map& params) { 38 | do_set_from_params(params); 39 | do_init(); 40 | } 41 | void clear() { 42 | _entries_cache_initialized = false; 43 | _entries_cache.clear(); 44 | do_clear(); 45 | } 46 | void set_from_params(const std::map& params) { 47 | do_set_from_params(params); 48 | } 49 | void save_params() { 50 | do_save_params(); 51 | } 52 | bool contains(const string_impl& word) const { 53 | return check_contains(word); 54 | } 55 | bool contains_partial(const string_impl& word) const { 56 | return check_contains_partial(word); 57 | } 58 | void add(const string_impl& word) { 59 | bool added = add_word(word); 60 | if (added && _entries_cache_initialized) 61 | _entries_cache.push_back(word); 62 | } 63 | std::vector entries() const { 64 | if (!_entries_cache_initialized) { 65 | _entries_cache = retrieve_all_entries(); 66 | _entries_cache_initialized = true; 67 | } 68 | return _entries_cache; 69 | } 70 | unsigned int size() const { 71 | return get_size(); 72 | } 73 | 74 | protected: 75 | LexiconInterface() = default; 76 | 77 | private: 78 | virtual void do_init() = 0; 79 | virtual void do_clear() = 0; 80 | virtual void do_set_from_params(const std::map& 81 | params) = 0; 82 | virtual void do_save_params() = 0; 83 | virtual bool check_contains(const string_impl& word) const = 0; 84 | virtual bool check_contains_partial(const string_impl& word) const = 0; 85 | virtual bool add_word(const string_impl& word) = 0; 86 | virtual std::vector retrieve_all_entries() const = 0; 87 | virtual unsigned int get_size() const = 0; 88 | 89 | mutable std::vector _entries_cache; 90 | mutable bool _entries_cache_initialized = false; 91 | }; 92 | 93 | } // namespace Normalizer 94 | } // namespace Norma 95 | 96 | #endif // NORMALIZER_LEXICON_INTERFACE_H_ 97 | -------------------------------------------------------------------------------- /src/norma.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_NORMA_H_ 19 | #define NORMA_NORMA_H_ 20 | // convenience header for the entirety of norma 21 | #include"cycle.h" 22 | #include"interface.h" 23 | 24 | #endif // NORMA_NORMA_H_ 25 | 26 | -------------------------------------------------------------------------------- /src/normalizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # common headers 2 | install_headers(result.h exceptions.h base.h cacheable.h) 3 | # convenience headers 4 | install_headers(exceptions.h rulebased.h wld.h mapper.h) 5 | add_sources(result.cpp cacheable.cpp) 6 | 7 | add_subdirectory(mapper) 8 | add_subdirectory(wld) 9 | add_subdirectory(rulebased) 10 | 11 | if (WITH_PYTHON) 12 | add_subdirectory(external) 13 | install_headers(external.h) 14 | endif() 15 | 16 | set(NORMALIZER_LIBRARIES ${NORMALIZER_LIBRARIES} PARENT_SCOPE) 17 | 18 | -------------------------------------------------------------------------------- /src/normalizer/cacheable.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2016 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"cacheable.h" 19 | #include 20 | #include 21 | #include 22 | #include"normalizer/result.h" 23 | #include"string_impl.h" 24 | 25 | namespace Norma { 26 | namespace Normalizer { 27 | void Cacheable::set_caching(bool value) { 28 | _caching.store(value); 29 | if (!_caching) 30 | clear_cache(); 31 | } 32 | 33 | void Cacheable::clear_cache() const { 34 | std::unique_lock write_lock(cache_mutex); 35 | _cache.clear(); 36 | } 37 | 38 | Result Cacheable::query_cache(const string_impl& word) const { 39 | std::shared_lock read_lock(cache_mutex); 40 | // _cache has to be checked here, because if we checked from the 41 | // outside, it's possible another thread has changed our state between 42 | // the call of has_cached() and query_cache() 43 | if (_cache.count(word) > 0) 44 | return _cache.at(word); 45 | return Result(); 46 | } 47 | 48 | void Cacheable::cache(const string_impl& word, const Result& result) const { 49 | std::unique_lock write_lock(cache_mutex); 50 | _cache[word] = result; 51 | } 52 | } // namespace Normalizer 53 | } // namespace Norma 54 | 55 | -------------------------------------------------------------------------------- /src/normalizer/cacheable.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2016 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_CACHEABLE_H_ 19 | #define NORMALIZER_CACHEABLE_H_ 20 | #include 21 | #include 22 | #include 23 | #include"normalizer/result.h" 24 | #include"string_impl.h" 25 | 26 | namespace Norma { 27 | namespace Normalizer { 28 | class Cacheable { 29 | public: 30 | void set_caching(bool value); 31 | bool is_caching() const { return _caching.load(); } 32 | void clear_cache() const; 33 | 34 | protected: 35 | Result query_cache(const string_impl& word) const; 36 | void cache(const string_impl& word, const Result& result) const; 37 | 38 | private: 39 | std::atomic_bool _caching {true}; 40 | mutable std::map _cache; 41 | mutable std::shared_timed_mutex cache_mutex; 42 | }; 43 | } // namespace Normalizer 44 | } // namespace Norma 45 | 46 | #endif // NORMALIZER_CACHEABLE_H_ 47 | 48 | -------------------------------------------------------------------------------- /src/normalizer/exceptions.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef EXCEPTIONS_H_ 19 | #define EXCEPTIONS_H_ 20 | #include 21 | 22 | namespace Norma { 23 | namespace Normalizer { 24 | class init_error : public std::runtime_error { 25 | using std::runtime_error::runtime_error; 26 | }; 27 | } // namespace Normalizer 28 | } // namespace Norma 29 | 30 | #endif // EXCEPTIONS_H_ 31 | -------------------------------------------------------------------------------- /src/normalizer/external.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_EXTERNAL_H_ 19 | #define NORMALIZER_EXTERNAL_H_ 20 | #include"external/external.h" 21 | #endif // NORMALIZER_EXTERNAL_H_ 22 | -------------------------------------------------------------------------------- /src/normalizer/external/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories("${CMAKE_SOURCE_DIR}/src") 2 | add_library(External SHARED 3 | external.cpp) 4 | install(TARGETS External 5 | DESTINATION "${NORMA_DEFAULT_PLUGIN_BASE}") 6 | set(NORMALIZER_LIBRARIES ${NORMALIZER_LIBRARIES} External PARENT_SCOPE) 7 | install_headers(external.h) 8 | 9 | -------------------------------------------------------------------------------- /src/normalizer/mapper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_MAPPER_H_ 19 | #define NORMALIZER_MAPPER_H_ 20 | #include"mapper/mapper.h" 21 | #endif // NORMALIZER_MAPPER_H_ 22 | -------------------------------------------------------------------------------- /src/normalizer/mapper/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories("${CMAKE_SOURCE_DIR}/src") 2 | add_library(Mapper SHARED mapper.cpp) 3 | target_link_libraries(Mapper LINK_PUBLIC norma) 4 | install(TARGETS Mapper 5 | DESTINATION "${NORMA_DEFAULT_PLUGIN_BASE}") 6 | set(NORMALIZER_LIBRARIES ${NORMALIZER_LIBRARIES} Mapper PARENT_SCOPE) 7 | install_headers(mapper.h) 8 | 9 | -------------------------------------------------------------------------------- /src/normalizer/mapper/mapper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_MAPPER_MAPPER_H_ 19 | #define NORMALIZER_MAPPER_MAPPER_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include"string_impl.h" 25 | #include"normalizer/base.h" 26 | #include"normalizer/result.h" 27 | 28 | namespace Norma { 29 | namespace Normalizer { 30 | namespace Mapper { 31 | class Mapper : public Base { 32 | public: 33 | void init(); 34 | using Base::init; 35 | void set_from_params(const std::map& params); 36 | void clear(); 37 | /// Get filename of the current mappings file 38 | const std::string& get_mapfile() const { return _mapfile; } 39 | /// Set filename for the mappings file 40 | Mapper& set_mapfile(const std::string& mapfile) { 41 | _mapfile = mapfile; 42 | return *this; 43 | } 44 | // this needs to be public because it is exposed to python bindings 45 | void do_train(const string_impl& word, const string_impl& modern, 46 | int count); 47 | 48 | protected: 49 | bool do_train(TrainingData* data); 50 | Result do_normalize(const string_impl& word) const; 51 | ResultSet do_normalize(const string_impl& word, unsigned int n) const; 52 | void do_save_params(); 53 | 54 | private: 55 | ResultSet make_all_results(const string_impl& word) const; 56 | bool write_mapfile(const std::string& fname); 57 | bool read_mapfile(const std::string& fname); 58 | 59 | std::map> _map; 60 | std::string _mapfile; 61 | }; 62 | } // namespace Mapper 63 | } // namespace Normalizer 64 | } // namespace Norma 65 | 66 | extern "C" Norma::Normalizer::Base* create_normalizer() { 67 | return new Norma::Normalizer::Mapper::Mapper; 68 | } 69 | extern "C" void destroy_normalizer(Norma::Normalizer::Base* n) { 70 | delete n; 71 | } 72 | 73 | #endif // NORMALIZER_MAPPER_H_ 74 | 75 | -------------------------------------------------------------------------------- /src/normalizer/result.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include 19 | #include 20 | #include"result.h" 21 | 22 | namespace Norma { 23 | namespace Normalizer { 24 | LogMessage make_message(LogLevel loglevel, 25 | std::string origin, std::string message) { 26 | return std::make_tuple(loglevel, origin, message); 27 | } 28 | 29 | std::string level_string(LogLevel loglevel) { 30 | switch (loglevel) { 31 | case LogLevel::TRACE: 32 | return "TRACE"; 33 | case LogLevel::WARN: 34 | return "WARNING"; 35 | case LogLevel::ERROR: 36 | return "ERROR"; 37 | case LogLevel::SILENT: 38 | default: 39 | return "SILENT"; 40 | } 41 | } 42 | } // namespace Normalizer 43 | } // namespace Norma 44 | -------------------------------------------------------------------------------- /src/normalizer/result.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RESULT_H_ 19 | #define NORMALIZER_RESULT_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include"string_impl.h" 26 | 27 | namespace Norma { 28 | namespace Normalizer { 29 | enum class LogLevel { 30 | TRACE = 0, 31 | WARN, 32 | ERROR, 33 | SILENT 34 | }; 35 | typedef std::tuple LogMessage; 36 | LogMessage make_message(LogLevel loglevel, 37 | std::string origin, std::string message); 38 | std::string level_string(LogLevel loglevel); 39 | 40 | struct Result { 41 | string_impl word = ""; 42 | double score = 0.0; 43 | std::string origin = ""; 44 | unsigned int priority = std::numeric_limits::max(); 45 | std::queue messages; 46 | /// this should only be set by the chooser, and is used to prevent 47 | /// subsequent normalizers from running in a best priority scenario 48 | bool is_final = false; 49 | 50 | Result() = default; 51 | Result(const string_impl& w, double s) : word(w), score(s) {} 52 | Result(const string_impl& w, double s, const std::string& c) 53 | : word(w), score(s), origin(c) {} 54 | 55 | bool operator<(const Result& that) const { 56 | return this->score < that.score; 57 | } 58 | bool operator>(const Result& that) const { 59 | return this->score > that.score; 60 | } 61 | bool operator==(const Result& that) const { 62 | return this->score == that.score 63 | && this->word == that.word; 64 | } 65 | bool operator!=(const Result& that) const { 66 | return !(*this == that); 67 | } 68 | }; 69 | 70 | typedef std::vector ResultSet; 71 | } // namespace Normalizer 72 | } // namespace Norma 73 | #endif // NORMALIZER_RESULT_H_ 74 | 75 | -------------------------------------------------------------------------------- /src/normalizer/rulebased.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RULEBASED_H_ 19 | #define NORMALIZER_RULEBASED_H_ 20 | #include"rulebased/rulebased.h" 21 | #endif // NORMALIZER_RULEBASED_H_ 22 | 23 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories("${CMAKE_SOURCE_DIR}/src") 2 | add_library(RuleBased SHARED 3 | rule.cpp rule_learn.cpp rule_collection.cpp 4 | candidate_finder.cpp rulebased.cpp) 5 | target_link_libraries(RuleBased LINK_PUBLIC norma) 6 | install(TARGETS RuleBased 7 | DESTINATION "${NORMA_DEFAULT_PLUGIN_BASE}") 8 | install_headers(candidate_finder.h rule.h rule_collection.h rule_learn.h 9 | rulebased.h symbols.h) 10 | set(NORMALIZER_LIBRARIES ${NORMALIZER_LIBRARIES} RuleBased PARENT_SCOPE) 11 | 12 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/rule_collection.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RULEBASED_RULE_COLLECTION_H_ 19 | #define NORMALIZER_RULEBASED_RULE_COLLECTION_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include"regex_impl.h" 25 | #include"rule.h" 26 | 27 | namespace Norma { 28 | namespace Normalizer { 29 | namespace Rulebased { 30 | 31 | struct RAState; 32 | 33 | /// A collection of rewrite rules as a parametrization for the 34 | /// rule-based normalizer. Encapsulates functions related to 35 | /// the retrieval, storage, and cost calculation of rewrite rules. 36 | class RuleCollection { 37 | public: 38 | RuleCollection() { _rule_re = make_regex_impl(_rule_re_str); } 39 | void clear(); 40 | bool read_rulesfile(const std::string& fname); 41 | bool save_rulesfile(const std::string& fname); 42 | 43 | void learn_rule(Rule r, int count = 1); 44 | void learn_ruleset(const RuleSet& rs); 45 | 46 | std::vector find_applicable_rules(const string_impl& left, 47 | const string_impl& back, 48 | bool epsilon) const; 49 | 50 | int get_freq(const Rule& r) const; 51 | int get_highest_freq() const { return _highest_freq; } 52 | int get_type_count() const { return _rules.size(); } 53 | int get_instance_count() const { return _total_count; } 54 | int get_average_freq() const { 55 | if (_rules.empty()) return 0; 56 | return _total_count / _rules.size(); 57 | } 58 | 59 | private: 60 | // input/output of rules files 61 | const std::string _rule_re_str 62 | = "[^0-9]*([0-9]+)\\s+\\{(.+)->(.+)/(.)_(.)\\}.*$"; 63 | regex_impl _rule_re; 64 | std::tuple parse_line(const std::string& line); 65 | 66 | // data members 67 | std::unordered_map _rules; 68 | int _total_count = 0; // counts rule instances, not types 69 | int _highest_freq = 0; // max(#instances) 70 | }; 71 | 72 | } // namespace Rulebased 73 | } // namespace Normalizer 74 | } // namespace Norma 75 | 76 | #endif // NORMALIZER_RULEBASED_RULE_COLLECTION_H_ 77 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/rule_learn.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"rule_learn.h" 19 | #include 20 | #include 21 | #include 22 | 23 | namespace Norma { 24 | namespace Normalizer { 25 | namespace Rulebased { 26 | /////////////////////// local helper functions //////////////////////////////// 27 | namespace { 28 | 29 | void construct_initial_matrix(std::vector> *matrix, 30 | const string_impl& source, 31 | const string_impl& target) { 32 | int n = source.length(), 33 | m = target.length(); 34 | matrix->resize(n + 1); 35 | for (int i = 0; i <= n; ++i) 36 | (*matrix)[i].resize(m + 1); 37 | 38 | // top row 39 | RuleSet edits; 40 | for (int p = 0; p < m; ++p) { 41 | edits.add_rule(EditOp::ADD, source, 1, target, p + 1); 42 | (*matrix)[0][p + 1] = RuleSet(edits); 43 | } 44 | 45 | // left col 46 | edits = RuleSet(); 47 | for (int p = 0; p < n; ++p) { 48 | edits.add_rule(EditOp::DEL, source, p + 1, target, 1); 49 | (*matrix)[p + 1][0] = RuleSet(edits); 50 | } 51 | } 52 | } // namespace 53 | 54 | RuleSet learn_rules(const string_impl& source, const string_impl& target, 55 | bool do_merge = true, bool insert_epsilon = true) { 56 | int n = source.length(), 57 | m = target.length(); 58 | 59 | std::vector> _matrix; 60 | construct_initial_matrix(&_matrix, source, target); 61 | 62 | RuleSet edits; 63 | for (int i = 1; i <= n; ++i) 64 | for (int j = 1; j <= m; ++j) { 65 | int sc = (source[i - 1] == target[j - 1]) ? 0 : 1, 66 | add_cost = _matrix[i][j-1].cost() + 1, 67 | del_cost = _matrix[i-1][j].cost() + 1, 68 | sub_cost = _matrix[i-1][j-1].cost() + sc; 69 | if (sub_cost <= add_cost && sub_cost <= del_cost) { 70 | edits = RuleSet(_matrix[i-1][j-1]); 71 | edits.add_rule(EditOp::SUB, source, i, target, j); 72 | } else if (del_cost <= sub_cost && del_cost <= add_cost) { 73 | edits = RuleSet(_matrix[i-1][j]); 74 | edits.add_rule(EditOp::DEL, source, i, target, j); 75 | } else if (add_cost <= sub_cost && add_cost <= del_cost) { 76 | edits = RuleSet(_matrix[i][j-1]); 77 | edits.add_rule(EditOp::ADD, source, i, target, j); 78 | } 79 | _matrix[i][j] = edits; 80 | } 81 | 82 | edits = _matrix.back().back(); 83 | 84 | if (do_merge) 85 | edits.merge_rules(); 86 | 87 | if (insert_epsilon) 88 | edits.insert_epsilon_identity(source, target); 89 | 90 | return edits; 91 | } 92 | 93 | } // namespace Rulebased 94 | } // namespace Normalizer 95 | } // namespace Norma 96 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/rule_learn.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RULEBASED_RULE_LEARN_H_ 19 | #define NORMALIZER_RULEBASED_RULE_LEARN_H_ 20 | #include"string_impl.h" 21 | #include"rule.h" 22 | 23 | namespace Norma { 24 | namespace Normalizer { 25 | namespace Rulebased { 26 | 27 | /// the main function of this library 28 | RuleSet learn_rules(const string_impl& source, const string_impl& target, 29 | bool do_merge, bool insert_epsilon); 30 | 31 | } // namespace Rulebased 32 | } // namespace Normalizer 33 | } // namespace Norma 34 | #endif // NORMALIZER_RULEBASED_RULE_LEARN_H_ 35 | 36 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/rulebased.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"rulebased.h" 19 | #include 20 | #include 21 | #include 22 | #include"normalizer/result.h" 23 | #include"normalizer/cacheable.h" 24 | #include"interface/iobase.h" 25 | #include"rule.h" 26 | #include"candidate_finder.h" 27 | 28 | namespace Norma { 29 | namespace Normalizer { 30 | namespace Rulebased { 31 | 32 | void Rulebased::set_from_params(const std::map& 33 | params) { 34 | if (params.count(_name + ".rulesfile") != 0) 35 | set_rulesfile(to_absolute(params.at(_name + ".rulesfile"), params)); 36 | else if (params.count("perfilemode.input") != 0) 37 | set_rulesfile(with_extension(params.at("perfilemode.input"), 38 | _name + ".rulesfile")); 39 | } 40 | 41 | void Rulebased::init() { 42 | clear(); 43 | if (!_rulesfile.empty()) 44 | _rules.read_rulesfile(_rulesfile); 45 | } 46 | 47 | void Rulebased::clear() { 48 | _rules.clear(); 49 | clear_cache(); 50 | } 51 | 52 | Result Rulebased::do_normalize(const string_impl& word) const { 53 | if (is_caching()) { 54 | Result res = query_cache(word); 55 | if (res != Result()) 56 | return res; 57 | } 58 | 59 | ResultSet resultset = do_normalize(word, 1); 60 | Result result; 61 | if (resultset.empty()) { 62 | result = Result(word, 0.0, name()); 63 | log_message(&result, LogLevel::TRACE, "no candidate found"); 64 | } else { 65 | result = resultset.front(); 66 | } 67 | if (is_caching()) 68 | cache(word, result); 69 | return result; 70 | } 71 | 72 | ResultSet Rulebased::do_normalize(const string_impl& word, 73 | unsigned int n) const { 74 | ResultSet resultset; 75 | Result unchanged_result = make_result(word, 0.0); 76 | CandidateFinder finder(word, _rules, *_lex, _name); 77 | for (unsigned int i = 0; i < n; ++i) { 78 | Result result = finder(); 79 | if (result == unchanged_result) 80 | break; 81 | resultset.push_back(result); 82 | } 83 | return resultset; 84 | } 85 | 86 | bool Rulebased::do_train(TrainingData* data) { 87 | for (auto pp = data->rbegin(); pp != data->rend(); ++pp) { 88 | if (pp->is_used()) 89 | break; 90 | _rules.learn_ruleset(learn_rules(pp->source(), 91 | pp->target(), 92 | true, true)); 93 | } 94 | clear_cache(); 95 | return true; 96 | } 97 | 98 | void Rulebased::do_save_params() { 99 | _rules.save_rulesfile(_rulesfile); 100 | } 101 | 102 | } // namespace Rulebased 103 | } // namespace Normalizer 104 | } // namespace Norma 105 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/rulebased.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RULEBASED_RULEBASED_H_ 19 | #define NORMALIZER_RULEBASED_RULEBASED_H_ 20 | #include 21 | #include 22 | #include"string_impl.h" 23 | #include"normalizer/base.h" 24 | #include"normalizer/cacheable.h" 25 | #include"normalizer/result.h" 26 | #include"lexicon/lexicon.h" 27 | #include"rule_collection.h" 28 | #include"rule_learn.h" 29 | #include"candidate_finder.h" 30 | 31 | namespace Norma { 32 | namespace Normalizer { 33 | namespace Rulebased { 34 | 35 | class Rulebased : public Base, public Cacheable { 36 | public: 37 | void init(); 38 | using Base::init; 39 | void set_from_params(const std::map& params); 40 | void clear(); 41 | 42 | /// Get filename of the current rules file 43 | const std::string& get_rulesfile() const { return _rulesfile; } 44 | /// Set filename for the rules file 45 | Rulebased& set_rulesfile(const std::string& rulesfile) { 46 | _rulesfile = rulesfile; 47 | return *this; 48 | } 49 | 50 | using Cacheable::set_caching; 51 | using Cacheable::clear_cache; 52 | using Cacheable::is_caching; 53 | 54 | protected: 55 | bool do_train(TrainingData* data); 56 | Result do_normalize(const string_impl& word) const; 57 | ResultSet do_normalize(const string_impl& word, unsigned int n) const; 58 | void do_save_params(); 59 | 60 | private: 61 | std::string _rulesfile; 62 | RuleCollection _rules; 63 | }; 64 | } // namespace Rulebased 65 | } // namespace Normalizer 66 | } // namespace Norma 67 | 68 | extern "C" Norma::Normalizer::Base* create_normalizer() { 69 | return new Norma::Normalizer::Rulebased::Rulebased; 70 | } 71 | extern "C" void destroy_normalizer(Norma::Normalizer::Base* n) { 72 | delete n; 73 | } 74 | 75 | #endif // NORMALIZER_RULEBASED_RULEBASED_H_ 76 | 77 | -------------------------------------------------------------------------------- /src/normalizer/rulebased/symbols.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_RULEBASED_SYMBOLS_H_ 19 | #define NORMALIZER_RULEBASED_SYMBOLS_H_ 20 | #include"string_impl.h" 21 | 22 | namespace Norma { 23 | namespace Normalizer { 24 | namespace Rulebased { 25 | 26 | /// Magic symbols for rule-based normalization 27 | // --- maybe refactor this at some point 28 | namespace Symbols { 29 | const string_impl EPSILON = "E"; 30 | const char_impl BOUNDARY = '#'; 31 | } 32 | 33 | } // namespace Rulebased 34 | } // namespace Normalizer 35 | } // namespace Norma 36 | 37 | #endif // NORMALIZER_RULEBASED_SYMBOLS_H_ 38 | -------------------------------------------------------------------------------- /src/normalizer/wld.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_H_ 19 | #define NORMALIZER_WLD_H_ 20 | #include"wld/wld.h" 21 | #endif // NORMALIZER_WLD_H_ 22 | -------------------------------------------------------------------------------- /src/normalizer/wld/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories("${CMAKE_SOURCE_DIR}/src") 2 | add_library(WLD SHARED 3 | symbols.cpp weight_set.cpp 4 | levenshtein_algorithm.cpp levenshtein_aligner.cpp 5 | wld.cpp) 6 | install(TARGETS WLD 7 | DESTINATION "${NORMA_DEFAULT_PLUGIN_BASE}") 8 | set(NORMALIZER_LIBRARIES ${NORMALIZER_LIBRARIES} WLD PARENT_SCOPE) 9 | install_headers(levenshtein_algorithm.h levenshtein_aligner.h symbols.h 10 | typedefs.h weight_set.h wld.h) 11 | 12 | -------------------------------------------------------------------------------- /src/normalizer/wld/levenshtein_algorithm.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_LEVENSHTEIN_ALGORITHM_H_ 19 | #define NORMALIZER_WLD_LEVENSHTEIN_ALGORITHM_H_ 20 | #include 21 | #include"string_impl.h" 22 | #include"typedefs.h" 23 | #include"weight_set.h" 24 | 25 | namespace Norma { 26 | namespace Normalizer { 27 | namespace WLD { 28 | 29 | AlignmentSet align(const std::vector& source, 30 | const std::vector& target, 31 | const WeightSet& weights); 32 | AlignmentSet align(const string_impl& from, const string_impl& to, 33 | const WeightSet& weights); 34 | AlignmentSet align(const WordPair& p, const WeightSet& weights); 35 | 36 | double wld(const std::vector& source, 37 | const std::vector& target, 38 | const WeightSet& weights); 39 | double wld(const string_impl& from, const string_impl& to, 40 | const WeightSet& weights); 41 | double wld(const WordPair& p, const WeightSet& weights); 42 | double wld(const EditPair& p, const WeightSet& weights); 43 | 44 | } // namespace WLD 45 | } // namespace Normalizer 46 | } // namespace Norma 47 | 48 | #endif // NORMALIZER_WLD_LEVENSHTEIN_ALGORITHM_H_ 49 | 50 | -------------------------------------------------------------------------------- /src/normalizer/wld/levenshtein_aligner.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_LEVENSHTEIN_ALIGNER_H_ 19 | #define NORMALIZER_WLD_LEVENSHTEIN_ALIGNER_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include"string_impl.h" 25 | #include"typedefs.h" 26 | #include"weight_set.h" 27 | 28 | namespace Norma { 29 | namespace Normalizer { 30 | namespace WLD { 31 | class WeightSet; 32 | 33 | class LevenshteinAligner { 34 | public: 35 | LevenshteinAligner(const WeightSet& ws, 36 | unsigned int n = 3, unsigned int d = 7) 37 | : _weights(ws), _ngrams(n), _divisor(d) {} 38 | 39 | // Performs a PMI training cycle, updating the WeightSet 40 | void perform_training_cycle(const TrainSet& pairs); 41 | WeightSet make_final_weight_set(const TrainSet& pairs); 42 | 43 | WeightSet& weight_set() { return _weights; } 44 | const WeightSet& weight_set() const { return _weights; } 45 | double& learning_rate() { return _learning_rate; } 46 | const double& learning_rate() const { return _learning_rate; } 47 | bool& allow_pure_insertions() { return _allow_pure_insertions; } 48 | const bool& allow_pure_insertions() const { return _allow_pure_insertions; } 49 | bool& allow_identity() { return _allow_identity; } 50 | const bool& allow_identity() const { return _allow_identity; } 51 | double meandiff() const { return _meandiff; } 52 | 53 | private: 54 | struct RuleStats { 55 | int freq = 0; 56 | double pmi = 0.0; 57 | }; 58 | typedef std::map RuleStatsMap; 59 | typedef std::map, int> NgramFrequencyMap; 60 | typedef std::map, 61 | std::set>> PairTypesMap; 62 | 63 | WeightSet _weights; 64 | unsigned int _ngrams; 65 | unsigned int _divisor; 66 | double _learning_rate = 0.2; 67 | double _meandiff = 0; 68 | bool _allow_pure_insertions = false; 69 | bool _allow_identity = false; 70 | 71 | void collect_unigram_frequencies(RuleStatsMap* fr, 72 | NgramFrequencyMap* fs, 73 | NgramFrequencyMap* ft, 74 | const AlignmentSet& as, 75 | int count = 1) const; 76 | void collect_frequencies(RuleStatsMap* fr, 77 | NgramFrequencyMap* fs, PairTypesMap* pt, 78 | const AlignmentSet& as, int count = 1) const; 79 | std::tuple calculate_pmi(RuleStatsMap* rules, 80 | const NgramFrequencyMap& fs, 81 | const NgramFrequencyMap& ft) const; 82 | double adjust_weights(const RuleStatsMap& rules, 83 | std::tuple pmi); 84 | }; 85 | } // namespace WLD 86 | } // namespace Normalizer 87 | } // namespace Norma 88 | 89 | #endif // NORMALIZER_WLD_LEVENSHTEIN_ALIGNER_H_ 90 | -------------------------------------------------------------------------------- /src/normalizer/wld/symbols.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"symbols.h" 19 | 20 | namespace Norma { 21 | namespace Normalizer { 22 | namespace WLD { 23 | namespace Symbols { 24 | const string_impl map_to_any(const string_impl& str) { return ANY; } 25 | } // namespace Symbols 26 | } // namespace WLD 27 | } // namespace Normalizer 28 | } // namespace Norma 29 | 30 | -------------------------------------------------------------------------------- /src/normalizer/wld/symbols.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_SYMBOLS_H_ 19 | #define NORMALIZER_WLD_SYMBOLS_H_ 20 | #include"string_impl.h" 21 | 22 | namespace Norma { 23 | namespace Normalizer { 24 | namespace WLD { 25 | namespace Symbols { 26 | const string_impl ANY = ""; 27 | const string_impl EPS = ""; 28 | const string_impl map_to_any(const string_impl& str); 29 | } // namespace Symbols 30 | } // namespace WLD 31 | } // namespace Normalizer 32 | } // namespace Norma 33 | 34 | #endif // NORMALIZER_WLD_SYMBOLS_H_ 35 | -------------------------------------------------------------------------------- /src/normalizer/wld/typedefs.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_TYPEDEFS_H_ 19 | #define NORMALIZER_WLD_TYPEDEFS_H_ 20 | #include 21 | #include 22 | #include 23 | #include"string_impl.h" 24 | 25 | namespace Norma { 26 | namespace Normalizer { 27 | namespace WLD { 28 | typedef std::pair WordPair; 29 | typedef std::map TrainSet; 30 | 31 | typedef std::pair, std::vector> EditPair; 32 | typedef std::vector RuleSet; 33 | typedef std::vector AlignmentSet; 34 | 35 | typedef std::vector> DistanceMatrix; 36 | } // namespace WLD 37 | } // namespace Normalizer 38 | } // namespace Norma 39 | 40 | #endif // NORMALIZER_WLD_TYPEDEFS_H_ 41 | -------------------------------------------------------------------------------- /src/normalizer/wld/weight_set.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMALIZER_WLD_WEIGHT_SET_H_ 19 | #define NORMALIZER_WLD_WEIGHT_SET_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include"gfsm_wrapper.h" 26 | #include"string_impl.h" 27 | #include"typedefs.h" 28 | #include"symbols.h" 29 | 30 | namespace Norma { 31 | namespace Normalizer { 32 | namespace WLD { 33 | class WeightSet { 34 | public: 35 | void clear(); 36 | bool read_paramfile(const std::string& fname); 37 | bool save_paramfile(const std::string& fname); 38 | 39 | const std::set& input_symbols() const 40 | { return _input_symbols; } 41 | const std::vector weights() const; 42 | const std::map& weight_map() const 43 | { return _weights; } 44 | size_t size() const { return _weights.size(); } 45 | bool empty() const { return _weights.empty(); } 46 | double& default_identity_cost() 47 | { return _default_identity_cost; } 48 | double& default_replacement_cost() 49 | { return _default_replacement_cost; } 50 | double& default_insertion_cost() 51 | { return _default_insertion_cost; } 52 | double& default_deletion_cost() 53 | { return _default_deletion_cost; } 54 | const double& default_identity_cost() const 55 | { return _default_identity_cost; } 56 | const double& default_replacement_cost() const 57 | { return _default_replacement_cost; } 58 | const double& default_insertion_cost() const 59 | { return _default_insertion_cost; } 60 | const double& default_deletion_cost() const 61 | { return _default_deletion_cost; } 62 | 63 | void copy_defaults(const WeightSet& ws); 64 | void add_weight(const string_impl& from, const string_impl& to, 65 | double weight); 66 | void add_weight(const EditPair& edit, double weight); 67 | double get_weight(const string_impl& from, const string_impl& to) const; 68 | double get_weight(const EditPair& pair) const; 69 | void divide_all(double divisor); 70 | 71 | private: 72 | double _default_identity_cost = 0.0, 73 | _default_replacement_cost = 1.0, 74 | _default_insertion_cost = 1.0, 75 | _default_deletion_cost = 1.0; 76 | 77 | /// Custom weights 78 | std::map _weights; 79 | 80 | /// Set of all used input symbols 81 | std::set _input_symbols; 82 | 83 | double calculate_wld(const EditPair& pair) const; 84 | static EditPair make_editpair(const string_impl& from, 85 | const string_impl& to); 86 | }; 87 | } // namespace WLD 88 | } // namespace Normalizer 89 | } // namespace Norma 90 | 91 | #endif // NORMALIZER_WLD_WEIGHT_SET_H_ 92 | -------------------------------------------------------------------------------- /src/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ########################## python bindings ##################################### 2 | # need Boost::Python 3 | find_package(Boost 1.50 COMPONENTS python REQUIRED) 4 | 5 | add_library(norma-python MODULE 6 | string_impl_conv.cpp result_conv.cpp training_conv.cpp 7 | exception_wrapper.cpp lexicon_wrapper.cpp 8 | norma.cpp) 9 | ### "The name used in BOOST_PYTHON_MODULE must match the name of 10 | ### the .so library you generate and import into python." 11 | ### --see 12 | set_target_properties(norma-python PROPERTIES PREFIX "" 13 | OUTPUT_NAME "norma" 14 | LIBRARY_OUTPUT_DIRECTORY "norma/") 15 | set(NORMA_PYTHON_LIBRARIES norma ${PYTHON_LIBRARIES} ${Boost_PYTHON_LIBRARY}) 16 | target_link_libraries(norma-python LINK_PUBLIC ${NORMA_PYTHON_LIBRARIES}) 17 | 18 | ### let python tell us what the appropriate directory for site packages is 19 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import get_python_lib; print get_python_lib(prefix='${DESTINATION}')" 20 | OUTPUT_VARIABLE PYTHON_SITE_PACKAGES OUTPUT_STRIP_TRAILING_WHITESPACE) 21 | set(NORMA_PYTHON_INSTALL_DIR ${PYTHON_SITE_PACKAGES}/norma) 22 | install(TARGETS norma-python DESTINATION ${NORMA_PYTHON_INSTALL_DIR}) 23 | 24 | ### setup.py isn't currently used, but let's leave it in here in case we ever 25 | ### decide to do this via setuptools 26 | #configure_file(setup.py.in setup.py) 27 | 28 | macro(add_normalizer_bindings _NORM_LIB_NAME) 29 | string(TOLOWER ${_NORM_LIB_NAME} _NORM_NAME) 30 | add_library("${_NORM_NAME}-python" MODULE normalizer/${_NORM_NAME}.cpp) 31 | set_target_properties("${_NORM_NAME}-python" 32 | PROPERTIES PREFIX "" 33 | OUTPUT_NAME ${_NORM_NAME} 34 | INSTALL_RPATH ${NORMA_DEFAULT_PLUGIN_BASE} 35 | LIBRARY_OUTPUT_DIRECTORY "norma/") 36 | target_link_libraries("${_NORM_NAME}-python" LINK_PUBLIC ${_NORM_LIB_NAME} ${NORMA_PYTHON_LIBRARIES}) 37 | install(TARGETS "${_NORM_NAME}-python" DESTINATION ${NORMA_PYTHON_INSTALL_DIR}) 38 | endmacro(add_normalizer_bindings) 39 | 40 | add_normalizer_bindings(RuleBased) 41 | add_normalizer_bindings(WLD) 42 | add_normalizer_bindings(Mapper) 43 | 44 | ### native Python files that complement the bindings 45 | add_subdirectory(norma) 46 | 47 | -------------------------------------------------------------------------------- /src/python/exception_wrapper.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"exception_wrapper.h" 19 | #include 20 | #include //NOLINT[build/include_order] 21 | #include"normalizer/exceptions.h" 22 | 23 | namespace bp = boost::python; 24 | using Norma::Normalizer::init_error; 25 | 26 | namespace Norma { 27 | namespace Python { 28 | 29 | PyObject *PyExc_Norma_InitError = NULL; 30 | 31 | void translate_norma_exception(const init_error& e) { 32 | assert(PyExc_Norma_InitError != NULL); 33 | PyErr_SetString(PyExc_Norma_InitError, e.what()); 34 | } 35 | 36 | void register_exception_translators() { 37 | bp::class_ Norma_InitError 38 | ("NormaInitError", bp::init()); 39 | PyExc_Norma_InitError = Norma_InitError.ptr(); 40 | bp::register_exception_translator(&translate_norma_exception); 41 | } 42 | 43 | } // namespace Python 44 | } // namespace Norma 45 | -------------------------------------------------------------------------------- /src/python/exception_wrapper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_PYTHON_EXCEPTION_WRAPPER_H_ 19 | #define NORMA_PYTHON_EXCEPTION_WRAPPER_H_ 20 | #include //NOLINT[build/include_order] 21 | #include"normalizer/exceptions.h" 22 | 23 | namespace Norma { 24 | namespace Python { 25 | void translate_norma_exception(const Norma::Normalizer::init_error& e); 26 | 27 | void register_exception_translators(); 28 | } // namespace Python 29 | } // namespace Norma 30 | 31 | #endif // NORMA_PYTHON_EXCEPTION_WRAPPER_H_ 32 | -------------------------------------------------------------------------------- /src/python/lexicon_wrapper.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_PYTHON_LEXICON_WRAPPER_H_ 19 | #define NORMA_PYTHON_LEXICON_WRAPPER_H_ 20 | #include 21 | 22 | namespace Norma { 23 | namespace Normalizer { 24 | class Lexicon; 25 | } // namespace Normalizer 26 | 27 | namespace Python { 28 | struct lexicon_wrapper { 29 | static void init(Norma::Normalizer::Lexicon* l, 30 | std::string lexfile, std::string symfile); 31 | static void save(Norma::Normalizer::Lexicon* l, 32 | std::string lexfile, std::string symfile); 33 | static void wrap(); 34 | }; 35 | } // namespace Python 36 | } // namespace Norma 37 | 38 | #endif // NORMA_PYTHON_LEXICON_WRAPPER_H_ 39 | -------------------------------------------------------------------------------- /src/python/norma.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include 19 | #include //NOLINT[build/include_order] 20 | #include //NOLINT[build/include_order] 21 | #include"string_impl_conv.h" 22 | #include"result_conv.h" 23 | #include"training_conv.h" 24 | #include"exception_wrapper.h" 25 | #include"lexicon_wrapper.h" 26 | 27 | namespace bp = boost::python; 28 | 29 | namespace Norma { 30 | namespace Python { 31 | 32 | BOOST_PYTHON_MODULE(norma) { 33 | register_exception_translators(); 34 | register_string_impl_converters(); 35 | register_result_converters(); 36 | register_training_converters(); 37 | 38 | // Without the "true" template parameter, conversion of string_impl 39 | // will fail, and nobody knows why except for an obscure mailing 40 | // list post: 41 | bp::class_>("StringVector") 42 | .def(bp::vector_indexing_suite, true>()); 43 | 44 | result_wrapper::wrap(); // Result 45 | lexicon_wrapper::wrap(); // Lexicon 46 | } 47 | 48 | } // namespace Python 49 | } // namespace Norma 50 | -------------------------------------------------------------------------------- /src/python/norma/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(PY_NATIVE_FILES 2 | __init__.py LexiconWrapper.py ChainNormalizer.py NormalizerWrapper.py 3 | NormaCfgParser.py) 4 | 5 | foreach(PY_NATIVE_FILE ${PY_NATIVE_FILES}) 6 | configure_file(${PY_NATIVE_FILE} ${PY_NATIVE_FILE} COPYONLY) 7 | endforeach(PY_NATIVE_FILE) 8 | 9 | install(FILES ${PY_NATIVE_FILES} DESTINATION ${NORMA_PYTHON_INSTALL_DIR}) 10 | -------------------------------------------------------------------------------- /src/python/norma/LexiconWrapper.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | ################################################################################ 3 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 4 | # 5 | # This file is part of Norma. 6 | # 7 | # Norma is free software: you can redistribute it and/or modify it under the 8 | # terms of the GNU Lesser General Public License as published by the Free 9 | # Software Foundation, either version 3 of the License, or (at your option) any 10 | # later version. 11 | # 12 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 15 | # details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License along 18 | # with Norma. If not, see . 19 | ################################################################################ 20 | 21 | from norma import Lexicon as CppLexicon 22 | 23 | # Extends the C++ bindings for Lexicon by providing some convenience 24 | # functions that are much simpler to implement on the Python side 25 | class Lexicon(CppLexicon): 26 | """A lexicon holding a set of wordforms. 27 | 28 | Lexicon objects are used by some normalizers to restrict the 29 | output of normalization candidates to words contained within the 30 | lexicon. 31 | 32 | Supports only a small subset of Python's container interface, 33 | namely: 34 | * len(lexicon) 35 | * Membership tests (if word in lexicon: ...) 36 | * Iterators (for word in lexicon: ...) 37 | 38 | If you need more of Python's built-in functions for containers, 39 | make a copy of the Lexicon's entries and work with that: 40 | x = set(lexicon.entries) 41 | or 42 | x = [w for w in lexicon.entries] 43 | 44 | """ 45 | 46 | def __init__(self, lexfile=None, symfile=None): 47 | """Construct and initialize the lexicon. 48 | 49 | Keyword arguments: 50 | lexfile -- Name of the lexicon file 51 | symfile -- Name of the symbols file 52 | """ 53 | super(Lexicon, self).__init__() 54 | if lexfile is not None: 55 | self.lexfile = lexfile 56 | if symfile is not None: 57 | self.symfile = symfile 58 | self.init() 59 | 60 | def __iter__(self): 61 | return self.entries.__iter__() 62 | 63 | def add(self, *args): 64 | """Add new lexicon entries. 65 | 66 | Adds all supplied arguments to the lexicon. 67 | """ 68 | for word in args: 69 | super(Lexicon, self).add(word) 70 | 71 | def extend(self, args): 72 | """Extend the lexicon with a list of entries. 73 | 74 | extend(['foo', 'bar']) is equivalent to add('foo', 'bar'). 75 | """ 76 | self.add(*args) 77 | -------------------------------------------------------------------------------- /src/python/norma/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | ################################################################################ 3 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 4 | # 5 | # This file is part of Norma. 6 | # 7 | # Norma is free software: you can redistribute it and/or modify it under the 8 | # terms of the GNU Lesser General Public License as published by the Free 9 | # Software Foundation, either version 3 of the License, or (at your option) any 10 | # later version. 11 | # 12 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 15 | # details. 16 | # 17 | # You should have received a copy of the GNU Lesser General Public License along 18 | # with Norma. If not, see . 19 | ################################################################################ 20 | 21 | from norma import NormaInitError, Result 22 | from NormaCfgParser import NormaCfgParser 23 | -------------------------------------------------------------------------------- /src/python/normalizer/mapper.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"normalizer/mapper.h" 19 | #include"python/normalizer/normalizer_wrapper.h" 20 | 21 | namespace bp = boost::python; 22 | 23 | namespace Norma { 24 | namespace Python { 25 | struct mapper : normalizer_wrapper { 26 | static void wrap_mapper() { 27 | using Norma::Normalizer::Mapper::Mapper; 28 | void (Mapper::*mapper_train)(const string_impl&, 29 | const string_impl&, 30 | int) = &Mapper::do_train; 31 | bp::docstring_options local_docstring_options(true, true, false); 32 | 33 | base_wrapper::make_class("MapperNormalizer") 34 | .def("init", &init_nolex, 35 | "Initialize the normalizer.\n\n" 36 | "Initializes the normalizer with data from the supplied " 37 | "parameter file.\n\n" 38 | "Arguments:\n" 39 | " file -- Name of the parameter file" 40 | ) 41 | .def("init", &init, 42 | bp::with_custodian_and_ward<1, 3>(), 43 | "Initialize the normalizer.\n\n" 44 | "This overload is not useful for this normalizer -- it only " 45 | "exists for compatibility reasons." 46 | ) 47 | .def("save", &save, 48 | "Save the normalizer state to a file.\n\n" 49 | "Saves the current state of the normalizer to the given " 50 | "parameter file.\n\n" 51 | "Arguments:\n" 52 | " file -- Name of the file to save to" 53 | ) 54 | .def("train", mapper_train, 55 | "Train the normalizer on a single word pair.\n\n" 56 | "Arguments:\n" 57 | " source -- Input wordform\n" 58 | " target -- Normalized (target) wordform\n" 59 | " n -- Number of times this word pair has been seen\n\n" 60 | "Calling train(source, target, n) is functionally identical " 61 | "to calling train(n * [(source, target)])." 62 | ) 63 | .add_property("mapfile", 64 | bp::make_function(&Mapper::get_mapfile, 65 | bp::return_value_policy()), 66 | bp::make_function(&Mapper::set_mapfile, 67 | bp::return_self<>()), 68 | "Name of a parameter file containing the mappings."); 69 | } 70 | }; 71 | 72 | BOOST_PYTHON_MODULE(mapper) { 73 | mapper::wrap_mapper(); 74 | } 75 | } // namespace Python 76 | } // namespace Norma 77 | 78 | -------------------------------------------------------------------------------- /src/python/normalizer/rulebased.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"normalizer/rulebased.h" 19 | #include"python/normalizer/normalizer_wrapper.h" 20 | 21 | namespace bp = boost::python; 22 | namespace Norma { 23 | namespace Python { 24 | struct rulebased : normalizer_wrapper { 25 | static void wrap_rulebased() { 26 | using Norma::Normalizer::Rulebased::Rulebased; 27 | bp::docstring_options local_docstring_options(true, true, false); 28 | 29 | base_wrapper::make_class("RulebasedNormalizer") 30 | .def("init", &init_nolex, 31 | "Initialize the normalizer.\n\n" 32 | "Initializes the normalizer with data from the supplied " 33 | "parameter file.\n\n" 34 | "Arguments:\n" 35 | " file -- Name of the parameter file" 36 | ) 37 | .def("init", &init, 38 | bp::with_custodian_and_ward<1, 3>(), 39 | "Initialize the normalizer.\n\n" 40 | "Initializes the normalizer with data from the supplied " 41 | "parameter file and a lexicon.\n\n" 42 | "Arguments:\n" 43 | " file -- Name of the parameter file\n" 44 | " lex -- A Lexicon object" 45 | ) 46 | .def("save", &save, 47 | "Save the normalizer state to a file.\n\n" 48 | "Saves the current state of the normalizer to the given " 49 | "parameter file.\n\n" 50 | "Arguments:\n" 51 | " file -- Name of the file to save to" 52 | ) 53 | .def("clear_cache", &Rulebased::clear_cache, 54 | "Clear the internal cache." 55 | ) 56 | .add_property("caching", 57 | &Rulebased::is_caching, &Rulebased::set_caching, 58 | "Whether to cache normalization results.\n\n" 59 | "Caching increases performance at the cost of " 60 | "higher memory usage. The cache is only used when " 61 | "a single best normalization candidate is requested, " 62 | "never when determining the n-best candidates. " 63 | "It is recommended to always keep this set to True." 64 | ) 65 | .add_property("rulesfile", 66 | bp::make_function(&Rulebased::get_rulesfile, 67 | bp::return_value_policy()), 68 | bp::make_function(&Rulebased::set_rulesfile, 69 | bp::return_self<>()), 70 | "Name of a parameter file containing the rules.") 71 | ; // NOLINT[whitespace/semicolon] 72 | } 73 | }; 74 | 75 | BOOST_PYTHON_MODULE(rulebased) { 76 | rulebased::wrap_rulebased(); 77 | } 78 | } // namespace Python 79 | } // namespace Norma 80 | 81 | 82 | -------------------------------------------------------------------------------- /src/python/result_conv.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_PYTHON_RESULT_CONV_H_ 19 | #define NORMA_PYTHON_RESULT_CONV_H_ 20 | #include 21 | #include 22 | #include //NOLINT[build/include_order] 23 | #include"string_impl.h" 24 | #include"normalizer/result.h" 25 | 26 | namespace Norma { 27 | namespace Python { 28 | typedef boost::python::converter::rvalue_from_python_stage1_data 29 | py_stage1_data; 30 | typedef boost::python::converter::rvalue_from_python_storage 31 | py_storage; 32 | 33 | typedef std::queue 34 | LogMessageQueue; 35 | 36 | struct Result_from_python_tuple { 37 | static void* convertible(PyObject* obj_ptr); 38 | static void construct(PyObject* obj_ptr, 39 | py_stage1_data* data); 40 | }; 41 | 42 | struct ResultSet_to_python_list { 43 | static PyObject* convert(Norma::Normalizer::ResultSet const& rs); 44 | }; 45 | 46 | struct LogMessageQueue_to_python_list { 47 | static PyObject* convert(LogMessageQueue const& lmq); 48 | }; 49 | 50 | void register_result_converters(); 51 | 52 | struct result_wrapper { 53 | static string_impl repr(const Norma::Normalizer::Result& result); 54 | static void wrap(); 55 | }; 56 | } // namespace Python 57 | } // namespace Norma 58 | 59 | #endif // NORMA_PYTHON_RESULT_CONV_H_ 60 | -------------------------------------------------------------------------------- /src/python/setup.py.in: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | from setuptools import setup 4 | setup( 5 | name = "norma", 6 | version = "@CMAKE_PROJECT_VERSION@", 7 | packages = ['norma'], 8 | 9 | # Well, this is not what it's intended for... 10 | package_data = { 11 | 'norma': ['*.so'] 12 | }, 13 | 14 | author = "Marcel Bollmann", 15 | author_email = "bollmann@linguistics.rub.de", 16 | url = "http://www.linguistics.rub.de/clhist/resources/norma/" 17 | ) 18 | -------------------------------------------------------------------------------- /src/python/string_impl_conv.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_PYTHON_STRING_IMPL_CONV_H_ 19 | #define NORMA_PYTHON_STRING_IMPL_CONV_H_ 20 | #include //NOLINT[build/include_order] 21 | #include"string_impl.h" 22 | 23 | namespace Norma { 24 | namespace Python { 25 | 26 | typedef boost::python::converter::rvalue_from_python_stage1_data 27 | py_stage1_data; 28 | typedef boost::python::converter::rvalue_from_python_storage 29 | py_storage; 30 | 31 | #ifdef USE_ICU_STRING 32 | 33 | struct ICUString_to_python_str { 34 | static PyObject* convert(string_impl const& s); 35 | }; 36 | 37 | struct ICUString_from_python_str { 38 | static void* convertible(PyObject* obj_ptr); 39 | static void construct(PyObject* obj_ptr, 40 | py_stage1_data* data); 41 | }; 42 | 43 | #else // USE_ICU_STRING 44 | 45 | struct STDString_from_python_unicode { 46 | static void* convertible(PyObject* obj_ptr); 47 | static void construct(PyObject* obj_ptr, 48 | py_stage1_data* data); 49 | }; 50 | 51 | #endif // USE_ICU_STRING 52 | 53 | void register_string_impl_converters(); 54 | 55 | } // namespace Python 56 | } // namespace Norma 57 | 58 | #endif // NORMA_PYTHON_STRING_IMPL_CONV_H_ 59 | -------------------------------------------------------------------------------- /src/python/training_conv.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | // ---------------------------------------------------------------------------- 19 | // This will register converters with Boost::Python that convert 20 | // TrainingPair and TrainingData to and from native Python objects. 21 | // ---------------------------------------------------------------------------- 22 | #include"training_conv.h" 23 | #include //NOLINT[build/include_order] 24 | #include"string_impl.h" 25 | #include"training_data.h" 26 | 27 | namespace bp = boost::python; 28 | 29 | namespace Norma { 30 | namespace Python { 31 | PyObject* 32 | TrainingPair_to_python_tuple::convert(Norma::TrainingPair const& pair) { 33 | return bp::incref(bp::make_tuple(pair.source(), pair.target()).ptr()); 34 | } 35 | 36 | void* TrainingData_from_python_list::convertible(PyObject* obj_ptr) { 37 | // can only convert lists 38 | if (!PyList_Check(obj_ptr)) 39 | return 0; 40 | Py_ssize_t size = PyList_Size(obj_ptr); 41 | for (Py_ssize_t i = 0; i < size; ++i) { 42 | // each list member must be a tuple 43 | PyObject* member = PyList_GetItem(obj_ptr, i); 44 | if (PyTuple_Check(member) && PyTuple_Size(member) == 2) { 45 | // each tuple must consist of exactly two strings 46 | PyObject* source = PyTuple_GetItem(member, 0); 47 | PyObject* target = PyTuple_GetItem(member, 1); 48 | if (!(PyString_Check(source) || PyUnicode_Check(source)) 49 | || !(PyString_Check(target) || PyUnicode_Check(target))) 50 | return 0; 51 | } else { 52 | return 0; 53 | } 54 | } 55 | return obj_ptr; 56 | } 57 | 58 | void TrainingData_from_python_list::construct(PyObject* obj_ptr, 59 | py_stage1_data* data) { 60 | // Grab pointer to memory into which to construct the new object 61 | void* storage = reinterpret_cast(data)->storage.bytes; 62 | 63 | // Extract data from the python list 64 | Norma::TrainingData* training = new (storage) Norma::TrainingData(); 65 | Py_ssize_t size = PyList_Size(obj_ptr); 66 | for (Py_ssize_t i = 0; i < size; ++i) { 67 | PyObject* member = PyList_GetItem(obj_ptr, i); 68 | PyObject* source = PyTuple_GetItem(member, 0); 69 | PyObject* target = PyTuple_GetItem(member, 1); 70 | string_impl si_source = bp::extract(source); 71 | string_impl si_target = bp::extract(target); 72 | training->add_pair(si_source, si_target); 73 | } 74 | 75 | data->convertible = storage; 76 | } 77 | 78 | void register_training_converters() { 79 | bp::to_python_converter(); 81 | bp::converter::registry::push_back( 82 | &TrainingData_from_python_list::convertible, 83 | &TrainingData_from_python_list::construct, 84 | bp::type_id()); 85 | } 86 | } // namespace Python 87 | } // namespace Norma 88 | -------------------------------------------------------------------------------- /src/python/training_conv.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef NORMA_PYTHON_TRAINING_CONV_H_ 19 | #define NORMA_PYTHON_TRAINING_CONV_H_ 20 | #include 21 | #include //NOLINT[build/include_order] 22 | #include"training_data.h" 23 | 24 | namespace Norma { 25 | namespace Python { 26 | typedef boost::python::converter::rvalue_from_python_stage1_data 27 | py_stage1_data; 28 | typedef boost::python::converter::rvalue_from_python_storage 29 | py_storage; 30 | 31 | struct TrainingPair_to_python_tuple { 32 | static PyObject* convert(Norma::TrainingPair const& pair); 33 | }; 34 | 35 | struct TrainingData_from_python_list { 36 | static void* convertible(PyObject* obj_ptr); 37 | static void construct(PyObject* obj_ptr, 38 | py_stage1_data* data); 39 | }; 40 | 41 | // register string_impl converters before calling this! 42 | void register_training_converters(); 43 | } // namespace Python 44 | } // namespace Norma 45 | 46 | #endif // NORMA_PYTHON_TRAINING_CONV_H_ 47 | -------------------------------------------------------------------------------- /src/regex_impl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef REGEX_IMPL_H_ 19 | #define REGEX_IMPL_H_ 20 | #include 21 | #include // NOLINT[build/include_order] 22 | #include"defines.h" // NOLINT[build/include_order] 23 | 24 | #ifdef USE_ICU_STRING 25 | #include // NOLINT[build/include_order] 26 | 27 | typedef boost::u32regex regex_impl; 28 | #define REGEX_IMPL_MATCH boost::u32regex_match 29 | 30 | inline regex_impl make_regex_impl(const std::string& regex_str) { 31 | return boost::make_u32regex(regex_str.c_str()); 32 | } 33 | 34 | #else // USE_ICU_STRING 35 | 36 | typedef boost::regex regex_impl; 37 | #define REGEX_IMPL_MATCH boost::regex_match 38 | 39 | inline regex_impl make_regex_impl(const std::string& regex_str) { 40 | return boost::regex(regex_str); 41 | } 42 | 43 | #endif // USE_ICU_STRING 44 | 45 | #endif // REGEX_IMPL_H 46 | -------------------------------------------------------------------------------- /src/results_queue-inl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef RESULTS_QUEUE_INL_H_ 19 | #define RESULTS_QUEUE_INL_H_ 20 | namespace Norma { 21 | template 22 | void ResultsQueue::set_consumer(std::function consumer) { 23 | _consumer = consumer; 24 | output_done = std::async(std::launch::async, 25 | &ResultsQueue::consume, this); 26 | } 27 | 28 | template 29 | void ResultsQueue::add_producer(std::function producer, 30 | const string_impl line) { 31 | // TODO(fpetran) set a timeout here maybe in case producer malfunctions? 32 | for (; ;) { 33 | if (num_threads.load() <= _max_threads) 34 | break; 35 | } 36 | std::unique_lock consumer_lock(_mutex); 37 | results.push(std::async(_policy, producer, line)); 38 | ++num_threads; 39 | if (!output_ready) 40 | output_ready = true; 41 | consumer_condition.notify_one(); 42 | } 43 | 44 | template bool ResultsQueue::consume() { 45 | do { 46 | std::unique_lock consumer_lock(_mutex); 47 | // this loop prevents spurious wakeup 48 | while (!output_ready) 49 | consumer_condition.wait(consumer_lock, 50 | [this]{ return output_ready.load(); }); 51 | while (!results.empty()) { 52 | R result = results.front().get(); 53 | _consumer(result); 54 | results.pop(); 55 | --num_threads; 56 | } 57 | } while (!workers_done); 58 | return true; 59 | } 60 | } // namespace Norma 61 | #endif // RESULTS_QUEUE_INL_H_ 62 | -------------------------------------------------------------------------------- /src/results_queue.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef RESULTS_QUEUE_H_ 19 | #define RESULTS_QUEUE_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include"string_impl.h" 26 | 27 | namespace Norma { 28 | /// a multi producer-single consumer queue 29 | /** 30 | * policy only relates to the producer threads, the consumer is always async. 31 | * if max_threads is 0, it will be set to hardware_concurrency (the number of 32 | * threads that can physically be executed in parallel). 33 | **/ 34 | template class ResultsQueue { 35 | public: 36 | ResultsQueue() { init(); } 37 | explicit ResultsQueue(unsigned max_threads) { init(max_threads); } 38 | explicit ResultsQueue(std::launch policy) { init(0, policy); } 39 | ResultsQueue(unsigned max_threads, std::launch policy) { 40 | init(max_threads, policy); 41 | } 42 | void set_consumer(std::function consumer); 43 | void add_producer(std::function producer, 44 | const string_impl line); 45 | /// consume remaining results and wait for the consumer to be done 46 | void finish() { 47 | output_ready = true; 48 | workers_done = true; 49 | consumer_condition.notify_all(); 50 | output_done.wait(); 51 | } 52 | 53 | private: 54 | std::launch _policy = std::launch::async|std::launch::deferred; 55 | unsigned _max_threads; 56 | std::atomic num_threads{0}; 57 | std::mutex _mutex; 58 | std::function _consumer; 59 | std::queue> results; 60 | std::future output_done; 61 | std::atomic output_ready{false}, workers_done{false}; 62 | std::condition_variable consumer_condition; 63 | 64 | void init(unsigned max_threads = 0, 65 | std::launch policy = std::launch::async|std::launch::deferred) { 66 | if (max_threads == 0) 67 | _max_threads = std::thread::hardware_concurrency() * 2; 68 | _policy = policy; 69 | } 70 | bool consume(); 71 | }; 72 | } // namespace Norma 73 | 74 | #include"results_queue-inl.h" 75 | #endif // NORMA_RESULTS_QUEUE_H_ 76 | 77 | -------------------------------------------------------------------------------- /src/string_impl.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"string_impl.h" 19 | 20 | #ifdef USE_ICU_STRING 21 | #include 22 | #include 23 | #include 24 | 25 | const char* to_cstr(const string_impl& str) { 26 | // one problem with this function: 27 | // it returns a char ptr, so if i call it 28 | // twice, both char ptr will have the value 29 | // of the second call. might be fixable by 30 | // making the ptr non static, but idk what 31 | // that will do to performance. 32 | thread_local char out[256]; 33 | out[str.extract(0, 99, out)] = 0; 34 | return out; 35 | } 36 | 37 | std::istream& operator>>(std::istream& strm, string_impl& val) { 38 | std::string str; 39 | strm >> str; 40 | string_impl conv(str.c_str()); 41 | val = conv; 42 | return strm; 43 | } 44 | 45 | std::ostream& operator<<(std::ostream& strm, const string_impl& ustr) { 46 | std::string str = to_cstr(ustr); 47 | strm << str; 48 | return strm; 49 | } 50 | 51 | #endif // USE_ICU_STRING 52 | 53 | void extract_tail(const string_impl& str, string_size len, string_impl* out) { 54 | if (len >= str.length()) { 55 | *out = str; 56 | } else { 57 | extract(str, str.length()-len, str.length(), out); 58 | } 59 | } 60 | 61 | bool has_alpha(const string_impl& str) { 62 | for (string_size i = 0; i < str.length(); ++i) 63 | if (check_if_alpha(str[i])) 64 | return true; 65 | return false; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /src/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # this is a horrible, horrible hack: cmake doesn't allow adding dependencies 2 | # to built-in targets like "test", but at the same time, add_test doesn't add 3 | # a dependency on the executable for the test target. so the only way to have 4 | # make test depend on the tests without having tests in all is to add a "test" 5 | # that builds the tests and make all other tests depend on it. 6 | # cf. CMake bugs #8774, #8438 7 | # fpetran 2015/03/17 8 | 9 | add_test(ctest_build_test_code "${CMAKE_COMMAND}" --build ${CMAKE_BINARY_DIR} --target buildtests) 10 | 11 | macro(add_complete_test binary source testname) 12 | add_executable(${binary} ${source}) 13 | add_dependencies(buildtests ${binary}) 14 | target_link_libraries(${binary} norma ${Boost_UNIT_TEST_FRAMEWORK} ${ARGN}) 15 | add_test(${testname} ${binary}) 16 | set_tests_properties(${testname} PROPERTIES DEPENDS ctest_build_test_code) 17 | if (WITH_COVERAGE) 18 | setup_target_for_coverage(${binary}) 19 | endif(WITH_COVERAGE) 20 | endmacro() 21 | 22 | add_complete_test(gfsm_wrapper gfsm_wrapper.cpp Gfsm ${LIBGFSM_LIBRARIES}) 23 | add_complete_test(training_data training_data.cpp TrainingData) 24 | add_complete_test(interface interface_test.cpp Interface) 25 | add_subdirectory(normalizer) 26 | 27 | if(WITH_PYTHON) 28 | add_subdirectory(python) 29 | endif() 30 | -------------------------------------------------------------------------------- /src/tests/data/fileinput.txt: -------------------------------------------------------------------------------- 1 | foo 2 | foobar 3 | bar baz 4 | anshelmus anselm 5 | bla 6 | -------------------------------------------------------------------------------- /src/tests/data/normalize.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 3 | # 4 | # This file is part of Norma. 5 | # 6 | # Norma is free software: you can redistribute it and/or modify it under the 7 | # terms of the GNU Lesser General Public License as published by the Free 8 | # Software Foundation, either version 3 of the License, or (at your option) any 9 | # later version. 10 | # 11 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 12 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 13 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 14 | # details. 15 | # 16 | # You should have received a copy of the GNU Lesser General Public License along 17 | # with Norma. If not, see . 18 | ################################################################################ 19 | def do_setup(): 20 | return 21 | 22 | def do_teardown(): 23 | return 24 | 25 | def do_normalize(word): 26 | return ("foobar", 0.85) 27 | 28 | def do_normalize_nbest(word, n): 29 | return [ ("foo", 0.8), ("bar", 0.2) ] 30 | 31 | def do_train(): 32 | return True 33 | 34 | def do_save(): 35 | return 36 | -------------------------------------------------------------------------------- /src/tests/data/test-lexicon.gfsa: -------------------------------------------------------------------------------- 1 | gfsm_automaton 2 | 3 | !   4 | 5 |       6 |          -------------------------------------------------------------------------------- /src/tests/data/test-lexicon.gfsa.old: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comphist/norma/2459c2bb29c9f1daad38a6d7d15e28930782e2a8/src/tests/data/test-lexicon.gfsa.old -------------------------------------------------------------------------------- /src/tests/data/test-lexicon.lab: -------------------------------------------------------------------------------- 1 | <#> 1 2 | 2 3 | 3 4 | a 18 5 | b 17 6 | c 15 7 | d 10 8 | e 4 9 | f 13 10 | h 16 11 | i 5 12 | n 6 13 | r 11 14 | s 7 15 | t 19 16 | u 20 17 | v 12 18 | w 9 19 | z 8 20 | ü 14 21 | -------------------------------------------------------------------------------- /src/tests/data/test-lexicon.lab.old: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | a 3 4 | c 4 5 | b 5 6 | e 6 7 | d 7 8 | f 8 9 | i 9 10 | h 10 11 | n 11 12 | s 12 13 | r 13 14 | u 14 15 | t 15 16 | w 16 17 | v 17 18 | z 18 19 | ü 19 20 | -------------------------------------------------------------------------------- /src/tests/data/test-lexicon.txt: -------------------------------------------------------------------------------- 1 | eins 2 | zwei 3 | drei 4 | vier 5 | fünf 6 | sechs 7 | sieben 8 | acht 9 | neun 10 | zehn 11 | nein 12 | zweitens 13 | -------------------------------------------------------------------------------- /src/tests/data/test-mapfile-malformed.txt: -------------------------------------------------------------------------------- 1 | vndund 25 2 | 10 vnnd und 3 | 20 jn ihn 4 | jnin 75 5 | jn inne inne inne 5 6 | jn inne 21 inne inne 7 | -------------------------------------------------------------------------------- /src/tests/data/test-mapfile.txt: -------------------------------------------------------------------------------- 1 | vnd und 25 2 | vnnd und 10 3 | jn ihn 20 4 | jn in 75 5 | jn inne 5 6 | -------------------------------------------------------------------------------- /src/tests/data/test-rulesfile-malformed.txt: -------------------------------------------------------------------------------- 1 | {E->E/#_v} 100000 2 | {E->E/u_n} 50000 3 | {E->E/n_d} 50000 4 | {E->E/d_#} 50000 5 | {E->E/i_n} 50000 6 | {E->E/s_#} 50000 7 | -------------------------------------------------------------------------------- /src/tests/data/test-rulesfile.txt: -------------------------------------------------------------------------------- 1 | 100000 {E->E/#_v} 2 | 50000 {E->E/u_n} 3 | 50000 {E->E/n_d} 4 | 50000 {E->E/d_#} 5 | 50000 {E->E/i_n} 6 | 50000 {E->E/s_#} 7 | 1000 {v->u/#_n} 8 | 250 {v->v/#_n} 9 | 1000 {n->n/u_d} 10 | 1000 {d->d/n_#} 11 | 1000 {n->n/i_d} 12 | 5 {e->ä/g_b} 13 | 500 {v->ei/#_n} 14 | 100 {d->s/n_#} 15 | -------------------------------------------------------------------------------- /src/tests/data/test-weights-malformed.txt: -------------------------------------------------------------------------------- 1 | ji 0.2 2 | jih 0.8 3 | 0.9 n m 4 | 0.5 nn n 5 | x 0.99 0.99 6 | q 0.1 0.99 7 | -------------------------------------------------------------------------------- /src/tests/data/test-weights.txt: -------------------------------------------------------------------------------- 1 | j i 0.2 2 | j ih 0.8 3 | n m 0.9 4 | nn n 0.5 5 | x 0.99 6 | q 0.1 7 | -------------------------------------------------------------------------------- /src/tests/normalizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_complete_test(lexicon_test lexicon_test.cpp Normalizer_Lexicon) 2 | add_complete_test(mapper_test mapper.cpp Normalizer_Mapper Mapper pthread) 3 | add_complete_test(rulebased_test rulebased_test Normalizer_Rulebased RuleBased pthread ${Boost_REGEX_LIBRARY}) 4 | add_complete_test(wld_test wld_test.cpp Normalizer_WLD WLD pthread) 5 | if(WITH_PYTHON) 6 | add_complete_test(external_test external_test.cpp Normalizer_External External pthread ${PYTHON_LIBRARIES}) 7 | endif() 8 | if(MAKE_WLD_BENCHMARK) 9 | add_executable(wld_optimization wld_optimization.cpp) 10 | target_link_libraries(wld_optimization norma ${Boost_PROGRAM_OPTIONS_LIBRARY}) 11 | endif() 12 | -------------------------------------------------------------------------------- /src/tests/normalizer/external_test.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #define BOOST_TEST_DYN_LINK 19 | #define BOOST_TEST_MODULE Normalizer_External 20 | #include 21 | #include 22 | #include 23 | #include"tests/tests.h" 24 | #include"config.h" 25 | #include"normalizer/external.h" 26 | #include"normalizer/result.h" 27 | 28 | using Norma::Normalizer::External::External; 29 | using Norma::Normalizer::Result; 30 | using Norma::Normalizer::ResultSet; 31 | 32 | const std::string TEST_PATH = std::string(TEST_BASE_DIR); 33 | 34 | struct ExternalFixture { 35 | External *e; 36 | std::map params; 37 | 38 | ExternalFixture() { 39 | e = new External(); 40 | e->set_name("External"); 41 | params["External.path"] = TEST_PATH; 42 | params["External.script"] = "normalize"; 43 | e->set_from_params(params); 44 | e->init(); 45 | } 46 | ~ExternalFixture() { delete e; } 47 | }; 48 | 49 | BOOST_FIXTURE_TEST_SUITE(External1, ExternalFixture) 50 | 51 | BOOST_AUTO_TEST_CASE(name_check) { 52 | const std::string name = "External"; 53 | BOOST_CHECK_EQUAL(e->name(), name); 54 | } 55 | 56 | BOOST_AUTO_TEST_CASE(normalize_best) { 57 | Result r = (*e)("test"); 58 | BOOST_CHECK_EQUAL(r.word, "foobar"); 59 | BOOST_CHECK_EQUAL(r.score, 0.85); 60 | } 61 | 62 | BOOST_AUTO_TEST_CASE(normalize_nbest) { 63 | ResultSet expected = {Result("foo", 0.8), Result("bar", 0.2)}; 64 | ResultSet rs = (*e)("test2", 2); 65 | BOOST_CHECK_EQUAL(rs.size(), 2); 66 | BOOST_CHECK(rs == expected); 67 | } 68 | 69 | BOOST_AUTO_TEST_SUITE_END() 70 | -------------------------------------------------------------------------------- /src/tests/normalizer/mock_lexicon.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef TESTS_NORMALIZER_MOCK_LEXICON_H_ 19 | #define TESTS_NORMALIZER_MOCK_LEXICON_H_ 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include"string_impl.h" 25 | #include"lexicon/lexicon_interface.h" 26 | 27 | using Norma::Normalizer::LexiconInterface; 28 | 29 | class MockLexicon : public LexiconInterface { 30 | private: 31 | const std::vector _words { 32 | "eins", "zwei", "drei", "und" 33 | }; 34 | const std::vector _partial_words { 35 | "", 36 | "e", "z", "d", "u", 37 | "ei", "zw", "dr", "un", 38 | "ein", "zwe", "dre", "und", 39 | "eins", "zwei", "drei" 40 | }; 41 | 42 | void do_init() {} 43 | void do_clear() {} 44 | void do_set_from_params(const std::map& 45 | params) {} 46 | void do_save_params() {} 47 | bool check_contains(const string_impl& word) const { 48 | return (std::find(_words.begin(), _words.end(), word) != _words.end()); 49 | } 50 | bool check_contains_partial(const string_impl& word) const { 51 | return (std::find(_partial_words.begin(), _partial_words.end(), word) 52 | != _partial_words.end()); 53 | } 54 | bool add_word(const string_impl& word) { return true; } 55 | std::vector retrieve_all_entries() const { 56 | return _words; 57 | } 58 | unsigned int get_size() const { 59 | return _words.size(); 60 | } 61 | }; 62 | 63 | #endif // TESTS_NORMALIZER_MOCK_LEXICON_H_ 64 | -------------------------------------------------------------------------------- /src/tests/normalizer/wld_optimization.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"normalizer/wld.h" 19 | #include"normalizer/result.h" 20 | #include"string_impl.h" 21 | 22 | namespace { //NOLINT[build/namespaces] 23 | // Derived class to allow manipulation of the internal cascade 24 | class BenchmarkWLD : public Norma::Normalizer::WLD::WLD { 25 | public: 26 | void set_max_weight(double w) { _cascade->set_max_weight(w); } 27 | void set_max_ops(unsigned int n) { _cascade->set_max_ops(n); } 28 | Norma::Normalizer::Result operator()(const string_impl& word) const; 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /src/tests/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(PythonInterp 2.7 REQUIRED) 2 | configure_file(py-norma.py.in py-norma.py) 3 | configure_file(test_lexicon.py test_lexicon.py COPYONLY) 4 | configure_file(test_lexicon_icu.py test_lexicon_icu.py COPYONLY) 5 | configure_file(test_lexicon_std.py test_lexicon_std.py COPYONLY) 6 | configure_file(test_base.py test_base.py COPYONLY) 7 | configure_file(test_result.py test_result.py COPYONLY) 8 | configure_file(test_mapper.py test_mapper.py COPYONLY) 9 | configure_file(test_rulebased.py test_rulebased.py COPYONLY) 10 | configure_file(test_wld.py test_wld.py COPYONLY) 11 | configure_file(test_chain.py test_chain.py COPYONLY) 12 | add_test(PythonBindings ${PYTHON_EXECUTABLE} py-norma.py) 13 | -------------------------------------------------------------------------------- /src/tests/python/py-norma.py.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | from numbers import Number 5 | sys.path.insert(0, "../../python") 6 | from norma import NormaInitError, Result 7 | from norma.LexiconWrapper import Lexicon 8 | import norma.NormalizerWrapper as Normalizer 9 | import unittest 10 | 11 | TEST_BASE_DIR = "@NORMA_TEST_BASE_DIR@" 12 | STRING_IMPL = "@STRING_IMPL@" 13 | test_vars = {} 14 | 15 | class AssertFloat: 16 | precision = 0.001 17 | def assertClose(self, tuple1, tuple2): 18 | if isinstance(tuple1, Result): 19 | tuple1 = (tuple1.word, tuple1.score, tuple1.origin) 20 | if isinstance(tuple2, Result): 21 | tuple2 = (tuple2.word, tuple2.score, tuple2.origin) 22 | for (x, y) in zip(tuple1, tuple2): 23 | if isinstance(x, Number) and isinstance(y, Number): 24 | try: 25 | self.assertTrue((x-self.precision) < y < (x+self.precision)) 26 | except AssertionError as e: 27 | msg = "%s\n\n%f != %f within the precision boundary of %f" 28 | msg = msg % (e.message, x, y, self.precision) 29 | raise AssertionError, AssertionError(msg), sys.exc_info()[2] 30 | else: 31 | self.assertEquals(x, y) 32 | 33 | execfile("test_lexicon.py") 34 | execfile("test_base.py") 35 | execfile("test_result.py") 36 | execfile("test_mapper.py") 37 | execfile("test_rulebased.py") 38 | execfile("test_wld.py") 39 | execfile("test_chain.py") 40 | 41 | if STRING_IMPL == "ICU": 42 | execfile("test_lexicon_icu.py") 43 | elif STRING_IMPL == "STD": 44 | execfile("test_lexicon_std.py") 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /src/tests/python/test_base.py: -------------------------------------------------------------------------------- 1 | #!python 2 | # -*- encoding: utf-8 -*- 3 | ################################################################################ 4 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 5 | # 6 | # This file is part of Norma. 7 | # 8 | # Norma is free software: you can redistribute it and/or modify it under the 9 | # terms of the GNU Lesser General Public License as published by the Free 10 | # Software Foundation, either version 3 of the License, or (at your option) any 11 | # later version. 12 | # 13 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 14 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 15 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 | # details. 17 | # 18 | # You should have received a copy of the GNU Lesser General Public License along 19 | # with Norma. If not, see . 20 | ################################################################################ 21 | 22 | # Tests base normalizer functions (i.e., functions that all 23 | # normalizers share), but instantiates MapperNormalizer for this 24 | # purpose, because 1) we don't expose the Base normalizer class to 25 | # Python, and 2) Mapper is the simplest normalizer available. 26 | class BaseTest(unittest.TestCase): 27 | 28 | def setUp(self): 29 | self.norm = Normalizer.Mapper() 30 | 31 | def testLexicon(self): 32 | lex1 = make_test_lexicon() 33 | self.norm.lexicon = lex1 34 | lex2 = self.norm.lexicon 35 | self.assertEquals(len(lex1.entries), len(lex2.entries)) 36 | self.assertEquals("eins" in lex1, "eins" in lex2) 37 | lex1.add("fünfzehn") 38 | self.assertTrue("fünfzehn" in lex2) 39 | -------------------------------------------------------------------------------- /src/tests/python/test_lexicon_icu.py: -------------------------------------------------------------------------------- 1 | #!python 2 | # -*- encoding: utf-8 -*- 3 | ################################################################################ 4 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 5 | # 6 | # This file is part of Norma. 7 | # 8 | # Norma is free software: you can redistribute it and/or modify it under the 9 | # terms of the GNU Lesser General Public License as published by the Free 10 | # Software Foundation, either version 3 of the License, or (at your option) any 11 | # later version. 12 | # 13 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 14 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 15 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 | # details. 17 | # 18 | # You should have received a copy of the GNU Lesser General Public License along 19 | # with Norma. If not, see . 20 | ################################################################################ 21 | 22 | test_vars['lexfile'] = TEST_BASE_DIR+"/test-lexicon.gfsa" 23 | test_vars['symfile'] = TEST_BASE_DIR+"/test-lexicon.lab" 24 | 25 | class LexiconUnicodeTest(unittest.TestCase): 26 | def setUp(self): 27 | self.lex = Lexicon() 28 | self.lex.init() 29 | 30 | def testEntries1(self): 31 | self.assertEquals(len(self.lex.entries), 0) 32 | self.lex.add("fünf") 33 | self.assertTrue("fünf" in self.lex) 34 | 35 | def testEntries2(self): 36 | self.assertEquals(len(self.lex.entries), 0) 37 | self.lex.add("fünf") 38 | self.assertTrue(u'fünf' in self.lex) 39 | 40 | def testEntries3(self): 41 | self.assertEquals(len(self.lex.entries), 0) 42 | self.lex.add(u'fünf') 43 | self.assertTrue("fünf" in self.lex) 44 | 45 | def testEntries4(self): 46 | self.assertEquals(len(self.lex.entries), 0) 47 | self.lex.add(u'fünf') 48 | self.assertTrue(u'fünf' in self.lex) 49 | 50 | def testUnicode(self): 51 | self.assertFalse(u'\u0409' in self.lex) 52 | self.lex.add(u'\u0409') 53 | self.assertTrue(u'\u0409' in self.lex) 54 | 55 | def testLoadFromFile(self): 56 | self.lex.lexfile = test_vars['lexfile'] 57 | self.lex.symfile = test_vars['symfile'] 58 | self.lex.init() 59 | self.assertTrue("fünf" in self.lex) 60 | self.assertTrue(u'fünf' in self.lex) 61 | self.assertTrue(self.lex.contains_partial("fü")) 62 | 63 | def testToPythonConversion(self): 64 | self.lex.lexfile = test_vars['lexfile'] 65 | self.lex.symfile = test_vars['symfile'] 66 | self.lex.init() 67 | for entry in self.lex.entries: 68 | self.assertTrue(isinstance(entry, unicode)) 69 | -------------------------------------------------------------------------------- /src/tests/python/test_lexicon_std.py: -------------------------------------------------------------------------------- 1 | #!python 2 | # -*- encoding: utf-8 -*- 3 | ################################################################################ 4 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 5 | # 6 | # This file is part of Norma. 7 | # 8 | # Norma is free software: you can redistribute it and/or modify it under the 9 | # terms of the GNU Lesser General Public License as published by the Free 10 | # Software Foundation, either version 3 of the License, or (at your option) any 11 | # later version. 12 | # 13 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 14 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 15 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 | # details. 17 | # 18 | # You should have received a copy of the GNU Lesser General Public License along 19 | # with Norma. If not, see . 20 | ################################################################################ 21 | 22 | test_vars['lexfile'] = TEST_BASE_DIR+"/test-lexicon.gfsa" 23 | test_vars['symfile'] = TEST_BASE_DIR+"/test-lexicon.lab" 24 | 25 | class LexiconSTDTest(unittest.TestCase): 26 | def setUp(self): 27 | self.lex = Lexicon() 28 | self.lex.init() 29 | 30 | def testEntries1(self): 31 | self.assertEquals(len(self.lex.entries), 0) 32 | self.lex.add("fünf") 33 | self.assertTrue("fünf" in self.lex) 34 | 35 | def testEntries2(self): 36 | self.assertEquals(len(self.lex.entries), 0) 37 | self.lex.add("fünf") 38 | # not the same with STD strings: 39 | self.assertFalse(u'fünf' in self.lex) 40 | 41 | def testEntries3(self): 42 | self.assertEquals(len(self.lex.entries), 0) 43 | self.lex.add(u'fünf') 44 | # not the same with STD strings: 45 | self.assertFalse("fünf" in self.lex) 46 | 47 | def testEntries4(self): 48 | self.assertEquals(len(self.lex.entries), 0) 49 | self.lex.add(u"fünf") 50 | self.assertTrue(u"fünf" in self.lex) 51 | 52 | def testEntries5(self): 53 | self.assertEquals(len(self.lex.entries), 0) 54 | self.lex.add(u'fünf') 55 | # implicit latin-1 conversion of unicode: 56 | self.assertTrue(u'fünf'.encode("latin-1") in self.lex) 57 | 58 | def testUnicode(self): 59 | # unicode strings with characters outside of latin-1 raise an 60 | # exception 61 | with self.assertRaises(UnicodeEncodeError): 62 | test = (u'\u0409' in self.lex) 63 | 64 | def testUnicode2(self): 65 | self.assertFalse(u'\u0409'.encode("utf-8") in self.lex) 66 | 67 | def testLoadFromFile(self): 68 | self.lex.lexfile = test_vars['lexfile'] 69 | self.lex.symfile = test_vars['symfile'] 70 | self.lex.init() 71 | # doesn't work anymore with STD strings: 72 | self.assertFalse("fünf" in self.lex) 73 | self.assertFalse(u'fünf' in self.lex) 74 | self.assertFalse(self.lex.contains_partial("fü")) 75 | 76 | def testToPythonConversion(self): 77 | self.lex.lexfile = test_vars['lexfile'] 78 | self.lex.symfile = test_vars['symfile'] 79 | self.lex.init() 80 | for entry in self.lex.entries: 81 | self.assertTrue(isinstance(entry, str)) 82 | -------------------------------------------------------------------------------- /src/tests/python/test_result.py: -------------------------------------------------------------------------------- 1 | #!python 2 | # -*- encoding: utf-8 -*- 3 | ################################################################################ 4 | # Copyright 2013-2015 Marcel Bollmann, Florian Petran 5 | # 6 | # This file is part of Norma. 7 | # 8 | # Norma is free software: you can redistribute it and/or modify it under the 9 | # terms of the GNU Lesser General Public License as published by the Free 10 | # Software Foundation, either version 3 of the License, or (at your option) any 11 | # later version. 12 | # 13 | # Norma is distributed in the hope that it will be useful, but WITHOUT ANY 14 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 15 | # A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 | # details. 17 | # 18 | # You should have received a copy of the GNU Lesser General Public License along 19 | # with Norma. If not, see . 20 | ################################################################################ 21 | 22 | class ResultTest(unittest.TestCase, AssertFloat): 23 | def setUp(self): 24 | self.norm = Normalizer.Mapper() 25 | 26 | def testIdenticalResultsWithFloat(self): 27 | self.norm.init() 28 | result_cpp = self.norm("vrouwe") 29 | result_tuple = ("vrouwe", 0.0, "Mapper") 30 | result_py = Result("vrouwe", 0.0, "Mapper") 31 | self.assertEquals(result_cpp, result_tuple) 32 | self.assertEquals(result_cpp, result_py) 33 | self.assertEquals(result_py, result_tuple) 34 | 35 | def testIdenticalResultsWithInt(self): 36 | self.norm.init() 37 | result_cpp = self.norm("vrouwe") 38 | result_tuple = ("vrouwe", 0, "Mapper") 39 | result_py = Result("vrouwe", 0, "Mapper") 40 | self.assertEquals(result_cpp, result_tuple) 41 | self.assertEquals(result_cpp, result_py) 42 | self.assertEquals(result_py, result_tuple) 43 | 44 | def testLogMessages(self): 45 | self.norm.init() 46 | result = self.norm("vrouwe") 47 | self.assertTrue(isinstance(result.messages, list)) 48 | self.assertTrue(len(result.messages) > 0) 49 | message = result.messages[0] 50 | self.assertTrue(isinstance(message, tuple)) 51 | self.assertEquals(len(message), 3) 52 | -------------------------------------------------------------------------------- /src/tests/tests.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef TESTS_TESTS_H_ 19 | #define TESTS_TESTS_H_ 20 | #include 21 | 22 | #endif // TESTS_TESTS_H_ 23 | -------------------------------------------------------------------------------- /src/tests/training_data.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #define BOOST_TEST_DYN_LINK 19 | #define BOOST_TEST_MODULE TrainingData 20 | #include 21 | #include 22 | #include // NOLINT[build/include_order] 23 | #include"training_data.h" 24 | #include"string_impl.h" 25 | #include"tests.h" 26 | 27 | struct TrainingDataFixture { 28 | std::vector strings { "foo", "bar", "baz", "bla" }; 29 | Norma::TrainingData data; 30 | }; 31 | 32 | BOOST_FIXTURE_TEST_SUITE(TrainingData1, TrainingDataFixture) 33 | 34 | BOOST_AUTO_TEST_CASE(training_data) { 35 | data.add_source(strings[0].c_str()); 36 | BOOST_CHECK_EQUAL(data.length(), 0); 37 | BOOST_CHECK(data.empty()); 38 | data.add_target(strings[1].c_str()); 39 | BOOST_CHECK_EQUAL(data.length(), 1); 40 | data.add_target(strings[2].c_str()); 41 | data.add_source(strings[3].c_str()); 42 | BOOST_CHECK_EQUAL(data.length(), 2); 43 | Norma::TrainingData::iterator pp = data.begin(); 44 | BOOST_CHECK_EQUAL(pp->source(), strings[0].c_str()); 45 | BOOST_CHECK_EQUAL(pp->target(), strings[1].c_str()); 46 | ++pp; 47 | BOOST_CHECK_EQUAL(pp->source(), strings[3].c_str()); 48 | BOOST_CHECK_EQUAL(pp->target(), strings[2].c_str()); 49 | --pp; 50 | BOOST_CHECK_EQUAL(pp->source(), strings[0].c_str()); 51 | BOOST_CHECK_EQUAL(pp->target(), strings[1].c_str()); 52 | ++pp; ++pp; 53 | BOOST_CHECK_THROW(*pp, std::out_of_range); 54 | Norma::TrainingData::reverse_iterator pq = data.rbegin(); 55 | BOOST_CHECK_EQUAL(pq->source(), strings[3].c_str()); 56 | BOOST_CHECK_EQUAL(pq->target(), strings[2].c_str()); 57 | ++pq; 58 | BOOST_CHECK_EQUAL(pq->source(), strings[0].c_str()); 59 | BOOST_CHECK_EQUAL(pq->target(), strings[1].c_str()); 60 | --pq; 61 | BOOST_CHECK_EQUAL(pq->source(), strings[3].c_str()); 62 | BOOST_CHECK_EQUAL(pq->target(), strings[2].c_str()); 63 | ++pq; ++pq; 64 | BOOST_CHECK_THROW(*pq, std::out_of_range); 65 | } 66 | 67 | BOOST_AUTO_TEST_SUITE_END() 68 | -------------------------------------------------------------------------------- /src/training_data-inl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #ifndef TRAINING_DATA_INL_H_ 19 | #define TRAINING_DATA_INL_H_ 20 | namespace Norma { 21 | template 22 | TrainingData::basic_iterator::basic_iterator(TrainingData* data, 23 | size_t pos) 24 | : _data(data), _pos(pos) {} 25 | 26 | template 27 | TrainingPair& TrainingData::basic_iterator::operator*() { 28 | // the pair needs to be stored as member since 29 | // operator->() needs to return a ptr. it shouldn't 30 | // be a big deal anyway since TrainingPair is a fairly 31 | // light weight class. 32 | if (!has_pair) 33 | _pair = _data->get_pair(_pos, D); 34 | return _pair; 35 | } 36 | 37 | template 38 | TrainingPair* TrainingData::basic_iterator::operator->() { 39 | return &(operator*()); 40 | } 41 | 42 | template 43 | TrainingData::basic_iterator& TrainingData::basic_iterator::operator++() { 44 | ++_pos; 45 | return *this; 46 | } 47 | 48 | template 49 | TrainingData::basic_iterator& TrainingData::basic_iterator::operator--() { 50 | --_pos; 51 | return *this; 52 | } 53 | 54 | template 55 | bool TrainingData::basic_iterator::operator== 56 | (const TrainingData::basic_iterator& that) { 57 | return this->_data == that._data 58 | && this->_pos == that._pos; 59 | } 60 | 61 | template 62 | bool TrainingData::basic_iterator::operator!= 63 | (const TrainingData::basic_iterator& that) { 64 | return !(*this == that); 65 | } 66 | } // namespace Norma 67 | #endif // TRAINING_DATA_INL_H_ 68 | 69 | -------------------------------------------------------------------------------- /src/training_data.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2013-2015 Marcel Bollmann, Florian Petran 2 | * 3 | * This file is part of Norma. 4 | * 5 | * Norma is free software: you can redistribute it and/or modify it under the 6 | * terms of the GNU Lesser General Public License as published by the Free 7 | * Software Foundation, either version 3 of the License, or (at your option) any 8 | * later version. 9 | * 10 | * Norma is distributed in the hope that it will be useful, but WITHOUT ANY 11 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 | * A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 13 | * details. 14 | * 15 | * You should have received a copy of the GNU Lesser General Public License along 16 | * with Norma. If not, see . 17 | */ 18 | #include"training_data.h" 19 | #include 20 | #include 21 | 22 | namespace Norma { 23 | ////////////////////////////////// TrainingPair /////////////////////////////// 24 | 25 | const string_impl& TrainingPair::source() const { 26 | return _source; 27 | } 28 | 29 | const string_impl& TrainingPair::target() const { 30 | return _target; 31 | } 32 | 33 | bool TrainingPair::is_used() const { 34 | return _used; 35 | } 36 | 37 | void TrainingPair::make_used() { 38 | _used = true; 39 | if (_data == nullptr) 40 | return; 41 | _data->_used[_pos] = true; 42 | } 43 | 44 | TrainingPair::TrainingPair(TrainingData* data, size_t pos) { 45 | _data = data; 46 | _pos = pos; 47 | _used = _data->_used[pos]; 48 | _source = _data->_source[pos]; 49 | _target = _data->_target[pos]; 50 | } 51 | 52 | ////////////////////////////////// TrainingData /////////////////////////////// 53 | TrainingData& TrainingData::add_pair(const string_impl& source, 54 | const string_impl& target) { 55 | return add_source(source).add_target(target); 56 | } 57 | 58 | TrainingData& TrainingData::add_source(const string_impl& source) { 59 | return add_token(source, &_source); 60 | } 61 | 62 | TrainingData& TrainingData::add_target(const string_impl& target) { 63 | return add_token(target, &_target); 64 | } 65 | 66 | TrainingData& TrainingData::add_token(const string_impl& token, 67 | std::vector* where) { 68 | where->push_back(token); 69 | if (_target.size() == _source.size()) 70 | _used.push_back(false); 71 | return *this; 72 | } 73 | 74 | size_t TrainingData::length() const { 75 | size_t slen = _source.size(), 76 | tlen = _target.size(); 77 | return (slen < tlen) ? slen : tlen; 78 | } 79 | 80 | bool TrainingData::empty() const { 81 | return length() == 0; 82 | } 83 | 84 | TrainingData::iterator TrainingData::begin() { 85 | return iterator(this, 0); 86 | } 87 | 88 | TrainingData::iterator TrainingData::end() { 89 | return iterator(this, length()); 90 | } 91 | 92 | TrainingData::reverse_iterator TrainingData::rbegin() { 93 | return reverse_iterator(this, 0); 94 | } 95 | 96 | TrainingData::reverse_iterator TrainingData::rend() { 97 | return reverse_iterator(this, length()); 98 | } 99 | 100 | TrainingPair TrainingData::get_pair(size_t pos, IteratorDirection direction) { 101 | // this is a bit hacky: 102 | // pos gets inverted on length for reverse iterators 103 | // if pos == length() (for rend), it will become the largest 104 | // possible value (implementation dependent) since size_t is 105 | // unsigned. if it gets incremented after that, it will be 0 again 106 | // so it points to the first element. 107 | if (direction == IteratorDirection::REV) 108 | pos = length() - (pos + 1); 109 | if (pos >= length()) 110 | throw std::out_of_range("TrainingData out of range"); 111 | return TrainingPair(this, pos); 112 | } 113 | } // namespace Norma 114 | --------------------------------------------------------------------------------