├── .dockerignore ├── .github └── workflows │ ├── badge.svg │ ├── cleanup.yml │ └── timbl.yml ├── .gitignore ├── AUTHORS ├── COPYING ├── Dockerfile ├── MAINTAINERS ├── Makefile.am ├── NEWS ├── README ├── README.md ├── TODO ├── bootstrap.sh ├── build-deps.sh ├── codemeta.json ├── configure.ac ├── demos ├── .gitignore ├── Makefile.am ├── api_test1.cxx ├── api_test2.cxx ├── api_test3.cxx ├── api_test4.cxx ├── api_test5.cxx ├── api_test6.cxx ├── classify.cxx ├── cross_val.test ├── dimin.script ├── dimin.test ├── dimin.train ├── small_1.train ├── small_2.train ├── small_3.train ├── small_4.train ├── small_5.train └── tse.cxx ├── docs ├── Makefile.am ├── Timbl_6.4_Manual.pdf ├── texfiles │ ├── Timbl_6.3_API.tex │ ├── Timbl_6.3_Manual.tex │ ├── Timbl_6.4_Manual.tex │ ├── distanceweight-ided.eps │ ├── fspace.eps │ ├── fullname.bst │ ├── fullname.sty │ ├── ilk.bib │ ├── mble-method.eps │ ├── pos-neg.eps │ └── roc-auc.eps └── timbl.1 ├── include ├── Makefile.am └── timbl │ ├── .gitignore │ ├── BestArray.h │ ├── Choppers.h │ ├── Common.h │ ├── Features.h │ ├── GetOptClass.h │ ├── IBtree.h │ ├── Instance.h │ ├── MBLClass.h │ ├── Makefile.am │ ├── Matrices.h │ ├── Metrics.h │ ├── MsgClass.h │ ├── Options.h │ ├── Statistics.h │ ├── StringOps.h │ ├── Targets.h │ ├── Testers.h │ ├── TimblAPI.h │ ├── TimblExperiment.h │ ├── Types.h │ └── neighborSet.h ├── m4 ├── .gitignore ├── Makefile.am └── ac_osx_pkg.m4 ├── src ├── .gitignore ├── BestArray.cxx ├── CVExperiment.cxx ├── Choppers.cxx ├── Common.cxx ├── Features.cxx ├── GetOptClass.cxx ├── IBprocs.cxx ├── IBtree.cxx ├── IGExperiment.cxx ├── Instance.cxx ├── LOOExperiment.cxx ├── MBLClass.cxx ├── Makefile.am ├── Metrics.cxx ├── MsgClass.cxx ├── Statistics.cxx ├── StringOps.cxx ├── TRIBLExperiments.cxx ├── Targets.cxx ├── Testers.cxx ├── Timbl.cxx ├── TimblAPI.cxx ├── TimblExperiment.cxx ├── Types.cxx ├── neighborSet.cxx └── simpletest.cxx └── timbl.pc.in /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .cache 3 | .* 4 | _* 5 | *.cache 6 | *.pyc 7 | build 8 | *.egg-info 9 | gource* 10 | *.tar.gz 11 | *.pdf 12 | TODO 13 | *.lock 14 | -------------------------------------------------------------------------------- /.github/workflows/badge.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 | 33 | 34 | 35 | 36 | 37 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /.github/workflows/cleanup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Delete old workflow runs 3 | on: 4 | schedule: 5 | - cron: '0 15 14 * *' 6 | # Run monthly, at 15:00 on the 14t day of month. (testing) 7 | 8 | jobs: 9 | del_runs: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | actions: write 13 | steps: 14 | - name: Delete workflow runs 15 | uses: Mattraks/delete-workflow-runs@v2 16 | with: 17 | token: ${{ github.token }} 18 | repository: ${{ github.repository }} 19 | retain_days: 30 20 | keep_minimum_runs: 6 21 | -------------------------------------------------------------------------------- /.github/workflows/timbl.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: C/C++ CI 3 | 4 | on: 5 | schedule: 6 | - cron: "0 20 3 * 5" # run test once a month 7 | push: 8 | branches: 9 | - master 10 | - develop 11 | paths: 12 | - configure.ac 13 | - 'src/**' 14 | - 'include/**' 15 | - '.github/workflows/*' 16 | 17 | pull_request: 18 | branches: [master] 19 | 20 | jobs: 21 | notification: 22 | runs-on: ubuntu-latest 23 | name: Notifications 24 | steps: 25 | - name: IRC notification of starting the builds 26 | uses: LanguageMachines/ticcactions/irc-init@v1 27 | 28 | build: 29 | runs-on: ${{ matrix.os }} 30 | needs: notification 31 | strategy: 32 | matrix: 33 | os: [ubuntu-latest, macos-latest] 34 | compiler: [g++-12, clang++] 35 | 36 | steps: 37 | 38 | - name: Cancel Previous Runs 39 | uses: styfle/cancel-workflow-action@0.12.1 40 | with: 41 | access_token: ${{ github.token }} 42 | 43 | - uses: actions/checkout@v4.1.1 44 | 45 | - uses: LanguageMachines/ticcactions/cpp-build-env@v1 46 | - uses: LanguageMachines/ticcactions/cpp-dependencies@v1 47 | - uses: LanguageMachines/ticcactions/irc-nick@v1 48 | 49 | - uses: LanguageMachines/ticcactions/cpp-submodule-build@v1 50 | with: 51 | branch: ${{ github.ref_name }} 52 | module: ticcutils 53 | 54 | - uses: LanguageMachines/ticcactions/setup-cppcheck@v1 55 | - name: Static Code-check 56 | if: ${{ env.action_status == '' }} 57 | run: cppcheck ${{ env.cpc_opts }} . 58 | 59 | - uses: LanguageMachines/ticcactions/cpp-safe-build@v1 60 | 61 | - name: Notify IRC of results 62 | uses: LanguageMachines/ticcactions/irc-status@v1 63 | with: 64 | branch: ${{ github.ref_name }} 65 | nickname: ${{ env.nick }} 66 | step: test 67 | status: ${{ env.action_status }} 68 | details: ${{ env.action_details }} 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.gz 3 | Makefile 4 | Makefile.in 5 | compile 6 | config.guess 7 | config.h 8 | config.h.in 9 | config.log 10 | config.status 11 | config.sub 12 | configure 13 | INSTALL 14 | aclocal.m4 15 | autom4te.cache/ 16 | depcomp 17 | install-sh 18 | libtool 19 | ltmain.sh 20 | missing 21 | stamp-h1 22 | test-driver 23 | timbl.pc 24 | ChangeLog 25 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | TiMBL authors 2 | 3 | Lead programmer: 4 | 5 | Ko van der Sloot 6 | 7 | Code, algorithm, and design contributions by: 8 | 9 | Peter Berck 10 | Antal van den Bosch 11 | Walter Daelemans 12 | Maarten van Gompel 13 | Ton Weijters 14 | Jakub Zavrel 15 | 16 | Contributors: 17 | 18 | People who contributed to Timbl by suggesting improvements, filing bug 19 | reports, asking the right questions etc.: 20 | 21 | Robert Andersson 22 | Vincent Van Asch 23 | Joris Bleys 24 | Johan Bos 25 | Joan Bresnan 26 | Stefan Breuer 27 | Sabine Buchholz 28 | Bertjan Busser 29 | Sander Canisius 30 | Giovanni Cassani 31 | Win Carus 32 | Felix Filoz 33 | Alan Frankel 34 | Sven Hartrumpf 35 | Iris Hendrickx 36 | Lyndon Hiew 37 | Steve Hunt 38 | Valentin Jijkoun 39 | Gunn Inger Lyse 40 | Svetoslav Marinov 41 | Erwin Marsi 42 | Liam McGrath 43 | Jens Nilsson 44 | Ties Kemper 45 | Tom DePlonty 46 | Adam Radziszewski 47 | Albert Russel 48 | Yvan Saeys 49 | Frank Scheelen 50 | Armin Schmidt 51 | Olaf Seibert 52 | Gabriel Skantze 53 | Carline Sporleder 54 | Herman Stehouwer 55 | Erik Tjong Kim Sang 56 | Joseph Turian 57 | Frederik Vaassen 58 | Corne Versloot 59 | Colin Wilson 60 | Linda Yung 61 | Bram Vandekerckhove 62 | Menno van Zaanen 63 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:latest 2 | #VERSION can be: 3 | # - stable: builds latest stable versions from source (default) 4 | # - distro: uses packages as provided by Alpine Linux (may be slightly out of date) 5 | # - devel: latest development version (git master/main branch) 6 | ARG VERSION="stable" 7 | LABEL org.opencontainers.image.authors="Maarten van Gompel " 8 | LABEL description="timbl - tilburg memory-based learner" 9 | 10 | RUN mkdir -p /data 11 | RUN mkdir -p /usr/src/timbl 12 | COPY . /usr/src/timbl 13 | 14 | RUN if [ "$VERSION" = "distro" ]; then \ 15 | rm -Rf /usr/src/timbl &&\ 16 | echo -e "----------------------------------------------------------\nNOTE: Installing latest release as provided by Alpine package manager.\nThis version may diverge from the one in the git master tree or even from the latest release on github!\nFor development, build with --build-arg VERSION=development.\n----------------------------------------------------------\n" &&\ 17 | apk update && apk add timbl; \ 18 | else \ 19 | PACKAGES="libbz2 icu-libs libxml2 libgomp libstdc++" &&\ 20 | BUILD_PACKAGES="build-base autoconf-archive autoconf automake libtool bzip2-dev icu-dev libxml2-dev git" &&\ 21 | apk add $PACKAGES $BUILD_PACKAGES &&\ 22 | cd /usr/src/ && ./timbl/build-deps.sh &&\ 23 | cd timbl && sh ./bootstrap.sh && ./configure && make && make install &&\ 24 | apk del $BUILD_PACKAGES && rm -Rf /usr/src; \ 25 | fi 26 | 27 | WORKDIR / 28 | 29 | ENTRYPOINT [ "timbl" ] 30 | -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Maarten van Gompel (KNAW Humanities Cluster) 2 | Ko van der Sloot 3 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | 2 | ACLOCAL_AMFLAGS =-I m4 --install 3 | 4 | SUBDIRS = src include demos docs m4 5 | 6 | EXTRA_DIST = bootstrap.sh AUTHORS TODO NEWS README.md timbl.pc.in codemeta.json 7 | 8 | pkgconfigdir = $(libdir)/pkgconfig 9 | pkgconfig_DATA = timbl.pc 10 | 11 | ChangeLog: NEWS 12 | git pull; git2cl > ChangeLog 13 | 14 | docker: 15 | docker build -t timbl:latest . 16 | 17 | docker-dev: 18 | docker build -t timbl:dev --build-arg VERSION=development . 19 | 20 | deps: 21 | ./build-deps.sh 22 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | 6.10 2024-12-16 2 | [Ko van der Sloot] 3 | * C++17 is required 4 | * updated to latest ticcutils 5 | * improved GitHub CI 6 | * code quality 7 | 8 | 6.9 2023-10-21 9 | [Ko van der Sloot] 10 | * better code: const correctness etc. 11 | 12 | 6.8.2 2023-02-22 13 | [Ko van der Sloot] 14 | * plugged a memory leak 15 | * C++ code quality improved 16 | * removed dependency on deprecated sprintf function 17 | * removed dependency on libtar 18 | 19 | 6.8.1 2023-01-04 20 | [Ko van der Sloot] 21 | * fix for some odd distro issues 22 | * for now re-added 2 backward compatability functions 23 | 24 | 6.8 2023-01-02 25 | [Ko van der Sloot] 26 | * major code refactoring 27 | - BREAKS API and ABI 28 | - library bumped 29 | - getting rid of a lot of pointers and C-style arrays 30 | - removed C-style casts 31 | - Unicode is the default now for most functions. Some 'string' functions 32 | are still available in the API. 33 | - In general modernizing to C++11 34 | - weeded out CppCheck warnings 35 | * improved GitHub action 36 | 37 | 6.7 2022-07-22 38 | [Maarten van Gompel] 39 | * updated metadata (codemeta.json) following new (proposed) CLARIAH requirements (CLARIAH/clariah-plus#38) 40 | * added builds-deps.sh for automatically building and installing dependencies 41 | * added Dockerfile and instructions 42 | * no functional changes 43 | 44 | 6.6 2020-12-15 45 | [Ko vd Sloot] 46 | * Internally we use NFC normalized UnicodeString's now. 47 | Timbl should be robust for UTF8 files, even exotic languages. 48 | * added some Unicode fuctions to the API 49 | * bumped library version 50 | * several code refactorings 51 | * added the possibility to use the options -f and -i, without -t 52 | 53 | 6.5 2020-04-15 54 | [Ko vd Sloot] 55 | * adapted to the newest TiCC::CommandLine implementation 56 | * small code refactorings 57 | 58 | 6.4.14 2019-10-21 59 | [Ko vd Sloot] 60 | * added JSON support. Still EXPERIMENTAL! 61 | the JSON syntax might change in the future. So handle with care. 62 | * confidence score calculation is now a real TimblExperiment member 63 | * removed Boost dependency. 64 | 65 | 6.4.13 2018-11-28 66 | [Ko van der Sloot] 67 | - added a '--limit' option to use only the most significant features 68 | 69 | 6.4.12 2018-05-16 70 | [Ko van der Sloot] 71 | Bugfix release: 72 | - updated usage(). Info on -G 2 option was wrong. 73 | - changed an error message to be more clear. 74 | - fixed building of the TeX documentation 75 | 76 | [Maarten van Gompel] 77 | - Added codemeta.json metadata 78 | 79 | 6.4.11 2018-01-09 80 | [Ko van der Sloot] 81 | Bugfix release: 82 | - Fixed a major bug in similarity metric calculations. (Cosine and Dot product) 83 | 84 | 6.4.10 2017-11-09 85 | [Ko van der Sloot] 86 | Bugfix release: 87 | - allow for spaces in TABBED input (they are significant) 88 | - corrected some typos in messages and man page 89 | - minor code refactorings 90 | 91 | 6.4.9 2017-05-04 92 | [Ko van der Sloot] 93 | Maintenance release: 94 | - removed unused/non-functional functions from the API 95 | - code refactoring. Mostly based on CPPCHECK static analyzer. 96 | - small bugs: 97 | -e options didn't always do what you expected 98 | - added missing files in docs 99 | [Maarten van Gompel] 100 | - updated README.md 101 | 102 | 6.4.8 2016-07-11 103 | [Ko van der Sloot] 104 | Maintance release: 105 | - code refactoring and improvement 106 | - relying more on ticcutils 107 | - fixed exit codes 108 | - accept long options: --version and --help 109 | - fix out-of-range problem in Sparse Format 110 | 111 | 6.4.7 2016-01-14 112 | [Ko van der Sloot][Maarten van Gompel] 113 | * repository moved to GitHub 114 | * added travis support 115 | * code updates. (clearer code mainly) 116 | * depending a bit more on ticcutils (CommanLine, StringOps) 117 | * some small bug fixes (LOO with a 1 line file) 118 | 119 | 6.4.6 2014-09-23 120 | [Ko van der Sloot] 121 | * release 122 | 123 | 6.4.5 2014-09-16 124 | * small bug fixes 125 | 126 | 6.4.4 2013-04-03 127 | * rely more on ticcutils stuff. A lot of functions are moved there 128 | * added a GetAccuracy option to the API 129 | * bug fix in Choppers.cxx 130 | 131 | 6.4.3 2012-10-11 132 | * added an --occurrences option for training/testing files with an occurrence 133 | value. 134 | * made Tree.cxx and Trie.h 'omp thread-safe' and moved them to ticcutils 135 | * added a "Tabbed' inputformat (© Maarten van Gompel) 136 | * The Micro Avagare F-score calculation is now according to the Manual. 137 | There were small differences caused by a mixup of test and train data. 138 | 139 | 6.4.2 2011-12-20 140 | * start to use Requires.private in timbl.pc 141 | * added a 'check' target to Make system 142 | * achieved a considerable speedup for multithreaded testing. 143 | * fixed a small problem in LogBuffer. Also simplified and cleaned up 144 | LogBuffer and LogStream code. All dependencies need recompiling! 145 | * implemented +vcf (confidence) output 146 | * The -T option for TreeOrdening is now named --Treeorder 147 | * fixed tiebreaking for -R (random) option, closes bug 43, again. 148 | * some small fixes for '-pedantic' compiler option 149 | * avoid zero result in Exponential Decay (bug 89). 150 | * removed unused relativeWeight() function. (was duplicated) 151 | 152 | 6.4.1 2011-08-25 153 | 154 | [ Ko van der Sloot ] 155 | - added Version() and VersionName() functions. We want them for every 156 | member of the family 157 | - fixed a problem with including 'config'h' in the API 158 | - fixed a problem with normalization on empty distributions. 159 | - added a Confidence( class ) function to the instances API. 160 | returns the Weight of 'class'. Which is influenced by normalization! 161 | - added logProbability normalization 162 | - the +vS option was not always honoured. Now Timbl shuts it's big mouth better 163 | - Expand() is now also enabled for TRIBL and TRIBL2 164 | 165 | 6.4.0 166 | - decapped Timbl and libTimbl to timbl and libtimbl 167 | this will shake the whole timbl family tree! 168 | - small fixes to survive -pedantic compiler option without warnings 169 | 170 | 6.3.4 171 | - we now support incremental learning from a file on the command line. 172 | - implemented a --clones option to use multiple threads for testing 173 | - fixed bug 58. Emit Error when reading an InstanceBase without a test. 174 | - fixed bug 61. Give a Warning when a trainingset contains only 1 class. 175 | - cleaned up build system 176 | 177 | 6.3.3 178 | - several small fixes. 179 | 180 | 6.3.2 181 | - fixed bug 44. Segfault in weird cases 182 | - fixed bug 45. Needless processing of traindata when required option is missing 183 | - fixed bug 46. Tribl2 sometimes fails to correctly output +v+k+n 184 | - fixed bug 47. Unclear error message when InnerProduct fails 185 | - several small uncritical enhancements 186 | 187 | 6.3.1 - 2010-11-17 188 | - Little API change in TimblOptions (more clear i hope) 189 | - Little bug fixes and improvement (logging mostly) 190 | - Moved LogStream stuff back in from TimblServer 191 | 192 | 6.2.3 193 | forgot to edit this file 194 | 6.2.2 195 | forgot to edit this file 196 | 197 | 6.2.1 - 2009-11-30 198 | 199 | - Fixed compilation problem on Cygwin 200 | - Added functions to API 201 | - Improved server functionality (undocumented yet) 202 | 203 | 6.2.0 - 2009-11-03 204 | 205 | - Stable release 206 | 207 | 6.1.99.0.20091021.1 - 2009-10-21 208 | 209 | - Another snapshot from SVN. Now needs libxml2 for building. 210 | 211 | 6.1.99.0.20091014.1 - 2009-10-14 212 | 213 | - Another snapshot from SVN. (By mistake, aka 6.2.0). 214 | 215 | 6.2.0.pre3 - 2009-10-05 216 | 217 | - Another snapshot from SVN. 218 | 219 | 6.2.0-pre1 - 2009-09-03 220 | 221 | - snapshot from SVN. 222 | 223 | 2009-08-31: Ko vd Sloot 224 | It's been a long time sine the prevous news 225 | lot of overhauling of the code took place 226 | 2008-03-04: Ko vd Sloot 227 | numerous small bug fixes. 228 | getting ready for 6.1.2 229 | 2007-12-03: Ko vd Sloot 230 | Packaging seems to be fine now. 231 | 2007-10-01: Ko vd Sloot 232 | first packaging attempt 233 | 234 | # $Id$ 235 | # $URL$ 236 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Please see README.md for for information. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub build](https://github.com/LanguageMachines/timbl/actions/workflows/timbl.yml/badge.svg?branch=master)](https://github.com/LanguageMachines/timbl/actions/) 2 | [![Language Machines Badge](http://applejack.science.ru.nl/lamabadge.php/timbl)](http://applejack.science.ru.nl/languagemachines/) 3 | [![DOI](https://zenodo.org/badge/20526237.svg)](https://zenodo.org/badge/latestdoi/20526237) 4 | 5 | =========================================== 6 | TiMBL: Tilburg Memory Based Learner 7 | =========================================== 8 | 9 | TiMBL 6.4 (c) CLS/ILK/CLiPS 1998 - 2024 10 | Centre for Language Studies, Radboud University Nijmegen 11 | Induction of Linguistic Knowledge Research Group, Tilburg University and 12 | Centre for Dutch Language and Speech, University of Antwerp 13 | 14 | **Website:** https://languagemachines.github.io/timbl/ 15 | 16 | 17 | TiMBL is an open source software package implementing several memory-based 18 | learning algorithms, among which IB1-IG, an implementation of k-nearest 19 | neighbor classification with feature weighting suitable for symbolic feature 20 | spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented 21 | algorithms have in common that they store some representation of the training 22 | set explicitly in memory. During testing, new cases are classified by 23 | extrapolation from the most similar stored cases. 24 | 25 | For over fifteen years TiMBL has been mostly used in natural language 26 | processing as a machine learning classifier component, but its use extends to 27 | virtually any supervised machine learning domain. Due to its particular 28 | decision-tree-based implementation, TiMBL is in many cases far more efficient 29 | in classification than a standard k-nearest neighbor algorithm would be. 30 | 31 | 32 | ----------------------------------------------------------------------- 33 | 34 | This is a major extension to the sixth main release of TiMBL. 35 | Most significant change: **The main program is now called 'timbl' and not 36 | 'Timbl' anymore. Be warned!** 37 | This change is part of our effort to get our MBL software into software 38 | distributions like Debian, Ubuntu, RedHat . 39 | 40 | Comments and bug-reports are welcome at our issue tracker at 41 | https://github.com/LanguageMachines/timbl/issues or by mailing 42 | lamasoftware (at) science.ru.nl. 43 | Documentation and more info may be found on https://languagemachines.github.io/timbl . 44 | 45 | TiMBL is distributed under the GNU Public Licence v3 (see the file COPYING). 46 | 47 | ----------------------------------------------------------------------- 48 | 49 | This software has been tested on: 50 | - Intel platforms running several versions of Linux, including Ubuntu, Debian, 51 | Arch Linux, Fedora (both 32 and 64 bits) 52 | - MAC platform running OS X 10.10 53 | 54 | Alternatively, with some effort, you may get it to work on a Windows platform using Cygwin. 55 | 56 | Compilers: 57 | - GCC (use 7.0 or later) 58 | - Clang 59 | 60 | Contents of this distribution: 61 | - Sources 62 | - Licensing information ( COPYING ) 63 | - Build system based on GNU Autotools 64 | - Container build file ( Dockerfile ) 65 | - Example data files ( in the demos directory ) 66 | - Documentation ( in the docs directory ) 67 | 68 | Dependencies: 69 | To be able to succesfully build TiMBL from the tarball, you need the 70 | following pakages: 71 | - ticcutils (https://github.com/LanguageMachines/ticcutils) 72 | - pkg-config 73 | - libxml2-dev 74 | 75 | To install TiMBL, first consult whether your distribution's package manager has an up-to-date package for TiMBL. 76 | 77 | To compile and install manually from source instead, provided you have all the dependencies installed: 78 | 79 | $ bash bootstrap.sh 80 | $ ./configure 81 | $ make 82 | $ make install 83 | 84 | If you want to automatically download and install the latest stable versions of 85 | the required dependencies, then run `./build-deps.sh` prior to the above. You 86 | can pass a target directory prefix as first argument and you may need to 87 | prepend `sudo` to ensure you can install there. The dependencies are: 88 | 89 | * [ticcutils](https://github.com/LanguageMachines/ticcutils) 90 | 91 | A `Dockerfile` for a container build is also available, specify `--build-arg VERSION=development` if you want the latest 92 | development version instead. 93 | 94 | You will still need to take care to install the following 3rd party 95 | dependencies through your distribution's package manager, as they are not 96 | provided by our script: 97 | 98 | * ``icu`` - A C++ library for Unicode and Globalization support. On Debian/Ubuntu systems, install the package libicu-dev. 99 | * A sane build environment with a C++ compiler (e.g. gcc 4.9 or above or clang), make, autotools, libtool, pkg-config 100 | 101 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | * reorganize clone() splitChild() and such in TimblExperiment. 2 | it is very confusing now. 3 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # bootstrap - script to bootstrap the distribution rolling engine 4 | 5 | # usage: 6 | # $ sh ./bootstrap && ./configure && make dist[check] 7 | # 8 | # this yields a tarball which one can install doing 9 | # 10 | # $ tar zxf PACKAGENAME-*.tar.gz 11 | # $ cd PACKAGENAME-* 12 | # $ ./configure 13 | # $ make 14 | # # make install 15 | 16 | # requirements: 17 | # GNU autoconf, from e.g. ftp.gnu.org:/pub/gnu/autoconf/ 18 | # GNU automake, from e.g. http://ftp.gnu.org/gnu/automake/ 19 | 20 | automake=automake 21 | aclocal=aclocal 22 | 23 | # if you want to autogenerate a ChangeLog form svn: 24 | # 25 | # svn2cl, a python script, as used in the GNU Enterprise project. 26 | # By jcater (Jason Cater), contributions by reinhard (Reinhard Müller). 27 | # Get it from 28 | # http://www.gnuenterprise.org/cgi-bin/viewcvs.cgi/*checkout*/gnue/trunk/gnue-common/utils/svn2cl . 29 | # svn2cl is used in Makefile.am too. 30 | # 31 | # (Another svn2cl implementation, in perl, is at 32 | # http://www.contactor.se/~dast/svn/archive-2002-04/0910.shtml) 33 | # 34 | # see also toplevel Makefile.am 35 | 36 | # test -f ChangeLog || { 37 | # svn log --verbose > ChangeLog 38 | #} 39 | 40 | # inspired by hack as used in mcl (from http://micans.org/) 41 | 42 | # autoconf-archive Debian package, aclocal-archive RPM, obsolete/badly supported OS, installed in home dir 43 | acdirs="/usr/share/autoconf-archive/ /usr/share/aclocal/ /usr/local/share/aclocal/ $HOME/local/share/autoconf-archive/ /opt/homebrew/share/aclocal/" 44 | 45 | found=false 46 | for d in $acdirs 47 | do 48 | if test -f ${d}pkg.m4 49 | then 50 | found=true 51 | break 52 | fi 53 | done 54 | 55 | if ! $found 56 | then 57 | cat <&2 13 | echo " Building latest stable release of main dependencies from source.">&2 14 | echo "------------------------------------------------------------------------">&2 15 | else 16 | echo "------------------------------------------------------------------------">&2 17 | echo " Building development versions of main dependencie from source.">&2 18 | echo " (This is experimental and may contain bugs! DO NOT PUBLISH!)">&2 19 | echo "-----------------------------------------------------------------------">&2 20 | fi 21 | 22 | PWD="$(pwd)" 23 | BUILDDIR="$(mktemp -dt "build-deps.XXXXXX")" 24 | cd "$BUILDDIR" 25 | BUILD_SOURCES="LanguageMachines/ticcutils" 26 | for SUFFIX in $BUILD_SOURCES; do \ 27 | NAME="$(basename "$SUFFIX")" 28 | git clone "https://github.com/$SUFFIX" 29 | cd "$NAME" 30 | REF=$(git tag -l | grep -E "^v?[0-9]+(\.[0-9])*" | sort -t. -k 1.2,1n -k 2,2n -k 3,3n -k 4,4n | tail -n 1) 31 | if [ "$VERSION" = "stable" ] && [ -n "$REF" ]; then 32 | git -c advice.detachedHead=false checkout "$REF" 33 | fi 34 | sh ./bootstrap.sh && ./configure --prefix "$PREFIX" && make && make install 35 | cd .. 36 | done 37 | cd "$PWD" 38 | [ -n "$BUILDDIR" ] && rm -Rf "$BUILDDIR" 39 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": [ 3 | "https://doi.org/10.5063/schema/codemeta-2.0", 4 | "http://schema.org", 5 | "https://w3id.org/software-types" 6 | ], 7 | "@type": "SoftwareSourceCode", 8 | "identifier": "timbl", 9 | "name": "TiMBL", 10 | "version": "6.10", 11 | "description": "TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.", 12 | "license": "https://spdx.org/licenses/GPL-3.0", 13 | "url": "https://languagemachines.github.io/timbl", 14 | "author": [ 15 | { 16 | "@type": "Person", 17 | "givenName": "Ko", 18 | "familyName": "van der Sloot", 19 | "email": "ko.vandersloot@let.ru.nl", 20 | "affiliation": { 21 | "@id": "https://www.ru.nl/clst", 22 | "@type": "Organization", 23 | "name": "Centre for Language and Speech Technology", 24 | "url": "https://www.ru.nl/clst", 25 | "parentOrganization": { 26 | "@id": "https://www.ru.nl/cls", 27 | "@type": "Organization", 28 | "name": "Centre for Language Studies", 29 | "url": "https://www.ru.nl/cls", 30 | "parentOrganization": { 31 | "@id": "https://www.ru.nl", 32 | "name": "Radboud University", 33 | "@type": "Organization", 34 | "url": "https://www.ru.nl", 35 | "location": { 36 | "@type": "Place", 37 | "name": "Nijmegen" 38 | } 39 | } 40 | 41 | } 42 | }, 43 | "position": 1 44 | }, 45 | { 46 | "@id": "https://orcid.org/0000-0003-2493-656X", 47 | "@type": "Person", 48 | "givenName": "Antal", 49 | "familyName": "van den Bosch", 50 | "email": "antal.vandenbosch@let.ru.nl", 51 | "affiliation": { "@id": "https://cls.ru.nl" }, 52 | "position": 2 53 | }, 54 | { 55 | "@type": "Person", 56 | "givenName": "Walter", 57 | "familyName": "Daelemans", 58 | "position": 3 59 | }, 60 | { 61 | "@id": "https://orcid.org/0000-0002-1046-0006", 62 | "@type": "Person", 63 | "givenName": "Maarten", 64 | "familyName": "van Gompel", 65 | "email": "proycon@anaproy.nl", 66 | "affiliation": { "@id": "https://knaw.huc.nl" }, 67 | "position": 4 68 | }, 69 | { 70 | "@type": "Person", 71 | "givenName": "Ton", 72 | "familyName": "Weijters", 73 | "position": 5 74 | }, 75 | { 76 | "@type": "Person", 77 | "givenName": "Jakub", 78 | "familyName": "Zavrel", 79 | "position": 6 80 | } 81 | ], 82 | "sourceOrganization": { "@id": "https://www.ru.nl/clst" }, 83 | "programmingLanguage": { 84 | "@type": "ComputerLanguage", 85 | "identifier": "c++", 86 | "name": "C++" 87 | }, 88 | "operatingSystem": [ "Linux", "BSD", "macOS" ], 89 | "codeRepository": "https://github.com/LanguageMachines/timbl", 90 | "softwareRequirements": [ 91 | { 92 | "@type": "SoftwareApplication", 93 | "identifier": "libxml2", 94 | "name": "libxml2" 95 | }, 96 | { 97 | "@type": "SoftwareApplication", 98 | "identifier": "ticcutils", 99 | "name": "ticcutils" 100 | } 101 | ], 102 | "readme": "https://github.com/LanguageMachines/timbl/blob/master/README.md", 103 | "issueTracker": "https://github.com/LanguageMachines/timbl/issues", 104 | "contIntegration": "https://travis-ci.org/LanguageMachines/timbl", 105 | "releaseNotes": "https://github.com/LanguageMachines/timbl/releases", 106 | "developmentStatus": "https://www.repostatus.org/#active", 107 | "keywords": [ "nlp", "natural language processing", "memory based learning", "machine learning", "knn", "k-nearest neighbours", "decision tree" ], 108 | "referencePublication": [ 109 | { 110 | "@type": "TechArticle", 111 | "name": "TiMBL: Tilburg Memory Based Learner, Reference Guide", 112 | "author": [ "Walter Daelemans", "Jakub Zavrel", "Ko van der Sloot", "Antal van den Bosch" ], 113 | "url": "https://github.com/LanguageMachines/timbl/raw/master/docs/Timbl_6.4_Manual.pdf" 114 | }, 115 | { 116 | "@type": "Book", 117 | "name": "Memory-Based Language Processing", 118 | "author": [ "Walter Daelemans", "Antal van den Bosch" ], 119 | "url": "http://ilk.uvt.nl/mblp", 120 | "publisher": "Cambridge University Press" 121 | } 122 | ], 123 | "dateCreated": "1998", 124 | "targetProduct": [ 125 | { 126 | "@type": "SoftwareLibrary", 127 | "executableName": "libtimbl", 128 | "name": "libtimbl", 129 | "runtimePlatform": [ "Linux", "BSD", "macOS" ], 130 | "description": "Memory-based Learning Library with API for C++" 131 | }, 132 | { 133 | "@type": "CommandLineApplication", 134 | "executableName": "timbl", 135 | "name": "timbl", 136 | "runtimePlatform": [ "Linux", "BSD", "macOS" ], 137 | "description": "Memory-based learner, command-line tool" 138 | } 139 | ] 140 | } 141 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.69]) 5 | AC_INIT([timbl],[6.10],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! 6 | AM_INIT_AUTOMAKE 7 | AC_CONFIG_SRCDIR([.]) 8 | AC_CONFIG_MACRO_DIR([m4]) 9 | AC_CONFIG_HEADERS([config.h]) 10 | 11 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX_17]) 12 | 13 | # Checks for programs. 14 | AC_PROG_CXX( [g++ c++] ) 15 | AX_CXX_COMPILE_STDCXX_17 16 | 17 | # libtool stuff 18 | LT_INIT 19 | 20 | # when running tests, use CXX 21 | AC_LANG([C++]) 22 | 23 | AC_OPENMP 24 | if test "x$ac_cv_prog_cxx_openmp" != "x"; then 25 | if test "x$ac_cv_prog_cxx_openmp" != "xunsupported"; then 26 | CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS" 27 | AC_DEFINE([HAVE_OPENMP], [1] , [Define to 1 if you have OpenMP] ) 28 | else 29 | AC_MSG_NOTICE([We don't have OpenMP for Clang. Multithreaded operation is di 30 | sabled]) 31 | fi 32 | fi 33 | 34 | #checks for libraries. 35 | 36 | # Checks for header files. 37 | AC_CHECK_HEADERS([sys/time.h]) 38 | 39 | # Checks for typedefs, structures, and compiler characteristics. 40 | AC_HEADER_STDBOOL 41 | AC_C_INLINE 42 | AC_TYPE_SIZE_T 43 | 44 | # Checks for library functions. 45 | AC_CHECK_FUNCS([floor gettimeofday pow rint sqrt ]) 46 | 47 | PKG_PROG_PKG_CONFIG 48 | 49 | if test "x$prefix" = "xNONE"; then 50 | prefix="/usr/local" 51 | fi 52 | 53 | if test "x$PKG_CONFIG_PATH" = x; then 54 | export PKG_CONFIG_PATH="$prefix/lib/pkgconfig" 55 | else 56 | export PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH" 57 | fi 58 | 59 | AC_OSX_PKG( [icu4c] ) 60 | 61 | PKG_PROG_PKG_CONFIG 62 | PKG_CHECK_MODULES([XML2], [libxml-2.0 >= 2.6.16] ) 63 | CXXFLAGS="$CXXFLAGS $XML2_CFLAGS" 64 | LIBS="$LIBS $XML2_LIBS" 65 | 66 | PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.30] ) 67 | CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS" 68 | LIBS="$LIBS $ticcutils_LIBS" 69 | 70 | PKG_CHECK_MODULES([ICU], [icu-uc >= 50 icu-io] ) 71 | CXXFLAGS="$CXXFLAGS $ICU_CFLAGS" 72 | LIBS="$ICU_LIBS $LIBS" 73 | 74 | AC_CONFIG_FILES([ 75 | Makefile 76 | timbl.pc 77 | m4/Makefile 78 | src/Makefile 79 | docs/Makefile 80 | include/Makefile 81 | include/timbl/Makefile 82 | demos/Makefile 83 | ]) 84 | AC_OUTPUT 85 | -------------------------------------------------------------------------------- /demos/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *.lo 4 | Makefile 5 | Makefile.in 6 | .deps/ 7 | .libs/ 8 | tse 9 | api_test1 10 | api_test2 11 | api_test3 12 | api_test4 13 | api_test5 14 | api_test6 15 | classify 16 | -------------------------------------------------------------------------------- /demos/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | # $URL: $ 3 | 4 | AM_CPPFLAGS = -I@top_srcdir@/include 5 | AM_CXXFLAGS = -std=c++17 6 | 7 | noinst_PROGRAMS = api_test1 api_test2 api_test3 api_test4 api_test5 api_test6\ 8 | tse classify 9 | 10 | LDADD = ../src/libtimbl.la 11 | 12 | tse_SOURCES = tse.cxx 13 | 14 | classify_SOURCES = classify.cxx 15 | 16 | api_test1_SOURCES = api_test1.cxx 17 | 18 | api_test2_SOURCES = api_test2.cxx 19 | 20 | api_test3_SOURCES = api_test3.cxx 21 | 22 | api_test4_SOURCES = api_test4.cxx 23 | 24 | api_test5_SOURCES = api_test5.cxx 25 | 26 | api_test6_SOURCES = api_test6.cxx 27 | 28 | exdir = $(datadir)/doc/@PACKAGE@/examples 29 | 30 | ex_DATA = dimin.script dimin.train dimin.test cross_val.test \ 31 | small_1.train small_2.train small_3.train small_4.train small_5.train 32 | 33 | 34 | EXTRA_DIST = $(ex_DATA) 35 | -------------------------------------------------------------------------------- /demos/api_test1.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include "timbl/TimblAPI.h" 28 | int main(){ 29 | Timbl::TimblAPI My_Experiment( "-a IGTREE +vDI+DB+F", "test1" ); 30 | My_Experiment.SetOptions( "-w3 -vDB" ); 31 | My_Experiment.ShowSettings( std::cout ); 32 | My_Experiment.Learn( "dimin.train" ); 33 | My_Experiment.Test( "dimin.test", "my_first_test.out" ); 34 | My_Experiment.SetOptions( "-mM" ); 35 | My_Experiment.Test( "dimin.test", "my_first_test.out" ); 36 | } 37 | -------------------------------------------------------------------------------- /demos/api_test2.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include 28 | 29 | #include "timbl/TimblAPI.h" 30 | 31 | int main(){ 32 | Timbl::TimblAPI *My_Experiment = new Timbl::TimblAPI( "-a IB2 +vF+DI+DB" , 33 | "test2" ); 34 | My_Experiment->SetOptions( "-b100" ); 35 | My_Experiment->ShowSettings( std::cout ); 36 | My_Experiment->Learn( "dimin.train" ); 37 | My_Experiment->Test( "dimin.test", "my_second_test.out" ); 38 | delete My_Experiment; 39 | exit(1); 40 | } 41 | -------------------------------------------------------------------------------- /demos/api_test3.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include 28 | 29 | #include "timbl/TimblAPI.h" 30 | using Timbl::TimblAPI; 31 | 32 | int main(){ 33 | TimblAPI *My_Experiment = new TimblAPI( "-t cross_validate" ); 34 | My_Experiment->Test( "cross_val.test" ); 35 | delete My_Experiment; 36 | exit(0); 37 | } 38 | -------------------------------------------------------------------------------- /demos/api_test4.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include 28 | 29 | #include 30 | #include "timbl/TimblAPI.h" 31 | using namespace Timbl; 32 | 33 | int main(){ 34 | TimblAPI *My_Experiment = new TimblAPI( "-a IB1 +vDI+DB +mM" , 35 | "test4" ); 36 | My_Experiment->ShowSettings( std::cout ); 37 | My_Experiment->Learn( "dimin.train" ); 38 | My_Experiment->Test( "dimin.test", "inc1.out" ); 39 | My_Experiment->SaveWeights( "wg.1.wgt" ); 40 | My_Experiment->WriteArrays( "arr.1.arr" ); 41 | My_Experiment->Increment( "=,=,=,=,+,k,e,=,-,r,@,l,T" ); 42 | My_Experiment->Test( "dimin.test", "inc2.out" ); 43 | My_Experiment->SaveWeights( "wg.2.wgt" ); 44 | My_Experiment->WriteArrays( "arr.2.arr" ); 45 | My_Experiment->Increment( "+,zw,A,rt,-,k,O,p,-,n,O,n,E" ); 46 | My_Experiment->Test( "dimin.test", "inc3.out" ); 47 | My_Experiment->SaveWeights( "wg.3.wgt" ); 48 | My_Experiment->WriteArrays( "arr.3.arr" ); 49 | My_Experiment->Decrement( "+,zw,A,rt,-,k,O,p,-,n,O,n,E" ); 50 | My_Experiment->Test( "dimin.test", "inc4.out" ); 51 | My_Experiment->SaveWeights( "wg.4.wgt" ); 52 | My_Experiment->WriteArrays( "arr.4.arr" ); 53 | My_Experiment->Decrement( "=,=,=,=,+,k,e,=,-,r,@,l,T" ); 54 | My_Experiment->Test( "dimin.test", "inc5.out" ); 55 | My_Experiment->SaveWeights( "wg.5.wgt" ); 56 | My_Experiment->WriteArrays( "arr.5.arr" ); 57 | delete My_Experiment; 58 | exit(1); 59 | } 60 | -------------------------------------------------------------------------------- /demos/api_test5.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | #include 27 | #include 28 | #include "timbl/TimblAPI.h" 29 | 30 | using std::endl; 31 | using std::cout; 32 | using std::string; 33 | using namespace Timbl; 34 | 35 | int main(){ 36 | TimblAPI *My_Experiment = new TimblAPI( "-a IB1 +vDI+DB+n +mM +k4 " , 37 | "test5" ); 38 | My_Experiment->Learn( "dimin.train" ); 39 | { 40 | icu::UnicodeString line = "=,=,=,=,+,k,e,=,-,r,@,l,T"; 41 | const neighborSet *neighbours1 = My_Experiment->classifyNS( line ); 42 | if ( neighbours1 ){ 43 | cout << "Classify OK on " << line << endl; 44 | cout << neighbours1; 45 | } 46 | else { 47 | cout << "Classify failed on " << line << endl; 48 | neighbours1 = new neighborSet(); 49 | } 50 | neighborSet neighbours2; 51 | line = "+,zw,A,rt,-,k,O,p,-,n,O,n,E"; 52 | if ( My_Experiment->classifyNS( line, neighbours2 ) ){ 53 | cout << "Classify OK on " << line << endl; 54 | cout << neighbours2; 55 | } 56 | else { 57 | cout << "Classify failed on " << line << endl; 58 | } 59 | line = "+,z,O,n,-,d,A,xs,-,=,A,rm,P"; 60 | const neighborSet *neighbours3 = My_Experiment->classifyNS( line ); 61 | if ( neighbours3 ){ 62 | cout << "Classify OK on " << line << endl; 63 | cout << neighbours3; 64 | } 65 | else { 66 | cout << "Classify failed on " << line << endl; 67 | neighbours3 = new neighborSet(); 68 | } 69 | neighborSet uit2; 70 | { 71 | neighborSet uit; 72 | uit.setShowDistance(true); 73 | uit.setShowDistribution(true); 74 | cout << " before first merge " << endl; 75 | cout << uit; 76 | uit.merge( *neighbours1 ); 77 | cout << " after first merge " << endl; 78 | cout << uit; 79 | uit.merge( *neighbours3 ); 80 | cout << " after second merge " << endl; 81 | cout << uit; 82 | uit.merge( neighbours2 ); 83 | cout << " after third merge " << endl; 84 | cout << uit; 85 | uit.truncate( 3 ); 86 | cout << " after truncate " << endl; 87 | cout << uit; 88 | cout << " test assignment" << endl; 89 | uit2 = *neighbours1; 90 | } 91 | cout << "assignment result: " << endl; 92 | cout << uit2; 93 | { 94 | cout << " test copy construction" << endl; 95 | neighborSet uit(uit2); 96 | cout << "result: " << endl; 97 | cout << uit; 98 | } 99 | cout << "almost done!" << endl; 100 | } 101 | delete My_Experiment; 102 | cout << "done!" << endl; 103 | } 104 | -------------------------------------------------------------------------------- /demos/api_test6.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include 28 | #include "timbl/TimblAPI.h" 29 | 30 | using std::cout; 31 | using std::endl; 32 | using namespace Timbl; 33 | 34 | int main(){ 35 | TimblAPI My_Experiment( "-a IB1 +vDI+DB -G 0 -k3", "test6" ); 36 | My_Experiment.Learn( "dimin.train" ); 37 | const ClassDistribution *vd; 38 | const TargetValue *tv 39 | = My_Experiment.Classify( std::string("-,=,O,m,+,h,K,=,-,n,I,N,K"), vd ); 40 | cout << "resulting target: " << tv << endl; 41 | cout << "resulting Distribution: " << vd << endl; 42 | ClassDistribution::dist_iterator it=vd->begin(); 43 | while ( it != vd->end() ){ 44 | cout << it->second << " OR "; 45 | cout << it->second->Value() << " " << it->second->Weight() << endl; 46 | ++it; 47 | } 48 | 49 | cout << "the same with neighborSets" << endl; 50 | const neighborSet *nb = My_Experiment.classifyNS( "-,=,O,m,+,h,K,=,-,n,I,N,K" ); 51 | WClassDistribution *vd2 = nb->bestDistribution(); 52 | vd2->Normalize(); 53 | cout << "default answer " << vd2 << endl; 54 | decayStruct *dc = new expDecay(0.3); 55 | delete vd2; 56 | vd2 = nb->bestDistribution( dc ); 57 | delete dc; 58 | cout << "with exponenial decay, alpha = 0.3 " << vd2 << endl; 59 | delete vd2; 60 | } 61 | -------------------------------------------------------------------------------- /demos/classify.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2015 3 | ILK - Tilburg University 4 | CLiPS - University of Antwerp 5 | 6 | This file is part of timbl 7 | 8 | timbl is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation; either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | timbl is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program; if not, see . 20 | 21 | For questions and suggestions, see: 22 | http://ilk.uvt.nl/software.html 23 | or send mail to: 24 | timbl@uvt.nl 25 | */ 26 | 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | #include "timbl/TimblAPI.h" 33 | 34 | using namespace std; 35 | using namespace Timbl; 36 | 37 | char inf[] = "./dimin.train"; 38 | char test_f[] = "./dimin.test"; 39 | 40 | int main(){ 41 | string Bresult; 42 | double Distance; 43 | 44 | TimblAPI *Exp = new TimblAPI( "-a TRIBL" ); 45 | Exp->SetOptions( "+vS +x -N30 -q2" ); 46 | Exp->ShowOptions( cout ); 47 | Exp->Learn( inf ); 48 | ifstream testfile; 49 | string Buffer; 50 | testfile.open( test_f, ios::in ); 51 | cout << "\nStart testing, using TRIBL" << endl; 52 | while ( getline( testfile, Buffer ) ){ 53 | const TargetValue *tv = Exp->Classify( Buffer, Distance ); 54 | if ( tv ) 55 | cout << Buffer << "\t --> " << tv << " " << Distance << endl; 56 | else 57 | cout << Buffer << "\t --> (nill)" << endl; 58 | } 59 | testfile.close(); 60 | delete Exp; 61 | Exp = new TimblAPI( "-a IB1" ); 62 | Exp->SetOptions( "+vS" ); 63 | Exp->ShowOptions( cout ); 64 | Exp->Learn( inf ); 65 | testfile.clear(); 66 | testfile.open( test_f, ios::in ); 67 | cout << "\nStart testing, using IB" << endl; 68 | while ( getline( testfile, Buffer ) ){ 69 | if ( Exp->Classify( Buffer, Bresult, Distance ) ){ 70 | cout << Buffer << "\t --> " << Bresult << " " << Distance << endl; 71 | } 72 | else 73 | cout << Buffer << "\t --> (nill)" << endl; 74 | } 75 | testfile.close(); 76 | delete Exp; 77 | Exp = new TimblAPI( "-a IGTREE" ); 78 | Exp->SetOptions( "+vS -N40" ); 79 | Exp->ShowOptions( cout ); 80 | Exp->Learn( inf ); 81 | Exp->WriteInstanceBase( "dimin.tree" ); 82 | Exp->SaveWeights( "dimin.wgt" ); 83 | cout << "\nStart testing, using IGTree, first run" << endl; 84 | testfile.clear(); 85 | testfile.open( test_f, ios::in ); 86 | while ( getline( testfile, Buffer ) ){ 87 | if ( Exp->Classify( Buffer, Bresult, Distance ) ){ 88 | cout << Buffer << "\t --> " << Bresult << " " << Distance << endl; 89 | } 90 | else 91 | cout << Buffer << "\t --> (nill)" << endl; 92 | } 93 | testfile.close(); 94 | delete Exp; 95 | Exp = new TimblAPI( "-a IGTREE" ); 96 | Exp->SetOptions( "+vS" ); 97 | Exp->ShowOptions( cout ); 98 | Exp->GetInstanceBase( "dimin.tree" ); 99 | Exp->GetWeights( "dimin.wgt" ); 100 | cout << "\nStart testing, using IGTree, second run, (retrieved Tree)" << endl; 101 | testfile.clear(); 102 | testfile.open( test_f, ios::in ); 103 | while ( getline( testfile, Buffer ) ){ 104 | if ( Exp->Classify( Buffer, Bresult, Distance ) ){ 105 | cout << Buffer << "\t --> " << Bresult << " " << Distance << endl; 106 | } 107 | else 108 | cout << Buffer << "\t --> (nill)" << endl; 109 | } 110 | testfile.close(); 111 | exit(1); 112 | } 113 | -------------------------------------------------------------------------------- /demos/cross_val.test: -------------------------------------------------------------------------------- 1 | small_1.train 2 | small_2.train 3 | small_3.train 4 | small_4.train 5 | small_5.train 6 | -------------------------------------------------------------------------------- /demos/dimin.script: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 1998 - 2011 3 | # ILK - Tilburg University 4 | # CLiPS - University of Antwerp 5 | # 6 | # This file is part of timbl 7 | # 8 | # timbl is free software; you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation; either version 3 of the License, or 11 | # (at your option) any later version. 12 | # 13 | # timbl is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program; if not, see . 20 | # 21 | # For questions and suggestions, see: 22 | # http://ilk.uvt.nl/software.html 23 | # or send mail to: 24 | # timbl@uvt.nl 25 | 26 | # example script file for tse demo program. 27 | # 28 | # create an experiment with name exp1 29 | # 30 | new exp1 31 | # 32 | # make sure that we see something happen 33 | # 34 | exp1.set +v+f+di+O 35 | # 36 | # set the desired weighting to IG 37 | # 38 | exp1.set +w IG 39 | # 40 | # now train ( prepare is implicit) 41 | # 42 | exp1.train ./dimin.train 43 | # 44 | # save the Instancebase for later use. 45 | # 46 | exp1.save tree.tmp 47 | exp1.show options 48 | # 49 | # 50 | # first we start with OVERLAP metric 51 | # 52 | exp1.set -mO 53 | # 54 | # and test 55 | exp1.test ./dimin.test a1.tmp 56 | # 57 | # now we try the Value Difference Metric 58 | exp1.set -mM 59 | # 60 | 61 | exp1.test ./dimin.test a2.tmp 62 | 63 | # 64 | # start a new experiment: 65 | new exp2 66 | # 67 | # fill it with de tree generated with exp1 68 | # 69 | exp2.get tree.tmp 70 | # 71 | # let's make a lot of noice! 72 | exp2.set +v +o+f+di+n+db 73 | # 74 | # now delete exp1, to demonstrate that it works. 75 | free exp1 76 | # 77 | # end perform a test with exp2 78 | exp2.test ./dimin.test exp2.out.tmp 79 | # 80 | # ready 81 | -------------------------------------------------------------------------------- /demos/small_1.train: -------------------------------------------------------------------------------- 1 | Rockwell,PUNT,PUNT,PUNT,NNP,PUNT,PUNT,PUNT,I 2 | International,Rockwell,PUNT,PUNT,NNP,NNP,PUNT,PUNT,I 3 | CorpPUNT,International,Rockwell,PUNT,NNP,NNP,NNP,PUNT,I 4 | 's,CorpPUNT,International,Rockwell,POS,NNP,NNP,NNP,I 5 | Tulsa,'s,CorpPUNT,International,NNP,POS,NNP,NNP,B 6 | unit,Tulsa,'s,CorpPUNT,NN,NNP,POS,NNP,I 7 | said,unit,Tulsa,'s,VBD,NN,NNP,POS,I 8 | it,said,unit,Tulsa,PRP,VBD,NN,NNP,O 9 | signed,it,said,unit,VBD,PRP,VBD,NN,I 10 | a,signed,it,said,DT,VBD,PRP,VBD,O 11 | -------------------------------------------------------------------------------- /demos/small_2.train: -------------------------------------------------------------------------------- 1 | tentative,a,signed,it,JJ,DT,VBD,PRP,I 2 | agreement,tentative,a,signed,NN,JJ,DT,VBD,I 3 | extending,agreement,tentative,a,VBG,NN,JJ,DT,I 4 | its,extending,agreement,tentative,PRP$,VBG,NN,JJ,O 5 | contract,its,extending,agreement,NN,PRP$,VBG,NN,I 6 | with,contract,its,extending,IN,NN,PRP$,VBG,I 7 | Boeing,with,contract,its,NNP,IN,NN,PRP$,O 8 | CoPUNT,Boeing,with,contract,NNP,NNP,IN,NN,I 9 | to,CoPUNT,Boeing,with,TO,NNP,NNP,IN,I 10 | provide,to,CoPUNT,Boeing,VB,TO,NNP,NNP,O 11 | -------------------------------------------------------------------------------- /demos/small_3.train: -------------------------------------------------------------------------------- 1 | structural,provide,to,CoPUNT,JJ,VB,TO,NNP,O 2 | parts,structural,provide,to,NNS,JJ,VB,TO,I 3 | for,parts,structural,provide,IN,NNS,JJ,VB,I 4 | Boeing,for,parts,structural,NNP,IN,NNS,JJ,O 5 | 's,Boeing,for,parts,POS,NNP,IN,NNS,I 6 | 747,'s,Boeing,for,CD,POS,NNP,IN,B 7 | jetliners,747,'s,Boeing,NNS,CD,POS,NNP,I 8 | PUNT,jetliners,747,'s,PUNT,NNS,CD,POS,I 9 | Rockwell,PUNT,PUNT,jetliners,NNP,PUNT,PUNT,NNS,O 10 | said,Rockwell,PUNT,PUNT,VBD,NNP,PUNT,PUNT,I 11 | -------------------------------------------------------------------------------- /demos/small_4.train: -------------------------------------------------------------------------------- 1 | the,said,Rockwell,PUNT,DT,VBD,NNP,PUNT,O 2 | agreement,the,said,Rockwell,NN,DT,VBD,NNP,I 3 | calls,agreement,the,said,VBZ,NN,DT,VBD,I 4 | for,calls,agreement,the,IN,VBZ,NN,DT,O 5 | it,for,calls,agreement,PRP,IN,VBZ,NN,O 6 | to,it,for,calls,TO,PRP,IN,VBZ,I 7 | supply,to,it,for,VB,TO,PRP,IN,O 8 | 200,supply,to,it,CD,VB,TO,PRP,O 9 | additional,200,supply,to,JJ,CD,VB,TO,I 10 | shipsets,so-called,additional,200,NNS,JJ,JJ,CD,I 11 | -------------------------------------------------------------------------------- /demos/small_5.train: -------------------------------------------------------------------------------- 1 | for,shipsets,so-called,additional,IN,NNS,JJ,JJ,I 2 | the,for,shipsets,so-called,DT,IN,NNS,JJ,O 3 | planes,the,for,shipsets,NNS,DT,IN,NNS,I 4 | PUNT,planes,the,for,PUNT,NNS,DT,IN,I 5 | These,PUNT,PUNT,planes,DT,PUNT,PUNT,NNS,O 6 | include,These,PUNT,PUNT,VBP,DT,PUNT,PUNT,I 7 | among,KOMMA,include,These,IN,KOMMA,VBP,DT,O 8 | other,among,KOMMA,include,JJ,IN,KOMMA,VBP,O 9 | -------------------------------------------------------------------------------- /docs/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | # $URL$ 3 | 4 | man1_MANS = timbl.1 5 | 6 | EXTRA_DIST = timbl.1 7 | -------------------------------------------------------------------------------- /docs/Timbl_6.4_Manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LanguageMachines/timbl/1c58d476a7baf82729502c1cc2804f8d210de70e/docs/Timbl_6.4_Manual.pdf -------------------------------------------------------------------------------- /docs/texfiles/fullname.sty: -------------------------------------------------------------------------------- 1 | %%% fullname.sty 2 | %%% 3 | %%% Stuart M. Shieber 4 | %%% Mon Mar 30 17:23:36 EST 1992 5 | 6 | %%% Modifications to the citation macros intended to be used with the 7 | %%% fullname.bst style. 8 | 9 | %%% Some material taken from Peter Patel-Schneider's AAAI style for 10 | %%% use in conjunction with AAAI-named bibliography style. 11 | %%% 12 | %%% Citation forms: 13 | %%% 14 | %%% Macro Output format 15 | %%% ----------- ----------------------------------------- 16 | %%% \cite: (Dewey, 1988) 17 | %%% (Dewey, 1988, page 15) 18 | %%% (Dewey, 1988; Cheatham, 1987; Howe, 1903) 19 | %%% \shortcite: (1988) 20 | %%% (1988, page 15) 21 | %%% \namecite: Dewey (1988) 22 | %%% Dewey (1988, page 15) 23 | 24 | % don't box citations, add space between multiple citations, separate with ; 25 | \def\@citex[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi 26 | \def\@citea{}\@cite{\@for\@citeb:=#2\do 27 | {\@citea\def\@citea{; }\@ifundefined 28 | {b@\@citeb}{{\bf ?}\@warning 29 | {Citation `\@citeb' on page \thepage \space undefined}}% 30 | {\csname b@\@citeb\endcsname}}}{#1}} 31 | % Allow short (name-less) citations, when used in 32 | % conjunction with a bibliography style that creates labels like 33 | % \citename{, } 34 | \let\@internalcite\cite 35 | \def\cite{\def\citename##1{##1, }(\@internalcite} 36 | \def\shortcite{\def\citename##1{}(\@internalcite} 37 | \def\namecite{\def\citename##1{##1 (}\@internalcite} 38 | \def\@biblabel#1{\def\citename##1{##1}[#1]\hfill} 39 | 40 | %%% More changes made by SMS (originals in latex.tex) 41 | % Use parentheses instead of square brackets in the text. 42 | \def\@cite#1#2{{#1\if@tempswa , #2\fi})} 43 | 44 | % Don't put a label in the bibliography at all. Just use the unlabeled format 45 | % instead. 46 | \def\thebibliography#1{\section*{References\@mkboth 47 | {References}{References}}\list 48 | {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent} 49 | \setlength{\itemindent}{-\parindent}} 50 | \def\newblock{\hskip .11em plus .33em minus -.07em} 51 | \sloppy\clubpenalty4000\widowpenalty4000 52 | \sfcode`\.=1000\relax} 53 | \let\endthebibliography=\endlist 54 | 55 | \def\@lbibitem[#1]#2{\item[]\if@filesw 56 | { \def\protect##1{\string ##1\space}\immediate 57 | \write\@auxout{\string\bibcite{#2}{#1}}\fi\ignorespaces}} 58 | 59 | \def\@bibitem#1{\item\if@filesw \immediate\write\@auxout 60 | {\string\bibcite{#1}{\the\c@enumi}}\fi\ignorespaces} 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/timbl.1: -------------------------------------------------------------------------------- 1 | .TH timbl 1 "2017 November 9" 2 | 3 | .SH NAME 4 | timbl \- Tilburg Memory Based Learner 5 | .SH SYNOPSIS 6 | timbl [options] 7 | 8 | timbl \-f data\-file \-t test\(hyfile 9 | 10 | .SH DESCRIPTION 11 | TiMBL is an open source software package implementing several memory\(hybased learning algorithms, among which IB1\(hyIG, an implementation of k\(hynearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision\(hytree approximation of IB1\(hyIG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases. 12 | 13 | .SH OPTIONS 14 | .B \-a 15 | 16 | or 17 | .B \-a 18 | 19 | .RS 20 | determines the classification algorithm. 21 | 22 | Possible values are: 23 | 24 | .B 0 25 | or 26 | .B IB 27 | the IB1 (k\(hyNN) algorithm (default) 28 | 29 | .B 1 30 | or 31 | .B IGTREE 32 | a decision\(hytree\(hybased approximation of IB1 33 | 34 | .B 2 35 | or 36 | .B TRIBL 37 | a hybrid of IB1 and IGTREE 38 | 39 | .B 3 40 | or 41 | .B IB2 42 | an incremental editing version of IB1 43 | 44 | .B 4 45 | or 46 | .B TRIBL2 47 | a non\(hyparameteric version of TRIBL 48 | .RE 49 | 50 | .B \-b 51 | n 52 | .RS 53 | number of lines used for bootstrapping (IB2 only) 54 | .RE 55 | 56 | .B \-B 57 | n 58 | .RS 59 | number of bins used for discretization of numeric feature values (Default B=20) 60 | .RE 61 | 62 | .BR \-\-Beam = 63 | .RS 64 | limit +v db output to n highest\(hyvote classes 65 | .RE 66 | 67 | .BR \-\-clones = 68 | .RS 69 | number f threads to use for parallel testing 70 | .RE 71 | 72 | .B \-c 73 | n 74 | .RS 75 | clipping frequency for prestoring MVDM matrices 76 | .RE 77 | 78 | .B +D 79 | .RS 80 | store distributions on all nodes (necessary for 81 | using +v db with IGTree, but wastes memory otherwise) 82 | .RE 83 | 84 | .B \-\-Diversify 85 | .RS 86 | rescale weight (see docs) 87 | .RE 88 | 89 | .B \-d 90 | val 91 | .RS 92 | weigh neighbors as function of their distance: 93 | Z : equal weights to all (default) 94 | ID : Inverse Distance 95 | IL : Inverse Linear 96 | ED:a : Exponential Decay with factor a (no whitespace!) 97 | ED:a:b : Exponential Decay with factor a and b (no whitespace!) 98 | .RE 99 | 100 | .B \-e 101 | n 102 | .RS 103 | estimate time until n patterns tested 104 | .RE 105 | 106 | .B \-f 107 | file 108 | .RS 109 | read from data file 'file' OR use filenames from 'file' for cross validation test 110 | .RE 111 | 112 | .B \-F 113 | format 114 | .RS 115 | assume the specified input format 116 | (Compact, C4.5, ARFF, Columns, Binary, Sparse ) 117 | .RE 118 | 119 | .B \-G 120 | normalization 121 | 122 | .RS 123 | normalize distributions (+v db option only) 124 | 125 | Supported normalizations are: 126 | 127 | .B Probability 128 | or 129 | .B 0 130 | 131 | normalize between 0 and 1 132 | 133 | .BR addFactor : 134 | or 135 | .BR 1 : 136 | 137 | add f to all possible targets, then normalize between 0 and 1 (default f=1.0). 138 | 139 | .B logProbability 140 | or 141 | .B 2 142 | 143 | Add 1 to the target Weight, take the 10Log and then normalize between 0 and 1 144 | 145 | .RE 146 | 147 | .B +H 148 | or 149 | .B \-H 150 | .RS 151 | write hashed trees (default +H) 152 | .RE 153 | 154 | .B \-i 155 | file 156 | .RS 157 | read the InstanceBase from 'file' (skips phase 1 & 2 ) 158 | .RE 159 | 160 | .B \-I 161 | file 162 | .RS 163 | dump the InstanceBase in 'file' 164 | .RE 165 | 166 | .B \-k 167 | n 168 | .RS 169 | search 'n' nearest neighbors (default n = 1) 170 | .RE 171 | 172 | .B \-L 173 | n 174 | .RS 175 | set value frequency threshold to back off from MVDM to Overlap at level n 176 | .RE 177 | 178 | .B \-l 179 | n 180 | .RS 181 | fixed feature value length (Compact format only) 182 | .RE 183 | 184 | .B \-m 185 | string 186 | .RS 187 | use feature metrics as specified in 'string': 188 | The format is : GlobalMetric:MetricRange:MetricRange 189 | e.g.: mO:N3:I2,5\-7 190 | 191 | C: cosine distance. (Global only. numeric features implied) 192 | D: dot product. (Global only. numeric features implied) 193 | DC: Dice coefficient 194 | O: weighted overlap (default) 195 | E: Euclidian distance 196 | L: Levenshtein distance 197 | M: modified value difference 198 | J: Jeffrey divergence 199 | S: Jensen\(hyShannon divergence 200 | N: numeric values 201 | I: Ignore named values 202 | .RE 203 | 204 | .BR \-\-matrixin =file 205 | .RS 206 | read ValueDifference Matrices from file 'file' 207 | .RE 208 | 209 | .BR \-\-matrixout =file 210 | .RS 211 | store ValueDifference Matrices in 'file' 212 | .RE 213 | 214 | .B \-n 215 | file 216 | .RS 217 | create a C4.5\-style names file 'file' 218 | .RE 219 | 220 | .B \-M 221 | n 222 | .RS 223 | size of MaxBests Array 224 | .RE 225 | 226 | .B \-N 227 | n 228 | .RS 229 | number of features (default 2500) 230 | .RE 231 | 232 | .B \-o 233 | s 234 | .RS 235 | use s as output filename 236 | .RE 237 | 238 | .BR \-\-occurrences = 239 | .RS 240 | The input file contains occurrence counts (at the last position) 241 | value can be one of: 242 | .B train 243 | , 244 | .B test 245 | or 246 | .B both 247 | .RE 248 | 249 | .B \-O 250 | path 251 | .RS 252 | save output using 'path' 253 | .RE 254 | 255 | .B \-p 256 | n 257 | .RS 258 | show progress every n lines (default p = 100,000) 259 | .RE 260 | 261 | .B \-P 262 | path 263 | .RS 264 | read data using 'path' 265 | .RE 266 | 267 | .B \-q 268 | n 269 | .RS 270 | set TRIBL threshold at level n 271 | .RE 272 | 273 | .B \-R 274 | n 275 | .RS 276 | solve ties at random with seed n 277 | .RE 278 | 279 | .B \-s 280 | .RS 281 | use the exemplar weights from the input file 282 | .RE 283 | 284 | .B \-s0 285 | .RS 286 | ignore the exemplar weights from the input file 287 | .RE 288 | 289 | .B \-T 290 | n 291 | .RS 292 | use feature n as the class label. (default: the last feature) 293 | .RE 294 | 295 | .B \-t 296 | file 297 | .RS 298 | test using 'file' 299 | .RE 300 | 301 | .B \-t 302 | leave_one_out 303 | .RS 304 | test with the leave\(hyone\(hyout testing regimen (IB1 only). 305 | you may add \-\-sloppy to speed up leave\(hyone\(hyout testing (but see docs) 306 | .RE 307 | 308 | .B \-t 309 | cross_validate 310 | .RS 311 | perform cross\(hyvalidation test (IB1 only) 312 | .RE 313 | 314 | .B \-t 315 | @file 316 | .RS 317 | test using files and options described in 'file' 318 | Supported options: d e F k m o p q R t u v w x % \- 319 | .RE 320 | 321 | .B \-\-Treeorder =value 322 | n 323 | .RS 324 | ordering of the Tree: 325 | DO: none 326 | GRO: using GainRatio 327 | IGO: using InformationGain 328 | 1/V: using 1/# of Values 329 | G/V: using GainRatio/# of Valuess 330 | I/V: using InfoGain/# of Valuess 331 | X2O: using X\(hysquare 332 | X/V: using X\(hysquare/# of Values 333 | SVO: using Shared Variance 334 | S/V: using Shared Variance/# of Values 335 | GxE: using GainRatio * SplitInfo 336 | IxE: using InformationGain * SplitInfo 337 | 1/S: using 1/SplitInfo 338 | .RE 339 | 340 | .B \-u 341 | file 342 | .RS 343 | read value\(hyclass probabilities from 'file' 344 | .RE 345 | 346 | .B \-U 347 | file 348 | .RS 349 | save value\(hyclass probabilities in 'file' 350 | .RE 351 | 352 | .B \-V 353 | .RS 354 | Show VERSION 355 | .RE 356 | 357 | .B +v 358 | level or 359 | .B \-v 360 | level 361 | .RS 362 | set or unset verbosity level, where level is: 363 | 364 | s: work silently 365 | o: show all options set 366 | b: show node/branch count and branching factor 367 | f: show calculated feature weights (default) 368 | p: show value difference matrices 369 | e: show exact matches 370 | as: show advanced statistics (memory consuming) 371 | cm: show confusion matrix (implies +vas) 372 | cs: show per\(hyclass statistics (implies +vas) 373 | cf: add confidence to output file (needs \-G) 374 | di: add distance to output file 375 | db: add distribution of best matched to output file 376 | md: add matching depth to output file. 377 | k: add a summary for all k neigbors to output file (sets \-x) 378 | n: add nearest neigbors to output file (sets \-x) 379 | 380 | You may combine levels using '+' e.g. +v p+db or \-v o+di 381 | .RE 382 | 383 | .B \-w 384 | n 385 | .RS 386 | weighting 387 | 0 or nw: no weighting 388 | 1 or gr: weigh using gain ratio (default) 389 | 2 or ig: weigh using information gain 390 | 3 or x2: weigh using the chi\(hysquare statistic 391 | 4 or sv: weigh using the shared variance statistic 392 | 5 or sd: weigh using standard deviation. (all features must be numeric) 393 | .RE 394 | 395 | .B \-w 396 | file 397 | .RS 398 | read weights from 'file' 399 | .RE 400 | 401 | .B \-w 402 | file:n 403 | .RS 404 | read weight n from 'file' 405 | .RE 406 | 407 | .B \-W 408 | file 409 | .RS 410 | calculate and save all weights in 'file' 411 | .RE 412 | 413 | .B +% 414 | or 415 | .B \-% 416 | .RS 417 | do or don't save test result (%) to file 418 | .RE 419 | 420 | .B +x 421 | or 422 | .B \-x 423 | .RS 424 | do or don't use the exact match shortcut 425 | (IB1 and IB2 only, default is \-x) 426 | .RE 427 | 428 | .BR \-X " file" 429 | .RS 430 | dump the InstanceBase as XML in 'file' 431 | .RE 432 | 433 | .SH BUGS 434 | possibly 435 | 436 | .SH AUTHORS 437 | Ko van der Sloot Timbl@uvt.nl 438 | 439 | Antal van den Bosch Timbl@uvt.nl 440 | 441 | .SH SEE ALSO 442 | .BR timblserver (1) 443 | -------------------------------------------------------------------------------- /include/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | # $URL$ 3 | 4 | AUTOMAKE_OPTIONS = foreign 5 | 6 | SUBDIRS = timbl 7 | -------------------------------------------------------------------------------- /include/timbl/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | -------------------------------------------------------------------------------- /include/timbl/BestArray.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_BESTARRAY_H 29 | #define TIMBL_BESTARRAY_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #include "unicode/unistr.h" 36 | #include "libxml/parser.h" 37 | #include "ticcutils/json.hpp" 38 | #include "timbl/Targets.h" 39 | 40 | namespace Timbl { 41 | 42 | class neighborSet; 43 | 44 | class BestRec { 45 | friend std::ostream& operator<< ( std::ostream&, const BestRec * ); 46 | public: 47 | BestRec(); 48 | BestRec( const BestRec& ) = delete; // forbid copies 49 | BestRec& operator=( const BestRec& ) = delete; // forbid copies 50 | ~BestRec(); 51 | size_t totalBests() const { return aggregateDist.totalSize(); }; 52 | double bestDistance; 53 | ClassDistribution aggregateDist; 54 | std::vector bestDistributions; 55 | std::vector bestInstances; 56 | private: 57 | }; 58 | 59 | class BestArray { 60 | friend std::ostream& operator<< ( std::ostream&, const BestArray& ); 61 | public: 62 | BestArray(): _storeInstances(false), 63 | _showDi(false), 64 | _showDb(false), 65 | size(0), 66 | maxBests(0) 67 | {}; 68 | ~BestArray(); 69 | void init( unsigned int, unsigned int, bool, bool, bool ); 70 | double addResult( double, 71 | const ClassDistribution *, 72 | const icu::UnicodeString& ); 73 | void initNeighborSet( neighborSet& ) const; 74 | void addToNeighborSet( neighborSet& , size_t ) const; 75 | xmlNode *toXML() const; 76 | nlohmann::json to_JSON() const; 77 | nlohmann::json record_to_json( const BestRec *, size_t ) const; 78 | private: 79 | bool _storeInstances; 80 | bool _showDi; 81 | bool _showDb; 82 | unsigned int size; 83 | unsigned int maxBests; 84 | std::vector bestArray; 85 | }; 86 | 87 | } 88 | #endif // TIMBL_BESTARRAY_H 89 | -------------------------------------------------------------------------------- /include/timbl/Choppers.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMBL_CHOPPERS_H 2 | #define TIMBL_CHOPPERS_H 3 | /* 4 | Copyright (c) 1998 - 2024 5 | ILK - Tilburg University 6 | CLST - Radboud University 7 | CLiPS - University of Antwerp 8 | 9 | This file is part of timbl 10 | 11 | timbl is free software; you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation; either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | timbl is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program; if not, see . 23 | 24 | For questions and suggestions, see: 25 | https://github.com/LanguageMachines/timbl/issues 26 | or send mail to: 27 | lamasoftware (at ) science.ru.nl 28 | 29 | */ 30 | 31 | #include 32 | #include 33 | 34 | #include // for ostream 35 | #include // for vector 36 | #include "unicode/unistr.h" 37 | #include "unicode/ustream.h" 38 | #include "timbl/Types.h" 39 | 40 | namespace Timbl{ 41 | 42 | static const icu::UnicodeString DefaultSparseString = "0.0000E-17"; 43 | 44 | class Chopper { 45 | public: 46 | Chopper(): 47 | vSize(0) 48 | {}; 49 | virtual ~Chopper() {}; 50 | virtual bool chop( const icu::UnicodeString&, size_t ) = 0; 51 | const icu::UnicodeString& operator[]( int i ) const { 52 | return choppedInput[i]; } 53 | const icu::UnicodeString& getField( size_t i ) const { 54 | return choppedInput[i]; 55 | }; 56 | virtual double getExW() const { return -1; }; 57 | virtual int getOcc() const { return 1; }; 58 | virtual icu::UnicodeString getString() const = 0; 59 | void print( std::ostream& os ){ 60 | os << getString(); 61 | }; 62 | void swapTarget( size_t target_pos ){ 63 | icu::UnicodeString tmp = choppedInput[target_pos]; 64 | for ( size_t i = target_pos+1; i < vSize; ++i ){ 65 | choppedInput[i-1] = choppedInput[i]; 66 | } 67 | choppedInput[vSize-1] = tmp; 68 | } 69 | static Chopper *create( InputFormatType , bool, int, bool ); 70 | static InputFormatType getInputFormat( const icu::UnicodeString&, 71 | bool=false ); 72 | static size_t countFeatures( const icu::UnicodeString&, 73 | InputFormatType, 74 | int, 75 | bool=false ); 76 | protected: 77 | virtual void init( const icu::UnicodeString&, size_t, bool ); 78 | size_t vSize; 79 | icu::UnicodeString strippedInput; 80 | std::vector choppedInput; 81 | }; 82 | 83 | class ExChopper: public virtual Chopper { 84 | public: 85 | ExChopper(): 86 | Chopper(), 87 | exW(-1.0) 88 | {}; 89 | double getExW() const override { return exW; }; 90 | protected: 91 | void init( const icu::UnicodeString&, size_t, bool ) override; 92 | double exW; 93 | }; 94 | 95 | class OccChopper: public virtual Chopper { 96 | public: 97 | OccChopper(): 98 | Chopper(), 99 | occ(-1) 100 | {}; 101 | int getOcc() const override { return occ; }; 102 | protected: 103 | void init( const icu::UnicodeString&, size_t, bool ) override; 104 | int occ; 105 | }; 106 | 107 | class C45_Chopper : public virtual Chopper { 108 | public: 109 | bool chop( const icu::UnicodeString&, size_t ) override; 110 | icu::UnicodeString getString() const override; 111 | }; 112 | 113 | class C45_ExChopper : public C45_Chopper, public ExChopper { 114 | }; 115 | 116 | class C45_OccChopper : public C45_Chopper, public OccChopper { 117 | }; 118 | 119 | class ARFF_Chopper : public C45_Chopper { 120 | public: 121 | bool chop( const icu::UnicodeString&, size_t ) override; 122 | }; 123 | 124 | class ARFF_ExChopper : public C45_ExChopper { 125 | }; 126 | 127 | class ARFF_OccChopper : public C45_OccChopper { 128 | }; 129 | 130 | class Bin_Chopper : public virtual Chopper { 131 | public: 132 | bool chop( const icu::UnicodeString&, size_t ) override; 133 | icu::UnicodeString getString() const override; 134 | }; 135 | 136 | class Bin_ExChopper : public Bin_Chopper, public ExChopper { 137 | }; 138 | 139 | class Bin_OccChopper : public Bin_Chopper, public OccChopper { 140 | }; 141 | 142 | class Compact_Chopper : public virtual Chopper { 143 | public: 144 | explicit Compact_Chopper( int L ): fLen(L){}; 145 | bool chop( const icu::UnicodeString&, size_t ) override; 146 | icu::UnicodeString getString() const override; 147 | private: 148 | int fLen; 149 | Compact_Chopper(); 150 | }; 151 | 152 | class Compact_ExChopper : public Compact_Chopper, public ExChopper { 153 | public: 154 | explicit Compact_ExChopper( int L ): Compact_Chopper( L ){}; 155 | private: 156 | Compact_ExChopper(); 157 | }; 158 | 159 | class Compact_OccChopper : public Compact_Chopper, public OccChopper { 160 | public: 161 | explicit Compact_OccChopper( int L ): Compact_Chopper( L ){}; 162 | private: 163 | Compact_OccChopper(); 164 | }; 165 | 166 | class Columns_Chopper : public virtual Chopper { 167 | public: 168 | bool chop( const icu::UnicodeString&, size_t ) override; 169 | icu::UnicodeString getString() const override; 170 | }; 171 | 172 | class Columns_ExChopper : public Columns_Chopper, public ExChopper { 173 | }; 174 | 175 | class Columns_OccChopper : public Columns_Chopper, public OccChopper { 176 | }; 177 | 178 | class Tabbed_Chopper : public virtual Chopper { 179 | public: 180 | bool chop( const icu::UnicodeString&, size_t ) override; 181 | icu::UnicodeString getString() const override; 182 | }; 183 | 184 | class Tabbed_ExChopper : public Tabbed_Chopper, public ExChopper { 185 | }; 186 | 187 | class Tabbed_OccChopper : public Tabbed_Chopper, public OccChopper { 188 | }; 189 | 190 | 191 | class Sparse_Chopper : public virtual Chopper { 192 | public: 193 | bool chop( const icu::UnicodeString&, size_t ) override; 194 | icu::UnicodeString getString() const override; 195 | }; 196 | 197 | class Sparse_ExChopper : public Sparse_Chopper, public ExChopper { 198 | }; 199 | 200 | class Sparse_OccChopper : public Sparse_Chopper, public OccChopper { 201 | }; 202 | 203 | } 204 | #endif // TIMBL_CHOPPERS_H 205 | -------------------------------------------------------------------------------- /include/timbl/Common.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_COMMON_H 29 | #define TIMBL_COMMON_H 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include // for isspace 36 | #include // for string 37 | 38 | namespace Common { 39 | const double Epsilon = std::numeric_limits::epsilon(); 40 | // smallest x so that 1+x != 1 41 | const int DEFAULT_MAX_FEATS = 2500; // default maximun number of Features 42 | 43 | std::string Version(); 44 | std::string VersionName(); 45 | std::string BuildInfo(); 46 | std::string VersionInfo( bool ); // obsolete 47 | 48 | inline int look_ahead( std::istream &is ){ 49 | while( is ){ 50 | int nc=is.peek(); 51 | if ( !isspace(nc) ) 52 | return nc; 53 | is.get(); 54 | } 55 | return -1; 56 | } 57 | 58 | inline void skip_spaces( std::istream &is ){ 59 | while( is ){ 60 | int nc=is.peek(); 61 | if ( !isspace(nc) ) 62 | return; 63 | is.get(); 64 | } 65 | } 66 | 67 | inline double Log2(double number){ 68 | // LOG base 2. 69 | if ( fabs(number) < Epsilon) 70 | return(0.0); 71 | return log2(number); 72 | } 73 | 74 | } 75 | #endif 76 | -------------------------------------------------------------------------------- /include/timbl/Features.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_FEATURES_H 29 | #define TIMBL_FEATURES_H 30 | 31 | #include 32 | #include 33 | #include 34 | #include "timbl/MsgClass.h" 35 | #include "timbl/Matrices.h" 36 | #include "ticcutils/Unicode.h" 37 | 38 | namespace Hash { 39 | class UnicodeHash; 40 | } 41 | 42 | namespace Timbl { 43 | 44 | class ValueClass; 45 | class TargetValue; 46 | class Targets; 47 | class metricClass; 48 | 49 | class SparseValueProbClass { 50 | friend std::ostream& operator<< ( std::ostream&, SparseValueProbClass * ); 51 | public: 52 | using IDmaptype = std::map< size_t, double >; 53 | using IDiterator = IDmaptype::const_iterator; 54 | explicit SparseValueProbClass( size_t d ): dimension(d) {}; 55 | void Assign( const size_t i, const double d ) { vc_map[i] = d; }; 56 | void Clear() { vc_map.clear(); }; 57 | IDiterator begin() const { return vc_map.begin(); }; 58 | IDiterator end() const { return vc_map.end(); }; 59 | private: 60 | IDmaptype vc_map; 61 | size_t dimension; 62 | }; 63 | 64 | enum FeatVal_Stat { 65 | Unknown, 66 | Singleton, 67 | SingletonNumeric, 68 | NumericValue, 69 | NotNumeric 70 | }; 71 | 72 | class FeatureValue: public ValueClass { 73 | friend class Feature; 74 | friend class Feature_List; 75 | friend struct D_D; 76 | public: 77 | explicit FeatureValue( const icu::UnicodeString& ); 78 | FeatureValue( const icu::UnicodeString&, size_t ); 79 | ~FeatureValue() override; 80 | void ReconstructDistribution( const ClassDistribution& vd ) { 81 | TargetDist.Merge( vd ); 82 | _frequency = TargetDist.totalSize(); 83 | }; 84 | bool isUnknown() const { return _index == 0; }; 85 | SparseValueProbClass *valueClassProb() const { return ValueClassProb; }; 86 | private: 87 | SparseValueProbClass *ValueClassProb; 88 | ClassDistribution TargetDist; 89 | }; 90 | 91 | 92 | class Feature: public MsgClass { 93 | friend class MBLClass; 94 | friend class Feature_List; 95 | public: 96 | explicit Feature( Hash::UnicodeHash *T ); 97 | ~Feature() override; 98 | bool Ignore() const { return ignore; }; 99 | void Ignore( const bool val ){ ignore = val; }; 100 | bool setMetricType( const MetricType ); 101 | MetricType getMetricType() const; 102 | double Weight() const { return weight; }; 103 | void SetWeight( const double w ) { weight = w; }; 104 | double InfoGain() const { return info_gain; }; 105 | void InfoGain( const double w ){ info_gain = w; }; 106 | double SplitInfo() const { return split_info; }; 107 | void SplitInfo( const double w ){ split_info = w; }; 108 | double GainRatio() const { return gain_ratio; }; 109 | void GainRatio( const double w ){ gain_ratio = w; }; 110 | double ChiSquare() const { return chi_square; }; 111 | void ChiSquare( const double w ){ chi_square = w; }; 112 | double SharedVariance() const { return shared_variance; }; 113 | void SharedVariance( const double w ){ shared_variance = w; }; 114 | double StandardDeviation() const { return standard_deviation; }; 115 | void StandardDeviation( const double w ){ standard_deviation = w; }; 116 | double Min() const { return n_min; }; 117 | void Min( const double val ){ n_min = val; }; 118 | double Max() const { return n_max; }; 119 | void Max( const double val ){ n_max = val; }; 120 | double fvDistance( const FeatureValue *, 121 | const FeatureValue *, 122 | size_t=1 ) const; 123 | FeatureValue *add_value( const icu::UnicodeString&, TargetValue *, int=1 ); 124 | FeatureValue *add_value( size_t, TargetValue *, int=1 ); 125 | FeatureValue *Lookup( const icu::UnicodeString& ) const; 126 | bool decrement_value( FeatureValue *, const TargetValue * ); 127 | bool increment_value( FeatureValue *, const TargetValue * ); 128 | size_t EffectiveValues() const; 129 | size_t TotalValues() const; 130 | bool isNumerical() const; 131 | bool isStorableMetric() const; 132 | bool AllocSparseArrays( size_t ); 133 | void InitSparseArrays(); 134 | bool ArrayRead(){ return vcpb_read; }; 135 | bool matrixPresent( bool& ) const; 136 | size_t matrix_byte_size() const; 137 | bool store_matrix( int = 1 ); 138 | void clear_matrix(); 139 | bool fill_matrix( std::istream& ); 140 | void print_matrix( std::ostream&, bool = false ) const; 141 | void print_vc_pb_array( std::ostream& ) const; 142 | bool read_vc_pb_array( std::istream & ); 143 | FeatVal_Stat prepare_numeric_stats(); 144 | void Statistics( double, const Targets&, bool ); 145 | void NumStatistics( double, const Targets&, int, bool ); 146 | void ClipFreq( size_t f ){ matrix_clip_freq = f; }; 147 | size_t ClipFreq() const { return matrix_clip_freq; }; 148 | SparseSymetricMatrix *metric_matrix; 149 | private: 150 | Feature( const Feature& ); 151 | Feature& operator=( const Feature& ); 152 | Hash::UnicodeHash *TokenTree; 153 | metricClass *metric; 154 | bool ignore; 155 | bool numeric; 156 | bool vcpb_read; 157 | enum ps_stat{ ps_undef, ps_failed, ps_ok, ps_read }; 158 | enum ps_stat PrestoreStatus; 159 | MetricType Prestored_metric; 160 | void delete_matrix(); 161 | double entropy; 162 | double info_gain; 163 | double split_info; 164 | double gain_ratio; 165 | double chi_square; 166 | double shared_variance; 167 | double standard_deviation; 168 | size_t matrix_clip_freq; 169 | std::vector n_dot_j; 170 | std::vector n_i_dot; 171 | double n_min; 172 | double n_max; 173 | double weight; 174 | void Statistics( double ); 175 | void NumStatistics( std::vector&, double ); 176 | void ChiSquareStatistics( const std::vector&, 177 | const Targets& ); 178 | void ChiSquareStatistics( const Targets& ); 179 | void SharedVarianceStatistics( const Targets&, int ); 180 | void StandardDeviationStatistics(); 181 | std::vector values_array; 182 | std::unordered_map< size_t, FeatureValue *> reverse_values; 183 | bool is_reference; 184 | }; 185 | 186 | class Feature_List: public MsgClass { 187 | friend class MBLClass; 188 | public: 189 | Feature_List(): 190 | _eff_feats(0), 191 | _num_of_feats(0), 192 | _num_of_num_feats(0), 193 | _feature_hash(0), 194 | _is_reference(false) 195 | { 196 | } 197 | explicit Feature_List( Hash::UnicodeHash *hash ): 198 | Feature_List() 199 | { 200 | _feature_hash = hash; 201 | } 202 | Feature_List &operator=( const Feature_List& ); 203 | ~Feature_List() override; 204 | void init( size_t, const std::vector& ); 205 | Hash::UnicodeHash *hash() const { return _feature_hash; }; 206 | size_t effective_feats(){ return _eff_feats; }; 207 | Feature *operator[]( size_t i ) const { return feats[i]; }; 208 | void write_permutation( std::ostream & ) const; 209 | void calculate_permutation( const std::vector& ); 210 | size_t _eff_feats; 211 | size_t _num_of_feats; 212 | size_t _num_of_num_feats; 213 | std::vector feats; 214 | std::vector perm_feats; 215 | std::vector permutation; 216 | private: 217 | Hash::UnicodeHash *_feature_hash; 218 | bool _is_reference; 219 | }; 220 | 221 | } // namespace Timbl 222 | 223 | #endif // TIMBL_FEATURES_H 224 | -------------------------------------------------------------------------------- /include/timbl/GetOptClass.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_GETOPTCLASS_H 29 | #define TIMBL_GETOPTCLASS_H 30 | 31 | #include 32 | #include 33 | 34 | namespace TiCC { 35 | class CL_Options; 36 | } 37 | namespace Timbl { 38 | class TimblExperiment; 39 | class GetOptClass: public MsgClass { 40 | public: 41 | explicit GetOptClass( const TiCC::CL_Options& ); 42 | GetOptClass& operator=( const GetOptClass& ) = delete; // forbid copies 43 | virtual ~GetOptClass() override; 44 | GetOptClass *Clone( std::ostream * = 0 ) const; 45 | bool parse_options( const TiCC::CL_Options&, const int=0 ); 46 | void set_default_options( const int=0 ); 47 | bool definitive_options( TimblExperiment * ); 48 | AlgorithmType Algo() const { return local_algo; }; 49 | int MaxFeatures() const { return MaxFeats; }; 50 | VerbosityFlags getVerbosity() { return myVerbosity; }; 51 | private: 52 | GetOptClass( const GetOptClass& ); 53 | AlgorithmType local_algo; 54 | MetricType local_metric; 55 | OrdeningType local_order; 56 | WeightType local_weight; 57 | InputFormatType LocalInputFormat; 58 | DecayType local_decay; 59 | double local_decay_alfa; 60 | double local_decay_beta; 61 | normType local_normalisation; 62 | double local_norm_factor; 63 | int MaxFeats; 64 | int target_pos; 65 | int no_neigh; 66 | int mvd_limit; 67 | int estimate; 68 | int maxbests; 69 | int clip_freq; 70 | int clones; 71 | int BinSize; 72 | int BeamSize; 73 | int bootstrap_lines; 74 | int f_length; 75 | int local_progress; 76 | int seed; 77 | int threshold; 78 | int igThreshold; 79 | VerbosityFlags myVerbosity; 80 | bool opt_init; 81 | bool opt_changed; 82 | bool do_exact; 83 | bool do_hashed; 84 | bool min_present; 85 | bool N_present; 86 | bool keep_distributions; 87 | bool do_sample_weights; 88 | bool do_ignore_samples; 89 | bool do_ignore_samples_test; 90 | bool do_query; 91 | bool do_all_weights; 92 | bool do_sloppy_loo; 93 | bool do_silly; 94 | bool do_diversify; 95 | std::vectormetricsArray; 96 | std::ostream *parent_socket_os; 97 | std::string inPath; 98 | std::string outPath; 99 | int occIn; 100 | void Error( const std::string& ) const override; 101 | inline bool parse_range( std::string&, 102 | std::string::iterator&, 103 | MetricType ); 104 | inline bool parse_metrics( const std::string&, 105 | MetricType& ); 106 | }; 107 | 108 | } 109 | #endif 110 | -------------------------------------------------------------------------------- /include/timbl/IBtree.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_IBTREE_H 29 | #define TIMBL_IBTREE_H 30 | 31 | #include 32 | 33 | #include "ticcutils/XMLtools.h" 34 | #include "timbl/MsgClass.h" 35 | 36 | //#define IBSTATS 37 | 38 | namespace Hash { 39 | class UnicodeHash; 40 | } 41 | 42 | namespace Timbl { 43 | 44 | class IB_InstanceBase; 45 | class IG_InstanceBase; 46 | class TRIBL_InstanceBase; 47 | class TRIBL2_InstanceBase; 48 | class Feature; 49 | class FeatureValue; 50 | class Instance; 51 | class Feature_List; 52 | class Targets; 53 | class TargetValue; 54 | class ClassDistribution; 55 | class WClassDistribution; 56 | 57 | class IBtree { 58 | friend class InstanceBase_base; 59 | friend class IB_InstanceBase; 60 | friend class IG_InstanceBase; 61 | friend class TRIBL_InstanceBase; 62 | friend class TRIBL2_InstanceBase; 63 | friend std::ostream &operator<<( std::ostream&, const IBtree& ); 64 | friend std::ostream &operator<<( std::ostream&, const IBtree * ); 65 | friend xmlNode *to_xml( IBtree *pnt ); 66 | friend int count_next( const IBtree * ); 67 | public: 68 | const TargetValue* targetValue() const { return TValue; }; 69 | private: 70 | FeatureValue *FValue; 71 | const TargetValue *TValue; 72 | ClassDistribution *TDistribution; 73 | IBtree *link; 74 | IBtree *next; 75 | 76 | IBtree(); 77 | explicit IBtree( FeatureValue * ); 78 | IBtree( const IBtree& ) = delete; // forbid copies 79 | IBtree& operator=( const IBtree& ) = delete; // forbid copies 80 | ~IBtree(); 81 | IBtree *Reduce( const TargetValue *, unsigned long&, long ); 82 | #ifdef IBSTATS 83 | static inline IBtree *add_feat_val( FeatureValue *, 84 | unsigned int&, 85 | IBtree *&, 86 | unsigned long& ); 87 | #else 88 | static inline IBtree *add_feat_val( FeatureValue *, 89 | IBtree *&, 90 | unsigned long& ); 91 | #endif 92 | inline ClassDistribution *sum_distributions( bool ); 93 | inline IBtree *make_unique( const TargetValue *, unsigned long& ); 94 | void cleanDistributions(); 95 | void re_assign_defaults( bool, bool ); 96 | void assign_defaults( bool, bool, size_t ); 97 | void redo_distributions(); 98 | void countBranches( unsigned int, 99 | std::vector&, 100 | std::vector& ); 101 | const ClassDistribution *exact_match( const Instance& ) const; 102 | protected: 103 | const IBtree *search_node( const FeatureValue * ) const; 104 | }; 105 | 106 | using FI_map = std::unordered_map; 107 | 108 | class InstanceBase_base: public MsgClass { 109 | friend class IG_InstanceBase; 110 | friend class TRIBL_InstanceBase; 111 | friend class TRIBL2_InstanceBase; 112 | InstanceBase_base( const InstanceBase_base& ) = delete; // forbid copies 113 | InstanceBase_base& operator=( const InstanceBase_base& ) = delete; // forbid copies 114 | friend std::ostream& operator<<( std::ostream &os, 115 | const InstanceBase_base& ); 116 | friend std::ostream& operator<<( std::ostream &os, 117 | const InstanceBase_base * ); 118 | public: 119 | InstanceBase_base( size_t, unsigned long&, bool, bool ); 120 | virtual ~InstanceBase_base( void ) override; 121 | void AssignDefaults( void ); 122 | void RedoDistributions(); 123 | bool AddInstance( const Instance& ); 124 | void RemoveInstance( const Instance& ); 125 | void summarizeNodes( std::vector&, 126 | std::vector& ); 127 | virtual bool MergeSub( InstanceBase_base * ); 128 | const ClassDistribution *ExactMatch( const Instance& I ) const { 129 | return InstBase->exact_match( I ); }; 130 | virtual const ClassDistribution *InitGraphTest( std::vector&, 131 | const std::vector *, 132 | const size_t, 133 | const size_t ); 134 | virtual const ClassDistribution *NextGraphTest( std::vector&, 135 | size_t& ); 136 | unsigned long int GetDistSize( ) const { return NumOfTails; }; 137 | virtual const ClassDistribution *IG_test( const Instance& , size_t&, bool&, 138 | const TargetValue *& ); 139 | virtual IB_InstanceBase *TRIBL_test( const Instance& , size_t, 140 | const TargetValue *&, 141 | const ClassDistribution *&, 142 | size_t& ); 143 | virtual IB_InstanceBase *TRIBL2_test( const Instance& , 144 | const ClassDistribution *&, 145 | size_t& ); 146 | bool read_hash( std::istream&, 147 | Hash::UnicodeHash&, 148 | Hash::UnicodeHash& ) const; 149 | virtual InstanceBase_base *Copy() const = 0; 150 | virtual InstanceBase_base *clone() const = 0; 151 | void Save( std::ostream&, 152 | bool=false ); 153 | void Save( std::ostream&, 154 | const Hash::UnicodeHash&, 155 | const Hash::UnicodeHash&, 156 | bool=false ); 157 | void toXML( std::ostream& ); 158 | void printStatsTree( std::ostream&, unsigned int startLevel ); 159 | virtual bool ReadIB( std::istream&, 160 | Feature_List&, 161 | Targets&, 162 | int ); 163 | virtual bool ReadIB_hashed( std::istream&, 164 | Feature_List&, 165 | Targets&, 166 | int ); 167 | virtual void Prune( const TargetValue *, long = 0 ); 168 | virtual bool IsPruned() const { return false; }; 169 | void CleanPartition( bool ); 170 | unsigned long int GetSizeInfo( unsigned long int&, double & ) const; 171 | const ClassDistribution *TopDist() const { return TopDistribution; }; 172 | bool HasDistributions() const; 173 | const TargetValue *TopTarget( bool & ); 174 | bool PersistentD() const { return PersistentDistributions; }; 175 | unsigned long int nodeCount() const { return ibCount;} ; 176 | size_t depth() const { return Depth;} ; 177 | const IBtree *instBase() const { return InstBase; }; 178 | 179 | #ifdef IBSTATS 180 | std::vector mismatch; 181 | #endif 182 | protected: 183 | bool DefAss; 184 | bool DefaultsValid; 185 | bool Random; 186 | bool PersistentDistributions; 187 | int Version; 188 | ClassDistribution *TopDistribution; 189 | WClassDistribution *WTop; 190 | const TargetValue *TopT; 191 | FI_map fast_index; 192 | bool tiedTop; 193 | IBtree *InstBase; 194 | IBtree *LastInstBasePos; 195 | std::vector RestartSearch; 196 | std::vector SkipSearch; 197 | std::vector InstPath; 198 | unsigned long int& ibCount; 199 | 200 | size_t Depth; 201 | unsigned long int NumOfTails; 202 | IBtree *read_list( std::istream&, 203 | Feature_List&, 204 | Targets&, 205 | int ); 206 | IBtree *read_local( std::istream&, 207 | Feature_List&, 208 | Targets&, 209 | int ); 210 | IBtree *read_list_hashed( std::istream&, 211 | Feature_List&, 212 | Targets&, 213 | int ); 214 | IBtree *read_local_hashed( std::istream&, 215 | Feature_List&, 216 | Targets&, 217 | int ); 218 | void write_tree( std::ostream &os, const IBtree * ) const; 219 | void write_tree_hashed( std::ostream &os, const IBtree * ) const; 220 | bool read_IB( std::istream&, 221 | Feature_List& , 222 | Targets&, 223 | int ); 224 | bool read_IB_hashed( std::istream&, 225 | Feature_List& , 226 | Targets&, 227 | int ); 228 | void fill_index(); 229 | const IBtree *fast_search_node( const FeatureValue * ); 230 | }; 231 | 232 | class IB_InstanceBase: public InstanceBase_base { 233 | public: 234 | IB_InstanceBase( size_t size, unsigned long& cnt, bool rand ): 235 | InstanceBase_base( size, cnt, rand , false ), 236 | offSet(0), 237 | effFeat(0), 238 | testInst(0) 239 | {}; 240 | IB_InstanceBase *Copy() const override; 241 | IB_InstanceBase *clone() const override; 242 | const ClassDistribution *InitGraphTest( std::vector&, 243 | const std::vector *, 244 | const size_t, 245 | const size_t ) override; 246 | const ClassDistribution *NextGraphTest( std::vector&, 247 | size_t& ) override; 248 | private: 249 | size_t offSet; 250 | size_t effFeat; 251 | const std::vector *testInst; 252 | }; 253 | 254 | class IG_InstanceBase: public InstanceBase_base { 255 | public: 256 | IG_InstanceBase( size_t size, unsigned long& cnt, 257 | bool rand, bool pruned, bool keep_dists ): 258 | InstanceBase_base( size, cnt, rand, keep_dists ), Pruned( pruned ) {}; 259 | IG_InstanceBase *clone() const override; 260 | IG_InstanceBase *Copy() const override; 261 | void Prune( const TargetValue *, long = 0 ) override; 262 | void specialPrune( const TargetValue * ); 263 | bool IsPruned() const override { return Pruned; }; 264 | const ClassDistribution *IG_test( const Instance& , 265 | size_t&, 266 | bool&, 267 | const TargetValue *& ) override; 268 | bool ReadIB( std::istream&, 269 | Feature_List&, 270 | Targets&, 271 | int ) override; 272 | bool ReadIB_hashed( std::istream&, 273 | Feature_List&, 274 | Targets&, 275 | int ) override; 276 | bool MergeSub( InstanceBase_base * ) override; 277 | protected: 278 | bool Pruned; 279 | }; 280 | 281 | class TRIBL_InstanceBase: public InstanceBase_base { 282 | public: 283 | TRIBL_InstanceBase( size_t size, unsigned long& cnt, 284 | bool rand, bool keep_dists ): 285 | InstanceBase_base( size, cnt, rand, keep_dists ), Threshold(0) {}; 286 | TRIBL_InstanceBase *clone() const override; 287 | TRIBL_InstanceBase *Copy() const override; 288 | IB_InstanceBase *TRIBL_test( const Instance&, 289 | size_t, 290 | const TargetValue *&, 291 | const ClassDistribution *&, 292 | size_t& ) override; 293 | private: 294 | IB_InstanceBase *IBPartition( IBtree * ) const; 295 | void AssignDefaults( size_t ); 296 | size_t Threshold; 297 | }; 298 | 299 | class TRIBL2_InstanceBase: public InstanceBase_base { 300 | public: 301 | TRIBL2_InstanceBase( size_t size, unsigned long& cnt, 302 | bool rand, bool keep_dists ): 303 | InstanceBase_base( size, cnt, rand, keep_dists ) { 304 | }; 305 | TRIBL2_InstanceBase *clone() const override; 306 | TRIBL2_InstanceBase *Copy() const override; 307 | IB_InstanceBase *TRIBL2_test( const Instance& , 308 | const ClassDistribution *&, 309 | size_t& ) override; 310 | private: 311 | IB_InstanceBase *IBPartition( IBtree * ) const; 312 | }; 313 | 314 | } 315 | #endif 316 | -------------------------------------------------------------------------------- /include/timbl/Instance.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_INSTANCE_H 29 | #define TIMBL_INSTANCE_H 30 | 31 | #include "ticcutils/Unicode.h" 32 | #include "timbl/Targets.h" 33 | #include "timbl/Features.h" 34 | 35 | namespace Hash { 36 | class UnicodeHash; 37 | } 38 | 39 | namespace Timbl { 40 | 41 | class TargetValue; 42 | class FeatureValue; 43 | 44 | class Instance { 45 | friend std::ostream& operator<<(std::ostream&, const Instance& ); 46 | friend std::ostream& operator<<(std::ostream&, const Instance * ); 47 | public: 48 | Instance(); 49 | explicit Instance( size_t s ): Instance() { Init( s ); }; 50 | Instance( const Instance& ) = delete; // inhibit copies 51 | Instance& operator=( const Instance& ) = delete; // inhibit copies 52 | ~Instance(); 53 | void Init( size_t ); 54 | void clear(); 55 | double ExemplarWeight() const { return sample_weight; }; 56 | void ExemplarWeight( const double sw ){ sample_weight = sw; }; 57 | int Occurrences() const { return occ; }; 58 | void Occurrences( const int o ) { occ = o; }; 59 | size_t size() const { return FV.size(); }; 60 | std::vector FV; 61 | TargetValue *TV; 62 | private: 63 | double sample_weight; // relative weight 64 | int occ; 65 | }; 66 | 67 | } 68 | #endif 69 | -------------------------------------------------------------------------------- /include/timbl/MBLClass.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_MBLCLASS_H 29 | #define TIMBL_MBLCLASS_H 30 | 31 | #include "timbl/Instance.h" 32 | #include "timbl/BestArray.h" 33 | #include "timbl/neighborSet.h" 34 | #include "timbl/Options.h" 35 | 36 | using xmlNode = struct _xmlNode; 37 | 38 | namespace Hash { 39 | class UnicodeHash; 40 | } 41 | 42 | namespace Timbl { 43 | using namespace Common; 44 | 45 | class InstanceBase_base; 46 | class TesterClass; 47 | class Chopper; 48 | class neighborSet; 49 | 50 | class MBLClass: public MsgClass { 51 | public: 52 | bool SetOption( const std::string& ); 53 | xmlNode *settingsToXml() const; 54 | virtual nlohmann::json settings_to_JSON(); 55 | bool ShowWeights( std::ostream& ) const; 56 | bool Verbosity( VerbosityFlags v ) const { 57 | return verbosity & v; }; 58 | void SetVerbosityFlag( VerbosityFlags v ) { verbosity |= v; }; 59 | void ResetVerbosityFlag( VerbosityFlags v ) { verbosity &= ~v; }; 60 | bool MBLInit() const { return MBL_init; }; 61 | void MBLInit( bool b ) { MBL_init = b; }; 62 | bool ExpInvalid( bool b = true ) const { 63 | if ( err_cnt > 0 ){ 64 | if ( b ){ 65 | InvalidMessage(); 66 | } 67 | return true; 68 | } 69 | else 70 | return false; 71 | }; 72 | WeightType CurrentWeighting() const { return Weighting; }; 73 | InputFormatType InputFormat() const { return input_format; }; 74 | bool connectToSocket( std::ostream *, bool = false ); 75 | std::ostream *sock_os; 76 | bool sock_is_json; 77 | mutable nlohmann::json last_error; 78 | int getOcc() const { return doOcc; }; 79 | protected: 80 | explicit MBLClass( const std::string& = "" ); 81 | void init_options_table( size_t ); 82 | MBLClass& operator=( const MBLClass& ); 83 | enum PhaseValue { TrainWords, LearnWords, TestWords, TrainLearnWords }; 84 | friend std::ostream& operator<< ( std::ostream&, const PhaseValue& ); 85 | enum IB_Stat { Invalid, Normal, Pruned }; 86 | 87 | bool writeArrays( std::ostream& ); 88 | bool readArrays( std::istream& ); 89 | bool writeMatrices( std::ostream& ) const; 90 | bool readMatrices( std::istream& ); 91 | bool writeWeights( std::ostream& ) const; 92 | bool readWeights( std::istream&, WeightType ); 93 | bool writeNamesFile( std::ostream& ) const; 94 | virtual bool ShowOptions( std::ostream& ); 95 | virtual bool ShowSettings( std::ostream& ); 96 | void writePermutation( std::ostream& ) const; 97 | void LearningInfo( std::ostream& ); 98 | virtual ~MBLClass() override; 99 | void Initialize( size_t ); 100 | bool PutInstanceBase( std::ostream& ) const; 101 | VerbosityFlags get_verbosity() const { return verbosity; }; 102 | void set_verbosity( VerbosityFlags v ) { verbosity = v; }; 103 | const Instance *chopped_to_instance( PhaseValue ); 104 | bool Chop( const icu::UnicodeString& ); 105 | bool HideInstance( const Instance& ); 106 | bool UnHideInstance( const Instance& ); 107 | icu::UnicodeString formatInstance( const std::vector&, 108 | const std::vector&, 109 | size_t, size_t ) const; 110 | bool setInputFormat( const InputFormatType ); 111 | size_t countFeatures( const icu::UnicodeString&, 112 | const InputFormatType ) const; 113 | InputFormatType getInputFormat( const icu::UnicodeString& ) const; 114 | size_t examineData( const std::string& ); 115 | void time_stamp( const char *, int =-1 ) const; 116 | void TestInstance( const Instance& , 117 | InstanceBase_base * = NULL, 118 | size_t = 0 ); 119 | icu::UnicodeString get_org_input( ) const; 120 | const ClassDistribution *ExactMatch( const Instance& ) const; 121 | void fillNeighborSet( neighborSet& ) const; 122 | void addToNeighborSet( neighborSet& ns, size_t n ) const; 123 | double getBestDistance() const; 124 | WClassDistribution *getBestDistribution( unsigned int =0 ); 125 | IB_Stat IBStatus() const; 126 | bool get_ranges( const std::string& ); 127 | size_t get_IB_Info( std::istream&, bool&, int&, bool&, std::string& ); 128 | size_t NumOfFeatures() const { return features._num_of_feats; }; 129 | size_t targetPos() const { return target_pos; }; 130 | size_t NumNumFeatures() const { return features._num_of_num_feats; }; 131 | size_t EffectiveFeatures() const { return features._eff_feats; }; 132 | void IBInfo( std::ostream& os ) const; 133 | void MatrixInfo( std::ostream& ) const; 134 | int RandomSeed() const { return random_seed; }; 135 | void Info( const std::string& ) const override; 136 | void Warning( const std::string& ) const override; 137 | void Error( const std::string& ) const override; 138 | void FatalError( const std::string& ) const override; 139 | size_t MaxFeats() const { return MaxFeatures; }; 140 | int Progress() const { return progress; }; 141 | void Progress( int p ){ progress = p; }; 142 | std::string extract_limited_m( size_t ); 143 | Targets targets; 144 | Feature_List features; 145 | InstanceBase_base *InstanceBase; 146 | std::ostream *mylog; 147 | std::ostream *myerr; 148 | size_t TRIBL_offset() const { return tribl_offset; }; 149 | unsigned int igOffset() const { return igThreshold; }; 150 | unsigned int IB2_offset() const { return ib2_offset; }; 151 | void IB2_offset( unsigned int n ) { ib2_offset = n; }; 152 | bool Do_Sloppy_LOO() const { return do_sloppy_loo; }; 153 | bool doSamples() const { 154 | return do_sample_weighting && !do_ignore_samples; }; 155 | bool Do_Exact() const { return do_exact_match; }; 156 | void Do_Exact( bool b ) { do_exact_match = b; }; 157 | void InitWeights(); 158 | void diverseWeights(); 159 | bool KeepDistributions() const { return keep_distributions; }; 160 | void KeepDistributions( bool f ){ keep_distributions = f; }; 161 | 162 | bool IsClone() const { return is_copy; }; 163 | void default_order(); 164 | void set_order(void); 165 | void calculatePermutation( const std::vector& ); 166 | void calculate_fv_entropy( bool ); 167 | bool recalculate_stats( Feature_List&, 168 | std::vector&, 169 | bool ); 170 | OptionTableClass Options; 171 | PhaseValue runningPhase; 172 | WeightType Weighting; 173 | metricClass *GlobalMetric; 174 | OrdeningType TreeOrder; 175 | size_t num_of_neighbors; 176 | bool dynamic_neighbors; 177 | DecayType decay_flag; 178 | std::string exp_name; 179 | Instance CurrInst; 180 | BestArray bestArray; 181 | size_t MaxBests; 182 | neighborSet nSet; 183 | decayStruct *decay; 184 | int beamSize; 185 | normType normalisation; 186 | double norm_factor; 187 | bool is_copy; 188 | bool is_synced; 189 | unsigned int ib2_offset; 190 | int random_seed; 191 | double decay_alfa; 192 | double decay_beta; 193 | bool MBL_init; 194 | bool tableFilled; 195 | MetricType globalMetricOption; 196 | bool do_diversify; 197 | bool initProbabilityArrays( bool ); 198 | void calculatePrestored(); 199 | void initDecay(); 200 | void initTesters(); 201 | Chopper *ChopInput; 202 | int F_length; 203 | private: 204 | size_t MaxFeatures; 205 | std::vector UserOptions; 206 | InputFormatType input_format; 207 | VerbosityFlags verbosity; 208 | size_t target_pos; 209 | int clip_factor; 210 | int Bin_Size; 211 | int progress; 212 | size_t tribl_offset; 213 | unsigned igThreshold; 214 | int mvd_threshold; 215 | bool do_sloppy_loo; 216 | bool do_exact_match; 217 | bool do_silly_testing; 218 | bool hashed_trees; 219 | bool need_all_weights; 220 | bool do_sample_weighting; 221 | bool do_ignore_samples; 222 | bool no_samples_test; 223 | bool keep_distributions; 224 | double DBEntropy; 225 | TesterClass *tester; 226 | int doOcc; 227 | bool chopExamples() const { 228 | return do_sample_weighting && 229 | !( runningPhase == TestWords && no_samples_test ); } 230 | bool chopOcc() const { 231 | switch( runningPhase ) { 232 | case TrainWords: 233 | case LearnWords: 234 | case TrainLearnWords: 235 | return doOcc == 1 || doOcc == 3; 236 | case TestWords: 237 | return doOcc > 1; 238 | default: 239 | return false; 240 | } 241 | }; 242 | void InvalidMessage() const ; 243 | 244 | void do_numeric_statistics( ); 245 | 246 | void test_instance( const Instance& , 247 | InstanceBase_base * = NULL, 248 | size_t = 0 ); 249 | void test_instance_sim( const Instance& , 250 | InstanceBase_base * = NULL, 251 | size_t = 0 ); 252 | 253 | void test_instance_ex( const Instance&, 254 | InstanceBase_base * = NULL, 255 | size_t = 0 ); 256 | 257 | bool allocate_arrays(); 258 | 259 | double RelativeWeight( unsigned int ) const; 260 | void writePermSpecial(std::ostream&) const; 261 | bool read_the_vals( std::istream& ); 262 | MBLClass( const MBLClass& ); 263 | }; 264 | 265 | inline std::ostream& operator<< ( std::ostream& os, 266 | const MBLClass::PhaseValue& ph ){ 267 | switch( ph ){ 268 | case MBLClass::TrainWords: 269 | os << "TrainWords"; 270 | break; 271 | case MBLClass::LearnWords: 272 | os << "LearnWords"; 273 | break; 274 | case MBLClass::TestWords: 275 | os << "TestWords"; 276 | break; 277 | case MBLClass::TrainLearnWords: 278 | os << "TrainlearnWords"; 279 | break; 280 | default: 281 | os << "unknown phase"; 282 | } 283 | return os; 284 | } 285 | 286 | bool empty_line( const icu::UnicodeString& , const InputFormatType ); 287 | } 288 | 289 | #endif // TIMBL_MBLCLASS_H 290 | -------------------------------------------------------------------------------- /include/timbl/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | # $URL$ 3 | 4 | pkginclude_HEADERS = Common.h GetOptClass.h IBtree.h Matrices.h \ 5 | Features.h Targets.h Instance.h \ 6 | MBLClass.h MsgClass.h BestArray.h \ 7 | StringOps.h TimblAPI.h Options.h \ 8 | TimblExperiment.h Types.h neighborSet.h Statistics.h \ 9 | Choppers.h Testers.h Metrics.h 10 | -------------------------------------------------------------------------------- /include/timbl/Matrices.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_MATRICES_H 30 | #define TIMBL_MATRICES_H 31 | 32 | template class SparseSymetricMatrix; 33 | template std::ostream& operator << (std::ostream&, 34 | const SparseSymetricMatrix& ); 35 | 36 | template 37 | class SparseSymetricMatrix { 38 | using CDmap = std::map< Class, double >; 39 | using CCDmap = std::map< Class, CDmap >; 40 | friend std::ostream& operator << <> ( std::ostream&, 41 | const SparseSymetricMatrix& ); 42 | 43 | public: 44 | void Clear() { my_mat.clear(); }; 45 | void Assign( Class i, Class j, double d ){ 46 | if ( i == j ) 47 | return; 48 | if ( i second.find(i); 61 | if ( it2 != it1->second.end() ){ 62 | return it2->second; 63 | } 64 | } 65 | } 66 | else { 67 | typename CCDmap::const_iterator it1 = my_mat.find(i); 68 | if ( it1 != my_mat.end() ){ 69 | typename CDmap::const_iterator it2 = it1->second.find(j); 70 | if ( it2 != it1->second.end() ){ 71 | return it2->second; 72 | } 73 | } 74 | } 75 | return 0.0; 76 | }; 77 | unsigned int NumBytes(void) const{ 78 | unsigned int tot = sizeof(std::map); 79 | typename CCDmap::const_iterator it1 = my_mat.begin(); 80 | while ( it1 != my_mat.end() ){ 81 | tot += sizeof(CDmap); 82 | typename CDmap::const_iterator it2 = it1->second.begin(); 83 | while ( it2 != it1->second.end() ){ 84 | tot += sizeof(double); 85 | ++it2; 86 | } 87 | ++it1; 88 | } 89 | return tot; 90 | }; 91 | SparseSymetricMatrix *copy(void) const{ 92 | SparseSymetricMatrix *res = new SparseSymetricMatrix(); 93 | typename CCDmap::const_iterator it1 = my_mat.begin(); 94 | while ( it1 != my_mat.end() ){ 95 | typename CDmap::const_iterator it2 = it1->second.begin(); 96 | while ( it2 != it1->second.end() ){ 97 | res->my_mat[it1->first][it2->first] = it2->second; 98 | ++it2; 99 | } 100 | ++it1; 101 | } 102 | return res; 103 | } 104 | private: 105 | CCDmap my_mat; 106 | }; 107 | 108 | template 109 | inline std::ostream& operator << (std::ostream& os, 110 | const SparseSymetricMatrix& m ){ 111 | typename SparseSymetricMatrix::CCDmap::const_iterator it1 = m.my_mat.begin(); 112 | while ( it1 != m.my_mat.end() ){ 113 | typename SparseSymetricMatrix::CDmap::const_iterator it2 = it1->second.begin(); 114 | while ( it2 != it1->second.end() ){ 115 | os << "[" << it1->first << ",\t" << it2->first << "] " 116 | << it2->second << std::endl; 117 | ++it2; 118 | } 119 | ++it1; 120 | } 121 | return os; 122 | } 123 | 124 | #endif // TIMBL_MATRICES_H 125 | -------------------------------------------------------------------------------- /include/timbl/Metrics.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_METRICS_H 30 | #define TIMBL_METRICS_H 31 | 32 | #include 33 | #include 34 | 35 | namespace Timbl{ 36 | 37 | class FeatureValue; 38 | 39 | class metricClass { 40 | public: 41 | explicit metricClass( MetricType m ): _type(m){}; 42 | virtual ~metricClass() {}; 43 | MetricType type() const { return _type; }; 44 | virtual bool isSimilarityMetric() const = 0; 45 | virtual bool isNumerical() const = 0; 46 | virtual bool isStorable() const = 0; 47 | virtual double distance( const FeatureValue *, 48 | const FeatureValue *, 49 | size_t=1, double = 1.0 ) const = 0; 50 | virtual double get_max_similarity() const { 51 | throw std::logic_error( "get_max_similarity not implemented for " + 52 | TiCC::toString( _type ) ); 53 | } 54 | private: 55 | MetricType _type; 56 | }; 57 | 58 | metricClass *getMetricClass( MetricType ); 59 | 60 | class distanceMetricClass: public metricClass { 61 | public: 62 | explicit distanceMetricClass( MetricType m ): metricClass(m){}; 63 | virtual ~distanceMetricClass() override {}; 64 | bool isSimilarityMetric() const override { return false; }; 65 | }; 66 | 67 | class OverlapMetric: public distanceMetricClass { 68 | public: 69 | OverlapMetric(): distanceMetricClass( Overlap ){}; 70 | bool isNumerical() const override { return false; }; 71 | bool isStorable() const override { return false; }; 72 | double distance( const FeatureValue *, 73 | const FeatureValue *, 74 | size_t, 75 | double ) const override; 76 | }; 77 | 78 | class NumericMetricClass: public distanceMetricClass { 79 | public: 80 | explicit NumericMetricClass( MetricType m ): distanceMetricClass( m ){}; 81 | virtual ~NumericMetricClass() override {}; 82 | bool isNumerical() const override { return true; }; 83 | bool isStorable() const override { return false; }; 84 | }; 85 | 86 | class NumericMetric: public NumericMetricClass { 87 | public: 88 | NumericMetric(): NumericMetricClass( Numeric ){}; 89 | double distance( const FeatureValue *, 90 | const FeatureValue *, 91 | size_t, 92 | double ) const override; 93 | }; 94 | 95 | class EuclideanMetric: public NumericMetricClass { 96 | public: 97 | EuclideanMetric(): NumericMetricClass( Euclidean ){}; 98 | double distance( const FeatureValue *, 99 | const FeatureValue *, 100 | size_t, 101 | double ) const override; 102 | }; 103 | 104 | class ValueDiffMetric: public distanceMetricClass { 105 | public: 106 | ValueDiffMetric(): distanceMetricClass( ValueDiff ){}; 107 | bool isNumerical() const override { return false; }; 108 | bool isStorable() const override { return true; }; 109 | double distance( const FeatureValue *, 110 | const FeatureValue *, 111 | size_t, 112 | double ) const override; 113 | }; 114 | 115 | class DiceMetric: public distanceMetricClass { 116 | public: 117 | DiceMetric(): distanceMetricClass( Dice ){}; 118 | bool isNumerical() const override { return false; }; 119 | bool isStorable() const override { return true; }; 120 | double distance( const FeatureValue *, 121 | const FeatureValue *, 122 | size_t, 123 | double ) const override; 124 | }; 125 | 126 | class JeffreyMetric: public distanceMetricClass { 127 | public: 128 | JeffreyMetric(): distanceMetricClass( JeffreyDiv ){}; 129 | bool isNumerical() const override{ return false; }; 130 | bool isStorable() const override { return true; }; 131 | double distance( const FeatureValue *, 132 | const FeatureValue *, 133 | size_t, 134 | double ) const override; 135 | }; 136 | 137 | class JSMetric: public distanceMetricClass { 138 | public: 139 | JSMetric(): distanceMetricClass( JSDiv ){}; 140 | bool isNumerical() const override { return false; }; 141 | bool isStorable() const override { return true; }; 142 | double distance( const FeatureValue *, 143 | const FeatureValue *, 144 | size_t, 145 | double ) const override; 146 | }; 147 | 148 | class LevenshteinMetric: public distanceMetricClass { 149 | public: 150 | LevenshteinMetric(): distanceMetricClass( Levenshtein ){}; 151 | bool isNumerical() const override { return false; }; 152 | bool isStorable() const override { return true; }; 153 | double distance( const FeatureValue *, 154 | const FeatureValue *, 155 | size_t, 156 | double ) const override; 157 | }; 158 | 159 | class similarityMetricClass: public metricClass { 160 | public: 161 | explicit similarityMetricClass( MetricType m ): metricClass( m ){}; 162 | virtual ~similarityMetricClass() override {}; 163 | bool isSimilarityMetric() const override { return true; }; 164 | bool isNumerical() const override { return true; }; 165 | bool isStorable() const override { return false; }; 166 | }; 167 | 168 | class CosineMetric: public similarityMetricClass { 169 | public: 170 | CosineMetric(): similarityMetricClass( Cosine ){}; 171 | double distance( const FeatureValue *, 172 | const FeatureValue *, 173 | size_t, 174 | double ) const override; 175 | double get_max_similarity() const override { return 1.0; }; 176 | }; 177 | 178 | class DotProductMetric: public similarityMetricClass { 179 | public: 180 | DotProductMetric(): similarityMetricClass( DotProduct ){}; 181 | double distance( const FeatureValue *, 182 | const FeatureValue *, 183 | size_t, 184 | double ) const override; 185 | double get_max_similarity() const override { 186 | return std::numeric_limits::max(); 187 | }; 188 | }; 189 | 190 | } 191 | 192 | #endif // TIMBL_METRICS_H 193 | -------------------------------------------------------------------------------- /include/timbl/MsgClass.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_MSGCLASS_H 29 | #define TIMBL_MSGCLASS_H 30 | 31 | namespace Timbl { 32 | class MsgClass { 33 | public: 34 | MsgClass(): 35 | err_cnt(0) 36 | {}; 37 | virtual ~MsgClass() {}; 38 | virtual void Info( const std::string& ) const; 39 | virtual void Warning( const std::string& ) const ; 40 | virtual void Error( const std::string& ) const ; 41 | virtual void FatalError( const std::string& ) const ; 42 | mutable int err_cnt; 43 | }; 44 | 45 | } 46 | #endif 47 | -------------------------------------------------------------------------------- /include/timbl/Statistics.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_STATISTICS_H 29 | #define TIMBL_STATISTICS_H 30 | 31 | #include "timbl/MsgClass.h" 32 | 33 | namespace Timbl { 34 | class Targets; 35 | class TargetValue; 36 | 37 | class ConfusionMatrix: public MsgClass { 38 | size_t size; 39 | std::vector > mat; 40 | public: 41 | explicit ConfusionMatrix( size_t ); 42 | virtual ~ConfusionMatrix() override; 43 | void Increment( const TargetValue*, const TargetValue* ); 44 | void Print( std::ostream&, const Targets& ) const; 45 | void FScore( std::ostream&, const Targets&, bool ) const; 46 | void merge( const ConfusionMatrix * ); 47 | }; 48 | 49 | class StatisticsClass { 50 | public: 51 | StatisticsClass(): _data(0), _skipped(0), _correct(0), 52 | _tieOk(0), _tieFalse(0), _exact(0) {}; 53 | void clear() { _data =0; _skipped = 0; _correct = 0; 54 | _tieOk = 0; _tieFalse = 0; _exact = 0; }; 55 | void addLine() { ++_data; } 56 | void addSkipped() { ++_skipped; } 57 | void addCorrect() { ++_correct; } 58 | void addTieCorrect() { ++_tieOk; } 59 | void addTieFailure() { ++_tieFalse; } 60 | void addExact() { ++_exact; } 61 | unsigned int dataLines() const { return _data; }; 62 | unsigned int skippedLines() const { return _skipped; }; 63 | unsigned int totalLines() const { return _data + _skipped; }; 64 | unsigned int testedCorrect() const { return _correct; }; 65 | unsigned int tiedCorrect() const { return _tieOk; }; 66 | unsigned int tiedFailure() const { return _tieFalse; }; 67 | unsigned int exactMatches() const { return _exact; }; 68 | void merge( const StatisticsClass& ); 69 | private: 70 | unsigned int _data; 71 | unsigned int _skipped; 72 | unsigned int _correct; 73 | unsigned int _tieOk; 74 | unsigned int _tieFalse; 75 | unsigned int _exact; 76 | }; 77 | 78 | } 79 | #endif 80 | -------------------------------------------------------------------------------- /include/timbl/StringOps.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_STRING_OPS_H 30 | #define TIMBL_STRING_OPS_H 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include "unicode/unistr.h" 38 | 39 | namespace Timbl { 40 | 41 | bool compare_nocase( const std::string&, const std::string& ); 42 | bool compare_nocase_n( const std::string&, const std::string& ); 43 | 44 | icu::UnicodeString StrToCode( const icu::UnicodeString&, bool=true ); 45 | icu::UnicodeString CodeToStr( const icu::UnicodeString& ); 46 | 47 | std::string correct_path( const std::string&, 48 | const std::string&, 49 | bool = true ); 50 | } 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /include/timbl/Targets.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | #ifndef TIMBL_TARGETS_H 29 | #define TIMBL_TARGETS_H 30 | 31 | #include 32 | #include 33 | #include 34 | #include "unicode/unistr.h" 35 | #include "timbl/MsgClass.h" 36 | #include "ticcutils/Unicode.h" 37 | 38 | namespace Hash { 39 | class UnicodeHash; 40 | } 41 | 42 | namespace Timbl { 43 | 44 | class ValueClass { 45 | public: 46 | ValueClass( const icu::UnicodeString& n, size_t i ): 47 | _name( n ), _index( i ), _frequency( 1 ) {}; 48 | ValueClass( const ValueClass& ) = delete; // forbid copies 49 | ValueClass& operator=( const ValueClass& ) = delete; // forbid copies 50 | virtual ~ValueClass() {}; 51 | void ValFreq( size_t f ){ _frequency = f; }; 52 | void IncValFreq( int f ){ _frequency += f; }; 53 | size_t ValFreq( ) const { return _frequency; }; 54 | void incr_val_freq(){ ++_frequency; }; 55 | void decr_val_freq(){ --_frequency; }; 56 | size_t Index() const { return _index; }; 57 | const icu::UnicodeString& name() const { return _name; }; 58 | const std::string name_string() const { return TiCC::UnicodeToUTF8(_name);}; 59 | // temporary for backward compatability 60 | const icu::UnicodeString& name_u() const { return _name; }; // HACK 61 | const std::string Name() const { return TiCC::UnicodeToUTF8(_name); }; // HACK 62 | // REMOVE ^^^^ 63 | friend std::ostream& operator<<( std::ostream& os, ValueClass const *vc ); 64 | protected: 65 | const icu::UnicodeString& _name; 66 | size_t _index; 67 | size_t _frequency; 68 | }; 69 | 70 | class TargetValue: public ValueClass { 71 | public: 72 | TargetValue( const icu::UnicodeString&, size_t ); 73 | }; 74 | 75 | class Targets: public MsgClass { 76 | friend class MBLClass; 77 | friend class WClassDistribution; 78 | friend class ConfusionMatrix; 79 | public: 80 | explicit Targets( Hash::UnicodeHash *T ): 81 | target_hash( T ), 82 | is_reference(false) 83 | {}; 84 | ~Targets() override; 85 | Targets& operator=( const Targets& ); 86 | void init(); 87 | TargetValue *add_value( const icu::UnicodeString&, int freq = 1 ); 88 | TargetValue *add_value( size_t, int freq = 1 ); 89 | TargetValue *Lookup( const icu::UnicodeString& ) const; 90 | TargetValue *ReverseLookup( size_t ) const; 91 | bool decrement_value( TargetValue * ); 92 | bool increment_value( TargetValue * ); 93 | TargetValue *MajorityClass() const; 94 | size_t EffectiveValues() const; 95 | size_t TotalValues() const; 96 | size_t num_of_values() const { return values_array.size(); }; 97 | Hash::UnicodeHash *hash() const { return target_hash; }; 98 | private: 99 | Hash::UnicodeHash *target_hash; 100 | std::vector values_array; 101 | std::unordered_map< size_t, TargetValue *> reverse_values; 102 | bool is_reference; 103 | }; 104 | 105 | class Vfield{ 106 | friend class ClassDistribution; 107 | friend class WClassDistribution; 108 | friend std::ostream& operator<<( std::ostream&, const Vfield& ); 109 | friend std::ostream& operator<<( std::ostream&, const Vfield * ); 110 | public: 111 | Vfield( const TargetValue *val, int freq, double w ): 112 | value(val), frequency(freq), weight(w) {}; 113 | Vfield( const Vfield& in ): 114 | value(in.value), frequency(in.frequency), weight(in.weight) {}; 115 | Vfield& operator=( const Vfield& ) = delete; // forbid copies 116 | ~Vfield(){}; 117 | std::ostream& put( std::ostream& ) const; 118 | const TargetValue *Value() const { return value; }; 119 | void Value( const TargetValue *t ){ value = t; }; 120 | size_t Freq() const { return frequency; }; 121 | void IncFreq( int inc=1 ) { frequency += inc; }; 122 | void AddFreq( int f ) { frequency += f; weight += f; }; 123 | void DecFreq() { frequency -= 1; }; 124 | double Weight() const { return weight; }; 125 | void SetWeight( double w ){ weight = w; }; 126 | size_t Index(); 127 | protected: 128 | const TargetValue *value; 129 | size_t frequency; 130 | double weight; 131 | private: 132 | }; 133 | 134 | class WClassDistribution; 135 | 136 | class ClassDistribution{ 137 | friend std::ostream& operator<<( std::ostream&, const ClassDistribution& ); 138 | friend std::ostream& operator<<( std::ostream&, const ClassDistribution * ); 139 | friend class WClassDistribution; 140 | public: 141 | using VDlist = std::map; 142 | using dist_iterator = VDlist::const_iterator; 143 | ClassDistribution( ): total_items(0) {}; 144 | ClassDistribution( const ClassDistribution& ); 145 | virtual ~ClassDistribution(){ clear(); }; 146 | size_t totalSize() const{ return total_items; }; 147 | size_t size() const{ return distribution.size(); }; 148 | bool empty() const{ return distribution.empty(); }; 149 | void clear(); 150 | dist_iterator begin() const { return distribution.begin(); }; 151 | dist_iterator end() const { return distribution.end(); }; 152 | virtual const TargetValue* BestTarget( bool&, bool = false ) const; 153 | void Merge( const ClassDistribution& ); 154 | virtual void SetFreq( const TargetValue *, int, double=1.0 ); 155 | virtual bool IncFreq( const TargetValue *, size_t, double=1.0 ); 156 | void DecFreq( const TargetValue * ); 157 | static ClassDistribution *read_distribution( std::istream&, 158 | Targets&, 159 | bool ); 160 | static ClassDistribution *read_distribution_hashed( std::istream&, 161 | Targets&, 162 | bool ); 163 | const std::string DistToString() const; 164 | const std::string DistToStringW( int ) const; 165 | double Confidence( const TargetValue * ) const; 166 | virtual const std::string SaveHashed() const; 167 | virtual const std::string Save() const; 168 | bool ZeroDist() const { return total_items == 0; }; 169 | double Entropy() const; 170 | ClassDistribution *to_VD_Copy( ) const; 171 | virtual WClassDistribution *to_WVD_Copy() const; 172 | protected: 173 | virtual void DistToString( std::string&, double=0 ) const; 174 | virtual void DistToStringWW( std::string&, int ) const; 175 | const TargetValue* BestTargetN( bool &, bool = false ) const; 176 | const TargetValue* BestTargetW( bool &, bool = false ) const; 177 | virtual ClassDistribution *clone( ) const { 178 | return new ClassDistribution(); }; 179 | size_t total_items; 180 | VDlist distribution; 181 | }; 182 | 183 | class WClassDistribution: public ClassDistribution { 184 | public: 185 | WClassDistribution(): ClassDistribution() {}; 186 | const TargetValue* BestTarget( bool &, bool = false ) const override; 187 | void SetFreq( const TargetValue *, int, double ) override; 188 | bool IncFreq( const TargetValue *, size_t, double ) override; 189 | WClassDistribution *to_WVD_Copy( ) const override; 190 | const std::string SaveHashed() const override; 191 | const std::string Save() const override; 192 | void Normalize(); 193 | void Normalize_1( double, const Targets& ); 194 | void Normalize_2(); 195 | void MergeW( const ClassDistribution&, double ); 196 | private: 197 | void DistToString( std::string&, double=0 ) const override; 198 | void DistToStringWW( std::string&, int ) const override; 199 | WClassDistribution *clone() const override { 200 | return new WClassDistribution; }; 201 | }; 202 | 203 | } 204 | #endif // TINBL_TARGETS_H 205 | -------------------------------------------------------------------------------- /include/timbl/Testers.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_TESTERS_H 30 | #define TIMBL_TESTERS_H 31 | 32 | namespace Timbl{ 33 | class metricTestFunction { 34 | public: 35 | virtual ~metricTestFunction(){}; 36 | virtual double test( const FeatureValue *, 37 | const FeatureValue *, 38 | const Feature * ) const = 0; 39 | }; 40 | 41 | class overlapTestFunction: public metricTestFunction { 42 | public: 43 | double test( const FeatureValue *FV, 44 | const FeatureValue *G, 45 | const Feature *Feat ) const override; 46 | }; 47 | 48 | class valueDiffTestFunction: public metricTestFunction { 49 | public: 50 | explicit valueDiffTestFunction( int t ): 51 | metricTestFunction(), 52 | threshold( t ) 53 | {}; 54 | double test( const FeatureValue *, 55 | const FeatureValue *, 56 | const Feature * ) const override; 57 | protected: 58 | int threshold; 59 | }; 60 | 61 | class TesterClass { 62 | public: 63 | TesterClass( const Feature_List& ); 64 | TesterClass( const TesterClass& ) = delete; // inhibit copies 65 | TesterClass& operator=( const TesterClass& ) = delete; // inhibit copies 66 | virtual ~TesterClass(){}; 67 | void init( const Instance&, size_t, size_t ); 68 | virtual size_t test( const std::vector&, 69 | size_t, 70 | double ) = 0; 71 | virtual double getDistance( size_t ) const = 0; 72 | protected: 73 | size_t _size; 74 | size_t effSize; 75 | size_t offSet; 76 | const std::vector *FV; 77 | const std::vector &features; 78 | const std::vector &permutation; 79 | std::vector permFeatures; 80 | std::vector distances; 81 | private: 82 | }; 83 | 84 | class DistanceTester: public TesterClass { 85 | public: 86 | DistanceTester( const Feature_List&, 87 | int ); 88 | ~DistanceTester() override; 89 | double getDistance( size_t ) const override; 90 | size_t test( const std::vector&, 91 | size_t, 92 | double ) override; 93 | private: 94 | std::vector metricTest; 95 | }; 96 | 97 | class SimilarityTester: public TesterClass { 98 | public: 99 | explicit SimilarityTester( const Feature_List& pf ): 100 | TesterClass( pf ){}; 101 | ~SimilarityTester() override {}; 102 | virtual size_t test( const std::vector&, 103 | size_t, 104 | double ) override = 0; 105 | protected: 106 | private: 107 | }; 108 | 109 | class CosineTester: public SimilarityTester { 110 | public: 111 | explicit CosineTester( const Feature_List& pf ): 112 | SimilarityTester( pf ){}; 113 | double getDistance( size_t ) const override; 114 | size_t test( const std::vector&, 115 | size_t, 116 | double ) override; 117 | private: 118 | }; 119 | 120 | class DotProductTester: public SimilarityTester { 121 | public: 122 | explicit DotProductTester( const Feature_List& pf ): 123 | SimilarityTester( pf ){}; 124 | double getDistance( size_t ) const override; 125 | size_t test( const std::vector&, 126 | size_t, 127 | double ) override; 128 | private: 129 | }; 130 | 131 | TesterClass* getTester( MetricType, 132 | const Feature_List&, 133 | int ); 134 | 135 | } 136 | 137 | #endif // TIMBL_TESTERS_H 138 | -------------------------------------------------------------------------------- /include/timbl/TimblAPI.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_API_H 30 | #define TIMBL_API_H 31 | 32 | #include 33 | #include 34 | #include "ticcutils/CommandLine.h" 35 | #include "timbl/Common.h" 36 | #include "timbl/Types.h" 37 | #include "timbl/Instance.h" 38 | #include "timbl/neighborSet.h" 39 | #include "timbl/TimblExperiment.h" 40 | 41 | namespace Timbl{ 42 | 43 | inline std::string Version() { return Common::Version(); } 44 | inline std::string VersionName() { return Common::VersionName(); } 45 | inline std::string BuildInfo() { return Common::BuildInfo(); } 46 | 47 | enum Algorithm { UNKNOWN_ALG, IB1, IB2, IGTREE, TRIBL, TRIBL2, LOO, CV }; 48 | enum Weighting { UNKNOWN_W, UD, NW, GR, IG, X2, SV, SD }; 49 | 50 | class TimblAPI { 51 | friend class TimblExperiment; 52 | public: 53 | // cppcheck-suppress noExplicitConstructor 54 | TimblAPI( const TiCC::CL_Options&, const std::string& = "" ); 55 | // cppcheck-suppress noExplicitConstructor 56 | TimblAPI( const std::string&, const std::string& = "" ); 57 | TimblAPI( const TimblAPI& ); 58 | ~TimblAPI(); 59 | bool isValid() const; 60 | bool Valid() const; 61 | TimblExperiment *grabAndDisconnectExp(){ 62 | TimblExperiment *res = 0; 63 | if ( Valid() ){ 64 | res = pimpl; 65 | pimpl = 0; 66 | } 67 | return res; 68 | } 69 | bool Prepare( const std::string& = "" ); 70 | bool CVprepare( const std::string& = "", 71 | Weighting = GR, 72 | const std::string& = "" ); 73 | bool Learn( const std::string& = "" ); 74 | bool Increment_u( const icu::UnicodeString& ); 75 | bool Increment( const std::string& ); 76 | bool Decrement_u( const icu::UnicodeString& ); 77 | bool Decrement( const std::string& ); 78 | bool Expand( const std::string& ); 79 | bool Remove( const std::string& ); 80 | bool Test( const std::string& = "", 81 | const std::string& = "", 82 | const std::string& = "" ); 83 | bool NS_Test( const std::string& = "", 84 | const std::string& = "" ); 85 | const TargetValue *Classify( const std::string& ); 86 | const TargetValue *Classify( const std::string&, 87 | const ClassDistribution *& ); 88 | const TargetValue *Classify( const std::string&, 89 | double& ); 90 | const TargetValue *Classify( const std::string&, 91 | const ClassDistribution *&, 92 | double& ); 93 | const TargetValue *Classify( const icu::UnicodeString& ); 94 | const TargetValue *Classify( const icu::UnicodeString&, 95 | const ClassDistribution *& ); 96 | const TargetValue *Classify( const icu::UnicodeString&, 97 | double& ); 98 | const TargetValue *Classify( const icu::UnicodeString&, 99 | const ClassDistribution *&, 100 | double& ); 101 | const neighborSet *classifyNS( const icu::UnicodeString& ); 102 | bool classifyNS( const icu::UnicodeString&, 103 | neighborSet& ); 104 | bool classifyNS( const std::string& in, 105 | neighborSet& st ){ 106 | return classifyNS( TiCC::UnicodeFromUTF8(in), st ); 107 | } 108 | const Instance *lastHandledInstance() const; 109 | const Targets& myTargets() const; 110 | bool Classify( const std::string&, 111 | std::string& ); 112 | bool Classify( const std::string&, 113 | std::string&, 114 | double& ); 115 | bool Classify( const std::string&, 116 | std::string&, 117 | std::string&, 118 | double& ); 119 | bool Classify( const icu::UnicodeString&, 120 | icu::UnicodeString& ); 121 | bool ShowBestNeighbors( std::ostream& ) const; 122 | size_t matchDepth() const; 123 | double confidence() const; 124 | bool matchedAtLeaf() const; 125 | std::string ExpName() const; 126 | static std::string VersionInfo( bool = false ); 127 | bool SaveWeights( const std::string& = "" ); 128 | bool GetWeights( const std::string& = "", Weighting = UNKNOWN_W ); 129 | double GetAccuracy(); 130 | Weighting CurrentWeighting() const; 131 | Weighting GetCurrentWeights( std::vector& ) const; 132 | bool WriteInstanceBase( const std::string& = "" ); 133 | bool WriteInstanceBaseXml( const std::string& = "" ); 134 | bool WriteInstanceBaseLevels( const std::string& = "", unsigned int=0 ); 135 | bool GetInstanceBase( const std::string& = "" ); 136 | bool WriteArrays( const std::string& = "" ); 137 | bool WriteMatrices( const std::string& = "" ); 138 | bool GetArrays( const std::string& = "" ); 139 | bool GetMatrices( const std::string& = "" ); 140 | bool WriteNamesFile( const std::string& = "" ); 141 | bool ShowWeights( std::ostream& ) const; 142 | bool ShowOptions( std::ostream& ) const; 143 | bool ShowSettings( std::ostream& ) const; 144 | bool ShowIBInfo( std::ostream& ) const; 145 | bool ShowStatistics( std::ostream& ) const; 146 | bool SetOptions( const std::string& ); 147 | bool SetIndirectOptions( const TiCC::CL_Options& ); 148 | bool SetThreads( int c ); 149 | std::string extract_limited_m( int ) const; 150 | Algorithm Algo() const; 151 | InputFormatType getInputFormat() const; 152 | size_t NumOfFeatures() const; 153 | static size_t Default_Max_Feats(); 154 | bool initExperiment(); 155 | private: 156 | TimblAPI(); 157 | TimblAPI& operator=( const TimblAPI& ); // forbid copies 158 | TimblExperiment *pimpl; 159 | bool i_am_fine; 160 | }; 161 | 162 | const std::string to_string( const Algorithm ); 163 | const std::string to_string( const Weighting ); 164 | bool string_to( const std::string&, Algorithm& ); 165 | bool string_to( const std::string&, Weighting& ); 166 | 167 | using ValueDistribution = ClassDistribution; // for backward compatability 168 | using WValueDistribution = WClassDistribution; // for backward compatability 169 | } 170 | #endif // TIMBL_API_H 171 | -------------------------------------------------------------------------------- /include/timbl/neighborSet.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | 27 | */ 28 | 29 | #ifndef TIMBL_NEIGHBORSET_H 30 | #define TIMBL_NEIGHBORSET_H 31 | 32 | #include "timbl/Types.h" 33 | 34 | namespace Timbl { 35 | 36 | class ClassDistribution; 37 | class WClassDistribution; 38 | 39 | class decayStruct { 40 | friend std::ostream& operator<<( std::ostream&, const decayStruct& ); 41 | friend std::ostream& operator<<( std::ostream&, const decayStruct * ); 42 | public: 43 | decayStruct():alpha(0),beta(0){}; 44 | decayStruct(double a, double b ):alpha(a),beta(b){}; 45 | virtual ~decayStruct(){}; 46 | virtual std::ostream& put( std::ostream& ) const = 0; 47 | virtual DecayType type() const = 0; 48 | double alpha; 49 | double beta; 50 | }; 51 | 52 | class zeroDecay: public decayStruct { 53 | public: 54 | zeroDecay():decayStruct(){}; 55 | std::ostream& put( std::ostream& ) const override; 56 | DecayType type() const override { return Zero;}; 57 | }; 58 | 59 | class invLinDecay: public decayStruct { 60 | public: 61 | invLinDecay():decayStruct(){}; 62 | std::ostream& put( std::ostream& ) const override; 63 | DecayType type() const override { return InvLinear;}; 64 | }; 65 | 66 | class invDistDecay: public decayStruct { 67 | public: 68 | invDistDecay():decayStruct(){}; 69 | std::ostream& put( std::ostream& ) const override; 70 | DecayType type() const override { return InvDist;}; 71 | }; 72 | 73 | class expDecay: public decayStruct { 74 | public: 75 | explicit expDecay( double alp ): decayStruct(alp,1.0){}; 76 | expDecay( double alp, double bet ): decayStruct(alp,bet){}; 77 | std::ostream& put( std::ostream& ) const override; 78 | DecayType type() const override { return ExpDecay;}; 79 | }; 80 | 81 | class neighborSet { 82 | friend std::ostream& operator<<( std::ostream&, const neighborSet& ); 83 | friend std::ostream& operator<<( std::ostream&, const neighborSet * ); 84 | friend class BestArray; 85 | public: 86 | neighborSet(); 87 | ~neighborSet(); 88 | neighborSet( const neighborSet& in ); 89 | neighborSet& operator=( const neighborSet& ); 90 | size_t size() const; 91 | void reserve( size_t ); 92 | void clear(); 93 | void truncate( size_t ); 94 | void merge( const neighborSet& ); 95 | double getDistance( size_t ) const; 96 | double bestDistance() const { return getDistance(0); }; 97 | const ClassDistribution *getDistribution( size_t ) const; 98 | WClassDistribution *bestDistribution( const decayStruct * =0, 99 | size_t =0 ) const ; 100 | double relativeWeight( const decayStruct *, size_t ) const; 101 | bool setShowDistance( bool b ) const { 102 | bool ret = showDistance; 103 | showDistance = b; 104 | return ret; 105 | } 106 | bool setShowDistribution( bool b ) const { 107 | bool ret = showDistribution; 108 | showDistribution = b; 109 | return ret; 110 | } 111 | private: 112 | mutable bool showDistance; 113 | mutable bool showDistribution; 114 | void push_back( double, const ClassDistribution & ); 115 | std::vector distances; 116 | std::vector distributions; 117 | }; 118 | 119 | } 120 | #endif 121 | -------------------------------------------------------------------------------- /m4/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | libtool.m 3 | libtool.m4 4 | ltoptions.m4 5 | ltsugar.m4 6 | ltversion.m4 7 | lt~obsolete.m4 8 | pkg.m4 9 | -------------------------------------------------------------------------------- /m4/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id: $ 2 | # $URL: $ 3 | 4 | extra_DIST = ax_openmp.m4 pkg.m4 -------------------------------------------------------------------------------- /m4/ac_osx_pkg.m4: -------------------------------------------------------------------------------- 1 | # osx_pkg.m4 - Macros to add OSX brew locations to pkg-config. -*- Autoconf -*- 2 | # serial 2 (pkg-config-0.24) 3 | # 4 | # Copyright © 2024 Ko van der Sloot 5 | # 6 | # This program is free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation; either version 2 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program; if not, write to the Free Software 18 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # As a special exception to the GNU General Public License, if you 21 | # distribute this file as part of a program that contains a 22 | # configuration script generated by Autoconf, you may include it under 23 | # the same distribution terms that you use for the rest of that program. 24 | 25 | # AC_OSX_PKG_ALL() 26 | # add all /opt/{package} directories to the PKG_CONFIG search path 27 | # ---------------------------------- 28 | AC_DEFUN([AC_OSX_PKG_ALL], 29 | [ 30 | case ${host_os} in 31 | linux*) 32 | # linux is wellbehaved 33 | ;; 34 | darwin*) 35 | # darwin isn't 36 | for i in `ls /usr/local/opt/` 37 | do 38 | if test -d "/usr/local/opt/$i/lib/pkgconfig" 39 | then 40 | export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" 41 | fi 42 | done 43 | for i in `ls /opt/homewbrew/opt` 44 | do 45 | if test -d "/opt/homebrew/opt/$i/lib/pkgconfig" 46 | then 47 | export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/opt/homebrew/opt/$i/lib/pkgconfig" 48 | fi 49 | done 50 | ;; 51 | esac 52 | ]) 53 | 54 | # AC_OSX_PKG_ALL([LIST_OF_PACKAGES]) 55 | # fore every packake in LIST_OF_PACKAGES, add the /opt/{package} directory 56 | # to the PKG_CONFIG search path 57 | # ---------------------------------- 58 | AC_DEFUN([AC_OSX_PKG], 59 | [ 60 | case ${host_os} in 61 | linux*) 62 | # linux is wellbehaved 63 | ;; 64 | darwin*) 65 | # darwin/macos isn't 66 | for i in $* 67 | do 68 | if test -d "/usr/local/opt/$i/lib/pkgconfig" 69 | then 70 | export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/usr/local/opt/$i/lib/pkgconfig" 71 | else 72 | if test -d "/opt/homebrew/opt//$i/lib/pkgconfig" 73 | then 74 | export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:/opt/homebrew/opt/$i/lib/pkgconfig" 75 | fi 76 | fi 77 | done 78 | ;; 79 | esac 80 | ]) 81 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *.lo 4 | *.la 5 | .deps 6 | .libs 7 | timbl 8 | simpletest* 9 | *.out 10 | *.log 11 | -------------------------------------------------------------------------------- /src/CVExperiment.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #include "timbl/Common.h" 34 | #include "timbl/Types.h" 35 | #include "timbl/StringOps.h" 36 | #include "timbl/TimblExperiment.h" 37 | 38 | namespace Timbl { 39 | using namespace std; 40 | 41 | bool CV_Experiment::Prepare( const string& f, bool, bool ){ 42 | cerr << "CV prepare " << f << endl; 43 | return true; 44 | } 45 | 46 | bool CV_Experiment::CVprepare( const string& wgtFile, 47 | WeightType w, 48 | const string& probFile ){ 49 | CV_WfileName = wgtFile; 50 | CV_fileW = w; 51 | CV_PfileName = probFile; 52 | return true; 53 | } 54 | 55 | bool CV_Experiment::Learn( const string& f, bool ){ 56 | cerr << "CV Learn " << f << endl; 57 | return true; 58 | } 59 | 60 | bool CV_Experiment::checkTestFile(){ 61 | if ( !IB1_Experiment::checkTestFile() ){ 62 | return false; 63 | } 64 | else if ( doSamples() ){ 65 | FatalError( "Cannot Cross validate on a file with Examplar Weighting" ); 66 | return false; 67 | } 68 | else if ( Verbosity(FEAT_W) ){ 69 | LearningInfo( *mylog ); 70 | } 71 | return true; 72 | } 73 | 74 | bool CV_Experiment::get_file_names( const string& FileName ){ 75 | if ( !ExpInvalid() ){ 76 | size_t size = 0; 77 | ifstream file_names( FileName, ios::in ); 78 | if ( !file_names ){ 79 | Error( "Unable to read CV filenames from " + FileName ); 80 | return false; 81 | } 82 | string name; 83 | while ( getline( file_names, name ) ){ 84 | size_t tmp = examineData( name ); 85 | if ( tmp != 0 ){ 86 | if ( !Verbosity(SILENT) ){ 87 | *mylog << "Examine datafile '" << name 88 | << "' gave the following results:" 89 | << endl 90 | << "Number of Features: " << tmp << endl; 91 | showInputFormat( *mylog ); 92 | } 93 | FileNames.push_back(name); 94 | if ( size == 0 ){ 95 | size = tmp; 96 | } 97 | else { 98 | if ( tmp != size ) { 99 | Error( "mismatching number of features in file " + 100 | name + "of CV filelist " + FileName ); 101 | return false; 102 | } 103 | } 104 | } 105 | else { 106 | Error( "unable to determine number of features in file " + 107 | name + "of CV filelist " + FileName ); 108 | return false; 109 | } 110 | } 111 | if ( FileNames.size() < 3 ){ 112 | Error( "Not enough filenames found in CV filelist " + FileName 113 | + " at least 3 required" ); 114 | return false; 115 | } 116 | return true; 117 | } 118 | return false; 119 | } 120 | 121 | bool CV_Experiment::Test( const string& FileName, 122 | const string& OutFile ){ 123 | if ( !ConfirmOptions() ){ 124 | return false; 125 | } 126 | (void)OutFile; 127 | bool result = false; 128 | VerbosityFlags keep = get_verbosity(); 129 | set_verbosity( SILENT ); 130 | if ( get_file_names( FileName ) ){ 131 | *mylog << "Starting Cross validation test on files:" << endl; 132 | for ( const auto& name : FileNames ){ 133 | *mylog << name << endl; 134 | } 135 | size_t NumOfFiles = FileNames.size(); 136 | TimblExperiment::Prepare( FileNames[1], false ); 137 | TimblExperiment::Learn( FileNames[1], false ); 138 | for ( size_t filenum = 2; filenum < NumOfFiles; ++filenum ){ 139 | Expand( FileNames[filenum] ); 140 | } 141 | string outName; 142 | string percName; 143 | for ( size_t SkipFile = 0; SkipFile < NumOfFiles-1; ++SkipFile ) { 144 | outName = correct_path( FileNames[SkipFile], outPath, false ); 145 | outName += ".cv"; 146 | percName = outName; 147 | percName += ".%"; 148 | set_verbosity( keep ); 149 | if ( CV_WfileName != "" ){ 150 | GetWeights( CV_WfileName, CV_fileW ); 151 | } 152 | if ( !CV_PfileName.empty() ){ 153 | GetArrays( CV_PfileName ); 154 | } 155 | result = TimblExperiment::Test( FileNames[SkipFile], outName ); 156 | if ( result ){ 157 | result = createPercFile( percName ); 158 | } 159 | if ( !result ){ 160 | return false; 161 | } 162 | set_verbosity( SILENT ); 163 | Expand( FileNames[SkipFile] ); 164 | Remove( FileNames[SkipFile+1] ); 165 | } 166 | outName = correct_path( FileNames[NumOfFiles-1], outPath, false ); 167 | outName += ".cv"; 168 | percName = outName; 169 | percName += ".%"; 170 | set_verbosity( keep ); 171 | if ( CV_WfileName != "" ){ 172 | GetWeights( CV_WfileName, CV_fileW ); 173 | } 174 | if ( !CV_PfileName.empty() ){ 175 | GetArrays( CV_PfileName ); 176 | } 177 | result = TimblExperiment::Test( FileNames[NumOfFiles-1], outName ); 178 | if ( result ){ 179 | result = createPercFile( percName ); 180 | } 181 | } 182 | return result; 183 | } 184 | 185 | } 186 | -------------------------------------------------------------------------------- /src/Common.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include "timbl/Common.h" 29 | 30 | #include "config.h" 31 | 32 | using namespace std; 33 | 34 | namespace Common { 35 | 36 | string VersionInfo( bool full ){ 37 | // obsolete 38 | if ( full ){ 39 | return BuildInfo(); 40 | } 41 | else { 42 | return Version(); 43 | } 44 | } 45 | string Version() { return VERSION; } 46 | string VersionName() { return PACKAGE_STRING; } 47 | string BuildInfo() { 48 | return Version() + ", compiled on " + __DATE__ + ", " + __TIME__; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/Instance.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | 30 | #include "timbl/Types.h" 31 | #include "timbl/Instance.h" 32 | 33 | using namespace std; 34 | 35 | namespace Timbl { 36 | 37 | Instance::Instance(): 38 | TV(NULL), 39 | sample_weight(0.0), 40 | occ(1) 41 | { 42 | } 43 | 44 | Instance::~Instance(){ 45 | clear(); 46 | } 47 | 48 | void Instance::clear(){ 49 | for ( auto& it : FV ){ 50 | if ( it ){ 51 | if ( it->isUnknown() ){ 52 | delete it; 53 | } 54 | } 55 | it = 0; 56 | } 57 | TV = 0; 58 | sample_weight = 0.0; 59 | occ = 1; 60 | } 61 | 62 | void Instance::Init( size_t len ){ 63 | FV.resize( len, 0 ); 64 | } 65 | 66 | ostream& operator<<( ostream& os, const Instance *I ){ 67 | if ( I ){ 68 | os << *I; 69 | } 70 | else { 71 | os << " Empty Instance"; 72 | } 73 | return os; 74 | } 75 | 76 | ostream& operator<<( ostream& os, const Instance& I ){ 77 | for ( const auto* it : I.FV ){ 78 | os << it << ", "; 79 | } 80 | os << I.TV << " " << I.sample_weight; 81 | return os; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/LOOExperiment.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #include "timbl/MsgClass.h" 36 | #include "timbl/Common.h" 37 | #include "timbl/Types.h" 38 | #include "timbl/IBtree.h" 39 | #include "timbl/Instance.h" 40 | #include "timbl/MBLClass.h" 41 | #include "timbl/TimblExperiment.h" 42 | 43 | namespace Timbl { 44 | using namespace std; 45 | using namespace icu; 46 | 47 | void LOO_Experiment::initExperiment( bool all_vd ){ 48 | if ( !ExpInvalid() ){ 49 | if ( !MBL_init ){ // do this only when necessary 50 | initDecay(); 51 | if ( !is_copy ){ 52 | calculate_fv_entropy( true ); 53 | if ( initProbabilityArrays( all_vd ) ){ 54 | calculatePrestored(); 55 | } 56 | else { 57 | Error( "not enough memory for Probability Arrays in (" 58 | + string(__FILE__) + "," + TiCC::toString(__LINE__) + ")\n" 59 | + "ABORTING now" ); 60 | throw std::bad_alloc(); 61 | } 62 | InitWeights(); 63 | if ( do_diversify ){ 64 | diverseWeights(); 65 | } 66 | srand( random_seed ); 67 | } 68 | initTesters(); 69 | MBL_init = true; 70 | } 71 | } 72 | } 73 | 74 | bool LOO_Experiment::checkTestFile(){ 75 | // no need to test the Testfile 76 | // it is the same as the trainfile, so already checked 77 | if ( doSamples() ){ 78 | FatalError( "Cannot Leave One Out on a file with Examplar Weighting" ); 79 | return false; 80 | } 81 | return true; 82 | } 83 | 84 | void LOO_Experiment::showTestingInfo( ostream& os ){ 85 | if ( !Verbosity(SILENT) ){ 86 | if ( Verbosity(OPTIONS ) ){ 87 | ShowSettings( os ); 88 | } 89 | os << endl << "Starting to test using Leave One Out"; 90 | if ( Do_Sloppy_LOO() ) { 91 | os << " using SLOPPY metric calculations" << endl; 92 | } 93 | else { 94 | os << endl; 95 | } 96 | os << "Writing output in: " << outStreamName << endl 97 | << "Algorithm : LOO" << endl; 98 | show_metric_info( os ); 99 | show_weight_info( os ); 100 | os << decay << endl; 101 | } 102 | } 103 | 104 | bool LOO_Experiment::Test( const string& FileName, 105 | const string& OutFile ){ 106 | bool result = false; 107 | if ( initTestFiles( FileName, OutFile ) ){ 108 | if ( InstanceBase->nodeCount() == InstanceBase->depth() + 1 ){ 109 | // protect ourselves against 1-line trainfiles 110 | FatalError( "the file '" + FileName + "' contains only 1 usable line. LOO impossible!" ); 111 | } 112 | initExperiment(); 113 | stats.clear(); 114 | delete confusionInfo; 115 | confusionInfo = 0; 116 | if ( Verbosity(ADVANCED_STATS) ){ 117 | confusionInfo = new ConfusionMatrix( targets.num_of_values() ); 118 | } 119 | showTestingInfo( *mylog ); 120 | // Start time. 121 | // 122 | time_t lStartTime; 123 | time(&lStartTime); 124 | timeval startTime; 125 | gettimeofday( &startTime, 0 ); 126 | if ( InputFormat() == ARFF ){ 127 | skipARFFHeader( testStream ); 128 | } 129 | UnicodeString Buffer; 130 | while ( nextLine( testStream, Buffer ) ){ 131 | if ( !chopLine( Buffer ) ){ 132 | Warning( "testfile, skipped line #" + 133 | TiCC::toString( stats.totalLines() ) + 134 | "\n" + TiCC::UnicodeToUTF8(Buffer) ); 135 | } 136 | else { 137 | chopped_to_instance( TestWords ); 138 | Decrement( CurrInst ); 139 | double final_distance = 0.0; 140 | bool exact = false; 141 | const TargetValue *ResultTarget = LocalClassify( CurrInst, 142 | final_distance, 143 | exact ); 144 | normalizeResult(); 145 | string dString = bestResult.getResult(); 146 | double confi = 0; 147 | if ( Verbosity(CONFIDENCE) ){ 148 | confi = confidence(); 149 | } 150 | // Write it to the output file for later analysis. 151 | show_results( outStream, confi, dString, 152 | ResultTarget, final_distance ); 153 | if ( exact ){ // remember that a perfect match may be incorrect! 154 | if ( Verbosity(EXACT) ){ 155 | *mylog << "Exacte match:\n" << get_org_input() << endl; 156 | } 157 | } 158 | if ( !Verbosity(SILENT) ){ 159 | // Display progress counter. 160 | show_progress( *mylog, lStartTime, stats.dataLines() ); 161 | } 162 | Increment( CurrInst ); 163 | } 164 | }// end while. 165 | if ( !Verbosity(SILENT) ){ 166 | time_stamp( "Ready: ", stats.dataLines() ); 167 | show_speed_summary( *mylog, startTime ); 168 | showStatistics( *mylog ); 169 | } 170 | result = true; 171 | } 172 | return result; 173 | } 174 | 175 | bool LOO_Experiment::ReadInstanceBase( const string& ){ 176 | Error( "cannot combine Leave One Out with retrieving an Instancebase " ); 177 | return false; 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS = -I@top_srcdir@/include 2 | AM_CXXFLAGS = -std=c++17 -W -Wall -O3 -g -pedantic 3 | 4 | bin_PROGRAMS = timbl 5 | 6 | check_PROGRAMS = simpletest 7 | TESTS = $(check_PROGRAMS) 8 | TESTS_ENVIRONMENT = topsrcdir=$(top_srcdir) 9 | simpletest_SOURCES = simpletest.cxx 10 | CLEANFILES = dimin.out 11 | 12 | LDADD = libtimbl.la 13 | 14 | timbl_SOURCES = Timbl.cxx 15 | lib_LTLIBRARIES = libtimbl.la 16 | libtimbl_la_LDFLAGS= -version-info 7:0:0 17 | 18 | libtimbl_la_SOURCES = Common.cxx \ 19 | GetOptClass.cxx IBtree.cxx IBprocs.cxx \ 20 | Targets.cxx Features.cxx Instance.cxx \ 21 | MBLClass.cxx MsgClass.cxx \ 22 | StringOps.cxx TimblAPI.cxx Choppers.cxx\ 23 | TimblExperiment.cxx IGExperiment.cxx Metrics.cxx Testers.cxx \ 24 | TRIBLExperiments.cxx LOOExperiment.cxx CVExperiment.cxx \ 25 | Types.cxx neighborSet.cxx Statistics.cxx BestArray.cxx 26 | -------------------------------------------------------------------------------- /src/MsgClass.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include "timbl/MsgClass.h" 32 | 33 | using std::cerr; 34 | using std::endl; 35 | using std::string; 36 | 37 | namespace Timbl { 38 | 39 | void MsgClass::Info( const string& out_line ) const { 40 | cerr << out_line << endl; 41 | } 42 | 43 | void MsgClass::Warning( const string& out_line ) const { 44 | cerr << "Warning:" << out_line << endl; 45 | } 46 | 47 | void MsgClass::Error( const string& out_line ) const { 48 | ++err_cnt; 49 | cerr << "Error:" << out_line << endl; 50 | } 51 | 52 | void MsgClass::FatalError( const string& out_line ) const { 53 | cerr << "Fatal timbl Error:" 54 | << out_line << endl 55 | << "Please send a bugreport to timbl@uvt.nl" << endl 56 | << "include enough information, like:" << endl 57 | << "- Type of computer, type and version of OS, " 58 | << "and type and version of the compiler" << endl 59 | << "- Which Commands and switches were used" << endl 60 | << "- Which input was used, and which output was produced" << endl; 61 | throw std::runtime_error( "aborted" ); 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/Statistics.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #include "timbl/Common.h" 33 | #include "timbl/MsgClass.h" 34 | #include "timbl/Types.h" 35 | #include "timbl/Instance.h" 36 | #include "timbl/Statistics.h" 37 | 38 | namespace Timbl { 39 | 40 | using std::bad_alloc; 41 | using std::ostream; 42 | using std::ios; 43 | using std::ios_base; 44 | using std::endl; 45 | using Common::Epsilon; 46 | 47 | ConfusionMatrix::ConfusionMatrix( size_t s ): size(s){ 48 | try { 49 | mat.resize(size+1); 50 | for ( size_t i=0; i <= size; ++i ){ 51 | mat[i].resize(size,0); 52 | } 53 | } 54 | catch( const bad_alloc& ){ 55 | Error ( "Not enough memory for ConfusionMatrix" ); 56 | throw; 57 | } 58 | } 59 | 60 | ConfusionMatrix::~ConfusionMatrix(){ 61 | for ( unsigned int i=0; i <= size; ++i ){ 62 | mat[i].clear(); 63 | } 64 | mat.clear(); 65 | } 66 | 67 | void ConfusionMatrix::Increment( const TargetValue *t1, 68 | const TargetValue *t2 ){ 69 | if ( t2 ){ 70 | if ( t1 ){ 71 | ++mat[t1->Index()-1][t2->Index()-1]; 72 | } 73 | else { 74 | ++mat[size][t2->Index()-1]; 75 | } 76 | } 77 | else { 78 | throw std::out_of_range( "ConfusionMatrix, index out of range" ); 79 | } 80 | } 81 | 82 | void ConfusionMatrix::Print( ostream& os, 83 | const Targets& targets ) const { 84 | os << "Confusion Matrix:" << endl; 85 | os << " "; 86 | for ( const auto* val : targets.values_array ){ 87 | // Print the class names. 88 | os.width(6); 89 | os.setf(ios::right, ios::adjustfield); 90 | os << val << " "; 91 | } 92 | os << endl; 93 | os << " "; 94 | for ( unsigned int i=0; i < size; ++i ){ 95 | os << "-------"; 96 | } 97 | os << endl; 98 | for ( unsigned int i=0; i < targets.values_array.size(); ++i ){ 99 | os.width(6); 100 | os.setf(ios::right, ios::adjustfield); 101 | os << targets.values_array[i] << " | "; 102 | for ( const auto& mv : mat[i] ){ 103 | os.width(6); 104 | os.setf(ios::right, ios::adjustfield); 105 | os << mv << " "; 106 | } 107 | os << endl; 108 | if ( i == targets.values_array.size() - 1 ){ 109 | os << " -*- | "; 110 | for ( const auto& mv : mat[size] ){ 111 | os.width(6); 112 | os.setf(ios::right, ios::adjustfield); 113 | os << mv << " "; 114 | } 115 | os << endl; 116 | } 117 | } 118 | os << endl; 119 | } 120 | 121 | void pf( ostream& os, size_t d ){ 122 | os.width(4); 123 | os << " \t" << d; 124 | } 125 | 126 | void pf( ostream& os, double d ){ 127 | if ( d < 0 ){ 128 | os << " \t (nan)\t"; 129 | } 130 | else { 131 | os.setf(ios::showpoint); 132 | os << " \t" << d; 133 | } 134 | } 135 | 136 | void ConfusionMatrix::FScore( ostream& os, 137 | const Targets& targets, 138 | bool cs_too ) const { 139 | double maf = 0.0; 140 | double mif = 0.0; 141 | double maa = 0.0; 142 | double mia = 0.0; 143 | ios_base::fmtflags flags = os.flags(ios::fixed); 144 | int oldPrec = os.precision(5); 145 | size_t effF = 0; 146 | size_t testF = 0; 147 | size_t effA = 0; 148 | if ( cs_too ){ 149 | os << "Scores per Value Class:" << endl; 150 | os << "class |\tTP\tFP\tTN\tFN\tprecision\trecall(TPR)\tFPR\t\tF-score\t\tAUC" << endl; 151 | } 152 | for ( unsigned int i=0; i < targets.values_array.size(); ++i ){ 153 | // so we loop over all known (trained) target values 154 | size_t TP = 0; 155 | size_t FP = 0; 156 | size_t FN = 0; 157 | size_t TN = 0; 158 | const ValueClass *tv = targets.values_array[i]; 159 | size_t testCount = 0; 160 | for ( unsigned int j=0; j < size; ++j ){ 161 | testCount += mat[i][j]; 162 | if ( i == j ){ 163 | TP = mat[i][j]; 164 | } 165 | else { 166 | FN += mat[i][j]; 167 | } 168 | } 169 | testF += testCount; 170 | for ( unsigned int j=0; j <= size; ++j ){ 171 | if ( j != i ){ 172 | FP += mat[j][i]; 173 | } 174 | } 175 | for ( unsigned int j=0; j <= size; ++j ){ 176 | if ( j != i ){ 177 | for ( unsigned int k=0; k < size; ++k ){ 178 | if ( k != i ){ 179 | TN += mat[j][k]; 180 | } 181 | } 182 | } 183 | } 184 | double precision; 185 | if ( TP + FP == 0 ){ 186 | precision = -1; 187 | } 188 | else { 189 | precision = TP / double(TP + FP); 190 | } 191 | double TPR; 192 | if ( TP + FN == 0 ){ 193 | TPR = -1; 194 | } 195 | else { 196 | TPR = TP / double(TP + FN); 197 | } 198 | double FPR; 199 | if ( FP + TN == 0 ){ 200 | FPR = -1; 201 | } 202 | else { 203 | FPR = FP / double(FP + TN); 204 | } 205 | double f_score; 206 | if ( precision < 0 || TPR < 0 || 207 | fabs(precision + TPR) < Epsilon ){ 208 | f_score = -1; 209 | } 210 | else { 211 | f_score = ( 2 * precision * TPR ) / (precision + TPR ); 212 | ++effF; 213 | maf += f_score; 214 | mif += (f_score * testCount); 215 | } 216 | double AUC; 217 | if ( TPR < 0 || FPR < 0 ){ 218 | AUC = -1; 219 | } 220 | else { 221 | AUC = ( 0.5 * TPR * FPR ) + ( TPR * ( 1.0 - FPR ) ) + 222 | ( 0.5 * ( ( 1.0 - TPR ) * ( 1.0 - FPR ) ) ); 223 | ++effA; 224 | maa += AUC; 225 | mia += (AUC * testCount); 226 | } 227 | if ( cs_too ){ 228 | os.width( 6 ); 229 | os << tv << " | "; 230 | os.width(0); 231 | pf(os,TP); 232 | pf(os,FP); 233 | pf(os,TN); 234 | pf(os,FN); 235 | pf(os,precision); 236 | pf(os,TPR); 237 | pf(os,FPR); 238 | pf(os,f_score); 239 | pf(os,AUC); 240 | os << endl; 241 | } 242 | } 243 | maf = maf / effF; 244 | mif = mif / testF; 245 | maa = maa / effA; 246 | mia = mia / testF; 247 | os.precision( oldPrec ); 248 | os.flags( flags ); 249 | os << "F-Score beta=1, microav: " << mif << endl; 250 | os << "F-Score beta=1, macroav: " << maf << endl; 251 | os << "AUC, microav: " << mia << endl; 252 | os << "AUC, macroav: " << maa << endl; 253 | } 254 | 255 | void ConfusionMatrix::merge( const ConfusionMatrix *cm ){ 256 | if ( cm ){ 257 | for ( size_t i=0; i <= size; ++i ){ 258 | for ( size_t j=0; j < size; ++j ){ 259 | mat[i][j] += cm->mat[i][j]; 260 | } 261 | } 262 | } 263 | } 264 | 265 | void StatisticsClass::merge( const StatisticsClass& in ){ 266 | _data += in._data; 267 | _skipped += in._skipped; 268 | _correct += in._correct; 269 | _tieOk += in._tieOk; 270 | _tieFalse += in._tieFalse; 271 | _exact += in._exact; 272 | } 273 | 274 | } 275 | -------------------------------------------------------------------------------- /src/StringOps.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | #include 34 | #include "ticcutils/StringOps.h" 35 | #include "timbl/StringOps.h" 36 | #include "unicode/ustream.h" 37 | 38 | using namespace std; 39 | using namespace icu; 40 | 41 | namespace Timbl { 42 | 43 | UnicodeString StrToCode( const UnicodeString &par, bool trim ){ 44 | UnicodeString In = par; 45 | // cerr << "string to code IN: '" << In << "'" << endl; 46 | if ( trim ){ 47 | In.trim(); 48 | } 49 | UnicodeString Out; 50 | for ( int i=0; i < In.length(); ++i ){ 51 | switch ( In[i] ){ 52 | case ' ': 53 | Out += '\\'; 54 | Out += '_'; 55 | break; 56 | case '\t': 57 | Out += '\\'; 58 | Out += 't'; 59 | break; 60 | case '\\': 61 | Out += '\\'; 62 | Out += '\\'; 63 | break; 64 | default: 65 | Out += In[i]; 66 | } 67 | } 68 | // cerr << "string to code Out: '" << Out << "'" << endl; 69 | return Out; 70 | } 71 | 72 | UnicodeString CodeToStr( const UnicodeString& in ){ 73 | UnicodeString out; 74 | for( int i=0; i < in.length(); ++i ){ 75 | if ( in[i] == '\\' ){ 76 | ++i; 77 | if ( i == in.length() ){ 78 | out += '\\'; 79 | break; 80 | } 81 | else { 82 | switch ( in[i] ){ 83 | case '_': 84 | out += ' '; 85 | break; 86 | case '\\': 87 | out += '\\'; 88 | break; 89 | case 't': 90 | out += '\t'; 91 | break; 92 | default: 93 | out += '\\'; 94 | out += in[i]; 95 | } 96 | } 97 | } 98 | else { 99 | out += in[i]; 100 | } 101 | } 102 | return out; 103 | } 104 | 105 | bool nocase_cmp( char c1, char c2 ){ 106 | return toupper(c1) == toupper(c2); 107 | } 108 | 109 | bool compare_nocase( const string& s1, const string& s2 ){ 110 | if ( s1.size() == s2.size() && 111 | equal( s1.begin(), s1.end(), s2.begin(), nocase_cmp ) ){ 112 | return true; 113 | } 114 | else { 115 | return false; 116 | } 117 | } 118 | 119 | bool compare_nocase_n( const string& s1, const string& s2 ){ 120 | if ( s1.size() <= s2.size() && 121 | equal( s1.begin(), s1.end(), s2.begin(), nocase_cmp ) ){ 122 | return true; 123 | } 124 | else { 125 | return false; 126 | } 127 | } 128 | 129 | string correct_path( const string& filename, 130 | const string& path, 131 | bool keep_origpath ){ 132 | // if filename contains pathinformation, it is replaced with path, except 133 | // when keep_origpath is true. 134 | // if filename contains NO pathinformation, path is always appended. 135 | // of course we don't append if the filename is empty or just '-' ! 136 | 137 | if ( path != "" && filename != "" && filename[0] != '-' ){ 138 | bool add_slash = path.back() != '/'; 139 | string result = path; 140 | if ( add_slash ){ 141 | result += "/"; 142 | } 143 | string::size_type pos = filename.rfind( '/' ); 144 | if ( pos == string::npos ){ 145 | result += filename; 146 | } 147 | else if ( keep_origpath ){ 148 | result += filename; 149 | } 150 | else { 151 | result += filename.substr( pos+1 ); 152 | } 153 | return result; 154 | } 155 | else { 156 | return filename; 157 | } 158 | } 159 | 160 | } // namespace Timbl 161 | -------------------------------------------------------------------------------- /src/Testers.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | #include 28 | #include 29 | #include 30 | 31 | #include "timbl/Common.h" 32 | #include "timbl/Types.h" 33 | #include "timbl/Instance.h" 34 | #include "timbl/Metrics.h" 35 | #include "timbl/Testers.h" 36 | 37 | using namespace std; 38 | using Common::Epsilon; 39 | using Common::Log2; 40 | 41 | namespace Timbl{ 42 | 43 | //#define DBGTEST 44 | //#define DBGTEST_DOT 45 | 46 | double overlapTestFunction::test( const FeatureValue *F, 47 | const FeatureValue *G, 48 | const Feature *Feat ) const { 49 | #ifdef DBGTEST 50 | cerr << "overlap_distance(" << F << "," << G << ") = "; 51 | #endif 52 | double result = Feat->fvDistance( F, G ); 53 | #ifdef DBGTEST 54 | cerr << result; 55 | #endif 56 | result *= Feat->Weight(); 57 | #ifdef DBGTEST 58 | cerr << " gewogen " << result << endl; 59 | #endif 60 | return result; 61 | } 62 | 63 | double valueDiffTestFunction::test( const FeatureValue *F, 64 | const FeatureValue *G, 65 | const Feature *Feat ) const { 66 | #ifdef DBGTEST 67 | cerr << TiCC::toString(Feat->getMetricType()) << "_distance(" << F << "," << G << ") = "; 68 | #endif 69 | double result = Feat->fvDistance( F, G, threshold ); 70 | #ifdef DBGTEST 71 | cerr << result; 72 | #endif 73 | result *= Feat->Weight(); 74 | #ifdef DBGTEST 75 | cerr << " gewogen " << result << endl; 76 | #endif 77 | return result; 78 | } 79 | 80 | TesterClass* getTester( MetricType m, 81 | const Feature_List& features, 82 | int mvdThreshold ){ 83 | if ( m == Cosine ){ 84 | return new CosineTester( features ); 85 | } 86 | else if ( m == DotProduct ){ 87 | return new DotProductTester( features ); 88 | } 89 | else { 90 | return new DistanceTester( features, mvdThreshold ); 91 | } 92 | } 93 | 94 | TesterClass::TesterClass( const Feature_List& features ): 95 | _size(features.feats.size()), 96 | effSize(_size), 97 | offSet(0), 98 | FV(0), 99 | features(features.feats), 100 | permutation(features.permutation) 101 | { 102 | permFeatures.resize(_size,0); 103 | #ifdef DBGTEST 104 | cerr << "created TesterClass(" << _size << ")" << endl; 105 | #endif 106 | for ( size_t j=0; j < _size; ++j ){ 107 | permFeatures[j] = features.feats[features.permutation[j]]; 108 | } 109 | distances.resize(_size+1, 0.0); 110 | } 111 | 112 | void TesterClass::init( const Instance& inst, 113 | size_t effective, 114 | size_t oset ){ 115 | #ifdef DBGTEST 116 | cerr << "tester Initialized!" << endl; 117 | #endif 118 | effSize = effective-oset; 119 | offSet = oset; 120 | FV = &inst.FV; 121 | } 122 | 123 | DistanceTester::~DistanceTester(){ 124 | for ( const auto& it : metricTest ){ 125 | delete it; 126 | } 127 | } 128 | 129 | DistanceTester::DistanceTester( const Feature_List& features, 130 | int mvdmThreshold ): 131 | TesterClass( features ){ 132 | #ifdef DBGTEST 133 | cerr << "create a tester with threshold = " << mvdmThreshold << endl; 134 | #endif 135 | metricTest.resize(_size,0); 136 | for ( size_t i=0; i < _size; ++i ){ 137 | #ifdef DBGTEST 138 | cerr << "set metric[" << i+1 << "]=" << TiCC::toString(features.feats[i]->getMetricType()) << endl; 139 | #endif 140 | if ( features[i]->Ignore() ) 141 | continue; 142 | if ( features[i]->isStorableMetric() ){ 143 | #ifdef DBGTEST 144 | cerr << "created valueDiffTestFunction " << endl; 145 | #endif 146 | metricTest[i] = new valueDiffTestFunction( mvdmThreshold ); 147 | } 148 | else { 149 | #ifdef DBGTEST 150 | cerr << "created overlapFunction " << endl; 151 | #endif 152 | metricTest[i] = new overlapTestFunction(); 153 | } 154 | } 155 | } 156 | 157 | size_t DistanceTester::test( const vector& G, 158 | size_t CurPos, 159 | double Threshold ) { 160 | size_t i; 161 | size_t TrueF; 162 | for ( i=CurPos, TrueF = i + offSet; i < effSize; ++i,++TrueF ){ 163 | #ifdef DBGTEST 164 | cerr << "feature " << TrueF << " (perm=" << permutation[TrueF] 165 | << ")" << endl; 166 | #endif 167 | double result = metricTest[permutation[TrueF]]->test( (*FV)[TrueF], 168 | G[i], 169 | permFeatures[TrueF] ); 170 | distances[i+1] = distances[i] + result; 171 | if ( distances[i+1] > Threshold ){ 172 | #ifdef DBGTEST 173 | cerr << "threshold reached at " << i << " distance=" 174 | << distances[i+1] << endl; 175 | #endif 176 | return i; 177 | } 178 | } 179 | #ifdef DBGTEST 180 | cerr << "threshold reached at end, distance=" << distances[effSize] << endl; 181 | #endif 182 | return effSize; 183 | } 184 | 185 | double DistanceTester::getDistance( size_t pos ) const{ 186 | return distances[pos]; 187 | } 188 | 189 | inline bool FV_to_real( const FeatureValue *FV, 190 | double &result ){ 191 | if ( FV ){ 192 | if ( TiCC::stringTo( FV->name(), result ) ){ 193 | return true; 194 | } 195 | } 196 | return false; 197 | } 198 | 199 | double innerProduct( const FeatureValue *FV, 200 | const FeatureValue *G ) { 201 | double r1=0, r2=0, result; 202 | #ifdef DBGTEST_DOT 203 | cerr << "innerproduct " << FV << " x " << G << endl; 204 | #endif 205 | if ( FV_to_real( FV, r1 ) && 206 | FV_to_real( G, r2 ) ){ 207 | #ifdef DBGTEST_DOT 208 | cerr << "innerproduct " << r1 << " x " << r2 << endl; 209 | #endif 210 | result = r1 * r2; 211 | } 212 | else { 213 | result = 0.0; 214 | } 215 | #ifdef DBGTEST_DOT 216 | cerr << " resultaat == " << result << endl; 217 | #endif 218 | return result; 219 | } 220 | 221 | size_t CosineTester::test( const vector& G, 222 | size_t, 223 | double ){ 224 | double denom1 = 0.0; 225 | double denom2 = 0.0; 226 | double result = 0.0; 227 | size_t TrueF; 228 | size_t i; 229 | for ( i=0, TrueF = i + offSet; i < effSize; ++i,++TrueF ){ 230 | double W = permFeatures[TrueF]->Weight(); 231 | denom1 += innerProduct( (*FV)[TrueF], (*FV)[TrueF] ) * W; 232 | denom2 += innerProduct( G[i], G[i] ) * W; 233 | result += innerProduct( (*FV)[TrueF], G[i] ) * W; 234 | } 235 | double denom = sqrt( denom1 * denom2 ); 236 | distances[effSize] = result/ (denom + Common::Epsilon); 237 | #ifdef DBGTEST 238 | cerr << "denom1 " << denom1 << endl; 239 | cerr << "denom2 " << denom2 << endl; 240 | cerr << "denom " << denom << endl; 241 | cerr << "result " << result << endl; 242 | cerr << "cosine::test() distance " << distances[effSize] << endl; 243 | #endif 244 | return effSize; 245 | } 246 | 247 | size_t DotProductTester::test( const vector& G, 248 | size_t, 249 | double ) { 250 | size_t TrueF; 251 | size_t i; 252 | for ( i=0, TrueF = i + offSet; i < effSize; ++i,++TrueF ){ 253 | double result = innerProduct( (*FV)[TrueF], G[i] ); 254 | result *= permFeatures[TrueF]->Weight(); 255 | distances[i+1] = distances[i] + result; 256 | #ifdef DBGTEST 257 | cerr << "gewogen result " << result << endl; 258 | cerr << "dot::test() distance[" << i+1 << "]=" << distances[i+1] << endl; 259 | #endif 260 | } 261 | return effSize; 262 | } 263 | 264 | double CosineTester::getDistance( size_t pos ) const{ 265 | #ifdef DBGTEST 266 | cerr << "getDistance, maxSim = " << 1.0 << endl; 267 | cerr << " distances[" << pos << "]= " << distances[pos] << endl; 268 | #endif 269 | return 1.0 - distances[pos]; 270 | } 271 | 272 | double DotProductTester::getDistance( size_t pos ) const{ 273 | #ifdef DBGTEST_DOT 274 | cerr << "getDistance, maxSim = " << std::numeric_limits::max() << endl; 275 | cerr << " distances[" << pos << "]= " << distances[pos] << endl; 276 | #endif 277 | return (std::numeric_limits::max() - distances[pos])/std::numeric_limits::max();; 278 | } 279 | 280 | } 281 | -------------------------------------------------------------------------------- /src/Types.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "timbl/Types.h" 36 | 37 | namespace Timbl { 38 | using std::string; 39 | using std::vector; 40 | 41 | // initializers 42 | 43 | const string AlgorithmName[][2] = { { "Unknown", "Unknown Algorithm" }, 44 | { "IB1", "Memory Based Learning" }, 45 | { "IB2", "Adapted Memory Based Learning"}, 46 | { "IGTree", "Information Gain Tree" }, 47 | { "TRIBL", "Tree IB1" }, 48 | { "TRIBL2", "Tribl 2" }, 49 | { "LOO", "Leave One Out" }, 50 | { "CV", "Cross Validate" } }; 51 | 52 | const string MetricName[][2] = { { "U", "Unknown Metric" }, 53 | { "I", "Ignore" }, 54 | { "N", "Numeric" }, 55 | { "D", "Dot product" }, 56 | { "C", "Cosine metric" }, 57 | { "O", "Overlap" }, 58 | { "L", "Levenshtein" }, 59 | { "DC", "Dice coefficient" }, 60 | { "M", "Value Difference" }, 61 | { "J", "Jeffrey Divergence" }, 62 | { "S", "Jensen-Shannon Divergence" }, 63 | { "E", "Euclidean Distance" } }; 64 | 65 | const string WeightName[][2] = { { "un", "Unknown Weighting" }, 66 | { "nw", "No Weighting" }, 67 | { "gr", "GainRatio" }, 68 | { "ig", "InfoGain" }, 69 | { "x2", "Chi-square" }, 70 | { "sv", "Shared Variance" }, 71 | { "sd", "Standard Deviation" }, 72 | { "ud", "User Defined"} }; 73 | 74 | const string DecayName[][2] = { { "Unknown", "Unknown Decay" }, 75 | { "Z", "Zero Decay" }, 76 | { "ID", "Inverse Distance" }, 77 | { "IL", "Inverse Linear Distance" }, 78 | { "ED", "Exponential Decay" } }; 79 | 80 | const string SmoothingName[][2] = { { "Unknown", "Unknown Smoothing" }, 81 | { "Default", "Default Smoothing" }, 82 | { "L", "Lidstone Smoothing" } }; 83 | 84 | const string OrdeningName[][2] = { { "Unknown", "Unknown Ordering" }, 85 | { "UDO", "Data File Ordering" }, 86 | { "DO", "Default Ordering" }, 87 | { "GRO", "GainRatio" }, 88 | { "IGO", "InformationGain" }, 89 | { "1/V", "Inverse Values" }, 90 | { "1/S", "Inverse SplitInfo" }, 91 | { "G/V", "GainRatio/Values" }, 92 | { "I/V", "InformationGain/Values" }, 93 | { "GxE", "GainRatio*Entropy" }, 94 | { "IxE", "InformationGain*Entropy" }, 95 | { "X2O", "Chi-Squared" }, 96 | { "SVO", "Shared Variance" }, 97 | { "SDO", "Standard Deviation" }, 98 | { "X/V", "Chi-Squared/Values" }, 99 | { "S/V", "Shared Variance/Values" }, 100 | { "SD/V", "Standard Deviation/Values" } }; 101 | 102 | const string InputFormatName[][2] = { 103 | { "Unknown", "Unknown Input Format" }, 104 | { "Compact", "Compact" }, 105 | { "C45", "C4.5" }, 106 | { "Column", "Columns" }, 107 | { "Tabbed", "Tabbed" }, 108 | { "ARFF", "ARFF" }, 109 | { "BINARY", "Sparse Binary" }, 110 | { "SPARSE", "Sparse" } }; 111 | 112 | const string VerbosityName[][2] = { { "Unknown", "erroneous" }, 113 | { "S", "Silent" }, 114 | { "O", "Options" }, 115 | { "F", "Feature_Statistics" }, 116 | { "P", "Probability_arrays" }, 117 | { "E", "Exact_match" }, 118 | { "DI", "Distances" }, 119 | { "DB", "Distribution" }, 120 | { "N", "Nearest_Neighbours" }, 121 | { "AS", "Advanced_Statistics" }, 122 | { "CM", "Confusion_Matrix" }, 123 | { "CS", "Class_Statistics" }, 124 | { "CD", "Client_Debug" }, 125 | { "K", "All_K_values" }, 126 | { "MD", "MatchingDepth" }, 127 | { "B", "BranchingFactor" }, 128 | { "CF", "Confidence" }, 129 | // Verbosity is special! 130 | // should end with "" strings! 131 | { "", "" } }; 132 | 133 | const string NormalisationName[][2] = { 134 | { "Unknown", "Unknown normalisation" }, 135 | { "None", "No Normalisation" }, 136 | { "Probability", "Normalise to 100%" }, 137 | { "AddFactor", "Add a factor to all targets, then normalise to 100%" }, 138 | { "LogProbability", "Take 10log, then Normalise to 100%" } 139 | }; 140 | 141 | WeightType charToWeig( char w ){ 142 | switch ( w ){ 143 | case '0': 144 | return No_w; 145 | case '1': 146 | return GR_w; 147 | case '2': 148 | return IG_w; 149 | case '3': 150 | return X2_w; 151 | case '4': 152 | return SV_w; 153 | case '5': 154 | return SD_w; 155 | default: 156 | return Unknown_w; 157 | } 158 | } 159 | 160 | AlgorithmType charToAlg( char a ){ 161 | switch ( a ){ 162 | case '0': 163 | return IB1_a; 164 | case '1': 165 | return IGTREE_a; 166 | case '2': 167 | return TRIBL_a; 168 | case '3': 169 | return IB2_a; 170 | case '4': 171 | return TRIBL2_a; 172 | default: 173 | return Unknown_a; 174 | } 175 | } 176 | 177 | normType charToNorm( char a ){ 178 | switch ( a ){ 179 | case '0': 180 | return probabilityNorm; 181 | case '1': 182 | return addFactorNorm; 183 | case '2': 184 | return logProbNorm; 185 | default: 186 | return unknownNorm; 187 | } 188 | } 189 | 190 | } 191 | -------------------------------------------------------------------------------- /src/neighborSet.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include 29 | 30 | #include "timbl/Common.h" 31 | #include "timbl/Types.h" 32 | #include "timbl/Instance.h" 33 | #include "timbl/neighborSet.h" 34 | 35 | namespace Timbl { 36 | 37 | using namespace std; 38 | using namespace Common; 39 | 40 | neighborSet::neighborSet(): showDistance(false),showDistribution(false){} 41 | 42 | neighborSet::~neighborSet(){ 43 | clear(); 44 | } 45 | 46 | neighborSet::neighborSet( const neighborSet& in ){ 47 | showDistance = in.showDistance; 48 | showDistribution = in.showDistribution; 49 | merge( in ); 50 | } 51 | 52 | neighborSet& neighborSet::operator=( const neighborSet& in ){ 53 | if ( this != &in ){ 54 | clear(); 55 | showDistance = in.showDistance; 56 | showDistribution = in.showDistribution; 57 | merge( in ); 58 | } 59 | return *this; 60 | } 61 | 62 | size_t neighborSet::size() const{ 63 | return distances.size(); 64 | } 65 | 66 | void neighborSet::clear(){ 67 | distances.clear(); 68 | for ( auto const& db : distributions ){ 69 | delete db; 70 | } 71 | distributions.clear(); 72 | } 73 | 74 | void neighborSet::reserve( size_t s ){ 75 | distances.reserve( s ); 76 | distributions.reserve( s ); 77 | } 78 | 79 | void neighborSet::truncate( size_t len ){ 80 | if ( len < distributions.size() ){ 81 | for ( size_t i=len; i < distributions.size(); ++i ){ 82 | delete distributions[i]; 83 | } 84 | distributions.resize( len ); 85 | distances.resize( len); 86 | } 87 | } 88 | 89 | void neighborSet::push_back( double d, const ClassDistribution &dist ){ 90 | distances.push_back( d ); 91 | distributions.push_back( dist.to_VD_Copy() ); 92 | } 93 | 94 | void neighborSet::merge( const neighborSet& s ){ 95 | // reserve enough space to avoid reallocations 96 | // reallocation invalidates pointers! 97 | reserve( size() + s.size() ); 98 | auto dit1 = distances.begin(); 99 | auto dit2 = s.distances.begin(); 100 | auto dis1 = distributions.begin(); 101 | auto dis2 = s.distributions.begin(); 102 | while ( dit1 != distances.end() ){ 103 | if ( dit2 != s.distances.end() ){ 104 | if (fabs(*dit1 - *dit2) < Epsilon) { 105 | // equal 106 | (*dis1)->Merge( **dis2 ); 107 | ++dit1; 108 | ++dis1; 109 | ++dit2; 110 | ++dis2; 111 | } 112 | else if ( *dit1 < *dit2 ){ 113 | ++dit1; 114 | ++dis1; 115 | } 116 | else { 117 | dit1 = distances.insert( dit1, *dit2 ); 118 | ++dit1; 119 | ++dit2; 120 | dis1 = distributions.insert( dis1, (*dis2)->to_VD_Copy() ); 121 | ++dis1; 122 | ++dis2; 123 | } 124 | } 125 | else { 126 | break; 127 | } 128 | } 129 | while ( dit2 != s.distances.end() ){ 130 | distances.push_back( *dit2 ); 131 | ++dit2; 132 | distributions.push_back( (*dis2)->to_VD_Copy() ); 133 | ++dis2; 134 | } 135 | } 136 | 137 | double neighborSet::relativeWeight( const decayStruct *d, 138 | size_t k ) const{ 139 | double result = 1.0; 140 | if ( !d ){ 141 | return result; 142 | } 143 | switch ( d->type() ){ 144 | case Zero: 145 | break; 146 | case InvDist: 147 | result = 1.0/(distances[k] + Epsilon); 148 | break; 149 | case InvLinear: 150 | if ( k > 0 && size() != 1 ){ 151 | double nearest_dist, furthest_dist; 152 | nearest_dist = distances[0]; 153 | furthest_dist = distances[size()-1]; 154 | result = (furthest_dist - distances[k]) / 155 | (furthest_dist-nearest_dist); 156 | } 157 | break; 158 | case ExpDecay: 159 | result = exp(-d->alpha*pow(distances[k], d->beta)); 160 | if ( result == 0 ){ 161 | // A result of zero is undesirable. (bug 89) 162 | // We optimisticly replace it with Epsilon 163 | result = Epsilon; 164 | } 165 | break; 166 | default: 167 | throw std::logic_error( "wrong value in switch" ); 168 | } 169 | return result; 170 | } 171 | 172 | double neighborSet::getDistance( size_t n ) const { 173 | if ( size() <= n ){ 174 | throw std::range_error( "getDistance() parameter exceeds size of neighborSet" ); 175 | } 176 | return distances[n]; 177 | } 178 | 179 | const ClassDistribution *neighborSet::getDistribution( size_t n ) const { 180 | if ( size() <= n ){ 181 | throw std::range_error( "getDistribution() parameter exceeds size of neighborSet" ); 182 | } 183 | return distributions[n]; 184 | } 185 | 186 | WClassDistribution *neighborSet::bestDistribution( const decayStruct *d, 187 | size_t max ) const { 188 | // Analyse the set to find THE best ClassDistribution. 189 | // For each neighbor, we loop over the number of bests in that 190 | // bin, and merge that distribution into the result 191 | // 192 | WClassDistribution *result = new WClassDistribution(); 193 | size_t stop = distributions.size(); 194 | stop = ( max > 0 && max < stop ? max : stop ); 195 | for ( size_t k = 0; k < stop; ++k ) { 196 | result->MergeW( *distributions[k], relativeWeight( d, k ) ); 197 | } 198 | return result; 199 | } 200 | 201 | ostream& operator<<( ostream& os, const neighborSet& set ){ 202 | for ( unsigned int i=0; i < set.size(); ++i ){ 203 | os << "# k=" << i+1; 204 | if ( set.showDistribution ){ 205 | os << "\t" << set.distributions[i]->DistToStringW(0); 206 | } 207 | if ( set.showDistance ){ 208 | int OldPrec = os.precision(DBL_DIG-1); 209 | os.setf(ios::showpoint); 210 | os << "\t" << set.distances[i]; 211 | os.precision(OldPrec); 212 | } 213 | os << endl; 214 | } 215 | return os; 216 | } 217 | 218 | ostream& operator<<( ostream& os, const neighborSet *Set ){ 219 | os << *Set; 220 | return os; 221 | } 222 | 223 | ostream& operator<<( ostream& os, const decayStruct& dc ){ 224 | return dc.put( os ); 225 | } 226 | 227 | ostream& zeroDecay::put( ostream& os ) const { 228 | return os; 229 | } 230 | 231 | ostream& invLinDecay::put( ostream& os ) const { 232 | os << "Decay : " << TiCC::toString( type(), true); 233 | return os; 234 | } 235 | 236 | ostream& invDistDecay::put( ostream& os ) const { 237 | os << "Decay : " << TiCC::toString( type(), true); 238 | return os; 239 | } 240 | 241 | ostream& expDecay::put( ostream& os ) const { 242 | os << "Decay : " << TiCC::toString( type(), true); 243 | os << " a=" << alpha << " b= " << beta; 244 | return os; 245 | } 246 | 247 | ostream& operator<<( ostream& os, const decayStruct *dc ){ 248 | if ( dc ){ 249 | os << *dc; 250 | } 251 | return os; 252 | } 253 | 254 | } 255 | -------------------------------------------------------------------------------- /src/simpletest.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 1998 - 2024 3 | ILK - Tilburg University 4 | CLST - Radboud University 5 | CLiPS - University of Antwerp 6 | 7 | This file is part of timbl 8 | 9 | timbl is free software; you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation; either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | timbl is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU General Public License 20 | along with this program; if not, see . 21 | 22 | For questions and suggestions, see: 23 | https://github.com/LanguageMachines/timbl/issues 24 | or send mail to: 25 | lamasoftware (at ) science.ru.nl 26 | */ 27 | 28 | #include "timbl/TimblAPI.h" 29 | #include 30 | 31 | int main(){ 32 | std::string path = std::getenv( "topsrcdir" ); 33 | std::cerr << path << std::endl; 34 | std::cerr << "version: " << Timbl::VersionName() << std::endl; 35 | Timbl::AlgorithmType alg = Timbl::charToAlg( '0' ); 36 | assert( alg == Timbl::IB1_a ); 37 | Timbl::normType nor = Timbl::charToNorm( '0' ); 38 | assert( nor == Timbl::probabilityNorm ); 39 | Timbl::WeightType w = Timbl::charToWeig( '0' ); 40 | assert( w == Timbl::No_w ); 41 | Timbl::TimblAPI exp( "+vdi+db", "test1" ); 42 | if ( exp.isValid() ){ 43 | exp.Learn( path + "/demos/dimin.train" ); 44 | if ( exp.isValid() ){ 45 | exp.Test( path + "/demos/dimin.test", "dimin.out" ); 46 | if ( exp.isValid() ){ 47 | return EXIT_SUCCESS; 48 | } 49 | } 50 | } 51 | return EXIT_FAILURE; 52 | } 53 | -------------------------------------------------------------------------------- /timbl.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | libdir=@libdir@ 4 | includedir=@includedir@ 5 | openmpflags=@OPENMP_CXXFLAGS@ 6 | 7 | Name: timbl 8 | Version: @VERSION@ 9 | Description: timbl library. 10 | Requires.private: libxml-2.0 11 | Libs: -L${libdir} -ltimbl 12 | Libs.private: @LIBS@ ${openmpflags} 13 | Cflags: -I${includedir} 14 | --------------------------------------------------------------------------------