CRFSuite: A fast implementation of Conditional Random Fields (CRFs)

├── AUTHORS ├── COPYING ├── ChangeLog ├── INSTALL ├── Makefile.am ├── README ├── autogen.sh ├── bench ├── accuracy.py ├── bench.py ├── bench_crfpp.py ├── bench_crfsgd.py ├── bench_crfsuite-0.11.py ├── bench_crfsuite.py ├── bench_mallet.py ├── bench_wapiti.py ├── collect.py ├── crfsuite_to_mallet.py └── plot_performance.py ├── configure.in ├── crfsuite.sln ├── doc ├── Doxyfile ├── footer.html └── header.html ├── example ├── chunking.py ├── crfutils.py ├── ner.py ├── pos.py └── template.py ├── frontend ├── Makefile.am ├── crfsuite.1 ├── dump.c ├── frontend.vcxproj ├── iwa.c ├── iwa.h ├── learn.c ├── main.c ├── option.c ├── option.h ├── readdata.h ├── reader.c └── tag.c ├── genbinary.sh.in ├── include ├── Makefile.am ├── crfsuite.h ├── crfsuite.hpp ├── crfsuite_api.hpp └── os.h ├── lib ├── cqdb │ ├── COPYING │ ├── Makefile.am │ ├── cqdb.vcxproj │ ├── doc │ │ ├── doxyfile │ │ └── footer.html │ ├── include │ │ └── cqdb.h │ ├── makedist.sh │ └── src │ │ ├── cqdb.c │ │ ├── lookup3.c │ │ └── main.c └── crf │ ├── Makefile.am │ ├── crf.vcxproj │ └── src │ ├── crf1d.h │ ├── crf1d_context.c │ ├── crf1d_encode.c │ ├── crf1d_feature.c │ ├── crf1d_model.c │ ├── crf1d_tag.c │ ├── crfsuite.c │ ├── crfsuite_internal.h │ ├── crfsuite_train.c │ ├── dataset.c │ ├── dictionary.c │ ├── holdout.c │ ├── logging.c │ ├── logging.h │ ├── params.c │ ├── params.h │ ├── quark.c │ ├── quark.h │ ├── rumavl.c │ ├── rumavl.h │ ├── train_arow.c │ ├── train_averaged_perceptron.c │ ├── train_l2sgd.c │ ├── train_lbfgs.c │ ├── train_passive_aggressive.c │ └── vecmath.h ├── swig ├── Makefile.am ├── crfsuite.cpp ├── export.i ├── perl │ ├── Makefile.PL.in │ ├── prepare.sh │ ├── sample_tag.pl │ └── sample_train.pl ├── python │ ├── README │ ├── crfsuite.py │ ├── export_wrap.cpp │ ├── export_wrap.h │ ├── prepare.sh │ ├── sample_tag.py │ ├── sample_train.py │ └── setup.py.in └── ruby │ ├── README │ ├── extconf.rb │ ├── prepare.sh │ ├── sample_tag.rb │ └── sample_train.rb └── win32 ├── inttypes.h ├── liblbfgs ├── lbfgs.h └── lbfgs.lib └── stdint.h /AUTHORS: -------------------------------------------------------------------------------- 1 | Naoaki Okazaki 2 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The BSD license. 2 | 3 | Copyright (c) 2007-2010, Naoaki Okazaki 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the names of the authors nor the names of its contributors 14 | may be used to endorse or promote products derived from this 15 | software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2011-08-11 Naoaki Okazaki 2 | * CRFsuite 0.12 3 | - [CORE] Optimized the implementation for faster training; approximately 1.4-1.5 x speed up. 4 | - [CORE] Faster routine for computing exp(x) using SSE2. 5 | - [CORE] Restructured the source code to separate routines for CRF graphical models and training algorithms; this is an initial attempt for implementing CRFs with different feature types (e.g., 2nd-order CRF, 1st-order transition features conditioned on observations) and different training algorithms. 6 | - [CORE] Implemented new training algorithms: Averaged Perceptron, Passive Aggressive, and Adaptive Regularization of Weights (AROW). 7 | - [CORE] Removed automatic generation of BOS/EOS features; one can use these features by inserting attributes to the first/last items (e.g., "__BOS__" at the first item and "__EOS__" at the last item). 8 | - [CORE] Fixed some memory-leak problems. 9 | - [CORE] Reduced memory usage in training. 10 | - [CORE] Fixed a crash problem when the model file does not exist in tagging. 11 | - [FRONTEND:LEARN] Training and test sets are maintained by group numbers; specify the group number for hold-out evaluation with "-e" option. 12 | - [FRONTEND:LEARN] Training algorithm is now specified by "-a" option instead of "-p algorithm=". 13 | - [FRONTEND:LEARN] Renamed some training parameters; for example, an L2 regularization coefficient is specified by "c2" instead of "regularization.sigma" (c2 = 0.5 / sigma * sigma; c1 = 1.0 / sigma). 14 | - [FRONTEND:LEARN] Show the list of parameters, default values, and descriptions with "-H" option. 15 | - [FRONTEND:LEARN] Removed the support of comment lines for simplicity; one may forget to escape '#' characters in a data set. CRFsuite now does not handle '#' as a special character. 16 | - [FRONTEND:TAGGER] Output probabilities of predicted sequences with "-p" option. 17 | - [FRONTEND:TAGGER] Output marginal probabilities of predicted items with "-i" option. 18 | - [API] Numerous changes in API for the enhancements. 19 | - [API] Renamed the library name "libcrf" to "libcrfsuite". 20 | - [API] Renamed the prefix "crf_" to "crfsuite_" in structure and function names. 21 | - [API] Implemented a high-level and easy-to-use API for C++/SWIG (crfsuite.hpp and crfsuite_api.hpp). 22 | - [API] Implemented the Python SWIG module and sample programs; writing a tagger is very easy with this module. 23 | - [SAMPLE] Rewritten samples. 24 | - [SAMPLE] A sample program (template.py) for using feature templates that are compatible with CRF++. 25 | - [SAMPLE] New samples in example directory: Named Entity Recognition (ner.py) using the CoNLL2003 data set, and part-of-speech tagging (pos.py). 26 | - [OTHER] Updated the MSVC solution file to MSVC 2010. 27 | 28 | 29 | 2010-07-16 Naoaki Okazaki 30 | * CRFsuite 0.11 31 | - Renamed crf.h into crfsuite.h to avoid possible conflects in include directories 32 | - Install crfsuite.h to the include directory (suggested by Ingo Glöckner) 33 | 34 | 35 | 2010-01-29 Naoaki Okazaki 36 | * CRFsuite 0.10 37 | - A patch submitted by Hiroshi Manabe (at Kodensha Co., Ltd.) to fix memory leak problems in the tagger. 38 | - Added a new option -r (--reference) for the tagger to output reference labels in parallel with predicted labels. 39 | 40 | 41 | 2009-09-24 Naoaki Okazaki 42 | * CRFsuite 0.9 43 | - Fixed a build problem with liblbfgs 1.8 44 | 45 | 46 | 2009-03-17 Naoaki Okazaki 47 | * CRFsuite 0.8 48 | - Improved the portability of model files across different machine architectures with different byte order; this fixes a crash problem in tagging on some machine architectures. 49 | 50 | 51 | 2009-03-10 Naoaki Okazaki 52 | * CRFsuite 0.7 53 | - Updated RumAVL library to 4.0.0; this fixes a crash problem occurring in feature generation on some machine architectures. 54 | 55 | 56 | 2009-03-07 Naoaki Okazaki 57 | * CRFsuite 0.6 58 | - A new training algorithm, Stochastic Gradient Descent (SGD). 59 | - Updated the L-BFGS routine to liblbfgs 1.7. 60 | - Reduced memory usage in training. 61 | - Supported escape sequences in training/test data. 62 | - Restructured the source code. 63 | - Added a parameter to configure the number of trials for line search. 64 | 65 | 66 | 2008-11-19 Naoaki Okazaki 67 | * CRFsuite 0.5 68 | - Updated the L-BFGS routine to liblbfgs 1.6. 69 | - New parameters lbfgs.stop, lbfgs.delta, lbfgs.linesearch were added. 70 | - Fixed a bug in which the frontend tools could not parse "item:value" format correctly. 71 | - Fixed a bug in computing the accuracy. 72 | - Fixed a bug when the tagger receives an item with no feature. 73 | 74 | 75 | 2008-03-05 Naoaki Okazaki 76 | 77 | * CRFsuite 0.4 (the first public release): 78 | - Website and documentation for CRFsuite. 79 | - Tutorial on the CoNLL 2000 chunking shared task. 80 | - Performance comparison on the CoNLL 2000 chunking shared task. 81 | - Bug fix in L2 regularization. 82 | - A number of small improvements for the public release. 83 | 84 | 85 | 2007-12-12 Naoaki Okazaki 86 | 87 | * CRFsuite 0.3 (internal release): 88 | - Implemented scaling method for forward/backward algorithm. 89 | - Removed the code for computing the forward/backward algorithm in logarithm domain. 90 | 91 | 92 | 2007-11-30 Naoaki Okazaki 93 | 94 | * CRFsuite 0.2 (internal release): 95 | - Orthant-Wise Limited-memory Quasi-Newton (OW-LQN) method for L1 regularization. 96 | - Configurable L-BFGS parameters (number of limited memories, epsilon). 97 | 98 | 99 | 2007-10-29 Naoaki Okazaki 100 | 101 | * CRFsuite 0.1 (internal release): 102 | - Initial release. 103 | 104 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | 3 | SUBDIRS = include lib/cqdb lib/crf frontend swig 4 | 5 | docdir = $(prefix)/share/doc/@PACKAGE@ 6 | doc_DATA = README INSTALL COPYING AUTHORS ChangeLog 7 | 8 | EXTRA_DIST = \ 9 | crfsuite.sln \ 10 | autogen.sh \ 11 | win32/stdint.h \ 12 | example/crfutils.py \ 13 | example/template.py \ 14 | example/pos.py \ 15 | example/ner.py \ 16 | example/chunking.py 17 | 18 | AUTOMAKE_OPTIONS = foreign 19 | ACLOCAL_AMFLAGS = -I m4 20 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | CRFsuite 2 | Version 0.12 3 | http://www.chokkan.org/software/crfsuite/ 4 | 5 | 6 | 7 | * INTRODUCTION 8 | CRFSuite is an implementation of Conditional Random Fields (CRFs) for 9 | labeling sequential data. Please refer to the web site for more 10 | information about this software. 11 | 12 | 13 | 14 | * COPYRIGHT AND LICENSING INFORMATION 15 | 16 | This program is distributed under the modified BSD license. Refer to 17 | COPYING file for the precise description of the license. 18 | 19 | 20 | Portions of this software are based on libLBFGS. 21 | 22 | The MIT License 23 | 24 | Copyright (c) 1990 Jorge Nocedal 25 | Copyright (c) 2007 Naoaki Okazaki 26 | 27 | Permission is hereby granted, free of charge, to any person obtaining a 28 | copy of this software and associated documentation files (the "Software"), 29 | to deal in the Software without restriction, including without limitation 30 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 31 | and/or sell copies of the Software, and to permit persons to whom the 32 | Software is furnished to do so, subject to the following conditions: 33 | 34 | The above copyright notice and this permission notice shall be included in 35 | all copies or substantial portions of the Software. 36 | 37 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 39 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 40 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 41 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 42 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 43 | THE SOFTWARE. 44 | 45 | 46 | Portions of this software are based on Constant Quark Database (CQDB). 47 | 48 | The BSD license. 49 | 50 | Copyright (c) 2007, Naoaki Okazaki 51 | All rights reserved. 52 | 53 | Redistribution and use in source and binary forms, with or without 54 | modification, are permitted provided that the following conditions are met: 55 | * Redistributions of source code must retain the above copyright 56 | notice, this list of conditions and the following disclaimer. 57 | * Redistributions in binary form must reproduce the above copyright 58 | notice, this list of conditions and the following disclaimer in the 59 | documentation and/or other materials provided with the distribution. 60 | * Neither the name of the Northwestern University, University of Tokyo, 61 | nor the names of its contributors may be used to endorse or promote 62 | products derived from this software without specific prior written 63 | permission. 64 | 65 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 66 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 67 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 68 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 69 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 70 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 71 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 72 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 73 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 74 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 75 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 76 | 77 | 78 | Portions of this software are based on RumAVL. 79 | 80 | MIT/X Consortium License. 81 | 82 | Copyright (c) 2005-2007 Jesse Long 83 | All rights reserved. 84 | 85 | Permission is hereby granted, free of charge, to any person obtaining a 86 | copy of this software and associated documentation files (the "Software"), 87 | to deal in the Software without restriction, including without limitation 88 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 89 | and/or sell copies of the Software, and to permit persons to whom the 90 | Software is furnished to do so, subject to the following conditions: 91 | 92 | 1. The above copyright notice and this permission notice shall be 93 | included in all copies or substantial portions of the Software. 94 | 2. The origin of the Software must not be misrepresented; you must not 95 | claim that you wrote the original Software. 96 | 3. Altered source versions of the Software must be plainly marked as 97 | such, and must not be misrepresented as being the original Software. 98 | 99 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 100 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 101 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 102 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 103 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 104 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 105 | DEALINGS IN THE SOFTWARE. 106 | 107 | 108 | Portions of this software are based on a portable stdint.h (for MSVC). 109 | 110 | Copyright (c) 2005-2007 Paul Hsieh 111 | 112 | Redistribution and use in source and binary forms, with or without 113 | modification, are permitted provided that the following conditions 114 | are met: 115 | 116 | Redistributions of source code must retain the above copyright 117 | notice, this list of conditions and the following disclaimer. 118 | 119 | Redistributions in binary form must not misrepresent the orignal 120 | source in the documentation and/or other materials provided 121 | with the distribution. 122 | 123 | The names of the authors nor its contributors may be used to 124 | endorse or promote products derived from this software without 125 | specific prior written permission. 126 | 127 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 128 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 129 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 130 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 131 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 132 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 133 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 134 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 135 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 136 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 137 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 138 | OF THE POSSIBILITY OF SUCH DAMAGE. 139 | 140 | 141 | Portions of this software are based on Mersenne Twister. 142 | 143 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 144 | All rights reserved. 145 | 146 | Redistribution and use in source and binary forms, with or without 147 | modification, are permitted provided that the following conditions 148 | are met: 149 | 150 | 1. Redistributions of source code must retain the above copyright 151 | notice, this list of conditions and the following disclaimer. 152 | 153 | 2. Redistributions in binary form must reproduce the above copyright 154 | notice, this list of conditions and the following disclaimer in the 155 | documentation and/or other materials provided with the distribution. 156 | 157 | 3. The names of its contributors may not be used to endorse or promote 158 | products derived from this software without specific prior written 159 | permission. 160 | 161 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 162 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 163 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 164 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 165 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 166 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 167 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 168 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 169 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 170 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 171 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 172 | 173 | 174 | 175 | * SPECIAL THANKS GOES TO... 176 | Olivier Grisel 177 | Andreas Holzbach 178 | Baoli Li 179 | Yoshimasa Tsuruoka 180 | Hiroshi Manabe 181 | Riza Theresa B. Batista-Navarro 182 | 183 | 184 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id:$ 3 | 4 | if [ "$1" = "--force" ]; 5 | then 6 | FORCE=--force 7 | NOFORCE= 8 | FORCE_MISSING=--force-missing 9 | else 10 | FORCE= 11 | NOFORCE=--no-force 12 | FORCE_MISSING= 13 | fi 14 | 15 | libtoolize --copy $FORCE 2>&1 | sed '/^You should/d' || { 16 | echo "libtoolize failed!" 17 | exit 1 18 | } 19 | 20 | aclocal $FORCE || { 21 | echo "aclocal failed!" 22 | exit 1 23 | } 24 | 25 | autoheader $FORCE || { 26 | echo "autoheader failed!" 27 | exit 1 28 | } 29 | 30 | automake -a -c $NOFORCE || { 31 | echo "automake failed!" 32 | exit 1 33 | } 34 | 35 | autoconf $FORCE || { 36 | echo "autoconf failed!" 37 | exit 1 38 | } 39 | -------------------------------------------------------------------------------- /bench/accuracy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | fi = sys.stdin 7 | fo = sys.stdout 8 | n = 0 9 | m = 0 10 | 11 | for line in fi: 12 | line = line.strip() 13 | if line: 14 | fields = line.split() 15 | if len(fields) >= 2: 16 | if fields[-1] == fields[-2]: 17 | m += 1 18 | n += 1 19 | 20 | print 'Item accuracy: %f' % (m / float(n)) 21 | -------------------------------------------------------------------------------- /bench/bench.py: -------------------------------------------------------------------------------- 1 | import re 2 | import collections 3 | 4 | LOGDIR='log/' 5 | 6 | def seconds(s): 7 | p = s.find(':') 8 | q = s.find(':', p+1) 9 | return int(s[:p]) * 3600 + int(s[p+1:q]) * 60 + int(s[q+1:]) 10 | 11 | def last(X): 12 | if len(X) >= 1: 13 | return X[-1] 14 | else: 15 | return None 16 | 17 | def diffmin(X): 18 | D = [] 19 | prev = None 20 | for x in X: 21 | if prev is not None: 22 | D.append(x - prev) 23 | prev = x 24 | return min(D) 25 | 26 | def analyze_log(fi, patterns): 27 | P = {} 28 | for name, pattern, index, cast, func in patterns: 29 | P[name] = (re.compile(pattern), index, cast, func) 30 | 31 | D = collections.defaultdict(list) 32 | for line in fi: 33 | line = line.strip('\n') 34 | for name, (regex, index, cast, func) in P.iteritems(): 35 | m = regex.search(line) 36 | if m is not None: 37 | if isinstance(index, tuple): 38 | for i in index: 39 | D[name].append(cast(m.group(i))) 40 | elif isinstance(index, int): 41 | D[name].append(cast(m.group(index))) 42 | 43 | 44 | R = {} 45 | for name, (regex, index, cast, func) in P.iteritems(): 46 | R[name] = func(D[name]) 47 | return R 48 | -------------------------------------------------------------------------------- /bench/bench_crfpp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFPP_LEARN='/home/okazaki/local/bin/crf_learn' 9 | CRFPP_TEST='/home/okazaki/local/bin/crf_test' 10 | OUTDIR='crfpp/' 11 | 12 | training_patterns = ( 13 | ('num_features', r'^Number of features:[ ]*(\d+)', 1, int, last), 14 | ('time', r'^Done!([\d.]+)', 1, float, last), 15 | ('iterations', r'^iter=(\d+)', 1, int, last), 16 | ('update', r'time=([\d.]+)', 1, float, min), 17 | ('loss', r'obj=([\d.]+)', 1, float, last), 18 | ) 19 | 20 | tagging_patterns = ( 21 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 22 | ) 23 | 24 | params = { 25 | 'lbfgs': '-a CRF-L2', 26 | 'mira': '-a MIRA', 27 | } 28 | 29 | if __name__ == '_main__': 30 | print analyze_log(sys.stdin, training_patterns) 31 | 32 | if __name__ == '__main__': 33 | fe = sys.stderr 34 | 35 | R = {} 36 | for name, param in params.iteritems(): 37 | model = OUTDIR + name + '.model' 38 | trlog = OUTDIR + name + '.tr.log' 39 | trtxt = LOGDIR + 'crfpp-' + name + '.txt' 40 | tglog = OUTDIR + name + '.tg.log' 41 | 42 | s = string.Template( 43 | '$crfpp_learn $param template.crfpp train.txt $model > $trlog' 44 | ) 45 | cmd = s.substitute( 46 | crfpp_learn=CRFPP_LEARN, 47 | param=param, 48 | model=model, 49 | trlog=trlog 50 | ) 51 | 52 | fe.write(cmd) 53 | fe.write('\n') 54 | #os.system(cmd) 55 | 56 | fo = open(trtxt, 'w') 57 | fo.write('$ %s\n' % cmd) 58 | fo.write(open(trlog, 'r').read()) 59 | 60 | s = string.Template( 61 | '$crfpp_test -m $model test.txt | ./accuracy.py > $tglog' 62 | ) 63 | cmd = s.substitute( 64 | crfpp_test=CRFPP_TEST, 65 | model=model, 66 | tglog=tglog 67 | ) 68 | 69 | fe.write(cmd) 70 | fe.write('\n') 71 | #os.system(cmd) 72 | 73 | D = analyze_log(open(trlog), training_patterns) 74 | D.update(analyze_log(open(tglog), tagging_patterns)) 75 | D['logfile'] = trtxt 76 | R[name] = D 77 | 78 | print repr(R) 79 | -------------------------------------------------------------------------------- /bench/bench_crfsgd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSGD='/home/okazaki/install/sgd-1.3/crf/crfsgd' 9 | OUTDIR='crfsgd/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'features: (\d+)', 1, int, last), 13 | ('time', r'^Done! ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\[Epoch (\d+)\]', 1, int, last), 15 | ('update', r'^\[Epoch \d+\][^a-z]+wnorm:[^a-z]+total time: ([\d.]+) seconds$', 1, float, diffmin), 16 | ('loss', r'loss: ([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'default': "-f 1 -r 100 -e ''", 25 | } 26 | 27 | if __name__ == '__main__': 28 | fe = sys.stderr 29 | 30 | R = {} 31 | for name, param in params.iteritems(): 32 | model = OUTDIR + name + '.model' 33 | trlog = OUTDIR + name + '.tr.log' 34 | trtxt = LOGDIR + 'crfsgd-' + name + '.txt' 35 | tglog = OUTDIR + name + '.tg.log' 36 | 37 | s = string.Template( 38 | '$crfsgd $param $model template.crfpp train.txt > $trlog' 39 | ) 40 | cmd = s.substitute( 41 | crfsgd=CRFSGD, 42 | param=param, 43 | model=model, 44 | trlog=trlog 45 | ) 46 | 47 | fe.write(cmd) 48 | fe.write('\n') 49 | #os.system(cmd) 50 | 51 | fo = open(trtxt, 'w') 52 | fo.write('$ %s\n' % cmd) 53 | fo.write(open(trlog, 'r').read()) 54 | 55 | s = string.Template( 56 | '$crfsgd -t $model test.txt | ./accuracy.py > $tglog' 57 | ) 58 | cmd = s.substitute( 59 | crfsgd=CRFSGD, 60 | model=model, 61 | tglog=tglog 62 | ) 63 | 64 | fe.write(cmd) 65 | fe.write('\n') 66 | #os.system(cmd) 67 | 68 | D = analyze_log(open(trlog), training_patterns) 69 | D.update(analyze_log(open(tglog), tagging_patterns)) 70 | D['logfile'] = trtxt 71 | R[name] = D 72 | 73 | print repr(R) 74 | 75 | if __name__ == '_main__': 76 | print analyze_log(sys.stdin, training_patterns) 77 | -------------------------------------------------------------------------------- /bench/bench_crfsuite-0.11.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSUITE='/home/okazaki/install/crfsuite-0.11/frontend/crfsuite' 9 | OUTDIR='crfsuite-0.11/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of features: (\d+)', 1, int, last), 13 | ('time', r'^Total seconds required for L-BFGS: ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\*\*\*\*\* (Iteration|Epoch) #(\d+)', 2, int, last), 15 | ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^Log-likelihood: -([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: \d+ / \d+ $([\d.]+)$', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs-sparse': '-p regularization.sigma=0.70710678118654746 -p feature.possible_states=0 -p feature.possible_transitions=0', 25 | 'lbfgs-dense': '-p regularization.sigma=0.70710678118654746 -p feature.possible_states=1 -p feature.possible_transitions=1', 26 | } 27 | 28 | if __name__ == '_main__': 29 | print analyze_log(sys.stdin, training_patterns) 30 | 31 | if __name__ == '__main__': 32 | fe = sys.stderr 33 | 34 | R = {} 35 | for name, param in params.iteritems(): 36 | model = OUTDIR + name + '.model' 37 | trlog = OUTDIR + name + '.tr.log' 38 | trtxt = LOGDIR + 'crfsuite0.11-' + name + '.txt' 39 | tglog = OUTDIR + name + '.tg.log' 40 | 41 | s = string.Template( 42 | '$crfsuite learn $param -m $model train.crfsuite > $trlog' 43 | ) 44 | cmd = s.substitute( 45 | crfsuite=CRFSUITE, 46 | param=param, 47 | model=model, 48 | trlog=trlog 49 | ) 50 | 51 | fe.write(cmd) 52 | fe.write('\n') 53 | #os.system(cmd) 54 | 55 | fo = open(trtxt, 'w') 56 | fo.write('$ %s\n' % cmd) 57 | fo.write(open(trlog, 'r').read()) 58 | 59 | s = string.Template( 60 | '$crfsuite tag -m $model -qt test.crfsuite > $tglog' 61 | ) 62 | cmd = s.substitute( 63 | crfsuite=CRFSUITE, 64 | model=model, 65 | tglog=tglog 66 | ) 67 | 68 | fe.write(cmd) 69 | fe.write('\n') 70 | #os.system(cmd) 71 | 72 | D = analyze_log(open(trlog), training_patterns) 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /bench/bench_crfsuite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSUITE='/home/okazaki/projects/crfsuite/frontend/crfsuite' 9 | OUTDIR='crfsuite/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of features: (\d+)', 1, int, last), 13 | ('time', r'^Total seconds required for training: ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\*\*\*\*\* (Iteration|Epoch) #(\d+)', 2, int, last), 15 | ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^Loss: ([-\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: \d+ / \d+ $([\d.]+)$', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs-sparse': '-a lbfgs -p feature.possible_states=0 -p feature.possible_transitions=0', 25 | 'lbfgs-dense': '-a lbfgs -p feature.possible_states=1 -p feature.possible_transitions=1', 26 | 'l2sgd-sparse': '-a l2sgd -p feature.possible_states=0 -p feature.possible_transitions=0', 27 | 'l2sgd-dense': '-a l2sgd -p feature.possible_states=1 -p feature.possible_transitions=1', 28 | 'ap-sparse': '-a ap -p feature.possible_states=0 -p feature.possible_transitions=0 -p max_iterations=50', 29 | 'ap-dense': '-a ap -p feature.possible_states=1 -p feature.possible_transitions=1 -p max_iterations=50', 30 | } 31 | 32 | if __name__ == '_main__': 33 | print analyze_log(sys.stdin, training_patterns) 34 | 35 | if __name__ == '__main__': 36 | fe = sys.stderr 37 | 38 | R = {} 39 | for name, param in params.iteritems(): 40 | model = OUTDIR + name + '.model' 41 | trlog = OUTDIR + name + '.tr.log' 42 | trtxt = LOGDIR + 'crfsuite-' + name + '.txt' 43 | tglog = OUTDIR + name + '.tg.log' 44 | 45 | s = string.Template( 46 | '$crfsuite learn $param -m $model train.crfsuite > $trlog' 47 | ) 48 | cmd = s.substitute( 49 | crfsuite=CRFSUITE, 50 | param=param, 51 | model=model, 52 | trlog=trlog 53 | ) 54 | 55 | fe.write(cmd) 56 | fe.write('\n') 57 | #os.system(cmd) 58 | 59 | fo = open(trtxt, 'w') 60 | fo.write('$ %s\n' % cmd) 61 | fo.write(open(trlog, 'r').read()) 62 | 63 | s = string.Template( 64 | '$crfsuite tag -m $model -qt test.crfsuite > $tglog' 65 | ) 66 | cmd = s.substitute( 67 | crfsuite=CRFSUITE, 68 | model=model, 69 | tglog=tglog 70 | ) 71 | 72 | fe.write(cmd) 73 | fe.write('\n') 74 | #os.system(cmd) 75 | 76 | D = analyze_log(open(trlog), training_patterns) 77 | D.update(analyze_log(open(tglog), tagging_patterns)) 78 | D['logfile'] = trtxt 79 | R[name] = D 80 | 81 | print repr(R) 82 | -------------------------------------------------------------------------------- /bench/bench_mallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | MALLET='java -cp "/home/okazaki/install/mallet-2.0.6/class:/home/okazaki/install/mallet-2.0.6/lib/mallet-deps.jar" cc.mallet.fst.SimpleTagger' 9 | OUTDIR='mallet/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of weights = (\d+)', 1, int, last), 13 | ('time', r'^([\d.]+)user ([\d.]+)system', (1, 2), float, sum), 14 | ('iterations', r'^CRF finished one iteration of maximizer, i=(\d+)', 1, int, len), 15 | # ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^getValue $loglikelihood, optimizable by label likelihood$ = -([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Testing accuracy=([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'default': '--gaussian-variance 0.70710678118654746', 25 | } 26 | 27 | if __name__ == '_main__': 28 | print analyze_log(sys.stdin, training_patterns) 29 | 30 | if __name__ == '__main__': 31 | fe = sys.stderr 32 | 33 | R = {} 34 | for name, param in params.iteritems(): 35 | model = OUTDIR + name + '.model' 36 | trlog = OUTDIR + name + '.tr.log' 37 | trtxt = LOGDIR + 'mallet-' + name + '.txt' 38 | tglog = OUTDIR + name + '.tg.log' 39 | 40 | s = string.Template( 41 | 'time $mallet --train true $param --model-file $model train.mallet > $trlog 2>&1' 42 | ) 43 | cmd = s.substitute( 44 | mallet=MALLET, 45 | param=param, 46 | model=model, 47 | trlog=trlog 48 | ) 49 | 50 | fe.write(cmd) 51 | fe.write('\n') 52 | #os.system(cmd) 53 | 54 | fo = open(trtxt, 'w') 55 | fo.write('$ %s\n' % cmd) 56 | fo.write(open(trlog, 'r').read()) 57 | 58 | s = string.Template( 59 | '$mallet --model-file $model --test lab test.mallet > $tglog 2>&1' 60 | ) 61 | cmd = s.substitute( 62 | mallet=MALLET, 63 | model=model, 64 | tglog=tglog 65 | ) 66 | 67 | fe.write(cmd) 68 | fe.write('\n') 69 | #os.system(cmd) 70 | 71 | D = analyze_log(open(trlog), training_patterns) 72 | D['update'] = 0. 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /bench/bench_wapiti.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | WAPITI='/home/okazaki/install/wapiti-1.1.3/wapiti' 9 | OUTDIR='wapiti/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'nb features: (\d+)', 1, int, last), 13 | ('time', r'^([\d.]+)user ([\d.]+)system', (1, 2), float, sum), 14 | ('iterations', r'\[\s*(\d+)\]', 1, int, last), 15 | ('update', r'time=([\d.]+)', 1, float, min), 16 | ('loss', r'obj=([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs': '-a l-bfgs --rho2 0.70710678118654746 --maxiter 1000 --stopeps 0.00001 --stopwin 10', 25 | 'rprop': '-a rprop --rho3 0.70710678118654746 --maxiter 1000', 26 | } 27 | 28 | if __name__ == '_main__': 29 | print analyze_log(sys.stdin, training_patterns) 30 | 31 | if __name__ == '__main__': 32 | fe = sys.stderr 33 | 34 | R = {} 35 | for name, param in params.iteritems(): 36 | model = OUTDIR + name + '.model' 37 | trlog = OUTDIR + name + '.tr.log' 38 | trtxt = LOGDIR + 'wapiti-' + name + '.txt' 39 | tglog = OUTDIR + name + '.tg.log' 40 | 41 | s = string.Template( 42 | 'time $wapiti train $param -p template.wapiti train.txt $model > $trlog 2>&1' 43 | ) 44 | cmd = s.substitute( 45 | wapiti=WAPITI, 46 | param=param, 47 | model=model, 48 | trlog=trlog 49 | ) 50 | 51 | fe.write(cmd) 52 | fe.write('\n') 53 | #os.system(cmd) 54 | 55 | fo = open(trtxt, 'w') 56 | fo.write('$ %s\n' % cmd) 57 | fo.write(open(trlog, 'r').read()) 58 | 59 | s = string.Template( 60 | '$wapiti label -m $model test.txt | ./accuracy.py > $tglog' 61 | ) 62 | cmd = s.substitute( 63 | wapiti=WAPITI, 64 | model=model, 65 | tglog=tglog 66 | ) 67 | 68 | fe.write(cmd) 69 | fe.write('\n') 70 | #os.system(cmd) 71 | 72 | D = analyze_log(open(trlog), training_patterns) 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /bench/collect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | 6 | scripts = ( 7 | ('CRFsuite 0.12', './bench_crfsuite.py'), 8 | ('CRFsuite 0.11', './bench_crfsuite-0.11.py'), 9 | ('Wapiti v1.1.3', './bench_wapiti.py'), 10 | ('sgd 1.3', './bench_crfsgd.py'), 11 | ('CRF++ 0.54', './bench_crfpp.py'), 12 | ('MALLET 2.0.6', './bench_mallet.py'), 13 | ) 14 | 15 | fields = ( 16 | ('# Features', 'num_features'), 17 | ('Time', 'time'), 18 | ('# Iters', 'iterations'), 19 | ('Update', 'update'), 20 | ('Loss', 'loss'), 21 | ('Log', 'log'), 22 | ) 23 | 24 | def number(x): 25 | y = '' 26 | p = x.find('.') 27 | if p == -1: 28 | p = len(x) 29 | for i in range(p): 30 | if i % 3 == 0 and i != 0: 31 | y = ' ' + y 32 | y = x[p-i-1] + y 33 | return y + x[p:] 34 | 35 | def read(): 36 | R = {} 37 | for name, script in scripts: 38 | fi = os.popen(script, 'r') 39 | R[name] = eval(fi.read()) 40 | return R 41 | 42 | def output_update(fo, R): 43 | for name, script in scripts: 44 | for param, result in R[name].iteritems(): 45 | fo.write('%s\t%s\t%f\n' % (name, param, result.get('update', 0.))) 46 | 47 | def output_table(fo, R): 48 | for name, script in scripts: 49 | for param, result in R[name].iteritems(): 50 | fo.write('\n') 51 | fo.write('%s\n' % name) 52 | fo.write('%s\n' % param) 53 | fo.write('\n') 54 | fo.write('%s\n' % number('%d' % result['num_features'])) 55 | fo.write('%s\n' % number('%.1f' % result['time'])) 56 | fo.write('%s\n' % number('%d' % result['iterations'])) 57 | fo.write('%s\n' % number('%.1f' % result['update'])) 58 | fo.write('%s\n' % number('%.1f' % result['loss'])) 59 | fo.write('%.3f\n' % (100. * result['accuracy'])) 60 | fo.write('Log\n' % result['logfile']) 61 | fo.write('\n') 62 | fo.write('\n') 63 | 64 | 65 | if __name__ == '__main__': 66 | R = read() 67 | output_table(sys.stdout, R) 68 | -------------------------------------------------------------------------------- /bench/crfsuite_to_mallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | fi = sys.stdin 6 | fo = sys.stdout 7 | 8 | for line in fi: 9 | line = line.strip('\n') 10 | if not line: 11 | fo.write('\n') 12 | 13 | fields = line.split('\t') 14 | fo.write('%s %s\n' % (' '.join(fields[1:]), fields[0])) 15 | 16 | 17 | -------------------------------------------------------------------------------- /bench/plot_performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | 6 | re_iteration = re.compile(r'^\*\*\*\*\* (Iteration|Epoch) #(\d+) \*\*\*\*\*') 7 | patterns = { 8 | 'loss': re.compile(r'^Loss: ([\d.]+)'), 9 | 'accuracy': re.compile(r'^Item accuracy: \d+ / \d+ $([\d.]+)$'), 10 | 'norm': re.compile(r'^Feature [L2-]+norm: ([\d.]+)'), 11 | } 12 | 13 | def read(fi): 14 | D = [] 15 | for line in fi: 16 | line = line.strip('\n') 17 | m = re_iteration.match(line) 18 | if m is not None: 19 | if len(D)+1 != int(m.group(2)): 20 | sys.stderr.write('ERROR: sync\n') 21 | sys.exit(1) 22 | D.append({}) 23 | continue 24 | 25 | if D: 26 | for name, pattern in patterns.iteritems(): 27 | m = pattern.match(line) 28 | if m is not None: 29 | D[-1][name] = float(m.group(1)) 30 | 31 | return D 32 | 33 | if __name__ == '__main__': 34 | fi = sys.stdin 35 | fo = sys.stdout 36 | 37 | i = 1 38 | D = read(fi) 39 | for item in D: 40 | fo.write('%d' % i) 41 | i += 1 42 | for name in patterns.iterkeys(): 43 | fo.write(' %f' % item[name]) 44 | fo.write('\n') 45 | -------------------------------------------------------------------------------- /configure.in: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl 3 | dnl 4 | dnl Exported and configured variables: 5 | dnl CFLAGS 6 | dnl LDFLAGS 7 | dnl INCLUDES 8 | 9 | 10 | dnl ------------------------------------------------------------------ 11 | dnl Initialization for autoconf 12 | dnl ------------------------------------------------------------------ 13 | AC_PREREQ(2.59) 14 | AC_INIT(crfsuite, 0.12) 15 | AC_CONFIG_SRCDIR([frontend/main.c]) 16 | AC_CONFIG_MACRO_DIR([m4]) 17 | 18 | dnl ------------------------------------------------------------------ 19 | dnl Checks for system 20 | dnl ------------------------------------------------------------------ 21 | AC_CANONICAL_HOST 22 | AC_AIX 23 | AC_MINIX 24 | AC_ISC_POSIX 25 | 26 | 27 | dnl ------------------------------------------------------------------ 28 | dnl Initialization for automake 29 | dnl ------------------------------------------------------------------ 30 | AM_INIT_AUTOMAKE 31 | AC_CONFIG_HEADERS(config.h) 32 | AM_MAINTAINER_MODE 33 | 34 | 35 | dnl ------------------------------------------------------------------ 36 | dnl Checks for program 37 | dnl ------------------------------------------------------------------ 38 | AM_PROG_CC_C_O 39 | AC_PROG_LIBTOOL 40 | AC_PROG_INSTALL 41 | AC_PROG_LN_S 42 | 43 | 44 | dnl ------------------------------------------------------------------ 45 | dnl Initialization for variables 46 | dnl ------------------------------------------------------------------ 47 | CFLAGS="-std=c99 ${ac_save_CFLAGS}" 48 | LDFLAGS="${ac_save_LDFLAGS}" 49 | INCLUDES="-I\$(top_srcdir) -I\$(top_srcdir)/include -I\$(srcdir)" 50 | 51 | 52 | dnl ------------------------------------------------------------------ 53 | dnl Checks for header files. 54 | dnl ------------------------------------------------------------------ 55 | AC_HEADER_STDC 56 | AC_CHECK_HEADERS(fcntl.h limits.h malloc.h strings.h unistd.h stdint.h) 57 | 58 | 59 | dnl ------------------------------------------------------------------ 60 | dnl Checks for typedefs, structures, and compiler characteristics. 61 | dnl ------------------------------------------------------------------ 62 | AC_C_CONST 63 | AC_CHECK_SIZEOF 64 | AC_TYPE_SIZE_T 65 | AC_STRUCT_TM 66 | AC_CHECK_SIZEOF(short) 67 | AC_CHECK_SIZEOF(unsigned short) 68 | AC_CHECK_SIZEOF(int) 69 | AC_CHECK_SIZEOF(unsigned int) 70 | AC_CHECK_SIZEOF(long) 71 | AC_CHECK_SIZEOF(unsigned long) 72 | 73 | AC_CHECK_TYPES([uint8_t, uint16_t, uint32_t]) 74 | 75 | dnl ------------------------------------------------------------------ 76 | dnl Checks for debugging mode 77 | dnl ------------------------------------------------------------------ 78 | AC_ARG_ENABLE( 79 | debug, 80 | [AS_HELP_STRING([--enable-debug],[turn on debugging])] 81 | ) 82 | 83 | if test "x$enable_debug" = "xyes"; then 84 | CFLAGS="-DDEBUG -O -g ${CFLAGS}" 85 | else 86 | CFLAGS="-O3 -fomit-frame-pointer -ffast-math -Winline ${CFLAGS}" 87 | fi 88 | 89 | dnl ------------------------------------------------------------------ 90 | dnl Checks for profiling mode 91 | dnl ------------------------------------------------------------------ 92 | AC_ARG_ENABLE( 93 | profile, 94 | [AS_HELP_STRING([--enable-profile],[turn on profiling])] 95 | ) 96 | 97 | if test "x$enable_profile" = "xyes"; then 98 | CFLAGS="-DPROFILE -pg ${CFLAGS}" 99 | fi 100 | 101 | 102 | dnl ------------------------------------------------------------------ 103 | dnl Checks for SSE2 build 104 | dnl ------------------------------------------------------------------ 105 | AC_ARG_ENABLE([sse2], 106 | AS_HELP_STRING( 107 | [--disable-sse2], 108 | [disable SSE2 optimization routines] 109 | ) 110 | ) 111 | 112 | AS_IF([test "x$enable_sse2" != "xno"], [ 113 | CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" 114 | ]) 115 | 116 | 117 | dnl ------------------------------------------------------------------ 118 | dnl Checks for library functions. 119 | dnl ------------------------------------------------------------------ 120 | AC_FUNC_ALLOCA 121 | AC_FUNC_MEMCMP 122 | AC_FUNC_VPRINTF 123 | AC_CHECK_FUNCS(strdup strerror strtol strtoul) 124 | 125 | dnl Check for math library 126 | AC_CHECK_LIB(m, rand) 127 | 128 | AC_ARG_WITH( 129 | liblbfgs, 130 | [AS_HELP_STRING([--with-liblbfgs=DIR],[liblbfgs directory])], 131 | [INCLUDES="${INCLUDES} -I${withval}/include"; LDFLAGS="${LDFLAGS} -L${withval}/lib"] 132 | ) 133 | AC_CHECK_LIB(lbfgs, lbfgs) 134 | 135 | dnl ------------------------------------------------------------------ 136 | dnl Export variables 137 | dnl ------------------------------------------------------------------ 138 | AC_SUBST(CFLAGS) 139 | AC_SUBST(LDFLAGS) 140 | AC_SUBST(INCLUDES) 141 | AC_SUBST(includedir) 142 | AC_SUBST(libdir) 143 | 144 | dnl ------------------------------------------------------------------ 145 | dnl Output the configure results. 146 | dnl ------------------------------------------------------------------ 147 | AC_CONFIG_FILES(Makefile genbinary.sh include/Makefile lib/cqdb/Makefile lib/crf/Makefile frontend/Makefile swig/Makefile swig/python/setup.py swig/perl/Makefile.PL) 148 | AC_OUTPUT 149 | -------------------------------------------------------------------------------- /crfsuite.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crf", "lib\crf\crf.vcxproj", "{D6B16F2E-DA86-4591-8B50-348AB7E3432E}" 5 | EndProject 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "frontend", "frontend\frontend.vcxproj", "{CEC83336-7B18-408B-9F3C-D11225609540}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cqdb", "lib\cqdb\cqdb.vcxproj", "{46A23DE6-7E34-4429-8F15-FCC3C083FC5B}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Win32 = Debug|Win32 13 | Debug|x64 = Debug|x64 14 | Release|Win32 = Release|Win32 15 | Release|x64 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|Win32.ActiveCfg = Debug|Win32 19 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|Win32.Build.0 = Debug|Win32 20 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|x64.ActiveCfg = Debug|Win32 21 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|x64.Build.0 = Debug|Win32 22 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|Win32.ActiveCfg = Release|Win32 23 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|Win32.Build.0 = Release|Win32 24 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|x64.ActiveCfg = Release|x64 25 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|x64.Build.0 = Release|x64 26 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|Win32.ActiveCfg = Debug|Win32 27 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|Win32.Build.0 = Debug|Win32 28 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|x64.ActiveCfg = Debug|Win32 29 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|Win32.ActiveCfg = Release|Win32 30 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|Win32.Build.0 = Release|Win32 31 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|x64.ActiveCfg = Release|Win32 32 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|Win32.ActiveCfg = Debug|Win32 33 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|Win32.Build.0 = Debug|Win32 34 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|x64.ActiveCfg = Debug|Win32 35 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|Win32.ActiveCfg = Release|Win32 36 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|Win32.Build.0 = Release|Win32 37 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|x64.ActiveCfg = Release|Win32 38 | EndGlobalSection 39 | GlobalSection(SolutionProperties) = preSolution 40 | HideSolutionNode = FALSE 41 | EndGlobalSection 42 | EndGlobal 43 | -------------------------------------------------------------------------------- /doc/footer.html: -------------------------------------------------------------------------------- 1 |

2 |

3 | Copyright (c) 2002-2011 by Naoaki Okazaki 4 |
$datetime 5 |

6 | 7 | 8 | -------------------------------------------------------------------------------- /doc/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | CRFSuite: A fast implementation of Conditional Random Fields (CRFs) 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /example/chunking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A feature extractor for chunking. 5 | Copyright 2010,2011 Naoaki Okazaki. 6 | """ 7 | 8 | # Separator of field values. 9 | separator = ' ' 10 | 11 | # Field names of the input data. 12 | fields = 'w pos y' 13 | 14 | # Attribute templates. 15 | templates = ( 16 | (('w', -2), ), 17 | (('w', -1), ), 18 | (('w', 0), ), 19 | (('w', 1), ), 20 | (('w', 2), ), 21 | (('w', -1), ('w', 0)), 22 | (('w', 0), ('w', 1)), 23 | (('pos', -2), ), 24 | (('pos', -1), ), 25 | (('pos', 0), ), 26 | (('pos', 1), ), 27 | (('pos', 2), ), 28 | (('pos', -2), ('pos', -1)), 29 | (('pos', -1), ('pos', 0)), 30 | (('pos', 0), ('pos', 1)), 31 | (('pos', 1), ('pos', 2)), 32 | (('pos', -2), ('pos', -1), ('pos', 0)), 33 | (('pos', -1), ('pos', 0), ('pos', 1)), 34 | (('pos', 0), ('pos', 1), ('pos', 2)), 35 | ) 36 | 37 | 38 | import crfutils 39 | 40 | def feature_extractor(X): 41 | # Apply attribute templates to obtain features (in fact, attributes) 42 | crfutils.apply_templates(X, templates) 43 | if X: 44 | # Append BOS and EOS features manually 45 | X[0]['F'].append('__BOS__') # BOS feature 46 | X[-1]['F'].append('__EOS__') # EOS feature 47 | 48 | if __name__ == '__main__': 49 | crfutils.main(feature_extractor, fields=fields, sep=separator) 50 | -------------------------------------------------------------------------------- /example/crfutils.py: -------------------------------------------------------------------------------- 1 | """ 2 | A miscellaneous utility for sequential labeling. 3 | Copyright 2010,2011 Naoaki Okazaki. 4 | """ 5 | 6 | import optparse 7 | import sys 8 | 9 | def apply_templates(X, templates): 10 | """ 11 | Generate features for an item sequence by applying feature templates. 12 | A feature template consists of a tuple of (name, offset) pairs, 13 | where name and offset specify a field name and offset from which 14 | the template extracts a feature value. Generated features are stored 15 | in the 'F' field of each item in the sequence. 16 | 17 | @type X: list of mapping objects 18 | @param X: The item sequence. 19 | @type template: tuple of (str, int) 20 | @param template: The feature template. 21 | """ 22 | for template in templates: 23 | name = '|'.join(['%s[%d]' % (f, o) for f, o in template]) 24 | X_len = len(X) 25 | for t in range(X_len): 26 | values = [] 27 | for field, offset in template: 28 | p = t + offset 29 | if p < 0 or p >= X_len: 30 | values = [] 31 | break 32 | values.append(X[p][field]) 33 | if values: 34 | X[t]['F'].append('%s=%s' % (name, '|'.join(values))) 35 | 36 | def readiter(fi, names, sep=' '): 37 | """ 38 | Return an iterator for item sequences read from a file object. 39 | This function reads a sequence from a file object L{fi}, and 40 | yields the sequence as a list of mapping objects. Each line 41 | (item) from the file object is split by the separator character 42 | L{sep}. Separated values of the item are named by L{names}, 43 | and stored in a mapping object. Every item has a field 'F' that 44 | is reserved for storing features. 45 | 46 | @type fi: file 47 | @param fi: The file object. 48 | @type names: tuple 49 | @param names: The list of field names. 50 | @type sep: str 51 | @param sep: The separator character. 52 | @rtype list of mapping objects 53 | @return An iterator for sequences. 54 | """ 55 | X = [] 56 | for line in fi: 57 | line = line.strip('\n') 58 | if not line: 59 | yield X 60 | X = [] 61 | else: 62 | fields = line.split(sep) 63 | if len(fields) < len(names): 64 | raise ValueError( 65 | 'Too few fields (%d) for %r\n%s' % (len(fields), names, line)) 66 | item = {'F': []} # 'F' is reserved for features. 67 | for i in range(len(names)): 68 | item[names[i]] = fields[i] 69 | X.append(item) 70 | 71 | def escape(src): 72 | """ 73 | Escape colon characters from feature names. 74 | 75 | @type src: str 76 | @param src: A feature name 77 | @rtype str 78 | @return The feature name escaped. 79 | """ 80 | return src.replace(':', '__COLON__') 81 | 82 | def output_features(fo, X, field=''): 83 | """ 84 | Output features (and reference labels) of a sequence in CRFSuite 85 | format. For each item in the sequence, this function writes a 86 | reference label (if L{field} is a non-empty string) and features. 87 | 88 | @type fo: file 89 | @param fo: The file object. 90 | @type X: list of mapping objects 91 | @param X: The sequence. 92 | @type field: str 93 | @param field: The field name of reference labels. 94 | """ 95 | for t in range(len(X)): 96 | if field: 97 | fo.write('%s' % X[t][field]) 98 | for a in X[t]['F']: 99 | if isinstance(a, str): 100 | fo.write('\t%s' % escape(a)) 101 | else: 102 | fo.write('\t%s:%f' % (escape(a[0]), a[1])) 103 | fo.write('\n') 104 | fo.write('\n') 105 | 106 | def to_crfsuite(X): 107 | """ 108 | Convert an item sequence into an object compatible with crfsuite 109 | Python module. 110 | 111 | @type X: list of mapping objects 112 | @param X: The sequence. 113 | @rtype crfsuite.ItemSequence 114 | @return The same sequence in crfsuite.ItemSequence type. 115 | """ 116 | import crfsuite 117 | xseq = crfsuite.ItemSequence() 118 | for x in X: 119 | item = crfsuite.Item() 120 | for f in x['F']: 121 | if isinstance(f, str): 122 | item.append(crfsuite.Attribute(escape(f))) 123 | else: 124 | item.append(crfsuite.Attribute(escape(f[0]), f[1])) 125 | xseq.append(item) 126 | return xseq 127 | 128 | def main(feature_extractor, fields='w pos y', sep=' '): 129 | fi = sys.stdin 130 | fo = sys.stdout 131 | 132 | # Parse the command-line arguments. 133 | parser = optparse.OptionParser(usage="""usage: %prog [options] 134 | This utility reads a data set from STDIN, and outputs attributes to STDOUT. 135 | Each line of a data set must consist of field values separated by SEPARATOR 136 | characters. The names and order of field values can be specified by -f option. 137 | The separator character can be specified with -s option. Instead of outputting 138 | attributes, this utility tags the input data when a model file is specified by 139 | -t option (CRFsuite Python module must be installed).""" 140 | ) 141 | parser.add_option( 142 | '-t', dest='model', 143 | help='tag the input using the model (requires "crfsuite" module)' 144 | ) 145 | parser.add_option( 146 | '-f', dest='fields', default=fields, 147 | help='specify field names of input data [default: "%default"]' 148 | ) 149 | parser.add_option( 150 | '-s', dest='separator', default=sep, 151 | help='specify the separator of columns of input data [default: "%default"]' 152 | ) 153 | (options, args) = parser.parse_args() 154 | 155 | # The fields of input: ('w', 'pos', 'y) by default. 156 | F = options.fields.split(' ') 157 | 158 | if not options.model: 159 | # The generator function readiter() reads a sequence from a 160 | for X in readiter(fi, F, options.separator): 161 | feature_extractor(X) 162 | output_features(fo, X, 'y') 163 | 164 | else: 165 | # Create a tagger with an existing model. 166 | import crfsuite 167 | tagger = crfsuite.Tagger() 168 | tagger.open(options.model) 169 | 170 | # For each sequence from STDIN. 171 | for X in readiter(fi, F, options.separator): 172 | # Obtain features. 173 | feature_extractor(X) 174 | xseq = to_crfsuite(X) 175 | yseq = tagger.tag(xseq) 176 | for t in range(len(X)): 177 | v = X[t] 178 | fo.write('\t'.join([v[f] for f in F])) 179 | fo.write('\t%s\n' % yseq[t]) 180 | fo.write('\n') 181 | -------------------------------------------------------------------------------- /example/ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A feature extractor for named eneity recognition (NER). 5 | Copyright 2010,2011 Naoaki Okazaki. 6 | """ 7 | 8 | # Separator of field values. 9 | separator = ' ' 10 | 11 | # Field names of the input data. 12 | fields = 'y w pos chk' 13 | 14 | 15 | import crfutils 16 | 17 | def get_shape(token): 18 | r = '' 19 | for c in token: 20 | if c.isupper(): 21 | r += 'U' 22 | elif c.islower(): 23 | r += 'L' 24 | elif c.isdigit(): 25 | r += 'D' 26 | elif c in ('.', ','): 27 | r += '.' 28 | elif c in (';', ':', '?', '!'): 29 | r += ';' 30 | elif c in ('+', '-', '*', '/', '=', '|', '_'): 31 | r += '-' 32 | elif c in ('(', '{', '[', '<'): 33 | r += '(' 34 | elif c in (')', '}', ']', '>'): 35 | r += ')' 36 | else: 37 | r += c 38 | return r 39 | 40 | def degenerate(src): 41 | dst = '' 42 | for c in src: 43 | if not dst or dst[-1] != c: 44 | dst += c 45 | return dst 46 | 47 | def get_type(token): 48 | T = ( 49 | 'AllUpper', 'AllDigit', 'AllSymbol', 50 | 'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol', 51 | 'AllUpperDigitSymbol', 52 | 'InitUpper', 53 | 'AllLetter', 54 | 'AllAlnum', 55 | ) 56 | R = set(T) 57 | if not token: 58 | return 'EMPTY' 59 | 60 | for i in range(len(token)): 61 | c = token[i] 62 | if c.isupper(): 63 | R.discard('AllDigit') 64 | R.discard('AllSymbol') 65 | R.discard('AllDigitSymbol') 66 | elif c.isdigit() or c in (',', '.'): 67 | R.discard('AllUpper') 68 | R.discard('AllSymbol') 69 | R.discard('AllUpperSymbol') 70 | R.discard('AllLetter') 71 | elif c.islower(): 72 | R.discard('AllUpper') 73 | R.discard('AllDigit') 74 | R.discard('AllSymbol') 75 | R.discard('AllUpperDigit') 76 | R.discard('AllUpperSymbol') 77 | R.discard('AllDigitSymbol') 78 | R.discard('AllUpperDigitSymbol') 79 | else: 80 | R.discard('AllUpper') 81 | R.discard('AllDigit') 82 | R.discard('AllUpperDigit') 83 | R.discard('AllLetter') 84 | R.discard('AllAlnum') 85 | 86 | if i == 0 and not c.isupper(): 87 | R.discard('InitUpper') 88 | 89 | for tag in T: 90 | if tag in R: 91 | return tag 92 | return 'NO' 93 | 94 | def get_2d(token): 95 | return len(token) == 2 and token.isdigit() 96 | 97 | def get_4d(token): 98 | return len(token) == 4 and token.isdigit() 99 | 100 | def get_da(token): 101 | bd = False 102 | ba = False 103 | for c in token: 104 | if c.isdigit(): 105 | bd = True 106 | elif c.isalpha(): 107 | ba = True 108 | else: 109 | return False 110 | return bd and ba 111 | 112 | def get_dand(token, p): 113 | bd = False 114 | bdd = False 115 | for c in token: 116 | if c.isdigit(): 117 | bd = True 118 | elif c == p: 119 | bdd = True 120 | else: 121 | return False 122 | return bd and bdd 123 | 124 | def get_all_other(token): 125 | for c in token: 126 | if c.isalnum(): 127 | return False 128 | return True 129 | 130 | def get_capperiod(token): 131 | return len(token) == 2 and token[0].isupper() and token[1] == '.' 132 | 133 | def contains_upper(token): 134 | b = False 135 | for c in token: 136 | b |= c.isupper() 137 | return b 138 | 139 | def contains_lower(token): 140 | b = False 141 | for c in token: 142 | b |= c.islower() 143 | return b 144 | 145 | def contains_alpha(token): 146 | b = False 147 | for c in token: 148 | b |= c.isalpha() 149 | return b 150 | 151 | def contains_digit(token): 152 | b = False 153 | for c in token: 154 | b |= c.isdigit() 155 | return b 156 | 157 | def contains_symbol(token): 158 | b = False 159 | for c in token: 160 | b |= ~c.isalnum() 161 | return b 162 | 163 | def b(v): 164 | return 'yes' if v else 'no' 165 | 166 | def observation(v, defval=''): 167 | # Lowercased token. 168 | v['wl'] = v['w'].lower() 169 | # Token shape. 170 | v['shape'] = get_shape(v['w']) 171 | # Token shape degenerated. 172 | v['shaped'] = degenerate(v['shape']) 173 | # Token type. 174 | v['type'] = get_type(v['w']) 175 | 176 | # Prefixes (length between one to four). 177 | v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval 178 | v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval 179 | v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval 180 | v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval 181 | 182 | # Suffixes (length between one to four). 183 | v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval 184 | v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval 185 | v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval 186 | v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval 187 | 188 | # Two digits 189 | v['2d'] = b(get_2d(v['w'])) 190 | # Four digits. 191 | v['4d'] = b(get_4d(v['w'])) 192 | # Alphanumeric token. 193 | v['d&a'] = b(get_da(v['w'])) 194 | # Digits and '-'. 195 | v['d&-'] = b(get_dand(v['w'], '-')) 196 | # Digits and '/'. 197 | v['d&/'] = b(get_dand(v['w'], '/')) 198 | # Digits and ','. 199 | v['d&,'] = b(get_dand(v['w'], ',')) 200 | # Digits and '.'. 201 | v['d&.'] = b(get_dand(v['w'], '.')) 202 | # A uppercase letter followed by '.' 203 | v['up'] = b(get_capperiod(v['w'])) 204 | 205 | # An initial uppercase letter. 206 | v['iu'] = b(v['w'] and v['w'][0].isupper()) 207 | # All uppercase letters. 208 | v['au'] = b(v['w'].isupper()) 209 | # All lowercase letters. 210 | v['al'] = b(v['w'].islower()) 211 | # All digit letters. 212 | v['ad'] = b(v['w'].isdigit()) 213 | # All other (non-alphanumeric) letters. 214 | v['ao'] = b(get_all_other(v['w'])) 215 | 216 | # Contains a uppercase letter. 217 | v['cu'] = b(contains_upper(v['w'])) 218 | # Contains a lowercase letter. 219 | v['cl'] = b(contains_lower(v['w'])) 220 | # Contains a alphabet letter. 221 | v['ca'] = b(contains_alpha(v['w'])) 222 | # Contains a digit. 223 | v['cd'] = b(contains_digit(v['w'])) 224 | # Contains a symbol. 225 | v['cs'] = b(contains_symbol(v['w'])) 226 | 227 | def disjunctive(X, t, field, begin, end): 228 | name = '%s[%d..%d]' % (field, begin, end) 229 | for offset in range(begin, end+1): 230 | p = t + offset 231 | if p not in range(0, len(X)): 232 | continue 233 | X[t]['F'].append('%s=%s' % (name, X[p][field])) 234 | 235 | U = [ 236 | 'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type', 237 | 'p1', 'p2', 'p3', 'p4', 238 | 's1', 's2', 's3', 's4', 239 | '2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up', 240 | 'iu', 'au', 'al', 'ad', 'ao', 241 | 'cu', 'cl', 'ca', 'cd', 'cs', 242 | ] 243 | B = ['w', 'pos', 'chk', 'shaped', 'type'] 244 | 245 | templates = [] 246 | for name in U: 247 | templates += [((name, i),) for i in range(-2, 3)] 248 | for name in B: 249 | templates += [((name, i), (name, i+1)) for i in range(-2, 2)] 250 | 251 | def feature_extractor(X): 252 | # Append observations. 253 | for x in X: 254 | observation(x) 255 | 256 | # Apply the feature templates. 257 | crfutils.apply_templates(X, templates) 258 | 259 | # Append disjunctive features. 260 | for t in range(len(X)): 261 | disjunctive(X, t, 'w', -4, -1) 262 | disjunctive(X, t, 'w', 1, 4) 263 | 264 | # Append BOS and EOS features. 265 | if X: 266 | X[0]['F'].append('__BOS__') 267 | X[-1]['F'].append('__EOS__') 268 | 269 | if __name__ == '__main__': 270 | crfutils.main(feature_extractor, fields=fields, sep=separator) 271 | -------------------------------------------------------------------------------- /example/pos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | An example for part-of-speech tagging. 5 | Copyright 2010,2011 Naoaki Okazaki. 6 | """ 7 | 8 | # Separator of field values. 9 | separator = ' ' 10 | 11 | # Field names of the input data. 12 | fields = 'w num cap sym p1 p2 p3 p4 s1 s2 s3 s4 y' 13 | 14 | # Feature template. This template is identical to the one bundled in CRF++ 15 | # distribution, but written in a Python object. 16 | templates = ( 17 | (('num', 0), ), 18 | (('cap', 0), ), 19 | (('sym', 0), ), 20 | (('p1', 0), ), 21 | (('p2', 0), ), 22 | (('p3', 0), ), 23 | (('p4', 0), ), 24 | (('s1', 0), ), 25 | (('s2', 0), ), 26 | (('s3', 0), ), 27 | (('s4', 0), ), 28 | 29 | (('w', 0), ), 30 | (('w', -1), ), 31 | (('w', 1), ), 32 | (('w', -2), ), 33 | (('w', 2), ), 34 | (('w', -2), ('w', -1)), 35 | (('w', -1), ('w', 0)), 36 | (('w', 0), ('w', 1)), 37 | (('w', 1), ('w', 2)), 38 | (('w', -2), ('w', -1), ('w', 0)), 39 | (('w', -1), ('w', 0), ('w', 1)), 40 | (('w', 0), ('w', 1), ('w', 2)), 41 | (('w', -2), ('w', -1), ('w', 0), ('w', 1)), 42 | (('w', -1), ('w', 0), ('w', 1), ('w', 2)), 43 | (('w', -2), ('w', -1), ('w', 0), ('w', 1), ('w', 2)), 44 | 45 | (('w', 0), ('w', -1)), 46 | (('w', 0), ('w', -2)), 47 | (('w', 0), ('w', -3)), 48 | (('w', 0), ('w', -4)), 49 | (('w', 0), ('w', -5)), 50 | (('w', 0), ('w', -6)), 51 | (('w', 0), ('w', -7)), 52 | (('w', 0), ('w', -8)), 53 | (('w', 0), ('w', -9)), 54 | 55 | (('w', 0), ('w', 1)), 56 | (('w', 0), ('w', 2)), 57 | (('w', 0), ('w', 3)), 58 | (('w', 0), ('w', 4)), 59 | (('w', 0), ('w', 5)), 60 | (('w', 0), ('w', 6)), 61 | (('w', 0), ('w', 7)), 62 | (('w', 0), ('w', 8)), 63 | (('w', 0), ('w', 9)), 64 | ) 65 | 66 | 67 | import crfutils 68 | 69 | def feature_extractor(X): 70 | # Apply feature templates to obtain features (in fact, attributes) 71 | crfutils.apply_templates(X, templates) 72 | if X: 73 | # Append BOS and EOS features manually 74 | X[0]['F'].append('__BOS__') # BOS feature 75 | X[-1]['F'].append('__EOS__') # EOS feature 76 | 77 | if __name__ == '__main__': 78 | crfutils.main(feature_extractor, fields=fields, sep=separator) 79 | -------------------------------------------------------------------------------- /example/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | class FeatureExtractor: 7 | def __init__(self): 8 | self.macro = re.compile(r'%x\[(?P[\d-]+),(?P[\d]+)\]') 9 | self.inst = [] 10 | self.t = 0 11 | self.templates = [] 12 | 13 | def read(self, fi): 14 | self.templates = [] 15 | for line in fi: 16 | line = line.strip() 17 | if line.startswith('#'): 18 | continue 19 | if line.startswith('U'): 20 | self.templates.append(line.replace(':', '=')) 21 | elif line == 'B': 22 | continue 23 | elif line.startswith('B'): 24 | sys.stderr( 25 | 'ERROR: bigram templates not supported: %s\n' % line) 26 | sys.exit(1) 27 | 28 | def replace(self, m): 29 | row = self.t + int(m.group('row')) 30 | col = int(m.group('col')) 31 | if row in range(0, len(self.inst)): 32 | return self.inst[row]['x'][col] 33 | else: 34 | return '' 35 | 36 | def apply(self, inst, t): 37 | self.inst = inst 38 | self.t = t 39 | for template in self.templates: 40 | f = re.sub(self.macro, self.replace, template) 41 | self.inst[t]['F'].append(f) 42 | 43 | def readiter(fi, sep=None): 44 | X = [] 45 | for line in fi: 46 | line = line.strip('\n') 47 | if not line: 48 | yield X 49 | X = [] 50 | else: 51 | fields = line.split(sep) 52 | item = { 53 | 'x': fields[0:-1], 54 | 'y': fields[-1], 55 | 'F': [] 56 | } 57 | X.append(item) 58 | 59 | if __name__ == '__main__': 60 | import optparse 61 | 62 | fi = sys.stdin 63 | fo = sys.stdout 64 | 65 | # Parse the command-line arguments. 66 | parser = optparse.OptionParser(usage="""usage: %prog