├── README ├── README.md ├── crfsuite ├── AUTHORS ├── COPYING ├── ChangeLog ├── INSTALL ├── Makefile.am ├── README ├── autogen.sh ├── bench │ ├── accuracy.py │ ├── bench.py │ ├── bench_crfpp.py │ ├── bench_crfsgd.py │ ├── bench_crfsuite-0.11.py │ ├── bench_crfsuite.py │ ├── bench_mallet.py │ ├── bench_wapiti.py │ ├── collect.py │ ├── crfsuite_to_mallet.py │ └── plot_performance.py ├── configure.in ├── crfsuite.sln ├── doc │ ├── Doxyfile │ ├── footer.html │ └── header.html ├── example │ ├── chunking.py │ ├── crfutils.py │ ├── ner.py │ ├── pos.py │ └── template.py ├── frontend │ ├── Makefile.am │ ├── dump.c │ ├── frontend.vcxproj │ ├── frontend.vcxproj.user │ ├── iwa.c │ ├── iwa.h │ ├── learn.c │ ├── main.c │ ├── option.c │ ├── option.h │ ├── readdata.h │ ├── reader.c │ ├── readme.txt │ └── tag.c ├── genbinary.sh.in ├── include │ ├── Makefile.am │ ├── crfsuite.h │ ├── crfsuite.hpp │ ├── crfsuite_api.hpp │ └── os.h ├── lib │ ├── cqdb │ │ ├── COPYING │ │ ├── Makefile.am │ │ ├── Release │ │ │ ├── CL.read.1.tlog │ │ │ ├── CL.write.1.tlog │ │ │ ├── Lib-link.read.1.tlog │ │ │ ├── Lib-link.write.1.tlog │ │ │ ├── cl.command.1.tlog │ │ │ ├── cqdb.lastbuildstate │ │ │ ├── cqdb.log │ │ │ ├── cqdb.obj │ │ │ ├── lib.command.1.tlog │ │ │ ├── lookup3.obj │ │ │ └── vc100.pdb │ │ ├── cqdb.vcxproj │ │ ├── cqdb.vcxproj.user │ │ ├── doc │ │ │ ├── doxyfile │ │ │ └── footer.html │ │ ├── include │ │ │ └── cqdb.h │ │ ├── makedist.sh │ │ └── src │ │ │ ├── cqdb.c │ │ │ ├── lookup3.c │ │ │ └── main.c │ └── crf │ │ ├── Makefile.am │ │ ├── Release │ │ ├── CL.read.1.tlog │ │ ├── CL.write.1.tlog │ │ ├── Lib-link.read.1.tlog │ │ ├── Lib-link.write.1.tlog │ │ ├── cl.command.1.tlog │ │ ├── crf.lastbuildstate │ │ ├── crf.log │ │ ├── crf1d_context.obj │ │ ├── crf1d_encode.obj │ │ ├── crf1d_feature.obj │ │ ├── crf1d_model.obj │ │ ├── crf1d_tag.obj │ │ ├── crfsuite.obj │ │ ├── crfsuite_train.obj │ │ ├── dataset.obj │ │ ├── dictionary.obj │ │ ├── holdout.obj │ │ ├── lib.command.1.tlog │ │ ├── logging.obj │ │ ├── params.obj │ │ ├── quark.obj │ │ ├── rumavl.obj │ │ ├── train_arow.obj │ │ ├── train_averaged_perceptron.obj │ │ ├── train_l2sgd.obj │ │ ├── train_lbfgs.obj │ │ ├── train_passive_aggressive.obj │ │ └── vc100.pdb │ │ ├── crf.suo │ │ ├── crf.vcxproj │ │ ├── crf.vcxproj.user │ │ └── src │ │ ├── crf1d.h │ │ ├── crf1d_context.c │ │ ├── crf1d_encode.c │ │ ├── crf1d_feature.c │ │ ├── crf1d_model.c │ │ ├── crf1d_tag.c │ │ ├── crfsuite.c │ │ ├── crfsuite_internal.h │ │ ├── crfsuite_train.c │ │ ├── dataset.c │ │ ├── dictionary.c │ │ ├── holdout.c │ │ ├── logging.c │ │ ├── logging.h │ │ ├── params.c │ │ ├── params.h │ │ ├── quark.c │ │ ├── quark.h │ │ ├── rumavl.c │ │ ├── rumavl.h │ │ ├── train_arow.c │ │ ├── train_averaged_perceptron.c │ │ ├── train_l2sgd.c │ │ ├── train_lbfgs.c │ │ ├── train_passive_aggressive.c │ │ └── vecmath.h ├── modification.txt ├── swig │ ├── Makefile.am │ ├── crfsuite.cpp │ ├── export.i │ ├── perl │ │ ├── Makefile.PL.in │ │ ├── prepare.sh │ │ ├── sample_tag.pl │ │ └── sample_train.pl │ └── python │ │ ├── README │ │ ├── crfsuite.py │ │ ├── export_wrap.cpp │ │ ├── export_wrap.h │ │ ├── prepare.sh │ │ ├── sample_tag.py │ │ ├── sample_train.py │ │ └── setup.py └── win32 │ ├── liblbfgs │ ├── lbfgs.h │ ├── lbfgs.lib │ └── lbfgs_debug.lib │ └── stdint.h ├── data ├── README ├── test_laptop ├── test_restaurant ├── train_laptop ├── train_restaurant └── train_restaurant_updated ├── mod_pycrfsuite ├── _pycrfsuite.pyx └── crfsuite_api.pxd ├── rnn ├── adagrad.py ├── adagrad_crf.py ├── crf_propagation.py └── propagation.py ├── train_RNCRF.py ├── train_depnn.py └── util ├── 10depParse.py ├── 20dtreeLabel.py ├── 30word_embedding.py ├── __init__.py ├── data_semEval ├── aspectTerm_sample ├── opinion_sample └── sample.txt ├── dtree_util.py ├── gen_util.py ├── lexparser.sh └── math_util.py /README: -------------------------------------------------------------------------------- 1 | ********************README************************************************************************** 2 | 3 | This is an instruction file for successfully running the RNCRF model of the paper 4 | published in EMNLP 2016: 5 | https://www.aclweb.org/anthology/D/D16/D16-1059.pdf 6 | 7 | **************************************************************************************************** 8 | 9 | This code makes use of "python-crfsuite", which wraps CRFsuite C++ API using Cython. 10 | Contributing: 11 | "python-crfsuite": https://python-crfsuite.readthedocs.io/en/latest/ licensed under MIT license 12 | "CRFsuite": http://www.chokkan.org/software/crfsuite/ licensed under BSD license 13 | 14 | The recursive neural network (RNN) is implemented using python based on QANTA for question answering: 15 | https://cs.umd.edu/~miyyer/qblearn/ 16 | 17 | **************************************************************************************************** 18 | 19 | Please follow these steps to install CRF software and make corresponding modifications 20 | in order to be executable with RNN implementation in python. 21 | 22 | 1. Download python-crfsuite package (this should also include original CRFsuite package) 23 | 24 | 2. Replace original 'crfsuite' folder with the provided modified folder 'crfsuite' 25 | 26 | 3. Make some modifications to the files in folder 'pycrfsuite': 27 | - Replace the file '_pycrfsuite.pyx' with provided one with the same name in the folder 'mod_pycrfsuite' 28 | - Replace the file 'crfsuite_api.pxd' with provided one with the same name in the folder 'mod_pycrfsuite' 29 | 30 | 4. Run the following command to generate cpp file from pycrfsuite.pyx file: 31 | $ ./update_cpp.sh 32 | 33 | 5. Run the following command to install the modified version of python-crfsuite: 34 | $ python setup.py install 35 | 36 | *************************************************************************************************** 37 | 38 | Now we are ready to build RNCRF model for trainin and evaluation: 39 | 40 | 1. Locate to folder 'util': 41 | - Download Stanford dependency tree parser (stanford-corenlp-3.5.1) 42 | - Run '10depParse.py' to generate dependency trees for each sentence using stanford parser 43 | - Run '20dtreeLabel.py' to build tree object for each sentence 44 | - Run '30word_embedding.py' to generate pre-trained word embedding dictionary. In order to run this file, please make sure to obtain the pre-trained word vectors from word2vec first 45 | 46 | 2. Run 'train_depnn.py' to pre-train recursive neural network without CRF first 47 | 48 | 3. Run 'train_RNCRF_laptop.py' to train RNCRF and make evaluations. 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recursive-Neural-Conditional-Random-Field 2 | Implementation and data for the published paper "Recursive Neural Conditional Random Fields for Aspect-based Sentiment Analysis". 3 | -------------------------------------------------------------------------------- /crfsuite/AUTHORS: -------------------------------------------------------------------------------- 1 | Naoaki Okazaki 2 | -------------------------------------------------------------------------------- /crfsuite/COPYING: -------------------------------------------------------------------------------- 1 | The BSD license. 2 | 3 | Copyright (c) 2007-2010, Naoaki Okazaki 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the names of the authors nor the names of its contributors 14 | may be used to endorse or promote products derived from this 15 | software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /crfsuite/ChangeLog: -------------------------------------------------------------------------------- 1 | 2011-08-11 Naoaki Okazaki 2 | * CRFsuite 0.12 3 | - [CORE] Optimized the implementation for faster training; approximately 1.4-1.5 x speed up. 4 | - [CORE] Faster routine for computing exp(x) using SSE2. 5 | - [CORE] Restructured the source code to separate routines for CRF graphical models and training algorithms; this is an initial attempt for implementing CRFs with different feature types (e.g., 2nd-order CRF, 1st-order transition features conditioned on observations) and different training algorithms. 6 | - [CORE] Implemented new training algorithms: Averaged Perceptron, Passive Aggressive, and Adaptive Regularization of Weights (AROW). 7 | - [CORE] Removed automatic generation of BOS/EOS features; one can use these features by inserting attributes to the first/last items (e.g., "__BOS__" at the first item and "__EOS__" at the last item). 8 | - [CORE] Fixed some memory-leak problems. 9 | - [CORE] Reduced memory usage in training. 10 | - [CORE] Fixed a crash problem when the model file does not exist in tagging. 11 | - [FRONTEND:LEARN] Training and test sets are maintained by group numbers; specify the group number for hold-out evaluation with "-e" option. 12 | - [FRONTEND:LEARN] Training algorithm is now specified by "-a" option instead of "-p algorithm=". 13 | - [FRONTEND:LEARN] Renamed some training parameters; for example, an L2 regularization coefficient is specified by "c2" instead of "regularization.sigma" (c2 = 0.5 / sigma * sigma; c1 = 1.0 / sigma). 14 | - [FRONTEND:LEARN] Show the list of parameters, default values, and descriptions with "-H" option. 15 | - [FRONTEND:LEARN] Removed the support of comment lines for simplicity; one may forget to escape '#' characters in a data set. CRFsuite now does not handle '#' as a special character. 16 | - [FRONTEND:TAGGER] Output probabilities of predicted sequences with "-p" option. 17 | - [FRONTEND:TAGGER] Output marginal probabilities of predicted items with "-i" option. 18 | - [API] Numerous changes in API for the enhancements. 19 | - [API] Renamed the library name "libcrf" to "libcrfsuite". 20 | - [API] Renamed the prefix "crf_" to "crfsuite_" in structure and function names. 21 | - [API] Implemented a high-level and easy-to-use API for C++/SWIG (crfsuite.hpp and crfsuite_api.hpp). 22 | - [API] Implemented the Python SWIG module and sample programs; writing a tagger is very easy with this module. 23 | - [SAMPLE] Rewritten samples. 24 | - [SAMPLE] A sample program (template.py) for using feature templates that are compatible with CRF++. 25 | - [SAMPLE] New samples in example directory: Named Entity Recognition (ner.py) using the CoNLL2003 data set, and part-of-speech tagging (pos.py). 26 | - [OTHER] Updated the MSVC solution file to MSVC 2010. 27 | 28 | 29 | 2010-07-16 Naoaki Okazaki 30 | * CRFsuite 0.11 31 | - Renamed crf.h into crfsuite.h to avoid possible conflects in include directories 32 | - Install crfsuite.h to the include directory (suggested by Ingo Glöckner) 33 | 34 | 35 | 2010-01-29 Naoaki Okazaki 36 | * CRFsuite 0.10 37 | - A patch submitted by Hiroshi Manabe (at Kodensha Co., Ltd.) to fix memory leak problems in the tagger. 38 | - Added a new option -r (--reference) for the tagger to output reference labels in parallel with predicted labels. 39 | 40 | 41 | 2009-09-24 Naoaki Okazaki 42 | * CRFsuite 0.9 43 | - Fixed a build problem with liblbfgs 1.8 44 | 45 | 46 | 2009-03-17 Naoaki Okazaki 47 | * CRFsuite 0.8 48 | - Improved the portability of model files across different machine architectures with different byte order; this fixes a crash problem in tagging on some machine architectures. 49 | 50 | 51 | 2009-03-10 Naoaki Okazaki 52 | * CRFsuite 0.7 53 | - Updated RumAVL library to 4.0.0; this fixes a crash problem occurring in feature generation on some machine architectures. 54 | 55 | 56 | 2009-03-07 Naoaki Okazaki 57 | * CRFsuite 0.6 58 | - A new training algorithm, Stochastic Gradient Descent (SGD). 59 | - Updated the L-BFGS routine to liblbfgs 1.7. 60 | - Reduced memory usage in training. 61 | - Supported escape sequences in training/test data. 62 | - Restructured the source code. 63 | - Added a parameter to configure the number of trials for line search. 64 | 65 | 66 | 2008-11-19 Naoaki Okazaki 67 | * CRFsuite 0.5 68 | - Updated the L-BFGS routine to liblbfgs 1.6. 69 | - New parameters lbfgs.stop, lbfgs.delta, lbfgs.linesearch were added. 70 | - Fixed a bug in which the frontend tools could not parse "item:value" format correctly. 71 | - Fixed a bug in computing the accuracy. 72 | - Fixed a bug when the tagger receives an item with no feature. 73 | 74 | 75 | 2008-03-05 Naoaki Okazaki 76 | 77 | * CRFsuite 0.4 (the first public release): 78 | - Website and documentation for CRFsuite. 79 | - Tutorial on the CoNLL 2000 chunking shared task. 80 | - Performance comparison on the CoNLL 2000 chunking shared task. 81 | - Bug fix in L2 regularization. 82 | - A number of small improvements for the public release. 83 | 84 | 85 | 2007-12-12 Naoaki Okazaki 86 | 87 | * CRFsuite 0.3 (internal release): 88 | - Implemented scaling method for forward/backward algorithm. 89 | - Removed the code for computing the forward/backward algorithm in logarithm domain. 90 | 91 | 92 | 2007-11-30 Naoaki Okazaki 93 | 94 | * CRFsuite 0.2 (internal release): 95 | - Orthant-Wise Limited-memory Quasi-Newton (OW-LQN) method for L1 regularization. 96 | - Configurable L-BFGS parameters (number of limited memories, epsilon). 97 | 98 | 99 | 2007-10-29 Naoaki Okazaki 100 | 101 | * CRFsuite 0.1 (internal release): 102 | - Initial release. 103 | 104 | -------------------------------------------------------------------------------- /crfsuite/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | 3 | SUBDIRS = include lib/cqdb lib/crf frontend swig 4 | 5 | docdir = $(prefix)/share/doc/@PACKAGE@ 6 | doc_DATA = README INSTALL COPYING AUTHORS ChangeLog 7 | 8 | EXTRA_DIST = \ 9 | crfsuite.sln \ 10 | autogen.sh \ 11 | win32/stdint.h \ 12 | example/crfutils.py \ 13 | example/template.py \ 14 | example/pos.py \ 15 | example/ner.py \ 16 | example/chunking.py 17 | 18 | AUTOMAKE_OPTIONS = foreign 19 | ACLOCAL_AMFLAGS = -I m4 20 | -------------------------------------------------------------------------------- /crfsuite/autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id:$ 3 | 4 | if [ "$1" = "--force" ]; 5 | then 6 | FORCE=--force 7 | NOFORCE= 8 | FORCE_MISSING=--force-missing 9 | else 10 | FORCE= 11 | NOFORCE=--no-force 12 | FORCE_MISSING= 13 | fi 14 | 15 | libtoolize --copy $FORCE 2>&1 | sed '/^You should/d' || { 16 | echo "libtoolize failed!" 17 | exit 1 18 | } 19 | 20 | aclocal $FORCE || { 21 | echo "aclocal failed!" 22 | exit 1 23 | } 24 | 25 | autoheader $FORCE || { 26 | echo "autoheader failed!" 27 | exit 1 28 | } 29 | 30 | automake -a -c $NOFORCE || { 31 | echo "automake failed!" 32 | exit 1 33 | } 34 | 35 | autoconf $FORCE || { 36 | echo "autoconf failed!" 37 | exit 1 38 | } 39 | -------------------------------------------------------------------------------- /crfsuite/bench/accuracy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | fi = sys.stdin 7 | fo = sys.stdout 8 | n = 0 9 | m = 0 10 | 11 | for line in fi: 12 | line = line.strip() 13 | if line: 14 | fields = line.split() 15 | if len(fields) >= 2: 16 | if fields[-1] == fields[-2]: 17 | m += 1 18 | n += 1 19 | 20 | print 'Item accuracy: %f' % (m / float(n)) 21 | -------------------------------------------------------------------------------- /crfsuite/bench/bench.py: -------------------------------------------------------------------------------- 1 | import re 2 | import collections 3 | 4 | LOGDIR='log/' 5 | 6 | def seconds(s): 7 | p = s.find(':') 8 | q = s.find(':', p+1) 9 | return int(s[:p]) * 3600 + int(s[p+1:q]) * 60 + int(s[q+1:]) 10 | 11 | def last(X): 12 | if len(X) >= 1: 13 | return X[-1] 14 | else: 15 | return None 16 | 17 | def diffmin(X): 18 | D = [] 19 | prev = None 20 | for x in X: 21 | if prev is not None: 22 | D.append(x - prev) 23 | prev = x 24 | return min(D) 25 | 26 | def analyze_log(fi, patterns): 27 | P = {} 28 | for name, pattern, index, cast, func in patterns: 29 | P[name] = (re.compile(pattern), index, cast, func) 30 | 31 | D = collections.defaultdict(list) 32 | for line in fi: 33 | line = line.strip('\n') 34 | for name, (regex, index, cast, func) in P.iteritems(): 35 | m = regex.search(line) 36 | if m is not None: 37 | if isinstance(index, tuple): 38 | for i in index: 39 | D[name].append(cast(m.group(i))) 40 | elif isinstance(index, int): 41 | D[name].append(cast(m.group(index))) 42 | 43 | 44 | R = {} 45 | for name, (regex, index, cast, func) in P.iteritems(): 46 | R[name] = func(D[name]) 47 | return R 48 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_crfpp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFPP_LEARN='/home/okazaki/local/bin/crf_learn' 9 | CRFPP_TEST='/home/okazaki/local/bin/crf_test' 10 | OUTDIR='crfpp/' 11 | 12 | training_patterns = ( 13 | ('num_features', r'^Number of features:[ ]*(\d+)', 1, int, last), 14 | ('time', r'^Done!([\d.]+)', 1, float, last), 15 | ('iterations', r'^iter=(\d+)', 1, int, last), 16 | ('update', r'time=([\d.]+)', 1, float, min), 17 | ('loss', r'obj=([\d.]+)', 1, float, last), 18 | ) 19 | 20 | tagging_patterns = ( 21 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 22 | ) 23 | 24 | params = { 25 | 'lbfgs': '-a CRF-L2', 26 | 'mira': '-a MIRA', 27 | } 28 | 29 | if __name__ == '_main__': 30 | print analyze_log(sys.stdin, training_patterns) 31 | 32 | if __name__ == '__main__': 33 | fe = sys.stderr 34 | 35 | R = {} 36 | for name, param in params.iteritems(): 37 | model = OUTDIR + name + '.model' 38 | trlog = OUTDIR + name + '.tr.log' 39 | trtxt = LOGDIR + 'crfpp-' + name + '.txt' 40 | tglog = OUTDIR + name + '.tg.log' 41 | 42 | s = string.Template( 43 | '$crfpp_learn $param template.crfpp train.txt $model > $trlog' 44 | ) 45 | cmd = s.substitute( 46 | crfpp_learn=CRFPP_LEARN, 47 | param=param, 48 | model=model, 49 | trlog=trlog 50 | ) 51 | 52 | fe.write(cmd) 53 | fe.write('\n') 54 | #os.system(cmd) 55 | 56 | fo = open(trtxt, 'w') 57 | fo.write('$ %s\n' % cmd) 58 | fo.write(open(trlog, 'r').read()) 59 | 60 | s = string.Template( 61 | '$crfpp_test -m $model test.txt | ./accuracy.py > $tglog' 62 | ) 63 | cmd = s.substitute( 64 | crfpp_test=CRFPP_TEST, 65 | model=model, 66 | tglog=tglog 67 | ) 68 | 69 | fe.write(cmd) 70 | fe.write('\n') 71 | #os.system(cmd) 72 | 73 | D = analyze_log(open(trlog), training_patterns) 74 | D.update(analyze_log(open(tglog), tagging_patterns)) 75 | D['logfile'] = trtxt 76 | R[name] = D 77 | 78 | print repr(R) 79 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_crfsgd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSGD='/home/okazaki/install/sgd-1.3/crf/crfsgd' 9 | OUTDIR='crfsgd/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'features: (\d+)', 1, int, last), 13 | ('time', r'^Done! ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\[Epoch (\d+)\]', 1, int, last), 15 | ('update', r'^\[Epoch \d+\][^a-z]+wnorm:[^a-z]+total time: ([\d.]+) seconds$', 1, float, diffmin), 16 | ('loss', r'loss: ([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'default': "-f 1 -r 100 -e ''", 25 | } 26 | 27 | if __name__ == '__main__': 28 | fe = sys.stderr 29 | 30 | R = {} 31 | for name, param in params.iteritems(): 32 | model = OUTDIR + name + '.model' 33 | trlog = OUTDIR + name + '.tr.log' 34 | trtxt = LOGDIR + 'crfsgd-' + name + '.txt' 35 | tglog = OUTDIR + name + '.tg.log' 36 | 37 | s = string.Template( 38 | '$crfsgd $param $model template.crfpp train.txt > $trlog' 39 | ) 40 | cmd = s.substitute( 41 | crfsgd=CRFSGD, 42 | param=param, 43 | model=model, 44 | trlog=trlog 45 | ) 46 | 47 | fe.write(cmd) 48 | fe.write('\n') 49 | #os.system(cmd) 50 | 51 | fo = open(trtxt, 'w') 52 | fo.write('$ %s\n' % cmd) 53 | fo.write(open(trlog, 'r').read()) 54 | 55 | s = string.Template( 56 | '$crfsgd -t $model test.txt | ./accuracy.py > $tglog' 57 | ) 58 | cmd = s.substitute( 59 | crfsgd=CRFSGD, 60 | model=model, 61 | tglog=tglog 62 | ) 63 | 64 | fe.write(cmd) 65 | fe.write('\n') 66 | #os.system(cmd) 67 | 68 | D = analyze_log(open(trlog), training_patterns) 69 | D.update(analyze_log(open(tglog), tagging_patterns)) 70 | D['logfile'] = trtxt 71 | R[name] = D 72 | 73 | print repr(R) 74 | 75 | if __name__ == '_main__': 76 | print analyze_log(sys.stdin, training_patterns) 77 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_crfsuite-0.11.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSUITE='/home/okazaki/install/crfsuite-0.11/frontend/crfsuite' 9 | OUTDIR='crfsuite-0.11/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of features: (\d+)', 1, int, last), 13 | ('time', r'^Total seconds required for L-BFGS: ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\*\*\*\*\* (Iteration|Epoch) #(\d+)', 2, int, last), 15 | ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^Log-likelihood: -([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: \d+ / \d+ \(([\d.]+)\)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs-sparse': '-p regularization.sigma=0.70710678118654746 -p feature.possible_states=0 -p feature.possible_transitions=0', 25 | 'lbfgs-dense': '-p regularization.sigma=0.70710678118654746 -p feature.possible_states=1 -p feature.possible_transitions=1', 26 | } 27 | 28 | if __name__ == '_main__': 29 | print analyze_log(sys.stdin, training_patterns) 30 | 31 | if __name__ == '__main__': 32 | fe = sys.stderr 33 | 34 | R = {} 35 | for name, param in params.iteritems(): 36 | model = OUTDIR + name + '.model' 37 | trlog = OUTDIR + name + '.tr.log' 38 | trtxt = LOGDIR + 'crfsuite0.11-' + name + '.txt' 39 | tglog = OUTDIR + name + '.tg.log' 40 | 41 | s = string.Template( 42 | '$crfsuite learn $param -m $model train.crfsuite > $trlog' 43 | ) 44 | cmd = s.substitute( 45 | crfsuite=CRFSUITE, 46 | param=param, 47 | model=model, 48 | trlog=trlog 49 | ) 50 | 51 | fe.write(cmd) 52 | fe.write('\n') 53 | #os.system(cmd) 54 | 55 | fo = open(trtxt, 'w') 56 | fo.write('$ %s\n' % cmd) 57 | fo.write(open(trlog, 'r').read()) 58 | 59 | s = string.Template( 60 | '$crfsuite tag -m $model -qt test.crfsuite > $tglog' 61 | ) 62 | cmd = s.substitute( 63 | crfsuite=CRFSUITE, 64 | model=model, 65 | tglog=tglog 66 | ) 67 | 68 | fe.write(cmd) 69 | fe.write('\n') 70 | #os.system(cmd) 71 | 72 | D = analyze_log(open(trlog), training_patterns) 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_crfsuite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | CRFSUITE='/home/okazaki/projects/crfsuite/frontend/crfsuite' 9 | OUTDIR='crfsuite/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of features: (\d+)', 1, int, last), 13 | ('time', r'^Total seconds required for training: ([\d.]+)', 1, float, last), 14 | ('iterations', r'^\*\*\*\*\* (Iteration|Epoch) #(\d+)', 2, int, last), 15 | ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^Loss: ([-\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: \d+ / \d+ \(([\d.]+)\)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs-sparse': '-a lbfgs -p feature.possible_states=0 -p feature.possible_transitions=0', 25 | 'lbfgs-dense': '-a lbfgs -p feature.possible_states=1 -p feature.possible_transitions=1', 26 | 'l2sgd-sparse': '-a l2sgd -p feature.possible_states=0 -p feature.possible_transitions=0', 27 | 'l2sgd-dense': '-a l2sgd -p feature.possible_states=1 -p feature.possible_transitions=1', 28 | 'ap-sparse': '-a ap -p feature.possible_states=0 -p feature.possible_transitions=0 -p max_iterations=50', 29 | 'ap-dense': '-a ap -p feature.possible_states=1 -p feature.possible_transitions=1 -p max_iterations=50', 30 | } 31 | 32 | if __name__ == '_main__': 33 | print analyze_log(sys.stdin, training_patterns) 34 | 35 | if __name__ == '__main__': 36 | fe = sys.stderr 37 | 38 | R = {} 39 | for name, param in params.iteritems(): 40 | model = OUTDIR + name + '.model' 41 | trlog = OUTDIR + name + '.tr.log' 42 | trtxt = LOGDIR + 'crfsuite-' + name + '.txt' 43 | tglog = OUTDIR + name + '.tg.log' 44 | 45 | s = string.Template( 46 | '$crfsuite learn $param -m $model train.crfsuite > $trlog' 47 | ) 48 | cmd = s.substitute( 49 | crfsuite=CRFSUITE, 50 | param=param, 51 | model=model, 52 | trlog=trlog 53 | ) 54 | 55 | fe.write(cmd) 56 | fe.write('\n') 57 | #os.system(cmd) 58 | 59 | fo = open(trtxt, 'w') 60 | fo.write('$ %s\n' % cmd) 61 | fo.write(open(trlog, 'r').read()) 62 | 63 | s = string.Template( 64 | '$crfsuite tag -m $model -qt test.crfsuite > $tglog' 65 | ) 66 | cmd = s.substitute( 67 | crfsuite=CRFSUITE, 68 | model=model, 69 | tglog=tglog 70 | ) 71 | 72 | fe.write(cmd) 73 | fe.write('\n') 74 | #os.system(cmd) 75 | 76 | D = analyze_log(open(trlog), training_patterns) 77 | D.update(analyze_log(open(tglog), tagging_patterns)) 78 | D['logfile'] = trtxt 79 | R[name] = D 80 | 81 | print repr(R) 82 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_mallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | MALLET='java -cp "/home/okazaki/install/mallet-2.0.6/class:/home/okazaki/install/mallet-2.0.6/lib/mallet-deps.jar" cc.mallet.fst.SimpleTagger' 9 | OUTDIR='mallet/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'^Number of weights = (\d+)', 1, int, last), 13 | ('time', r'^([\d.]+)user ([\d.]+)system', (1, 2), float, sum), 14 | ('iterations', r'^CRF finished one iteration of maximizer, i=(\d+)', 1, int, len), 15 | # ('update', r'^Seconds required for this iteration: ([\d.]+)', 1, float, min), 16 | ('loss', r'^getValue\(\) \(loglikelihood, optimizable by label likelihood\) = -([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Testing accuracy=([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'default': '--gaussian-variance 0.70710678118654746', 25 | } 26 | 27 | if __name__ == '_main__': 28 | print analyze_log(sys.stdin, training_patterns) 29 | 30 | if __name__ == '__main__': 31 | fe = sys.stderr 32 | 33 | R = {} 34 | for name, param in params.iteritems(): 35 | model = OUTDIR + name + '.model' 36 | trlog = OUTDIR + name + '.tr.log' 37 | trtxt = LOGDIR + 'mallet-' + name + '.txt' 38 | tglog = OUTDIR + name + '.tg.log' 39 | 40 | s = string.Template( 41 | 'time $mallet --train true $param --model-file $model train.mallet > $trlog 2>&1' 42 | ) 43 | cmd = s.substitute( 44 | mallet=MALLET, 45 | param=param, 46 | model=model, 47 | trlog=trlog 48 | ) 49 | 50 | fe.write(cmd) 51 | fe.write('\n') 52 | #os.system(cmd) 53 | 54 | fo = open(trtxt, 'w') 55 | fo.write('$ %s\n' % cmd) 56 | fo.write(open(trlog, 'r').read()) 57 | 58 | s = string.Template( 59 | '$mallet --model-file $model --test lab test.mallet > $tglog 2>&1' 60 | ) 61 | cmd = s.substitute( 62 | mallet=MALLET, 63 | model=model, 64 | tglog=tglog 65 | ) 66 | 67 | fe.write(cmd) 68 | fe.write('\n') 69 | #os.system(cmd) 70 | 71 | D = analyze_log(open(trlog), training_patterns) 72 | D['update'] = 0. 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /crfsuite/bench/bench_wapiti.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import string 6 | from bench import * 7 | 8 | WAPITI='/home/okazaki/install/wapiti-1.1.3/wapiti' 9 | OUTDIR='wapiti/' 10 | 11 | training_patterns = ( 12 | ('num_features', r'nb features: (\d+)', 1, int, last), 13 | ('time', r'^([\d.]+)user ([\d.]+)system', (1, 2), float, sum), 14 | ('iterations', r'\[\s*(\d+)\]', 1, int, last), 15 | ('update', r'time=([\d.]+)', 1, float, min), 16 | ('loss', r'obj=([\d.]+)', 1, float, last), 17 | ) 18 | 19 | tagging_patterns = ( 20 | ('accuracy', r'^Item accuracy: ([\d.]+)', 1, float, last), 21 | ) 22 | 23 | params = { 24 | 'lbfgs': '-a l-bfgs --rho2 0.70710678118654746 --maxiter 1000 --stopeps 0.00001 --stopwin 10', 25 | 'rprop': '-a rprop --rho3 0.70710678118654746 --maxiter 1000', 26 | } 27 | 28 | if __name__ == '_main__': 29 | print analyze_log(sys.stdin, training_patterns) 30 | 31 | if __name__ == '__main__': 32 | fe = sys.stderr 33 | 34 | R = {} 35 | for name, param in params.iteritems(): 36 | model = OUTDIR + name + '.model' 37 | trlog = OUTDIR + name + '.tr.log' 38 | trtxt = LOGDIR + 'wapiti-' + name + '.txt' 39 | tglog = OUTDIR + name + '.tg.log' 40 | 41 | s = string.Template( 42 | 'time $wapiti train $param -p template.wapiti train.txt $model > $trlog 2>&1' 43 | ) 44 | cmd = s.substitute( 45 | wapiti=WAPITI, 46 | param=param, 47 | model=model, 48 | trlog=trlog 49 | ) 50 | 51 | fe.write(cmd) 52 | fe.write('\n') 53 | #os.system(cmd) 54 | 55 | fo = open(trtxt, 'w') 56 | fo.write('$ %s\n' % cmd) 57 | fo.write(open(trlog, 'r').read()) 58 | 59 | s = string.Template( 60 | '$wapiti label -m $model test.txt | ./accuracy.py > $tglog' 61 | ) 62 | cmd = s.substitute( 63 | wapiti=WAPITI, 64 | model=model, 65 | tglog=tglog 66 | ) 67 | 68 | fe.write(cmd) 69 | fe.write('\n') 70 | #os.system(cmd) 71 | 72 | D = analyze_log(open(trlog), training_patterns) 73 | D.update(analyze_log(open(tglog), tagging_patterns)) 74 | D['logfile'] = trtxt 75 | R[name] = D 76 | 77 | print repr(R) 78 | -------------------------------------------------------------------------------- /crfsuite/bench/collect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | 6 | scripts = ( 7 | ('CRFsuite 0.12', './bench_crfsuite.py'), 8 | ('CRFsuite 0.11', './bench_crfsuite-0.11.py'), 9 | ('Wapiti v1.1.3', './bench_wapiti.py'), 10 | ('sgd 1.3', './bench_crfsgd.py'), 11 | ('CRF++ 0.54', './bench_crfpp.py'), 12 | ('MALLET 2.0.6', './bench_mallet.py'), 13 | ) 14 | 15 | fields = ( 16 | ('# Features', 'num_features'), 17 | ('Time', 'time'), 18 | ('# Iters', 'iterations'), 19 | ('Update', 'update'), 20 | ('Loss', 'loss'), 21 | ('Log', 'log'), 22 | ) 23 | 24 | def number(x): 25 | y = '' 26 | p = x.find('.') 27 | if p == -1: 28 | p = len(x) 29 | for i in range(p): 30 | if i % 3 == 0 and i != 0: 31 | y = ' ' + y 32 | y = x[p-i-1] + y 33 | return y + x[p:] 34 | 35 | def read(): 36 | R = {} 37 | for name, script in scripts: 38 | fi = os.popen(script, 'r') 39 | R[name] = eval(fi.read()) 40 | return R 41 | 42 | def output_update(fo, R): 43 | for name, script in scripts: 44 | for param, result in R[name].iteritems(): 45 | fo.write('%s\t%s\t%f\n' % (name, param, result.get('update', 0.))) 46 | 47 | def output_table(fo, R): 48 | for name, script in scripts: 49 | for param, result in R[name].iteritems(): 50 | fo.write('\n') 51 | fo.write('%s\n' % name) 52 | fo.write('%s\n' % param) 53 | fo.write('\n') 54 | fo.write('%s\n' % number('%d' % result['num_features'])) 55 | fo.write('%s\n' % number('%.1f' % result['time'])) 56 | fo.write('%s\n' % number('%d' % result['iterations'])) 57 | fo.write('%s\n' % number('%.1f' % result['update'])) 58 | fo.write('%s\n' % number('%.1f' % result['loss'])) 59 | fo.write('%.3f\n' % (100. * result['accuracy'])) 60 | fo.write('Log\n' % result['logfile']) 61 | fo.write('\n') 62 | fo.write('\n') 63 | 64 | 65 | if __name__ == '__main__': 66 | R = read() 67 | output_table(sys.stdout, R) 68 | -------------------------------------------------------------------------------- /crfsuite/bench/crfsuite_to_mallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | fi = sys.stdin 6 | fo = sys.stdout 7 | 8 | for line in fi: 9 | line = line.strip('\n') 10 | if not line: 11 | fo.write('\n') 12 | 13 | fields = line.split('\t') 14 | fo.write('%s %s\n' % (' '.join(fields[1:]), fields[0])) 15 | 16 | 17 | -------------------------------------------------------------------------------- /crfsuite/bench/plot_performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | 6 | re_iteration = re.compile(r'^\*\*\*\*\* (Iteration|Epoch) #(\d+) \*\*\*\*\*') 7 | patterns = { 8 | 'loss': re.compile(r'^Loss: ([\d.]+)'), 9 | 'accuracy': re.compile(r'^Item accuracy: \d+ / \d+ \(([\d.]+)\)'), 10 | 'norm': re.compile(r'^Feature [L2-]+norm: ([\d.]+)'), 11 | } 12 | 13 | def read(fi): 14 | D = [] 15 | for line in fi: 16 | line = line.strip('\n') 17 | m = re_iteration.match(line) 18 | if m is not None: 19 | if len(D)+1 != int(m.group(2)): 20 | sys.stderr.write('ERROR: sync\n') 21 | sys.exit(1) 22 | D.append({}) 23 | continue 24 | 25 | if D: 26 | for name, pattern in patterns.iteritems(): 27 | m = pattern.match(line) 28 | if m is not None: 29 | D[-1][name] = float(m.group(1)) 30 | 31 | return D 32 | 33 | if __name__ == '__main__': 34 | fi = sys.stdin 35 | fo = sys.stdout 36 | 37 | i = 1 38 | D = read(fi) 39 | for item in D: 40 | fo.write('%d' % i) 41 | i += 1 42 | for name in patterns.iterkeys(): 43 | fo.write(' %f' % item[name]) 44 | fo.write('\n') 45 | -------------------------------------------------------------------------------- /crfsuite/configure.in: -------------------------------------------------------------------------------- 1 | dnl $Id$ 2 | dnl 3 | dnl 4 | dnl Exported and configured variables: 5 | dnl CFLAGS 6 | dnl LDFLAGS 7 | dnl INCLUDES 8 | 9 | 10 | dnl ------------------------------------------------------------------ 11 | dnl Initialization for autoconf 12 | dnl ------------------------------------------------------------------ 13 | AC_PREREQ(2.59) 14 | AC_INIT 15 | AC_CONFIG_SRCDIR([frontend/main.c]) 16 | AC_CONFIG_MACRO_DIR([m4]) 17 | 18 | dnl ------------------------------------------------------------------ 19 | dnl Checks for system 20 | dnl ------------------------------------------------------------------ 21 | AC_CANONICAL_HOST 22 | AC_AIX 23 | AC_MINIX 24 | AC_ISC_POSIX 25 | 26 | 27 | dnl ------------------------------------------------------------------ 28 | dnl Initialization for automake 29 | dnl ------------------------------------------------------------------ 30 | AM_INIT_AUTOMAKE(crfsuite, 0.12) 31 | AC_CONFIG_HEADERS(config.h) 32 | AM_MAINTAINER_MODE 33 | AM_C_PROTOTYPES 34 | 35 | 36 | dnl ------------------------------------------------------------------ 37 | dnl Checks for program 38 | dnl ------------------------------------------------------------------ 39 | AM_PROG_CC_C_O 40 | AC_PROG_LIBTOOL 41 | AC_PROG_INSTALL 42 | AC_PROG_LN_S 43 | 44 | 45 | dnl ------------------------------------------------------------------ 46 | dnl Initialization for variables 47 | dnl ------------------------------------------------------------------ 48 | CFLAGS="-std=c99 ${ac_save_CFLAGS}" 49 | LDFLAGS="${ac_save_LDFLAGS}" 50 | INCLUDES="-I\$(top_srcdir) -I\$(top_srcdir)/include -I\$(srcdir)" 51 | 52 | 53 | dnl ------------------------------------------------------------------ 54 | dnl Checks for header files. 55 | dnl ------------------------------------------------------------------ 56 | AC_HEADER_STDC 57 | AC_CHECK_HEADERS(fcntl.h limits.h malloc.h strings.h unistd.h stdint.h) 58 | 59 | 60 | dnl ------------------------------------------------------------------ 61 | dnl Checks for typedefs, structures, and compiler characteristics. 62 | dnl ------------------------------------------------------------------ 63 | AC_C_CONST 64 | AC_CHECK_SIZEOF 65 | AC_TYPE_SIZE_T 66 | AC_STRUCT_TM 67 | AC_CHECK_SIZEOF(short) 68 | AC_CHECK_SIZEOF(unsigned short) 69 | AC_CHECK_SIZEOF(int) 70 | AC_CHECK_SIZEOF(unsigned int) 71 | AC_CHECK_SIZEOF(long) 72 | AC_CHECK_SIZEOF(unsigned long) 73 | 74 | AC_CHECK_TYPES([uint8_t, uint16_t, uint32_t]) 75 | 76 | dnl ------------------------------------------------------------------ 77 | dnl Checks for debugging mode 78 | dnl ------------------------------------------------------------------ 79 | AC_ARG_ENABLE( 80 | debug, 81 | [AS_HELP_STRING([--enable-debug],[turn on debugging])] 82 | ) 83 | 84 | if test "x$enable_debug" = "xyes"; then 85 | CFLAGS="-DDEBUG -O -g ${CFLAGS}" 86 | else 87 | CFLAGS="-O3 -fomit-frame-pointer -ffast-math -Winline ${CFLAGS}" 88 | fi 89 | 90 | dnl ------------------------------------------------------------------ 91 | dnl Checks for profiling mode 92 | dnl ------------------------------------------------------------------ 93 | AC_ARG_ENABLE( 94 | profile, 95 | [AS_HELP_STRING([--enable-profile],[turn on profiling])] 96 | ) 97 | 98 | if test "x$enable_profile" = "xyes"; then 99 | CFLAGS="-DPROFILE -pg ${CFLAGS}" 100 | fi 101 | 102 | 103 | dnl ------------------------------------------------------------------ 104 | dnl Checks for SSE2 build 105 | dnl ------------------------------------------------------------------ 106 | AC_ARG_ENABLE([sse2], 107 | AS_HELP_STRING( 108 | [--disable-sse2], 109 | [disable SSE2 optimization routines] 110 | ) 111 | ) 112 | 113 | AS_IF([test "x$enable_sse2" != "xno"], [ 114 | CFLAGS="-mfpmath=sse -msse2 -DUSE_SSE ${CFLAGS}" 115 | ]) 116 | 117 | 118 | dnl ------------------------------------------------------------------ 119 | dnl Checks for library functions. 120 | dnl ------------------------------------------------------------------ 121 | AC_FUNC_ALLOCA 122 | AC_FUNC_MEMCMP 123 | AC_FUNC_VPRINTF 124 | AC_CHECK_FUNCS(strdup strerror strtol strtoul) 125 | 126 | dnl Check for math library 127 | AC_CHECK_LIB(m, rand) 128 | 129 | AC_ARG_WITH( 130 | liblbfgs, 131 | [AS_HELP_STRING([--with-liblbfgs=DIR],[liblbfgs directory])], 132 | [INCLUDES="${INCLUDES} -I${withval}/include"; LDFLAGS="${LDFLAGS} -L${withval}/lib"] 133 | ) 134 | AC_CHECK_LIB(lbfgs, lbfgs) 135 | 136 | dnl ------------------------------------------------------------------ 137 | dnl Export variables 138 | dnl ------------------------------------------------------------------ 139 | AC_SUBST(CFLAGS) 140 | AC_SUBST(LDFLAGS) 141 | AC_SUBST(INCLUDES) 142 | AC_SUBST(includedir) 143 | AC_SUBST(libdir) 144 | 145 | dnl ------------------------------------------------------------------ 146 | dnl Output the configure results. 147 | dnl ------------------------------------------------------------------ 148 | AC_CONFIG_FILES(Makefile genbinary.sh include/Makefile lib/cqdb/Makefile lib/crf/Makefile frontend/Makefile swig/Makefile swig/perl/Makefile.PL) 149 | AC_OUTPUT 150 | -------------------------------------------------------------------------------- /crfsuite/crfsuite.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crf", "lib\crf\crf.vcxproj", "{D6B16F2E-DA86-4591-8B50-348AB7E3432E}" 5 | EndProject 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "frontend", "frontend\frontend.vcxproj", "{CEC83336-7B18-408B-9F3C-D11225609540}" 7 | EndProject 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cqdb", "lib\cqdb\cqdb.vcxproj", "{46A23DE6-7E34-4429-8F15-FCC3C083FC5B}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Win32 = Debug|Win32 13 | Debug|x64 = Debug|x64 14 | Release|Win32 = Release|Win32 15 | Release|x64 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|Win32.ActiveCfg = Debug|Win32 19 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|Win32.Build.0 = Debug|Win32 20 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|x64.ActiveCfg = Debug|Win32 21 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Debug|x64.Build.0 = Debug|Win32 22 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|Win32.ActiveCfg = Release|Win32 23 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|Win32.Build.0 = Release|Win32 24 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|x64.ActiveCfg = Release|x64 25 | {D6B16F2E-DA86-4591-8B50-348AB7E3432E}.Release|x64.Build.0 = Release|x64 26 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|Win32.ActiveCfg = Debug|Win32 27 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|Win32.Build.0 = Debug|Win32 28 | {CEC83336-7B18-408B-9F3C-D11225609540}.Debug|x64.ActiveCfg = Debug|Win32 29 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|Win32.ActiveCfg = Release|Win32 30 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|Win32.Build.0 = Release|Win32 31 | {CEC83336-7B18-408B-9F3C-D11225609540}.Release|x64.ActiveCfg = Release|Win32 32 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|Win32.ActiveCfg = Debug|Win32 33 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|Win32.Build.0 = Debug|Win32 34 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Debug|x64.ActiveCfg = Debug|Win32 35 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|Win32.ActiveCfg = Release|Win32 36 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|Win32.Build.0 = Release|Win32 37 | {46A23DE6-7E34-4429-8F15-FCC3C083FC5B}.Release|x64.ActiveCfg = Release|Win32 38 | EndGlobalSection 39 | GlobalSection(SolutionProperties) = preSolution 40 | HideSolutionNode = FALSE 41 | EndGlobalSection 42 | EndGlobal 43 | -------------------------------------------------------------------------------- /crfsuite/doc/footer.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | Copyright (c) 2002-2011 by Naoaki Okazaki 4 |
$datetime 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /crfsuite/doc/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | CRFSuite: A fast implementation of Conditional Random Fields (CRFs) 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /crfsuite/example/chunking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | A feature extractor for chunking. 5 | Copyright 2010,2011 Naoaki Okazaki. 6 | """ 7 | 8 | # Separator of field values. 9 | separator = ' ' 10 | 11 | # Field names of the input data. 12 | fields = 'w pos y' 13 | 14 | # Attribute templates. 15 | templates = ( 16 | (('w', -2), ), 17 | (('w', -1), ), 18 | (('w', 0), ), 19 | (('w', 1), ), 20 | (('w', 2), ), 21 | (('w', -1), ('w', 0)), 22 | (('w', 0), ('w', 1)), 23 | (('pos', -2), ), 24 | (('pos', -1), ), 25 | (('pos', 0), ), 26 | (('pos', 1), ), 27 | (('pos', 2), ), 28 | (('pos', -2), ('pos', -1)), 29 | (('pos', -1), ('pos', 0)), 30 | (('pos', 0), ('pos', 1)), 31 | (('pos', 1), ('pos', 2)), 32 | (('pos', -2), ('pos', -1), ('pos', 0)), 33 | (('pos', -1), ('pos', 0), ('pos', 1)), 34 | (('pos', 0), ('pos', 1), ('pos', 2)), 35 | ) 36 | 37 | 38 | import crfutils 39 | 40 | def feature_extractor(X): 41 | # Apply attribute templates to obtain features (in fact, attributes) 42 | crfutils.apply_templates(X, templates) 43 | if X: 44 | # Append BOS and EOS features manually 45 | X[0]['F'].append('__BOS__') # BOS feature 46 | X[-1]['F'].append('__EOS__') # EOS feature 47 | 48 | if __name__ == '__main__': 49 | crfutils.main(feature_extractor, fields=fields, sep=separator) 50 | -------------------------------------------------------------------------------- /crfsuite/example/crfutils.py: -------------------------------------------------------------------------------- 1 | """ 2 | A miscellaneous utility for sequential labeling. 3 | Copyright 2010,2011 Naoaki Okazaki. 4 | """ 5 | 6 | import optparse 7 | import sys 8 | 9 | def apply_templates(X, templates): 10 | """ 11 | Generate features for an item sequence by applying feature templates. 12 | A feature template consists of a tuple of (name, offset) pairs, 13 | where name and offset specify a field name and offset from which 14 | the template extracts a feature value. Generated features are stored 15 | in the 'F' field of each item in the sequence. 16 | 17 | @type X: list of mapping objects 18 | @param X: The item sequence. 19 | @type template: tuple of (str, int) 20 | @param template: The feature template. 21 | """ 22 | for template in templates: 23 | name = '|'.join(['%s[%d]' % (f, o) for f, o in template]) 24 | for t in range(len(X)): 25 | values = [] 26 | for field, offset in template: 27 | p = t + offset 28 | if p not in range(len(X)): 29 | values = [] 30 | break 31 | values.append(X[p][field]) 32 | if values: 33 | X[t]['F'].append('%s=%s' % (name, '|'.join(values))) 34 | 35 | def readiter(fi, names, sep=' '): 36 | """ 37 | Return an iterator for item sequences read from a file object. 38 | This function reads a sequence from a file object L{fi}, and 39 | yields the sequence as a list of mapping objects. Each line 40 | (item) from the file object is split by the separator character 41 | L{sep}. Separated values of the item are named by L{names}, 42 | and stored in a mapping object. Every item has a field 'F' that 43 | is reserved for storing features. 44 | 45 | @type fi: file 46 | @param fi: The file object. 47 | @type names: tuple 48 | @param names: The list of field names. 49 | @type sep: str 50 | @param sep: The separator character. 51 | @rtype list of mapping objects 52 | @return An iterator for sequences. 53 | """ 54 | X = [] 55 | for line in fi: 56 | line = line.strip('\n') 57 | if not line: 58 | yield X 59 | X = [] 60 | else: 61 | fields = line.split(sep) 62 | if len(fields) < len(names): 63 | raise ValueError( 64 | 'Too few fields (%d) for %r\n%s' % (len(fields), names, line)) 65 | item = {'F': []} # 'F' is reserved for features. 66 | for i in range(len(names)): 67 | item[names[i]] = fields[i] 68 | X.append(item) 69 | 70 | def escape(src): 71 | """ 72 | Escape colon characters from feature names. 73 | 74 | @type src: str 75 | @param src: A feature name 76 | @rtype str 77 | @return The feature name escaped. 78 | """ 79 | return src.replace(':', '__COLON__') 80 | 81 | def output_features(fo, X, field=''): 82 | """ 83 | Output features (and reference labels) of a sequence in CRFSuite 84 | format. For each item in the sequence, this function writes a 85 | reference label (if L{field} is a non-empty string) and features. 86 | 87 | @type fo: file 88 | @param fo: The file object. 89 | @type X: list of mapping objects 90 | @param X: The sequence. 91 | @type field: str 92 | @param field: The field name of reference labels. 93 | """ 94 | for t in range(len(X)): 95 | if field: 96 | fo.write('%s' % X[t][field]) 97 | for a in X[t]['F']: 98 | if isinstance(a, str): 99 | fo.write('\t%s' % escape(a)) 100 | else: 101 | fo.write('\t%s:%f' % (escape(a[0]), a[1])) 102 | fo.write('\n') 103 | fo.write('\n') 104 | 105 | def to_crfsuite(X): 106 | """ 107 | Convert an item sequence into an object compatible with crfsuite 108 | Python module. 109 | 110 | @type X: list of mapping objects 111 | @param X: The sequence. 112 | @rtype crfsuite.ItemSequence 113 | @return The same sequence in crfsuite.ItemSequence type. 114 | """ 115 | import crfsuite 116 | xseq = crfsuite.ItemSequence() 117 | for x in X: 118 | item = crfsuite.Item() 119 | for f in x['F']: 120 | if isinstance(f, str): 121 | item.append(crfsuite.Attribute(escape(f))) 122 | else: 123 | item.append(crfsuite.Attribute(escape(f[0]), f[1])) 124 | xseq.append(item) 125 | return xseq 126 | 127 | def main(feature_extractor, fields='w pos y', sep=' '): 128 | fi = sys.stdin 129 | fo = sys.stdout 130 | 131 | # Parse the command-line arguments. 132 | parser = optparse.OptionParser(usage="""usage: %prog [options] 133 | This utility reads a data set from STDIN, and outputs attributes to STDOUT. 134 | Each line of a data set must consist of field values separated by SEPARATOR 135 | characters. The names and order of field values can be specified by -f option. 136 | The separator character can be specified with -s option. Instead of outputting 137 | attributes, this utility tags the input data when a model file is specified by 138 | -t option (CRFsuite Python module must be installed).""" 139 | ) 140 | parser.add_option( 141 | '-t', dest='model', 142 | help='tag the input using the model (requires "crfsuite" module)' 143 | ) 144 | parser.add_option( 145 | '-f', dest='fields', default=fields, 146 | help='specify field names of input data [default: "%default"]' 147 | ) 148 | parser.add_option( 149 | '-s', dest='separator', default=sep, 150 | help='specify the separator of columns of input data [default: "%default"]' 151 | ) 152 | (options, args) = parser.parse_args() 153 | 154 | # The fields of input: ('w', 'pos', 'y) by default. 155 | F = options.fields.split(' ') 156 | 157 | if not options.model: 158 | # The generator function readiter() reads a sequence from a 159 | for X in readiter(fi, F, options.separator): 160 | feature_extractor(X) 161 | output_features(fo, X, 'y') 162 | 163 | else: 164 | # Create a tagger with an existing model. 165 | import crfsuite 166 | tagger = crfsuite.Tagger() 167 | tagger.open(options.model) 168 | 169 | # For each sequence from STDIN. 170 | for X in readiter(fi, F, options.separator): 171 | # Obtain features. 172 | feature_extractor(X) 173 | xseq = to_crfsuite(X) 174 | yseq = tagger.tag(xseq) 175 | for t in range(len(X)): 176 | v = X[t] 177 | fo.write('\t'.join([v[f] for f in F])) 178 | fo.write('\t%s\n' % yseq[t]) 179 | fo.write('\n') 180 | -------------------------------------------------------------------------------- /crfsuite/example/pos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | An example for part-of-speech tagging. 5 | Copyright 2010,2011 Naoaki Okazaki. 6 | """ 7 | 8 | # Separator of field values. 9 | separator = ' ' 10 | 11 | # Field names of the input data. 12 | fields = 'w num cap sym p1 p2 p3 p4 s1 s2 s3 s4 y' 13 | 14 | # Feature template. This template is identical to the one bundled in CRF++ 15 | # distribution, but written in a Python object. 16 | templates = ( 17 | (('num', 0), ), 18 | (('cap', 0), ), 19 | (('sym', 0), ), 20 | (('p1', 0), ), 21 | (('p2', 0), ), 22 | (('p3', 0), ), 23 | (('p4', 0), ), 24 | (('s1', 0), ), 25 | (('s2', 0), ), 26 | (('s3', 0), ), 27 | (('s4', 0), ), 28 | 29 | (('w', 0), ), 30 | (('w', -1), ), 31 | (('w', 1), ), 32 | (('w', -2), ), 33 | (('w', 2), ), 34 | (('w', -2), ('w', -1)), 35 | (('w', -1), ('w', 0)), 36 | (('w', 0), ('w', 1)), 37 | (('w', 1), ('w', 2)), 38 | (('w', -2), ('w', -1), ('w', 0)), 39 | (('w', -1), ('w', 0), ('w', 1)), 40 | (('w', 0), ('w', 1), ('w', 2)), 41 | (('w', -2), ('w', -1), ('w', 0), ('w', 1)), 42 | (('w', -1), ('w', 0), ('w', 1), ('w', 2)), 43 | (('w', -2), ('w', -1), ('w', 0), ('w', 1), ('w', 2)), 44 | 45 | (('w', 0), ('w', -1)), 46 | (('w', 0), ('w', -2)), 47 | (('w', 0), ('w', -3)), 48 | (('w', 0), ('w', -4)), 49 | (('w', 0), ('w', -5)), 50 | (('w', 0), ('w', -6)), 51 | (('w', 0), ('w', -7)), 52 | (('w', 0), ('w', -8)), 53 | (('w', 0), ('w', -9)), 54 | 55 | (('w', 0), ('w', 1)), 56 | (('w', 0), ('w', 2)), 57 | (('w', 0), ('w', 3)), 58 | (('w', 0), ('w', 4)), 59 | (('w', 0), ('w', 5)), 60 | (('w', 0), ('w', 6)), 61 | (('w', 0), ('w', 7)), 62 | (('w', 0), ('w', 8)), 63 | (('w', 0), ('w', 9)), 64 | ) 65 | 66 | 67 | import crfutils 68 | 69 | def feature_extractor(X): 70 | # Apply feature templates to obtain features (in fact, attributes) 71 | crfutils.apply_templates(X, templates) 72 | if X: 73 | # Append BOS and EOS features manually 74 | X[0]['F'].append('__BOS__') # BOS feature 75 | X[-1]['F'].append('__EOS__') # EOS feature 76 | 77 | if __name__ == '__main__': 78 | crfutils.main(feature_extractor, fields=fields, sep=separator) 79 | -------------------------------------------------------------------------------- /crfsuite/example/template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | class FeatureExtractor: 7 | def __init__(self): 8 | self.macro = re.compile(r'%x\[(?P[\d-]+),(?P[\d]+)\]') 9 | self.inst = [] 10 | self.t = 0 11 | self.templates = [] 12 | 13 | def read(self, fi): 14 | self.templates = [] 15 | for line in fi: 16 | line = line.strip() 17 | if line.startswith('#'): 18 | continue 19 | if line.startswith('U'): 20 | self.templates.append(line.replace(':', '=')) 21 | elif line == 'B': 22 | continue 23 | elif line.startswith('B'): 24 | sys.stderr( 25 | 'ERROR: bigram templates not supported: %s\n' % line) 26 | sys.exit(1) 27 | 28 | def replace(self, m): 29 | row = self.t + int(m.group('row')) 30 | col = int(m.group('col')) 31 | if row in range(0, len(self.inst)): 32 | return self.inst[row]['x'][col] 33 | else: 34 | return '' 35 | 36 | def apply(self, inst, t): 37 | self.inst = inst 38 | self.t = t 39 | for template in self.templates: 40 | f = re.sub(self.macro, self.replace, template) 41 | self.inst[t]['F'].append(f) 42 | 43 | def readiter(fi, sep=None): 44 | X = [] 45 | for line in fi: 46 | line = line.strip('\n') 47 | if not line: 48 | yield X 49 | X = [] 50 | else: 51 | fields = line.split(sep) 52 | item = { 53 | 'x': fields[0:-1], 54 | 'y': fields[-1], 55 | 'F': [] 56 | } 57 | X.append(item) 58 | 59 | if __name__ == '__main__': 60 | import optparse 61 | 62 | fi = sys.stdin 63 | fo = sys.stdout 64 | 65 | # Parse the command-line arguments. 66 | parser = optparse.OptionParser(usage="""usage: %prog