├── COPYING
├── Makefile
├── README.txt
├── crf
├── Makefile
├── README.txt
├── conlleval
├── crfasgd.cpp
├── crfsgd.cpp
└── template
├── data
├── README.txt
├── conll2000
│ └── README.txt
├── pascal
│ └── README.txt
└── rcv1
│ └── README.txt
├── lib
├── assert.h
├── gzstream.cpp
├── gzstream.h
├── matrices.cpp
├── matrices.h
├── pstream.cpp
├── pstream.h
├── timer.cpp
├── timer.h
├── vectors.cpp
├── vectors.h
└── wrapper.h
├── svm
├── Makefile
├── README.txt
├── data.cpp
├── data.h
├── loss.h
├── old
│ ├── Makefile
│ ├── README.txt
│ ├── svmcg.cpp
│ ├── svmolbfgs.cpp
│ ├── svmsgd2.cpp
│ └── svmsgdqn.cpp
├── prep_alpha.cpp
├── prep_rcv1.cpp
├── prep_webspam.cpp
├── svmasgd.cpp
└── svmsgd.cpp
└── win
├── README.txt
├── crfasgd
└── crfasgd.vcproj
├── crfsgd
└── crfsgd.vcproj
├── prep_alpha
└── prep_alpha.vcproj
├── prep_rcv1
└── prep_rcv1.vcproj
├── prep_webspam
└── prep_webspam.vcproj
├── sgd.sln
├── svmasgd
└── svmasgd.vcproj
├── svmsgd
└── svmsgd.vcproj
└── zlib
└── README.txt
/COPYING:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # General makefile
2 |
3 | MAKE=make
4 | SHELL=/bin/sh
5 |
6 | SUBDIRS=svm crf
7 |
8 |
9 | world: check all
10 | @echo "================================================"
11 | @echo "CONGRATULATIONS: The compilation was successful."
12 | @echo "To know what to do next, check the README file."
13 | @echo "================================================"
14 |
15 |
16 | all clean:
17 | @for n in ${SUBDIRS} ; \
18 | do ( cd $$n && ${MAKE} ${@}) || exit ; done
19 |
20 | all: check
21 |
22 | check:
23 | @if [ -r data/rcv1/rcv1-v2.topics.qrels.gz ] ; then : ; else \
24 | echo "=======================================" ; \
25 | echo "ATTENTION: Missing data files" ; \
26 | echo "You should have read the README file!" ; \
27 | echo "=======================================" ; \
28 | fi
29 |
30 |
31 | .PHONY: world all depend check
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 |
2 | SGD-2.0
3 | -------
4 |
5 | L. Bottou, October 2011
6 |
7 |
8 |
9 | 1. INTRODUCTION
10 |
11 | The goal of this package is to illustrate the efficiency of stochastic
12 | gradient descent for large-scale learning tasks.
13 |
14 | Two algorithms,
15 |
16 | * Stochastic gradient descent (SGD), and
17 | * Averaged stochastic gradient descent (ASGD),
18 |
19 | are applied to two well known problems
20 |
21 | * Linear Support Vector Machines, and
22 | * Conditional Random Fields.
23 |
24 | The corresponding programs are designed for simplicity and readability. In
25 | particular they avoid optimizations that would made the programs less
26 | readable. The sole exception is the handling of sparse training data.
27 |
28 |
29 |
30 | 2. DATASETS
31 |
32 | The programs are demonstrated using a number of standard datasets.
33 |
34 | * The RCV1-V2 dataset.
35 | * The ALPHA and WEBSPAM datasets from the first Pascal Large Scale Learning Challenge.
36 | * The dataset associated with the CONLL2000 chunking task.
37 |
38 | These datasets are available from the web. File "data/README" contains
39 | instructions for downloading. The Pascal datasets must be
40 | preprocessed using a relatively slow python script.
41 |
42 |
43 |
44 | 3. ALGORITHMS
45 |
46 | Unlike most optimization algorithm, each iteration of these stochastic
47 | algorithms process a single example and update the parameters. Although the
48 | theory calls for picking a random example at each iteration, this
49 | implementation performs sequential passes over randomly shuffled training
50 | examples. This process is in fact more effective in practice. Each pass is
51 | called an epoch.
52 |
53 | Assume we have an objective function of the form
54 |
55 | Obj(w) = 1/2 lambda w^2 + 1/n sum_i=1^n L(z_i,w)
56 |
57 | where w is the parameter, {z_1,...,z_n} are the training examples, 1/2 \lambda
58 | w^2 a regularization term, and L(z,w) is the loss function. Each iteration of
59 | the SGD algorithm picks a single example z and updates the parameter vector
60 | using the formula:
61 |
62 | SGD: w := (1 - lambda eta_t) w - eta_t dL/dw(z,w)
63 |
64 | The trick of course is to choose the gain sequence eta_t wisely. We use the
65 | formula eta_t = eta_0 / (1 + lambda eta_0 t), and we pick eta_0 by trying
66 | several gain values on a subset of the training data. In order to leverage
67 | sparse dataset, we represent vector w as the ratio of a vector W and a scalar
68 | wDivisor, that is, w = W / wDivisor. Each iteration effectively becomes:
69 |
70 | SGD: wDivisor = wDivisor / (1 - lambda eta_t)
71 | W = W - eta_t wDivisor dL/dw(z,w)
72 |
73 | The ASGD algorithm maintains two parameter vectors. The first parameter
74 | vector, w, is updated like the SGD parameter. However, the output of the
75 | algorithm is the second parameter vector, a, which computes an average of
76 | all the previous values of w.
77 |
78 | ASGD: w := (1 - lambda eta_t) w - eta_t dL/dw(z,w)
79 | a := a + mu_t [ w - a ]
80 |
81 | This algorithm has been shown to work extremely well (Polyak and Juditsky,
82 | 1992) provided that the sequence eta_t decreases with exactly the right speed.
83 | We follow (Xu, 2010) and choose eta_t = eta_0 / (1 + lambda eta0 t) ^ 0.75.
84 | We select eta_0 by trying several gain values on a subset of the training
85 | data, and we start the averaging process after a certain time, that is,
86 | mu_t = 1/max(1,t-t0). Following (Xu, 2010), sparse training data is treated
87 | using the substitutions w = W / wDivisor and a = (A + wFraction W) / aDivisor.
88 | The algorithm effectively becomes:
89 |
90 | ASGD: wDivisor = wDivisor / (1 - eta_t * lambda)
91 | W = W - eta_t wDivisor dL/dw(z,w)
92 | A = A + eta_t wFraction wDivisor dL/dw(z,w)
93 | aDivisor = aDivisor / (1 - mu_t)
94 | wFraction = wFraction + mu_t aDivisor / wDivisor
95 |
96 |
97 |
98 | 4. SUPPORT VECTOR MACHINES
99 |
100 | The directory "svm" contains programs to train a L2-regularized linear model
101 | for binary classification tasks. Compilation time switches determine whether
102 | the models include a bias term, whether the bias term is regularized, and
103 | which loss function should be used. The default is to use an unregularized
104 | bias term using the log-loss function L(x,y,w) = log(1+exp(-ywx)). See file
105 | "svm/README" for details about these programs and their usage for each of the
106 | datasets.
107 |
108 |
109 |
110 | 5. CONDITIONAL RANDOM FIELDS
111 |
112 | The directory "crf" contains programs "crfsgd" and "crfasgd" for training
113 | conditional random fields for sequences. Both programs take data files and
114 | template files and produces tagging files similar to those of Taku Kudo's
115 | CRF++ program described at . However they also
116 | accepts gzipped data files instead of plain files. See the file "crf/README"
117 | for detailed information about these programs and their usage.
118 |
119 |
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/crf/Makefile:
--------------------------------------------------------------------------------
1 | # CRF with stochastic gradient
2 |
3 | # This program is free software; you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation; either version 2 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program; if not, write to the Free Software
15 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
16 |
17 |
18 | L=../lib
19 |
20 |
21 | CXX=g++
22 | OPTS=-g -O2
23 | CXXFLAGS= ${OPTS} -Wall -I$L
24 | LIBS = -lz -lm
25 |
26 | PROGRAMS = crfsgd crfasgd
27 |
28 | OBJS = vectors.o matrices.o gzstream.o pstream.o timer.o
29 |
30 | all: ${PROGRAMS}
31 |
32 | clean:
33 | -rm ${PROGRAMS} 2>/dev/null
34 | -rm *.o 2>/dev/null
35 |
36 | crfsgd: crfsgd.o ${OBJS}
37 | -rm $@ 2>/dev/null
38 | ${CXX} ${CXXFLAGS} -o $@ crfsgd.o ${OBJS} ${LIBS}
39 |
40 | crfasgd: crfasgd.o ${OBJS}
41 | -rm $@ 2>/dev/null
42 | ${CXX} ${CXXFLAGS} -o $@ crfasgd.o ${OBJS} ${LIBS}
43 |
44 | crfsgd.o: crfsgd.cpp $L/vectors.h $L/gzstream.h $L/timer.h
45 | ${CXX} ${CXXFLAGS} -c -o $@ crfsgd.cpp
46 |
47 | crfasgd.o: crfasgd.cpp $L/vectors.h $L/gzstream.h $L/timer.h
48 | ${CXX} ${CXXFLAGS} -c -o $@ crfasgd.cpp
49 |
50 | vectors.o: $L/vectors.cpp $L/vectors.h $L/wrapper.h $L/assert.h
51 | ${CXX} ${CXXFLAGS} -c -o $@ $L/vectors.cpp
52 |
53 | matrices.o: $L/matrices.cpp $L/matrices.h $L/vectors.h $L/wrapper.h $L/assert.h
54 | ${CXX} ${CXXFLAGS} -c -o $@ $L/matrices.cpp
55 |
56 | gzstream.o: $L/gzstream.cpp $L/gzstream.h $L/assert.h
57 | ${CXX} ${CXXFLAGS} -c -o $@ $L/gzstream.cpp
58 |
59 | pstream.o: $L/pstream.cpp $L/pstream.h $L/assert.h
60 | ${CXX} ${CXXFLAGS} -c -o $@ $L/pstream.cpp
61 |
62 | timer.o: $L/timer.cpp $L/timer.h $L/assert.h
63 | ${CXX} ${CXXFLAGS} -c -o $@ $L/timer.cpp
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/crf/README.txt:
--------------------------------------------------------------------------------
1 |
2 | 1. COMPILING CRFSGD AND CRFASGD
3 |
4 | Compiling under Unix is achieved using the traditional command "make".
5 | The compilation requires the libz library. This library usually comes
6 | preinstalled on most Linux distributions, and is otherwise available
7 | from from http://www.zlib.org.
8 |
9 | Compiling under Windows is possible using Cygwin, using MSYS, or using the
10 | MSVC project files provided in the subdirectory "win" of the sgd distribution.
11 | Make sure to read the instructions as you need to compile zlib adequately.
12 | You then need to copy the executable files in this directory.
13 |
14 |
15 | 2. USAGE
16 |
17 | Synopsis (training):
18 | crfsgd [options] model template traindata [validdata]
19 | crfasgd [options] model template traindata [validdata]
20 |
21 | Synopsis (tagging):
22 | crfsgd -t model testdata
23 | crfasgd -t model testdata
24 |
25 | Program "crfsgd" and "crfasgd" implement stochastic gradient algorithms for
26 | training conditional random field models. The program inputs are modeled after
27 | the well known CRF++ program (http://crfpp.sourceforge.net). In particular
28 | these programs use the same format for the template files and the data
29 | files. These formats are well documented on the CRF++ web page. Programs
30 | crfsgd and crfasgd can also directly read gzipped data files, provided that
31 | the file name ends with suffix ".gz".
32 |
33 | When operating in training mode, these program construct a CRF according to
34 | the template file and perform a predefined number of training epochs on the
35 | training data. Every so many epochs are followed by a performance evaluation
36 | pass over the training set and optionally over a validation set. The
37 | performance evaluation procedure can pipe the tags into an external evaluation
38 | program such as the standard CONLL evaluation script conlleval. The training
39 | set performance is useful to monitor the progress of the optimization. The
40 | validation set performance is useful to estimate the generalization
41 | performance. The recommended procedure is to monitor the validation
42 | performance and stop the algorithm when the validation metrics no longer
43 | improve. In the limit of large number of examples, program "crfasgd" should
44 | reach this point after one or two epochs only. The model is saved in the
45 | specified model file (which in fact is a compressed text file.)
46 |
47 | Both programs accept the same options when used in training mode.
48 |
49 | -c : capacity control parameter (1.0)
50 | -f : threshold on the occurences of each feature (3)
51 | -r : total number of epochs (50)
52 | -h : epochs between each testing phase (5)
53 | -e : performance evaluation command (conlleval -q)
54 | -s : initial learning rate
55 | -q : silent mode
56 |
57 | Program crfasgd accepts one additional option:
58 |
59 | -a d : starts averaging after d iterations (default: 1.0.)
60 |
61 | Using option -t switches to the tagging mode. When operating in tagging mode, the
62 | program reads the model, tags every sentence from the provided test data file,
63 | and outputs the tags on the standard output using a format suitable for the
64 | standard evaluation script conlleval.
65 |
66 |
67 | 2. RUNNING THE STOCHASTIC GRADIENT CRF ON THE CONLL CHUNKING TASK
68 |
69 | Please follow the instructions in file "data/README.txt" to populate the
70 | directories "data/conll". The gzipped files are directly usable.
71 | No further preprocessing is necessary.
72 |
73 | Training a model using stochastic gradient descent.
74 |
75 | $ ./crfsgd -c 1.0 -f 3 model.gz template \
76 | ../data/conll2000/train.txt.gz ../data/conll2000/test.txt.gz
77 | Reading template file template.
78 | ...
79 | Reading and preprocessing ../data/conll2000/train.txt.gz.
80 | ...
81 | Reading and preprocessing ../data/conll2000/test.txt.gz.
82 | ...
83 | [Calibrating] -- 1000 samples
84 | ...
85 | [Epoch 1] -- wnorm=3428.22 time=15.66s.
86 | [Epoch 2] -- wnorm=4981.97 time=21.59s.
87 | [Epoch 3] -- wnorm=6099.82 time=27.5s.
88 | [Epoch 4] -- wnorm=6888.25 time=33.41s.
89 | [Epoch 5] -- wnorm=7465.87 time=39.29s.
90 | Training perf: sentences=8936 loss=0.8069 obj=1.22464 err=2454 (1.15904%)
91 | accuracy: 98.84%; precision: 97.95%; recall: 98.04%; FB1: 98.00
92 | Testing perf: sentences=2012 loss=2.35348 obj=2.77122 err=1997 (4.21513%)
93 | accuracy: 95.78%; precision: 93.31%; recall: 93.47%; FB1: 93.39
94 | [Epoch 6] -- wnorm=7904.99 time=45.19s.
95 | [Epoch 7] -- wnorm=8238.51 time=51.07s.
96 | [Epoch 8] -- wnorm=8494.34 time=56.96s.
97 | [Epoch 9] -- wnorm=8695.67 time=62.84s.
98 | [Epoch 10] -- wnorm=8859.06 time=68.73s.
99 | Training perf: sentences=8936 loss=0.592674 obj=1.08837 err=1492 (0.704681%)
100 | accuracy: 99.30%; precision: 98.81%; recall: 98.64%; FB1: 98.72
101 | Testing perf: sentences=2012 loss=2.27945 obj=2.77514 err=1950 (4.11592%)
102 | accuracy: 95.88%; precision: 93.60%; recall: 93.43%; FB1: 93.51
103 | ...
104 | [Epoch 46] -- wnorm=9670.23 time=281.04s.
105 | [Epoch 47] -- wnorm=9670.91 time=286.93s.
106 | [Epoch 48] -- wnorm=9670.6 time=292.83s.
107 | [Epoch 49] -- wnorm=9670.94 time=298.74s.
108 | [Epoch 50] -- wnorm=9669.67 time=304.64s.
109 | Training perf: sentences=8936 loss=0.476964 obj=1.01802 err=692 (0.326836%)
110 | accuracy: 99.67%; precision: 99.42%; recall: 99.26%; FB1: 99.34
111 | Testing perf: sentences=2012 loss=2.20519 obj=2.74624 err=1889 (3.98717%)
112 | accuracy: 96.01%; precision: 93.94%; recall: 93.55%; FB1: 93.74
113 | Saving model file model.gz.
114 | Done! 304.64 seconds.
115 |
116 |
117 |
118 | Training a model using averaged stochastic gradient descent.
119 |
120 |
121 | $ ./crfasgd -c 1.0 -f 3 -r 10 model.gz template \
122 | ../data/conll2000/train.txt.gz ../data/conll2000/test.txt.gz
123 | Reading template file template.
124 | ...
125 | Reading and preprocessing ../data/conll2000/train.txt.gz.
126 | ...
127 | Reading and preprocessing ../data/conll2000/test.txt.gz.
128 | ...
129 | [Calibrating] -- 1000 samples
130 | ...
131 | [Epoch 1] -- wnorm=3471.6 anorm=3471.6 time=16.88s.
132 | [Epoch 2] -- wnorm=5093.77 anorm=4238.68 time=23.5s.
133 | [Epoch 3] -- wnorm=6281.55 anorm=4871.78 time=30.11s.
134 | [Epoch 4] -- wnorm=7128.27 anorm=5400.67 time=36.75s.
135 | [Epoch 5] -- wnorm=7748.73 anorm=5837.29 time=43.34s.
136 | Training perf: sentences=8936 loss=0.879526 obj=1.20614 err=2945 (1.39%)
137 | accuracy: 98.61%; precision: 97.66%; recall: 97.51%; FB1: 97.58
138 | Testing perf: sentences=2012 loss=2.23476 obj=2.56138 err=1895 (3.99%)
139 | accuracy: 96.00%; precision: 93.75%; recall: 93.59%; FB1: 93.67
140 | [Epoch 6] -- wnorm=8219.54 anorm=6203.39 time=50.05s.
141 | [Epoch 7] -- wnorm=8569.77 anorm=6514.33 time=56.84s.
142 | [Epoch 8] -- wnorm=8858.12 anorm=6790.36 time=63.46s.
143 | [Epoch 9] -- wnorm=9059.93 anorm=7026.73 time=70.42s.
144 | [Epoch 10] -- wnorm=9230.78 anorm=7235.45 time=77.04s.
145 | Training perf: sentences=8936 loss=0.68643 obj=1.09128 err=1977 (0.93%)
146 | accuracy: 99.07%; precision: 98.40%; recall: 98.24%; FB1: 98.32
147 | Testing perf: sentences=2012 loss=2.21381 obj=2.61866 err=1872 (3.95%)
148 | accuracy: 96.05%; precision: 93.84%; recall: 93.67%; FB1: 93.75
149 | Saving model file model.gz.
150 | Done! 77.04 seconds.
151 |
152 |
153 | Testing the final model (using crfsgd or crfasgd is equivalent.)
154 |
155 | $ ./crfsgd -t model.gz ../data/conll2000/test.txt.gz | ./conlleval
156 | processed 47377 tokens with 23852 phrases; found: 23809 phrases; correct: 22342.
157 | accuracy: 96.05%; precision: 93.84%; recall: 93.67%; FB1: 93.75
158 | ADJP: precision: 79.66%; recall: 74.20%; FB1: 76.83 408
159 | ADVP: precision: 82.96%; recall: 80.95%; FB1: 81.94 845
160 | CONJP: precision: 55.56%; recall: 55.56%; FB1: 55.56 9
161 | INTJ: precision: 100.00%; recall: 50.00%; FB1: 66.67 1
162 | LST: precision: 0.00%; recall: 0.00%; FB1: 0.00 0
163 | NP: precision: 94.36%; recall: 94.08%; FB1: 94.22 12385
164 | PP: precision: 96.61%; recall: 97.80%; FB1: 97.20 4870
165 | PRT: precision: 77.45%; recall: 74.53%; FB1: 75.96 102
166 | SBAR: precision: 88.33%; recall: 84.86%; FB1: 86.56 514
167 | VP: precision: 93.80%; recall: 94.14%; FB1: 93.97 4675
168 |
169 |
170 | Comparing with CRF++ (on a different machine, about twice slower.)
171 |
172 | $ crf_learn -c 1.0 -f 3 template train.txt model
173 | ...
174 | Number of sentences: 8936
175 | Number of features: 1679700
176 | ... iter=18 terr=0.04522 serr=0.45636 act=1679700 obj=24917.57905 diff=0.02882
177 | ... iter=36 terr=0.02188 serr=0.27775 act=1679700 obj=13697.78077 diff=0.01717
178 | ... iter=71 terr=0.00518 serr=0.09109 act=1679700 obj=9654.43394 diff=0.00167
179 | ... iter=142 terr=0.00340 serr=0.06256 act=1679700 obj=9042.07254 diff=0.00007
180 | Done!4335.34 s
181 |
182 | $ crf_test -m model test.txt | tr '\t' ' ' | ./conlleval
183 | processed 47377 tokens with 23852 phrases; found: 23799 phrases; correct: 22334.
184 | accuracy: 96.02%; precision: 93.84%; recall: 93.64%; FB1: 93.74
185 | ADJP: precision: 79.71%; recall: 74.43%; FB1: 76.98 409
186 | ADVP: precision: 83.18%; recall: 81.06%; FB1: 82.11 844
187 | CONJP: precision: 55.56%; recall: 55.56%; FB1: 55.56 9
188 | INTJ: precision: 100.00%; recall: 50.00%; FB1: 66.67 1
189 | LST: precision: 0.00%; recall: 0.00%; FB1: 0.00 0
190 | NP: precision: 94.36%; recall: 94.03%; FB1: 94.19 12378
191 | PP: precision: 96.71%; recall: 97.82%; FB1: 97.26 4866
192 | PRT: precision: 79.05%; recall: 78.30%; FB1: 78.67 105
193 | SBAR: precision: 88.65%; recall: 84.67%; FB1: 86.62 511
194 | VP: precision: 93.63%; recall: 93.99%; FB1: 93.81 4676
195 |
196 |
197 |
--------------------------------------------------------------------------------
/crf/conlleval:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html
5 | # options: l: generate LaTeX output for tables like in
6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex
7 | # r: accept raw result tags (without B- and I- prefix;
8 | # assumes one word per chunk)
9 | # d: alternative delimiter tag (default is single space)
10 | # o: alternative outside tag (default is O)
11 | # note: the file should contain lines with items separated
12 | # by $delimiter characters (default space). The final
13 | # two items should contain the correct tag and the
14 | # guessed tag in that order. Sentences should be
15 | # separated from each other by empty lines or lines
16 | # with $boundary fields (default -X-).
17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/
18 | # started: 1998-09-25
19 | # version: 2004-01-26
20 | # author: Erik Tjong Kim Sang
21 |
22 | use strict;
23 |
24 | my $false = 0;
25 | my $true = 42;
26 |
27 | my $boundary = "-X-"; # sentence boundary
28 | my $correct; # current corpus chunk tag (I,O,B)
29 | my $correctChunk = 0; # number of correctly identified chunks
30 | my $correctTags = 0; # number of correct chunk tags
31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.)
32 | my $delimiter = " "; # field delimiter
33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979)
34 | my $firstItem; # first feature (for sentence boundary checks)
35 | my $foundCorrect = 0; # number of chunks in corpus
36 | my $foundGuessed = 0; # number of identified chunks
37 | my $guessed; # current guessed chunk tag
38 | my $guessedType; # type of current guessed chunk tag
39 | my $i; # miscellaneous counter
40 | my $inCorrect = $false; # currently processed chunk is correct until now
41 | my $lastCorrect = "O"; # previous chunk tag in corpus
42 | my $latex = 0; # generate LaTeX formatted output
43 | my $lastCorrectType = ""; # type of previously identified chunk tag
44 | my $lastGuessed = "O"; # previously identified chunk tag
45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
46 | my $lastType; # temporary storage for detecting duplicates
47 | my $line; # line
48 | my $nbrOfFeatures = -1; # number of features per line
49 | my $precision = 0.0; # precision score
50 | my $oTag = "O"; # outside tag, default O
51 | my $raw = 0; # raw input: add B to every token
52 | my $quiet = 0; # only display summary line
53 | my $recall = 0.0; # recall score
54 | my $tokenCounter = 0; # token counter (ignores sentence breaks)
55 |
56 | my %correctChunk = (); # number of correctly identified chunks per type
57 | my %foundCorrect = (); # number of chunks in corpus per type
58 | my %foundGuessed = (); # number of identified chunks per type
59 |
60 | my @features; # features on line
61 | my @sortedTypes; # sorted list of chunk type names
62 |
63 | # sanity check
64 | while (@ARGV and $ARGV[0] =~ /^-/) {
65 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
66 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
67 | elsif ($ARGV[0] eq "-q") { $quiet = 1; shift(@ARGV); }
68 | elsif ($ARGV[0] eq "-d") {
69 | shift(@ARGV);
70 | if (not defined $ARGV[0]) {
71 | die "conlleval: -d requires delimiter character";
72 | }
73 | $delimiter = shift(@ARGV);
74 | } elsif ($ARGV[0] eq "-o") {
75 | shift(@ARGV);
76 | if (not defined $ARGV[0]) {
77 | die "conlleval: -o requires delimiter character";
78 | }
79 | $oTag = shift(@ARGV);
80 | } else { die "conlleval: unknown argument $ARGV[0]\n"; }
81 | }
82 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
83 | # process input
84 | while () {
85 | chomp($line = $_);
86 | @features = split(/$delimiter/,$line);
87 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
88 | elsif ($nbrOfFeatures != $#features and @features != 0) {
89 | printf STDERR "unexpected number of features: %d (%d)\n",
90 | $#features+1,$nbrOfFeatures+1;
91 | exit(1);
92 | }
93 | if (@features == 0 or
94 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
95 | if (@features < 2) {
96 | die "conlleval: unexpected number of features in line $line\n";
97 | }
98 | if ($raw) {
99 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; }
100 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; }
101 | if ($features[$#features] ne "O") {
102 | $features[$#features] = "B-$features[$#features]";
103 | }
104 | if ($features[$#features-1] ne "O") {
105 | $features[$#features-1] = "B-$features[$#features-1]";
106 | }
107 | }
108 | # 20040126 ET code which allows hyphens in the types
109 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
110 | $guessed = $1;
111 | $guessedType = $2;
112 | } else {
113 | $guessed = $features[$#features];
114 | $guessedType = "";
115 | }
116 | pop(@features);
117 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
118 | $correct = $1;
119 | $correctType = $2;
120 | } else {
121 | $correct = $features[$#features];
122 | $correctType = "";
123 | }
124 | pop(@features);
125 | # ($guessed,$guessedType) = split(/-/,pop(@features));
126 | # ($correct,$correctType) = split(/-/,pop(@features));
127 | $guessedType = $guessedType ? $guessedType : "";
128 | $correctType = $correctType ? $correctType : "";
129 | $firstItem = shift(@features);
130 |
131 | # 1999-06-26 sentence breaks should always be counted as out of chunk
132 | if ( $firstItem eq $boundary ) { $guessed = "O"; }
133 |
134 | if ($inCorrect) {
135 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
136 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
137 | $lastGuessedType eq $lastCorrectType) {
138 | $inCorrect=$false;
139 | $correctChunk++;
140 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
141 | $correctChunk{$lastCorrectType}+1 : 1;
142 | } elsif (
143 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) !=
144 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
145 | $guessedType ne $correctType ) {
146 | $inCorrect=$false;
147 | }
148 | }
149 |
150 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
151 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
152 | $guessedType eq $correctType) { $inCorrect = $true; }
153 |
154 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
155 | $foundCorrect++;
156 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
157 | $foundCorrect{$correctType}+1 : 1;
158 | }
159 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
160 | $foundGuessed++;
161 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
162 | $foundGuessed{$guessedType}+1 : 1;
163 | }
164 | if ( $firstItem ne $boundary ) {
165 | if ( $correct eq $guessed and $guessedType eq $correctType ) {
166 | $correctTags++;
167 | }
168 | $tokenCounter++;
169 | }
170 |
171 | $lastGuessed = $guessed;
172 | $lastCorrect = $correct;
173 | $lastGuessedType = $guessedType;
174 | $lastCorrectType = $correctType;
175 | }
176 | if ($inCorrect) {
177 | $correctChunk++;
178 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
179 | $correctChunk{$lastCorrectType}+1 : 1;
180 | }
181 |
182 | if (not $latex) {
183 | # compute overall precision, recall and FB1 (default values are 0.0)
184 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
185 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
186 | $FB1 = 2*$precision*$recall/($precision+$recall)
187 | if ($precision+$recall > 0);
188 |
189 | # print overall performance
190 | if (not $quiet) {
191 | printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
192 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
193 | }
194 | if ($tokenCounter>0) {
195 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
196 | printf "precision: %6.2f%%; ",$precision;
197 | printf "recall: %6.2f%%; ",$recall;
198 | printf "FB1: %6.2f\n",$FB1;
199 | }
200 | }
201 |
202 | # sort chunk type names
203 | undef($lastType);
204 | @sortedTypes = ();
205 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
206 | if (not($lastType) or $lastType ne $i) {
207 | push(@sortedTypes,($i));
208 | }
209 | $lastType = $i;
210 | }
211 | # print performance per chunk type
212 | if (not $latex) {
213 | if (not $quiet) {
214 | for $i (@sortedTypes) {
215 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
216 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
217 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
218 | if (not($foundCorrect{$i})) { $recall = 0.0; }
219 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
220 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
221 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
222 | printf "%17s: ",$i;
223 | printf "precision: %6.2f%%; ",$precision;
224 | printf "recall: %6.2f%%; ",$recall;
225 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i};
226 | }
227 | }
228 | } else {
229 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline";
230 | if (not $quiet) {
231 | for $i (@sortedTypes) {
232 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
233 | if (not($foundGuessed{$i})) { $precision = 0.0; }
234 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
235 | if (not($foundCorrect{$i})) { $recall = 0.0; }
236 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
237 | if ($precision+$recall == 0.0) { $FB1 = 0.0; }
238 | else { $FB1 = 2*$precision*$recall/($precision+$recall); }
239 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
240 | $i,$precision,$recall,$FB1;
241 | }
242 | print "\\hline\n";
243 | }
244 | $precision = 0.0;
245 | $recall = 0;
246 | $FB1 = 0.0;
247 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
248 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
249 | $FB1 = 2*$precision*$recall/($precision+$recall)
250 | if ($precision+$recall > 0);
251 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
252 | $precision,$recall,$FB1;
253 | }
254 |
255 | exit 0;
256 |
257 | # endOfChunk: checks if a chunk ended between the previous and current word
258 | # arguments: previous and current chunk tags, previous and current types
259 | # note: this code is capable of handling other chunk representations
260 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
261 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
262 |
263 | sub endOfChunk {
264 | my $prevTag = shift(@_);
265 | my $tag = shift(@_);
266 | my $prevType = shift(@_);
267 | my $type = shift(@_);
268 | my $chunkEnd = $false;
269 |
270 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
271 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
272 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
273 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
274 |
275 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
276 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
277 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
278 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
279 |
280 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) {
281 | $chunkEnd = $true;
282 | }
283 |
284 | # corrected 1998-12-22: these chunks are assumed to have length 1
285 | if ( $prevTag eq "]" ) { $chunkEnd = $true; }
286 | if ( $prevTag eq "[" ) { $chunkEnd = $true; }
287 |
288 | return($chunkEnd);
289 | }
290 |
291 | # startOfChunk: checks if a chunk started between the previous and current word
292 | # arguments: previous and current chunk tags, previous and current types
293 | # note: this code is capable of handling other chunk representations
294 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
295 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
296 |
297 | sub startOfChunk {
298 | my $prevTag = shift(@_);
299 | my $tag = shift(@_);
300 | my $prevType = shift(@_);
301 | my $type = shift(@_);
302 | my $chunkStart = $false;
303 |
304 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
305 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
306 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
307 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
308 |
309 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
310 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
311 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
312 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
313 |
314 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) {
315 | $chunkStart = $true;
316 | }
317 |
318 | # corrected 1998-12-22: these chunks are assumed to have length 1
319 | if ( $tag eq "[" ) { $chunkStart = $true; }
320 | if ( $tag eq "]" ) { $chunkStart = $true; }
321 |
322 | return($chunkStart);
323 | }
324 |
--------------------------------------------------------------------------------
/crf/template:
--------------------------------------------------------------------------------
1 | # Unigram
2 | U00:%x[-2,0]
3 | U01:%x[-1,0]
4 | U02:%x[0,0]
5 | U03:%x[1,0]
6 | U04:%x[2,0]
7 | U05:%x[-1,0]/%x[0,0]
8 | U06:%x[0,0]/%x[1,0]
9 |
10 | U10:%x[-2,1]
11 | U11:%x[-1,1]
12 | U12:%x[0,1]
13 | U13:%x[1,1]
14 | U14:%x[2,1]
15 | U15:%x[-2,1]/%x[-1,1]
16 | U16:%x[-1,1]/%x[0,1]
17 | U17:%x[0,1]/%x[1,1]
18 | U18:%x[1,1]/%x[2,1]
19 |
20 | U20:%x[-2,1]/%x[-1,1]/%x[0,1]
21 | U21:%x[-1,1]/%x[0,1]/%x[1,1]
22 | U22:%x[0,1]/%x[1,1]/%x[2,1]
23 |
24 | # Bigram
25 | B
26 |
--------------------------------------------------------------------------------
/data/README.txt:
--------------------------------------------------------------------------------
1 |
2 | This directory should be
3 | populated with various data files
4 | containing well known datasets.
5 |
6 |
7 | * The following Reuters RCV1 dataset available from
8 | http://jmlr.csail.mit.edu/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
9 |
10 | rcv1/lyrl2004_tokens_test_pt0.dat.gz
11 | rcv1/lyrl2004_tokens_test_pt1.dat.gz
12 | rcv1/lyrl2004_tokens_test_pt2.dat.gz
13 | rcv1/lyrl2004_tokens_test_pt3.dat.gz
14 | rcv1/lyrl2004_tokens_train.dat.gz
15 | rcv1/rcv1-v2.topics.qrels.gz
16 |
17 |
18 | * The following CONLL2000 data available from
19 | http://www.cnts.ua.ac.be/conll2000/chunking
20 |
21 | conll2000/train.txt.gz
22 | conll2000/test.txt.gz
23 |
24 |
25 | * The following PASCAL data available from
26 | ftp://largescale.ml.tu-berlin.de/largescale/
27 |
28 | pascal/alpha_train.dat.bz2
29 | pascal/alpha_train.lab.bz2
30 | pascal/webspam_train.dat.bz2
31 | pascal/webspam_train.lab.bz2
32 | pascal/convert.py
33 |
34 | These files must then be decoded using the python script convert.py.
35 | This can take a while.
36 | $ cd pascal
37 | $ ./convert.py -o alpha.txt alpha train
38 | $ ./convert.py -o webspam.txt webspam train
39 |
40 |
--------------------------------------------------------------------------------
/data/conll2000/README.txt:
--------------------------------------------------------------------------------
1 | The CONLL2000 files go here.
2 | See ../README.
3 |
--------------------------------------------------------------------------------
/data/pascal/README.txt:
--------------------------------------------------------------------------------
1 | The PASCAL Alpha files go here.
2 | See ../README.
3 |
--------------------------------------------------------------------------------
/data/rcv1/README.txt:
--------------------------------------------------------------------------------
1 | The RCV1 files go here.
2 | See ../README.
3 |
--------------------------------------------------------------------------------
/lib/assert.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Simple assertions
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 |
6 | // This library is free software; you can redistribute it and/or
7 | // modify it under the terms of the GNU Lesser General Public
8 | // License as published by the Free Software Foundation; either
9 | // version 2.1 of the License, or (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with this program; if not, write to the Free Software
18 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
19 |
20 |
21 |
22 | #ifndef ASSERT_H
23 | #define ASSERT_H 1
24 |
25 | #include
26 | #include
27 |
28 | #define assertfail(msg) do { \
29 | std::cerr << "(" << __FILE__ << ":" << __LINE__ << ") " \
30 | << msg << std::endl; ::exit(10); } while(0)
31 |
32 | #define assert(expr) \
33 | do { if (!(expr)) assertfail("Assertion failed: " << #expr); } while(0)
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/lib/gzstream.cpp:
--------------------------------------------------------------------------------
1 | // ============================================================================
2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
3 | // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
4 | //
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This library is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | // Lesser General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU Lesser General Public
16 | // License along with this library; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 | // ============================================================================
19 | //
20 | // File : gzstream.C
21 | // Revision : $Revision$
22 | // Revision_date : $Date$
23 | // Author(s) : Deepak Bandyopadhyay, Lutz Kettner
24 | //
25 | // Standard streambuf implementation following Nicolai Josuttis, "The
26 | // Standard C++ Library".
27 | // ============================================================================
28 |
29 | #include
30 | #include
31 | #include // for memcpy
32 |
33 | #ifdef GZSTREAM_NAMESPACE
34 | namespace GZSTREAM_NAMESPACE {
35 | #endif
36 |
37 | // ----------------------------------------------------------------------------
38 | // Internal classes to implement gzstream. See header file for user classes.
39 | // ----------------------------------------------------------------------------
40 |
41 | // --------------------------------------
42 | // class gzstreambuf:
43 | // --------------------------------------
44 |
45 | gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
46 | if ( is_open())
47 | return (gzstreambuf*)0;
48 | mode = open_mode;
49 | // no append nor read/write mode
50 | if ((mode & std::ios::ate) || (mode & std::ios::app)
51 | || ((mode & std::ios::in) && (mode & std::ios::out)))
52 | return (gzstreambuf*)0;
53 | char fmode[10];
54 | char* fmodeptr = fmode;
55 | if ( mode & std::ios::in)
56 | *fmodeptr++ = 'r';
57 | else if ( mode & std::ios::out)
58 | *fmodeptr++ = 'w';
59 | *fmodeptr++ = 'b';
60 | *fmodeptr = '\0';
61 | file = gzopen( name, fmode);
62 | if (file == 0)
63 | return (gzstreambuf*)0;
64 | opened = 1;
65 | return this;
66 | }
67 |
68 | gzstreambuf * gzstreambuf::close() {
69 | if ( is_open()) {
70 | sync();
71 | opened = 0;
72 | if ( gzclose( file) == Z_OK)
73 | return this;
74 | }
75 | return (gzstreambuf*)0;
76 | }
77 |
78 | int gzstreambuf::underflow() { // used for input buffer only
79 | if ( gptr() && ( gptr() < egptr()))
80 | return * reinterpret_cast( gptr());
81 |
82 | if ( ! (mode & std::ios::in) || ! opened)
83 | return EOF;
84 | // Josuttis' implementation of inbuf
85 | int n_putback = gptr() - eback();
86 | if ( n_putback > 4)
87 | n_putback = 4;
88 | memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
89 |
90 | int num = gzread( file, buffer+4, bufferSize-4);
91 | if (num <= 0) // ERROR or EOF
92 | return EOF;
93 |
94 | // reset buffer pointers
95 | setg( buffer + (4 - n_putback), // beginning of putback area
96 | buffer + 4, // read position
97 | buffer + 4 + num); // end of buffer
98 |
99 | // return next character
100 | return * reinterpret_cast( gptr());
101 | }
102 |
103 | int gzstreambuf::flush_buffer() {
104 | // Separate the writing of the buffer from overflow() and
105 | // sync() operation.
106 | int w = pptr() - pbase();
107 | if ( gzwrite( file, pbase(), w) != w)
108 | return EOF;
109 | pbump( -w);
110 | return w;
111 | }
112 |
113 | int gzstreambuf::overflow( int c) { // used for output buffer only
114 | if ( ! ( mode & std::ios::out) || ! opened)
115 | return EOF;
116 | if (c != EOF) {
117 | *pptr() = c;
118 | pbump(1);
119 | }
120 | if ( flush_buffer() == EOF)
121 | return EOF;
122 | return c;
123 | }
124 |
125 | int gzstreambuf::sync() {
126 | // Changed to use flush_buffer() instead of overflow( EOF)
127 | // which caused improper behavior with std::endl and flush(),
128 | // bug reported by Vincent Ricard.
129 | if ( pptr() && pptr() > pbase()) {
130 | if ( flush_buffer() == EOF)
131 | return -1;
132 | }
133 | return 0;
134 | }
135 |
136 | // --------------------------------------
137 | // class gzstreambase:
138 | // --------------------------------------
139 |
140 | gzstreambase::gzstreambase( const char* name, int mode) {
141 | init( &buf);
142 | open( name, mode);
143 | }
144 |
145 | gzstreambase::~gzstreambase() {
146 | buf.close();
147 | }
148 |
149 | void gzstreambase::open( const char* name, int open_mode) {
150 | if ( ! buf.open( name, open_mode))
151 | clear( rdstate() | std::ios::badbit);
152 | }
153 |
154 | void gzstreambase::close() {
155 | if ( buf.is_open())
156 | if ( ! buf.close())
157 | clear( rdstate() | std::ios::badbit);
158 | }
159 |
160 | #ifdef GZSTREAM_NAMESPACE
161 | } // namespace GZSTREAM_NAMESPACE
162 | #endif
163 |
164 | // ============================================================================
165 | // EOF //
166 |
--------------------------------------------------------------------------------
/lib/gzstream.h:
--------------------------------------------------------------------------------
1 | // ============================================================================
2 | // gzstream, C++ iostream classes wrapping the zlib compression library.
3 | // Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner
4 | //
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This library is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | // Lesser General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU Lesser General Public
16 | // License along with this library; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 | // ============================================================================
19 | //
20 | // File : gzstream.h
21 | // Revision : $Revision$
22 | // Revision_date : $Date$
23 | // Author(s) : Deepak Bandyopadhyay, Lutz Kettner
24 | //
25 | // Standard streambuf implementation following Nicolai Josuttis, "The
26 | // Standard C++ Library".
27 | // ============================================================================
28 |
29 | #ifndef GZSTREAM_H
30 | #define GZSTREAM_H 1
31 |
32 | // standard C++ with new header file names and std:: namespace
33 | #include
34 | #include
35 | #include
36 |
37 | #ifdef GZSTREAM_NAMESPACE
38 | namespace GZSTREAM_NAMESPACE {
39 | #endif
40 |
41 | // ----------------------------------------------------------------------------
42 | // Internal classes to implement gzstream. See below for user classes.
43 | // ----------------------------------------------------------------------------
44 |
45 | class gzstreambuf : public std::streambuf {
46 | private:
47 | static const int bufferSize = 47+256; // size of data buff
48 | // totals 512 bytes under g++ for igzstream at the end.
49 |
50 | gzFile file; // file handle for compressed file
51 | char buffer[bufferSize]; // data buffer
52 | char opened; // open/close state of stream
53 | int mode; // I/O mode
54 |
55 | int flush_buffer();
56 | public:
57 | gzstreambuf() : opened(0) {
58 | setp( buffer, buffer + (bufferSize-1));
59 | setg( buffer + 4, // beginning of putback area
60 | buffer + 4, // read position
61 | buffer + 4); // end position
62 | // ASSERT: both input & output capabilities will not be used together
63 | }
64 | int is_open() { return opened; }
65 | gzstreambuf* open( const char* name, int open_mode);
66 | gzstreambuf* close();
67 | ~gzstreambuf() { close(); }
68 |
69 | virtual int overflow( int c = EOF);
70 | virtual int underflow();
71 | virtual int sync();
72 | };
73 |
74 | class gzstreambase : virtual public std::ios {
75 | protected:
76 | gzstreambuf buf;
77 | public:
78 | gzstreambase() { init(&buf); }
79 | gzstreambase( const char* name, int open_mode);
80 | ~gzstreambase();
81 | void open( const char* name, int open_mode);
82 | void close();
83 | gzstreambuf* rdbuf() { return &buf; }
84 | };
85 |
86 | // ----------------------------------------------------------------------------
87 | // User classes. Use igzstream and ogzstream analogously to ifstream and
88 | // ofstream respectively. They read and write files based on the gz*
89 | // function interface of the zlib. Files are compatible with gzip compression.
90 | // ----------------------------------------------------------------------------
91 |
92 | class igzstream : public gzstreambase, public std::istream {
93 | public:
94 | igzstream() : std::istream( &buf) {}
95 | igzstream( const char* name, int open_mode = std::ios::in)
96 | : gzstreambase( name, open_mode), std::istream( &buf) {}
97 | gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
98 | void open( const char* name, int open_mode = std::ios::in) {
99 | gzstreambase::open( name, open_mode);
100 | }
101 | };
102 |
103 | class ogzstream : public gzstreambase, public std::ostream {
104 | public:
105 | ogzstream() : std::ostream( &buf) {}
106 | ogzstream( const char* name, int mode = std::ios::out)
107 | : gzstreambase( name, mode), std::ostream( &buf) {}
108 | gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
109 | void open( const char* name, int open_mode = std::ios::out) {
110 | gzstreambase::open( name, open_mode);
111 | }
112 | };
113 |
114 | #ifdef GZSTREAM_NAMESPACE
115 | } // namespace GZSTREAM_NAMESPACE
116 | #endif
117 |
118 | #endif // GZSTREAM_H
119 | // ============================================================================
120 | // EOF //
121 |
122 |
--------------------------------------------------------------------------------
/lib/matrices.cpp:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Little library of matrices and sparse matrices
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #include "assert.h"
21 | #include
22 |
23 |
24 | void
25 | FMatrix::resize(int nrows, int ncols)
26 | {
27 | w.detach();
28 | Rep *d = rep();
29 | if (nrows >= 0)
30 | {
31 | d->rows.resize(nrows);
32 | d->nrows = nrows;
33 | }
34 | if (ncols >= 0)
35 | {
36 | d->ncols = ncols;
37 | for (int i=0; inrows; i++)
38 | d->rows[i].resize(ncols);
39 | }
40 | }
41 |
42 |
43 | VFloat
44 | FMatrix::get(int r, int c) const
45 | {
46 | const Rep *d = rep();
47 | if (r>=0 && rnrows)
48 | return d->rows[r].get(c);
49 | assert(r >= 0);
50 | return 0;
51 | }
52 |
53 |
54 | void
55 | FMatrix::set(int r, int c, VFloat v)
56 | {
57 | w.detach();
58 | Rep *d = rep();
59 | if (r>=d->nrows)
60 | resize(r+1);
61 | if (c>=d->ncols)
62 | d->ncols = c+1;
63 | assert(r >= 0);
64 | d->rows[r].set(c,v);
65 | }
66 |
67 |
68 | FVector&
69 | FMatrix::operator[](int r)
70 | {
71 | w.detach();
72 | Rep *d = rep();
73 | if (r>=d->nrows)
74 | resize(r+1);
75 | assert(r >= 0);
76 | return d->rows[r];
77 | }
78 |
79 |
80 | // ----------------------------------------
81 |
82 |
83 | void
84 | SMatrix::resize(int nrows, int ncols)
85 | {
86 | w.detach();
87 | Rep *d = rep();
88 | if (nrows >= 0)
89 | {
90 | d->rows.resize(nrows);
91 | d->nrows = nrows;
92 | }
93 | if (ncols >= 0 && ncols < d->ncols)
94 | {
95 | d->ncols = ncols;
96 | for (int i=0; inrows; i++)
97 | if (d->rows[i].size() > ncols)
98 | {
99 | // truncate
100 | SVector s = d->rows[i];
101 | SVector &v = d->rows[i];
102 | v.clear();
103 | for (const SVector::Pair *p = s; p->i >= 0 && p->i < ncols; p++)
104 | v.set(p->i, p->v);
105 | }
106 | }
107 | }
108 |
109 |
110 | VFloat
111 | SMatrix::get(int r, int c) const
112 | {
113 | const Rep *d = rep();
114 | if (r>=0 && rnrows)
115 | return d->rows[r].get(c);
116 | assert(r>=0);
117 | return 0;
118 | }
119 |
120 |
121 | void
122 | SMatrix::set(int r, int c, VFloat v)
123 | {
124 | w.detach();
125 | Rep *d = rep();
126 | if (r>=d->nrows)
127 | resize(r+1);
128 | if (c>=d->ncols)
129 | d->ncols = c+1;
130 | assert(r>=0);
131 | d->rows[r].set(c,v);
132 | }
133 |
134 |
135 | SVector&
136 | SMatrix::operator[](int r)
137 | {
138 | w.detach();
139 | Rep *d = rep();
140 | if (r>=d->nrows)
141 | resize(r+1);
142 | assert(r>=0);
143 | return d->rows[r];
144 | }
145 |
146 |
147 |
148 | /* -------------------------------------------------------------
149 | Local Variables:
150 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*")
151 | End:
152 | ------------------------------------------------------------- */
153 |
--------------------------------------------------------------------------------
/lib/matrices.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Little library of matrices and sparse matrices
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 |
21 | // $Id$
22 |
23 | #ifndef MATRICES_H
24 | #define MATRICES_H 1
25 |
26 | #include
27 | #include
28 | #include
29 | #include "wrapper.h"
30 | #include "vectors.h"
31 |
32 |
33 | class FMatrix
34 | {
35 | private:
36 | struct Rep
37 | {
38 | int refcount;
39 | int ncols;
40 | int nrows;
41 | std::vector rows;
42 | Rep() : ncols(0), nrows(0) { }
43 | Rep *copy() { return new Rep(*this); }
44 | };
45 |
46 | Wrapper w;
47 | Rep *rep() { return w.rep(); }
48 | const Rep *rep() const { return w.rep(); }
49 |
50 | public:
51 | FMatrix() {}
52 | FMatrix(int rows, int cols) { resize(rows, cols); }
53 | int rows() const { return rep()->nrows; }
54 | int cols() const { return rep()->ncols; }
55 | void resize(int nrows, int ncols=-1);
56 | VFloat get(int r, int c) const;
57 | void set(int r, int c, VFloat v);
58 |
59 | FVector& operator[](int r);
60 |
61 | const FVector operator[](int r) const {
62 | const Rep *d = rep();
63 | if (r<0 || r>=d->nrows)
64 | return FVector();
65 | return d->rows[r];
66 | }
67 | };
68 |
69 |
70 |
71 | class SMatrix
72 | {
73 | private:
74 | struct Rep
75 | {
76 | int refcount;
77 | int ncols;
78 | int nrows;
79 | std::vector rows;
80 | Rep() : ncols(0), nrows(0) { }
81 | Rep *copy() { return new Rep(*this); }
82 | };
83 |
84 | Wrapper w;
85 | Rep *rep() { return w.rep(); }
86 | const Rep *rep() const { return w.rep(); }
87 |
88 | public:
89 | SMatrix() {}
90 | SMatrix(int rows, int cols) { resize(rows,cols); }
91 | int rows() const { return rep()->nrows; }
92 | int cols() const { return rep()->ncols; }
93 | void resize(int nrows, int ncols=-1);
94 | VFloat get(int r, int c) const;
95 | void set(int r, int c, VFloat v);
96 |
97 | SVector& operator[](int r);
98 |
99 | const SVector operator[](int r) const {
100 | const Rep *d = rep();
101 | if (r<0 || r>=d->nrows)
102 | return SVector();
103 | return d->rows[r];
104 | }
105 | };
106 |
107 |
108 | #endif
109 |
110 | /* -------------------------------------------------------------
111 | Local Variables:
112 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" "std::\\sw+")
113 | End:
114 | ------------------------------------------------------------- */
115 |
--------------------------------------------------------------------------------
/lib/pstream.cpp:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Stream that uses popen/pclose internally
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #include "pstream.h"
21 | #include
22 | #include
23 |
24 |
25 | pstreambuf*
26 | pstreambuf::open( const char *cmd, int open_mode)
27 | {
28 | if (f)
29 | return 0;
30 | mode = open_mode;
31 | if ((mode & std::ios::ate) || (mode & std::ios::app)
32 | || ((mode & std::ios::in) && (mode & std::ios::out)))
33 | return 0;
34 | char fmode[10];
35 | char *fmodeptr = fmode;
36 | if ( mode & std::ios::in)
37 | *fmodeptr++ = 'r';
38 | else if ( mode & std::ios::out)
39 | *fmodeptr++ = 'w';
40 | #ifdef WIN32
41 | if (mode & std::ios::binary)
42 | *fmodeptr++ = 'b';
43 | *fmodeptr = '\0';
44 | f = ::_popen(cmd, fmode);
45 | #else
46 | *fmodeptr = '\0';
47 | f = ::popen(cmd, fmode);
48 | #endif
49 | if (f == 0)
50 | return 0;
51 | return this;
52 | }
53 |
54 |
55 | pstreambuf*
56 | pstreambuf::close()
57 | {
58 | if (f)
59 | {
60 | sync();
61 | #ifdef WIN32
62 | ::_pclose(f);
63 | #else
64 | ::pclose(f);
65 | #endif
66 | f = 0;
67 | return this;
68 | }
69 | return 0;
70 | }
71 |
72 |
73 | int
74 | pstreambuf::underflow()
75 | { // used for input buffer only
76 | if ( gptr() && ( gptr() < egptr()))
77 | return *reinterpret_cast( gptr());
78 | if ( ! (mode & std::ios::in) || ! f)
79 | return EOF;
80 | int n_putback = gptr() - eback();
81 | if ( n_putback > 4)
82 | n_putback = 4;
83 | memcpy(buffer + (4 - n_putback), gptr()-n_putback, n_putback);
84 | int num = std::fread(buffer+4, 1, bsize-4, f);
85 | if (num <= 0)
86 | return EOF;
87 | setg( buffer + (4 - n_putback), // beginning of putback area
88 | buffer + 4, // read position
89 | buffer + 4 + num); // end of buffer
90 | // return next character
91 | return *reinterpret_cast( gptr());
92 | }
93 |
94 |
95 | int
96 | pstreambuf::overflow(int c)
97 | { // used for output buffer only
98 | if (!(mode & std::ios::out) || !f)
99 | return EOF;
100 | if (c != EOF) {
101 | *pptr() = c;
102 | pbump(1);
103 | }
104 | if (! sync())
105 | return c;
106 | return EOF;
107 | }
108 |
109 |
110 | int
111 | pstreambuf::sync() {
112 | if ( pptr() && pptr() > pbase()) {
113 | int w = pptr() - pbase();
114 | if (std::fwrite( pbase(), 1, w, f ) != (size_t)w)
115 | return EOF;
116 | pbump( -w);
117 | }
118 | return 0;
119 | }
120 |
121 |
122 | pstreambase::pstreambase( const char* cmd, int mode) {
123 | init(&buf);
124 | open(cmd, mode);
125 | }
126 |
127 |
128 | pstreambase::~pstreambase() {
129 | buf.close();
130 | }
131 |
132 |
133 | void
134 | pstreambase::open( const char* cmd, int open_mode) {
135 | if (! buf.open(cmd, open_mode))
136 | setstate( std::ios::badbit);
137 | }
138 |
139 |
140 | void
141 | pstreambase::close() {
142 | if (buf.is_open())
143 | if (! buf.close())
144 | setstate(std::ios::badbit);
145 | }
146 |
147 |
148 |
149 | /* -------------------------------------------------------------
150 | Local Variables:
151 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" )
152 | End:
153 | ------------------------------------------------------------- */
154 |
--------------------------------------------------------------------------------
/lib/pstream.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Stream that uses popen/pclose internally
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #ifndef PSTREAM_H
21 | #define PSTREAM_H 1
22 |
23 | #include
24 | #include
25 | #include
26 |
27 | class pstreambuf : public std::streambuf
28 | {
29 | private:
30 | static const int bsize = 512;
31 | char buffer[bsize];
32 | std::FILE *f;
33 | int mode;
34 | public:
35 | pstreambuf() : f(0), mode(0) {
36 | setp( buffer, buffer+bsize-1 );
37 | setg( buffer+4, buffer+4, buffer+4 );
38 | }
39 | int is_open() { return !!f; }
40 | pstreambuf* open(const char *cmd, int open_mode);
41 | pstreambuf* close();
42 | ~pstreambuf() { close(); }
43 | virtual int overflow( int c = EOF);
44 | virtual int underflow();
45 | virtual int sync();
46 | };
47 |
48 |
49 | class pstreambase : virtual public std::ios {
50 | protected:
51 | pstreambuf buf;
52 | public:
53 | pstreambase() { init(&buf); }
54 | pstreambase(const char *cmd, int open_mode);
55 | ~pstreambase();
56 | void open(const char *cmd, int open_mode);
57 | void close();
58 | pstreambuf* rdbuf() { return &buf; }
59 | };
60 |
61 | // ----------------------------------------------------------------------------
62 | // User classes. Use ipstream and opstream analogously to ifstream and
63 | // ofstream respectively. They read and write files using popen().
64 | // ----------------------------------------------------------------------------
65 |
66 | class ipstream : public pstreambase, public std::istream {
67 | public:
68 | ipstream() : std::istream( &buf) {}
69 | ipstream( const char* cmd, int open_mode = std::ios::in)
70 | : pstreambase(cmd, open_mode), std::istream( &buf) {}
71 | pstreambuf* rdbuf() { return pstreambase::rdbuf(); }
72 | void open( const char* cmd, int open_mode = std::ios::in) {
73 | pstreambase::open(cmd, open_mode);
74 | }
75 | };
76 |
77 | class opstream : public pstreambase, public std::ostream {
78 | public:
79 | opstream() : std::ostream( &buf) {}
80 | opstream( const char *cmd, int mode = std::ios::out)
81 | : pstreambase(cmd, mode), std::ostream( &buf) {}
82 | pstreambuf* rdbuf() { return pstreambase::rdbuf(); }
83 | void open( const char *cmd, int open_mode = std::ios::out) {
84 | pstreambase::open( cmd, open_mode);
85 | }
86 | };
87 |
88 | #endif
89 |
90 | /* -------------------------------------------------------------
91 | Local Variables:
92 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" "std::\\sw+")
93 | End:
94 | ------------------------------------------------------------- */
95 |
--------------------------------------------------------------------------------
/lib/timer.cpp:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // A simple timer.
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #include "timer.h"
21 | #include
22 |
23 | #ifdef USE_REALTIME_CLOCK
24 | # include
25 | # include
26 | static double
27 | klock()
28 | {
29 | struct timeval tv;
30 | gettimeofday(&tv, NULL);
31 | return (double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
32 | return (double) std::clock() / (double) CLOCKS_PER_SEC;
33 | }
34 | #else
35 | static double
36 | klock()
37 | {
38 | return (double) std::clock() / (double) CLOCKS_PER_SEC;
39 | }
40 | #endif
41 |
42 | Timer::Timer()
43 | : a(0), s(0), r(0)
44 | {
45 | }
46 |
47 | void
48 | Timer::reset()
49 | {
50 | a = 0;
51 | s = 0;
52 | r = 0;
53 | }
54 |
55 |
56 | double
57 | Timer::elapsed()
58 | {
59 | double n = klock();
60 | if (r)
61 | a += n - s;
62 | s = n;
63 | return a;
64 | }
65 |
66 | double
67 | Timer::start()
68 | {
69 | elapsed();
70 | r = 1;
71 | return a;
72 | }
73 |
74 |
75 |
76 | double
77 | Timer::stop()
78 | {
79 | elapsed();
80 | r = 0;
81 | return a;
82 | }
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/lib/timer.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // A simple timer.
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #ifndef TIMER_H
21 | #define TIMER_H 1
22 |
23 | class Timer
24 | {
25 | public:
26 | Timer();
27 | void reset();
28 | double start();
29 | double stop();
30 | double elapsed();
31 | private:
32 | double a, s;
33 | int r;
34 | };
35 |
36 |
37 | /* -------------------------------------------------------------
38 | Local Variables:
39 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" "std::\\sw+_t")
40 | End:
41 | ------------------------------------------------------------- */
42 |
43 |
44 | #endif
45 |
--------------------------------------------------------------------------------
/lib/vectors.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Little library of vectors and sparse vectors
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This library is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 |
21 | #ifndef VECTORS_H
22 | #define VECTORS_H 1
23 |
24 | #include
25 | #include
26 | #include "wrapper.h"
27 |
28 |
29 | class FVector;
30 | class SVector;
31 |
32 | #ifndef VFLOAT
33 | # define VFLOAT float
34 | #endif
35 | typedef VFLOAT VFloat;
36 |
37 | class FVector
38 | {
39 | private:
40 | struct Rep
41 | {
42 | int refcount;
43 | int size;
44 | VFloat *data;
45 | Rep() : size(0), data(0) {}
46 | ~Rep() { delete [] data; }
47 | void resize(int n);
48 | Rep *copy();
49 | };
50 |
51 | Wrapper w;
52 | Rep *rep() { return w.rep(); }
53 | const Rep *rep() const { return w.rep(); }
54 | void qset(int i, double v);
55 |
56 | public:
57 | FVector();
58 | FVector(int n);
59 | FVector(const SVector &v);
60 | int size() const { return rep()->size; }
61 |
62 | // these accessors are range-checked.
63 | // get() returns 0 when i is out-of-range.
64 | // set() expands the vector.
65 | double get(int i) const;
66 | double set(int i, double v);
67 |
68 | // warning: you can write vector[i] but
69 | // the subscripts are not range-checked!
70 | // on the other hand, that's fast.
71 | operator const VFloat* () const { return rep()->data; }
72 | operator VFloat* () { w.detach(); return rep()->data; }
73 |
74 | void clear();
75 | void zero();
76 | void resize(int n);
77 | void touch(int i);
78 | FVector slice(int fi, int ti) const;
79 |
80 | void add(double c1);
81 | void add(const FVector &v2);
82 | void add(const SVector &v2);
83 | void add(const FVector &v2, double c2);
84 | void add(const SVector &v2, double c2);
85 | void add(const FVector &v2, double c2, const FVector &q2);
86 | void add(const SVector &v2, double c2, const FVector &q2);
87 | void scale(double c1);
88 | void combine(double c1, const FVector &v2, double c2);
89 | void combine(double c1, const SVector &v2, double c2);
90 |
91 | friend std::ostream& operator<<(std::ostream &f, const FVector &v);
92 | friend std::istream& operator>>(std::istream &f, FVector &v);
93 | bool save(std::ostream &f) const;
94 | bool load(std::istream &f);
95 | };
96 |
97 |
98 |
99 | class SVector
100 | {
101 | public:
102 | struct Pair
103 | {
104 | int i;
105 | VFloat v;
106 | };
107 | private:
108 | struct Rep
109 | {
110 | int refcount;
111 | int npairs;
112 | int mpairs;
113 | int size;
114 | struct Pair *pairs;
115 |
116 | Rep() : npairs(0), mpairs(-1), size(0), pairs(0) {}
117 | ~Rep() { delete [] pairs; }
118 | void resize(int n);
119 | double qset(int i, double v);
120 | Rep *copy();
121 | };
122 |
123 | Wrapper w;
124 | Rep *rep() { return w.rep(); }
125 | const Rep *rep() const { return w.rep(); }
126 |
127 | public:
128 | SVector();
129 | SVector(const FVector &v);
130 | int size() const { return rep()->size; }
131 |
132 | // these accessors are range-checked.
133 | // get() returns 0 when i is out-of-range.
134 | // set() expands the vector.
135 | double get(int i) const;
136 | double set(int i, double v);
137 |
138 | // to quickly iterate over the non-zero coefficients,
139 | // do for(SVector::Pair *p = x; p->i>=0; p++) { ... }
140 | int npairs() const { return rep()->npairs; }
141 | operator const Pair* () const { return rep()->pairs; }
142 |
143 | void zero();
144 | void clear();
145 | void trim();
146 | SVector slice(int fi, int ti) const;
147 |
148 | void add(const SVector &v2);
149 | void add(const SVector &v2, double c2);
150 | void scale(double c1);
151 | void combine(double c1, const SVector &v2, double c2);
152 |
153 | friend std::ostream& operator<<(std::ostream &f, const SVector &v);
154 | friend std::istream& operator>>(std::istream &f, SVector &v);
155 | bool save(std::ostream &f) const;
156 | bool load(std::istream &f);
157 |
158 | friend SVector combine(const SVector &v1, double a1,
159 | const SVector &v2, double a2);
160 | };
161 |
162 | double dot(const FVector &v1, const FVector &v2);
163 | double dot(const FVector &v1, const SVector &v2);
164 | double dot(const SVector &v1, const FVector &v2);
165 | double dot(const SVector &v1, const SVector &v2);
166 |
167 | SVector combine(const SVector &v1, double a1, const SVector &v2, double a2);
168 | FVector combine(const FVector &v1, double a1, const SVector &v2, double a2);
169 | FVector combine(const SVector &v1, double a1, const FVector &v2, double a2);
170 | FVector combine(const FVector &v1, double a1, const FVector &v2, double a2);
171 |
172 |
173 |
174 | #endif
175 |
176 | /* -------------------------------------------------------------
177 | Local Variables:
178 | c++-font-lock-extra-types: ("\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" "std::\\sw+")
179 | End:
180 | ------------------------------------------------------------- */
181 |
--------------------------------------------------------------------------------
/lib/wrapper.h:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // Little library of copy-on-write wrappers
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 |
6 | // This library is free software; you can redistribute it and/or
7 | // modify it under the terms of the GNU Lesser General Public
8 | // License as published by the Free Software Foundation; either
9 | // version 2.1 of the License, or (at your option) any later version.
10 | //
11 | // This program is distributed in the hope that it will be useful,
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | // GNU General Public License for more details.
15 | //
16 | // You should have received a copy of the GNU General Public License
17 | // along with this program; if not, write to the Free Software
18 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
19 |
20 |
21 |
22 | #ifndef WRAPPER_H
23 | #define WRAPPER_H 1
24 |
25 |
26 | template
27 | class Wrapper
28 | {
29 | private:
30 | Rep *q;
31 | Rep *ref(Rep *q) { q->refcount++; return q; }
32 | void deref(Rep *q) { if (! --(q->refcount)) delete q; }
33 |
34 | public:
35 | Wrapper()
36 | : q(new Rep) { q->refcount = 1; }
37 | Wrapper(Rep *rep)
38 | : q(rep) { q->refcount = 1; }
39 | Wrapper(const Wrapper &other)
40 | : q(ref(other.q)) {}
41 | ~Wrapper()
42 | { deref(q); }
43 | Wrapper& operator=(const Wrapper &other)
44 | { Rep *p = q; q = ref(other.q); deref(p); return *this; }
45 | void detach()
46 | { if (q->refcount > 1) { deref(q); q=q->copy(); q->refcount=1; } }
47 | Rep *rep() const
48 | { return q; }
49 | };
50 |
51 |
52 |
53 | // Recommended usage
54 | //
55 | // #include
56 | // #include
57 | //
58 | // class String
59 | // {
60 | // private:
61 | //
62 | // struct Rep
63 | // {
64 | // int refcount;
65 | // int length;
66 | // char *data;
67 | // Rep(const char *s, int l)
68 | // : length(len), data(new char[l+1])
69 | // { ::memcpy(data, s, l); data[len] = 0; }
70 | // Rep *copy()
71 | // { return new StringRep(data, length); }
72 | // };
73 | //
74 | // Wrapper w;
75 | // Rep *rep() { return w.rep(); }
76 | // const Rep *rep() const { return w.rep(); }
77 | //
78 | // public:
79 | // String(const char *s, int l)
80 | // : w(new Rep(s,l)) {}
81 | // String(const char *s)
82 | // : w(new Rep(s,::strlen(s))) {}
83 | //
84 | // // function that do not mutate
85 | // int size() const { return rep()->length; }
86 | // operator const char*() const { return rep()->data; }
87 | // char operator[](int i) const { return rep()->data[i]; }
88 | //
89 | // // functions that perform a mutation
90 | // void set(int i, char c) { w.detach(); rep()->data[i] = c; }
91 | // }
92 | //
93 |
94 |
95 |
96 | #endif
97 |
98 |
99 |
100 | /* -------------------------------------------------------------
101 | Local Variables:
102 | c++-font-lock-extra-types: ( "\\sw+_t" "[A-Z]\\sw*[a-z]\\sw*" )
103 | End:
104 | ------------------------------------------------------------- */
105 |
--------------------------------------------------------------------------------
/svm/Makefile:
--------------------------------------------------------------------------------
1 | # SVM with stochastic gradient
2 |
3 | # This program is free software; you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation; either version 2 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This program is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program; if not, write to the Free Software
15 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
16 |
17 |
18 | L=../lib
19 | CXX=g++
20 | OPT=
21 | OPTS=-g -O2
22 | CXXFLAGS= ${OPTS} ${OPT} -Wall -I$L
23 | LIBS = -lz -lm
24 |
25 | PROGRAMS = prep_rcv1 prep_alpha prep_webspam svmsgd svmasgd
26 |
27 | OBJS = vectors.o gzstream.o timer.o
28 | INCS = $L/vectors.h $L/gzstream.h $L/timer.h $L/wrapper.h $L/assert.h
29 |
30 | all: ${PROGRAMS}
31 |
32 | clean:
33 | -rm ${PROGRAMS} 2>/dev/null
34 | -rm *.o 2>/dev/null
35 |
36 | vectors.o: $L/vectors.cpp ${INCS}
37 | ${CXX} ${CXXFLAGS} -c -o $@ $L/vectors.cpp
38 |
39 | gzstream.o: $L/gzstream.cpp ${INCS}
40 | ${CXX} ${CXXFLAGS} -c -o $@ $L/gzstream.cpp
41 |
42 | timer.o: $L/timer.cpp ${INCS}
43 | ${CXX} ${CXXFLAGS} -c -o $@ $L/timer.cpp
44 |
45 | data.o: data.cpp ${INCS}
46 | ${CXX} ${CXXFLAGS} -c -o $@ data.cpp
47 |
48 | prep_rcv1.o: prep_rcv1.cpp ${INCS}
49 | ${CXX} ${CXXFLAGS} -c -o $@ prep_rcv1.cpp
50 |
51 | prep_alpha.o: prep_alpha.cpp ${INCS}
52 | ${CXX} ${CXXFLAGS} -c -o $@ prep_alpha.cpp
53 |
54 | prep_webspam.o: prep_webspam.cpp ${INCS}
55 | ${CXX} ${CXXFLAGS} -c -o $@ prep_webspam.cpp
56 |
57 | svmsgd.o: svmsgd.cpp data.h loss.h ${INCS}
58 | ${CXX} ${CXXFLAGS} -c -o $@ svmsgd.cpp
59 |
60 | svmasgd.o: svmasgd.cpp data.h loss.h ${INCS}
61 | ${CXX} ${CXXFLAGS} -c -o $@ svmasgd.cpp
62 |
63 | prep_rcv1: prep_rcv1.o ${OBJS}
64 | ${CXX} ${CXXFLAGS} -o $@ prep_rcv1.o ${OBJS} ${LIBS}
65 |
66 | prep_alpha: prep_alpha.o ${OBJS}
67 | ${CXX} ${CXXFLAGS} -o $@ prep_alpha.o ${OBJS} ${LIBS}
68 |
69 | prep_webspam: prep_webspam.o ${OBJS}
70 | ${CXX} ${CXXFLAGS} -o $@ prep_webspam.o ${OBJS} ${LIBS}
71 |
72 | svmsgd: svmsgd.o data.o ${OBJS}
73 | ${CXX} ${CXXFLAGS} -o $@ svmsgd.o data.o ${OBJS} ${LIBS}
74 |
75 | svmasgd: svmasgd.o data.o ${OBJS}
76 | ${CXX} ${CXXFLAGS} -o $@ svmasgd.o data.o ${OBJS} ${LIBS}
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/svm/data.cpp:
--------------------------------------------------------------------------------
1 | // -*- C++ -*-
2 | // SVM with stochastic gradient
3 | // Copyright (C) 2007- Leon Bottou
4 |
5 | // This program is free software; you can redistribute it and/or
6 | // modify it under the terms of the GNU Lesser General Public
7 | // License as published by the Free Software Foundation; either
8 | // version 2.1 of the License, or (at your option) any later version.
9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
18 |
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include