├── README.md
├── ref
    ├── A Conditional Random Field Word Segmenter.pdf
    ├── A Maximum Entropy Approach to Chinese Word Segmentation.pdf
    └── [2007,黄昌宁，赵海]中文分词十年回顾.pdf
├── score
    ├── bmm_result
    │   ├── pku_bmm_result.utf8
    │   └── pku_bmm_score.utf8
    ├── fmm_result
    │   ├── pku_fmm_result.utf8
    │   └── pku_fmm_score.utf8
    ├── pku_test_gold.utf8
    ├── pku_training.utf8
    └── score
└── src
    ├── LICENSE
    ├── ReadMe
    ├── autoScore.py
    ├── cwsBMM.py
    ├── cwsBMM_NRule.py
    ├── cwsFMM.py
    ├── cwsFMM_NRule.py
    ├── cwsMaxEn-13f.py
    ├── cwsMaxEn-16f.py
    ├── pku_test.utf8
    ├── pku_test_gold.utf8
    ├── pku_training.utf8
    └── score


/README.md:
--------------------------------------------------------------------------------
 1 | #Requirements:
 2 | 
 3 |   * Ubuntu 12.04
 4 |   * Python 2.7.3
 5 |   * Perl 5
 6 | 
 7 | File Description:
 8 | 
 9 |     src/cwsFMM.py -- source code of tool cwFMM
10 |     
11 |     src/pku_training.utf8 -- training data
12 |     
13 |     src/pku_test.utf8 -- test data
14 |     
15 |     socre/score -- tools to socre the result
16 |     
17 |     socre/pku_test_gold.utf8 -- gold data
18 | 
19 |     ref -- reference papers of the project
20 | 
21 | #Tool:
22 | 
23 | 
24 | ## cwsBMM
25 | 
26 |   Description:
27 | 
28 |     Using Backward Maximum Match(BMM) algorithm to do Chinese Word Segamentation
29 | 
30 | === TOTAL TRUE WORDS RECALL:    0.924
31 | 
32 | === TOTAL TEST WORDS PRECISION: 0.897
33 | 
34 | === F MEASURE:  0.910
35 | 
36 | Usage:
37 | 
38 |     python cwsBMM.py training_file test_file result_file
39 |     
40 |     perl score training_file gold_file result_file > score_file
41 | 
42 | Notice:
43 |    All the data and the tool score come from:http://sighan.cs.uchicago.edu/bakeoff2005/
44 |    
45 | 
46 | 
47 | ## cwsFMM
48 | 
49 |   Description:
50 | 
51 |     Using Forward Maximum Match(BMM) algorithm to do Chinese Word Segamentation
52 | 
53 | === TOTAL TRUE WORDS RECALL:    0.920
54 | 
55 | === TOTAL TEST WORDS PRECISION: 0.895
56 | 
57 | === F MEASURE:  0.907
58 | 
59 | Usage:
60 | 
61 |     python cwsFMM.py training_file test_file result_file
62 |     
63 |     perl score training_file gold_file result_file > score_file
64 | 
65 | 
66 | Notice:
67 |    All the data and the tool score come from:http://sighan.cs.uchicago.edu/bakeoff2005/
68 | 


--------------------------------------------------------------------------------
/ref/A Conditional Random Field Word Segmenter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minixalpha/PyCWS/aa8e6c3302a025424bc23c294ecb2b5bca2024d0/ref/A Conditional Random Field Word Segmenter.pdf


--------------------------------------------------------------------------------
/ref/A Maximum Entropy Approach to Chinese Word Segmentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minixalpha/PyCWS/aa8e6c3302a025424bc23c294ecb2b5bca2024d0/ref/A Maximum Entropy Approach to Chinese Word Segmentation.pdf


--------------------------------------------------------------------------------
/ref/[2007,黄昌宁，赵海]中文分词十年回顾.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minixalpha/PyCWS/aa8e6c3302a025424bc23c294ecb2b5bca2024d0/ref/[2007,黄昌宁，赵海]中文分词十年回顾.pdf


--------------------------------------------------------------------------------
/score/score:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | ###########################################################################
  4 | #                                                                         #
  5 | #                               SIGHAN                                    #
  6 | #                      Copyright (c) 2003,2005                            #
  7 | #                        All Rights Reserved.                             #
  8 | #                                                                         #
  9 | #  Permission is hereby granted, free of charge, to use and distribute    #
 10 | #  this software and its documentation without restriction, including     #
 11 | #  without limitation the rights to use, copy, modify, merge, publish,    #
 12 | #  distribute, sublicense, and/or sell copies of this work, and to        #
 13 | #  permit persons to whom this work is furnished to do so, subject to     #
 14 | #  the following conditions:                                              #
 15 | #   1. The code must retain the above copyright notice, this list of      #
 16 | #      conditions and the following disclaimer.                           #
 17 | #   2. Any modifications must be clearly marked as such.                  #
 18 | #   3. Original authors' names are not deleted.                           #
 19 | #   4. The authors' names are not used to endorse or promote products     #
 20 | #      derived from this software without specific prior written          #
 21 | #      permission.                                                        #
 22 | #                                                                         #
 23 | #  SIGHAN AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES       #
 24 | #  WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF      #
 25 | #  MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SIGHAN NOR THE          #
 26 | #  CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL      #
 27 | #  DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA     #
 28 | #  OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER      #
 29 | #  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR       #
 30 | #  PERFORMANCE OF THIS SOFTWARE.                                          #
 31 | #                                                                         #
 32 | ###########################################################################
 33 | #                                                                         #
 34 | # Author: Richard Sproat (rws@uiuc.edu)                                   #
 35 | #         Tom Emerson (tree@basistech.com)                                #
 36 | #                                                                         #
 37 | ###########################################################################
 38 | 
 39 | ## This code depends upon a version of diff (e.g. GNU diffutils 2.7.2)
 40 | ## that supports the -y flag:
 41 | ##
 42 | ## -y     Use the side by side output format.
 43 | ##
 44 | ## change the following per your installation:
 45 | 
 46 | $diff = "/usr/bin/diff";
 47 | 
 48 | $USAGE = "Usage:\t$0 dictionary truth test\n\t";
 49 | 
 50 | if (@ARGV != 3) {print "$USAGE\n"; exit;}
 51 | 
 52 | $tmp1 = "/tmp/comp01$$";
 53 | $tmp2 = "/tmp/comp02$$";
 54 | 
 55 | %dict = ();
 56 | 
 57 | open (S, $ARGV[0]) or  die "$ARGV[0]: $!\n";
 58 | 
 59 | while (<S>) {
 60 |     chop;
 61 |     s/^\s*//;
 62 |     s/\s*$//;
 63 |     $dict{$_} = 1;
 64 | }
 65 | 
 66 | close(S);
 67 | 
 68 | open (TRUTH, $ARGV[1]) or die "$ARGV[1]: $!\n";
 69 | open (TEST, $ARGV[2]) or die "$ARGV[2]: $!\n";
 70 | 
 71 | $Tot = $Del = $Ins = $Subst = $Truecount = $Testcount = 0;
 72 | $RawRecall = $RawPrecision = 0;
 73 | 
 74 | $linenum = 0;
 75 | 
 76 | 
 77 | $IVMISSED = $OOVMISSED = $OOV = $IV = 0;
 78 | 
 79 | $file1 = $ARGV[1];
 80 | $file2 = $ARGV[2];
 81 | $file1 =~ s=^/.*/==;
 82 | $file2 =~ s=^/.*/==;
 83 | 
 84 | while (defined($truth = <TRUTH>) && defined($test = <TEST>)) {
 85 |     $truth =~ s/^\s*//;
 86 |     $test =~ s/^\s*//;
 87 |     $truth =~ s/\s*$//;
 88 |     $test =~ s/\s*$//;
 89 |     $truth =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g;
 90 |     $test =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g;
 91 |     $truth =~ s///g;
 92 |     $test =~ s///g;
 93 |     @truthwords = split /\s+/, $truth;
 94 |     @testwords = split /\s+/, $test;
 95 |     $truecount = scalar(@truthwords);
 96 |     $testcount = scalar(@testwords);
 97 |     ++$linenum;
 98 |     if ($truecount == 0) { 
 99 | 	if ($testcount > 0) {
100 | 	    print STDERR "Warning: training is 0 but test is nonzero, possible misalignment at line $linenum.\n";
101 | 	}
102 | 	next; 
103 |     }
104 |     if ($testcount == 0) { 
105 | 	print STDERR "Warning: No output in test data where there is in training data, line $linenum\n";
106 |     }
107 |     open (T1, ">$tmp1") or die "Can't open $tmp1";
108 |     open (T2, ">$tmp2") or die "Can't open $tmp2";
109 |     foreach my $w (@truthwords) { print T1 "$w\n"; }
110 |     foreach my $w (@testwords) {print T2 "$w\n";}
111 |     close (T1);
112 |     close (T2);
113 |     open (P, "$diff -y $tmp1 $tmp2 |") 
114 | 	or die "Can't open pipe.\n";
115 |     print "--$file1-------$file2----$linenum\n";
116 |     my $del = 0;
117 |     my $ins = 0;
118 |     my $subst = 0;
119 |     my $rawrecall = 0;
120 |     my $rawprecision = 0;
121 |     while (<P>) {
122 | 	my $err = 0;
123 | 	if (/\s\|\s/) {$subst++ ; $err++; }
124 | 	elsif (/\s\>\s/) {$ins++ ; $err++; }
125 | 	elsif (/\s\<\s/) {$del++ ; $err++; }
126 | 	if (/^([^\s]+)\s/) { 
127 | 	    my $w = $1;
128 | 	    if (!$dict{$w}) {++$OOV;}	    
129 | 	    else {++$IV;}
130 | 	    if (/^[^\s]+\s.*\s[\|\>\<]\s/) {
131 | 		if (!$dict{$w}) {++$OOVMISSED;}
132 | 		else {++$IVMISSED;}
133 | 		++$rawrecall; 
134 | 	    }
135 | 	}
136 | 	if (/\s[\|\>\<]\s.*[^\s]$/) { ++$rawprecision; }
137 | 	print "$_";
138 |     }
139 |     close (P);
140 |     my $tot = $del + $ins + $subst;
141 |     $Tot += $tot;
142 |     $Del += $del;
143 |     $Ins += $ins;
144 |     $Subst += $subst;
145 |     $Truecount += $truecount;
146 |     $Testcount += $testcount;
147 |     $rawrecall = $truecount - $rawrecall;
148 |     $rawprecision = $testcount - $rawprecision;
149 |     $RawRecall += $rawrecall;
150 |     $RawPrecision += $rawprecision;
151 |     $rawrecall = sprintf("%2.3f", $rawrecall/$truecount);
152 |     $rawprecision = sprintf("%2.3f", $rawprecision/$testcount);
153 |     print "INSERTIONS:\t$ins\n";
154 |     print "DELETIONS:\t$del\n";
155 |     print "SUBSTITUTIONS:\t$subst\n";
156 |     print "NCHANGE:\t$tot\n";
157 |     print "NTRUTH:\t$truecount\n";
158 |     print "NTEST:\t$testcount\n";
159 |     print "TRUE WORDS RECALL:\t$rawrecall\n";
160 |     print "TEST WORDS PRECISION:\t$rawprecision\n";
161 | }
162 | 
163 | close(TRUTH);
164 | close(TEST);
165 | unlink($tmp1);
166 | unlink($tmp2);
167 | 
168 | print "=== SUMMARY:\n";
169 | print "=== TOTAL INSERTIONS:\t$Ins\n";
170 | print "=== TOTAL DELETIONS:\t$Del\n";
171 | print "=== TOTAL SUBSTITUTIONS:\t$Subst\n";
172 | print "=== TOTAL NCHANGE:\t$Tot\n";
173 | print "=== TOTAL TRUE WORD COUNT:\t$Truecount\n";
174 | print "=== TOTAL TEST WORD COUNT:\t$Testcount\n";
175 | $RawRecall =  $RawRecall/$Truecount;
176 | $RawPrecision = $RawPrecision/$Testcount;
177 | $beta = 1;
178 | $R = $RawRecall;
179 | $P = $RawPrecision;
180 | $F = (1 + $beta)*$P*$R/($beta * $P + $R);
181 | $F = sprintf("%2.3f", $F);
182 | $RawRecall = sprintf("%2.3f", $RawRecall);
183 | $RawPrecision = sprintf("%2.3f", $RawPrecision);
184 | print "=== TOTAL TRUE WORDS RECALL:\t$RawRecall\n";
185 | print "=== TOTAL TEST WORDS PRECISION:\t$RawPrecision\n";
186 | print "=== F MEASURE:\t$F\n";
187 | if ($OOV > 0) {
188 |     $OOVMISSED = sprintf("%2.3f", 1 - $OOVMISSED / $OOV);
189 | }
190 | else {
191 |     $OOVMISSED = "--";
192 | }
193 | $OOV = sprintf("%2.3f", $OOV / $Truecount);
194 | if ($IV > 0) {
195 |     $IVMISSED = sprintf("%2.3f", 1 - $IVMISSED / $IV);
196 | }
197 | else {
198 |     $IVMISSED = "--";
199 | }
200 | print "=== OOV Rate:\t$OOV\n";
201 | print "=== OOV Recall Rate:\t$OOVMISSED\n";
202 | print "=== IV Recall Rate:\t$IVMISSED\n";
203 | 
204 | print "###\t$file2\t$Ins\t$Del\t$Subst\t$Tot\t$Truecount\t$Testcount\t$RawRecall\t$RawPrecision\t$F\t$OOV\t$OOVMISSED\t$IVMISSED\n";
205 | exit(0);
206 | 


--------------------------------------------------------------------------------
/src/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/src/ReadMe:
--------------------------------------------------------------------------------
 1 | 1 系统环境：
 2 |     Ubuntu 12.04
 3 |     Python 2.7
 4 |     Perl 5
 5 |     Maxent
 6 | 
 7 | 2 文件说明：
 8 |     cwsFMM_NRule.py     -- 不含规则的正向最大匹配算法
 9 |     cwsFMM.py           -- 包含规则的正向最大匹配算法
10 |     cwsBMM_NRule.py     -- 不含规则的逆向最大匹配算法
11 |     cwsBMM.py           -- 包含规则的逆向最大匹配算法
12 |     cwsMaxEn-13f.py     -- 13特征最大熵字标注算法
13 |     cwsMaxEn-16f.py     -- 16特征最大熵字标注算法
14 |     pku_test.utf8       -- 测试文件
15 |     pku_training.utf8   -- 训练文件
16 |     pku_test_gold.utf8  -- 测试文件标准分词结果
17 |     score               -- 评分工具
18 | 
19 | 3 工具使用
20 |     分词：
21 |         Usage: python toolName training_file test_file result_file
22 |         eg:
23 |             python cwsFMM.py pku_training.utf8 pku_test.utf8 pku_test.result
24 |     评分：
25 |         perl score training_file pku_test_gold.utf8 result_file
26 | 
27 | 4 版权声明
28 |     pku_test.utf8, pku_training.utf8, score 均来自Bakeoff-2005官方网站：http://www.sighan.org/bakeoff2005/　版权归Bakeoff-2005所有
29 |     其余文件遵循GPL协议, 见LICENSE文件
30 | 
31 | 5 感谢
32 |   工具的开发使用了张乐博士的Maxent工具包, 在此特别感谢
33 |   工具主页：http://homepages.inf.ed.ac.uk/lzhang10/maxent_toolkit.html
34 |   工具Github主页:https://github.com/lzhang10/maxent
35 | 
36 | 


--------------------------------------------------------------------------------
/src/autoScore.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Author: minix
 5 | # Date:   2013-05-18
 6 | # Email:  minix007@foxmail.com
 7 | 
 8 | # 功能：
 9 | #     0. 自动为多个文件评分
10 | #     1. 提取多个评分结果
11 | #     2. 打印所有评分结果
12 | 
13 | # 使用：
14 | #   例如分词结果文件为：pku_test.result.10,pku_test.result.50,pku_test.result.xxx
15 | #   这样文件名的模板为pku_test.result
16 | #   python autoScore pku_test.result 即可将所有此类文件评分，输出评分结果
17 | 
18 | import sys
19 | import os
20 | 
21 | def score_files(file_name_temp):
22 |   filename_list = os.popen('ls ' + file_name_temp + '.*').readlines()
23 |   score_path = '/home/zhaoxk/Project/PyCWS/score/score'
24 |   for filename in filename_list:
25 |     filename = filename.rstrip()
26 |     fileid = filename.split('.')[-1]
27 |     name = filename.split('.')[0]
28 |     cmd = 'perl ' + score_path + ' pku_training.utf8 pku_test_gold.utf8 ' + filename  + ' > ' + name +'.score.' + fileid
29 |     print cmd
30 |     os.system(cmd)
31 | 
32 | def extract_score_result(file_name_temp):
33 |   filename_list = os.popen('ls ' + file_name_temp + '.*').readlines()
34 |   fileid_list = []
35 |   for filename in filename_list:
36 |     filename = filename.rstrip()
37 |     fileid = filename.split('.')[-1]
38 |     fileid_list.append(int(fileid))
39 |   fileid_list.sort()
40 |   
41 |   result_list = []
42 |   for fileid in fileid_list:
43 |     filename = file_name_temp + '.' + str(fileid)
44 |     cmd = 'tail -16 ' + filename 
45 |     result = os.popen(cmd).readlines()
46 |     cur_result_list = []
47 |     for item in result[:-1]:
48 |       tmp = item.rstrip().split('\t')
49 |       if len(tmp) > 1:
50 |         cur_result_list.append(tmp[1])
51 |           
52 |     result_list.append(cur_result_list)
53 | 
54 |   return (fileid_list, result_list)
55 | 
56 | def main():
57 |   args = sys.argv[1:]
58 | 
59 |   if len(args)<1:
60 |     print 'Usage: python autoScore.py file_to_score'
61 |     exit(-1)
62 | 
63 |   file_to_score = args[0]
64 |   score_files(file_to_score)
65 |   (id_list, re_list) = extract_score_result(file_to_score.split('.')[0] + '.score')
66 |   id_len = len(id_list)
67 |   i = 0
68 |   print 'id'
69 |   label_list = [
70 |       'TRUE WORDS RECALL', 'TEST WORDS PRECISION',
71 |       'TOTAL INSERTIONS', 'TOTAL DELETIONS',
72 |       'TOTAL SUBSTITUTIONS', 'TOTAL NCHANGE',
73 |       'TOTAL TRUE WORD COUNT', 'TOTAL TEST WORD COUNT',
74 |       'TOTAL TRUE WORDS RECALL', 'TOTAL TEST WORDS PRECISION',
75 |       'F MEASURE',
76 |       'OOV Rate', 'OOV Recall Rate',
77 |       'IV Recall Rate'
78 |       ]
79 |   print label_list
80 |   while i< id_len:
81 |     print id_list[i]
82 |     print re_list[i]
83 |     i += 1
84 | 
85 | if __name__ == "__main__":
86 |   main()
87 | 


--------------------------------------------------------------------------------
/src/cwsBMM.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Author: 赵晓凯
  5 | # Date:   2013-06-05
  6 | # Email:  minix007@foxmail.com
  7 | 
  8 | # 使用BMM 进行中文分词
  9 | # 添加了规则，用于匹配数字带来的新词
 10 | 
 11 | import codecs
 12 | import sys
 13 | 
 14 | # 由规则处理的一些特殊符号
 15 | numMath = [u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9']
 16 | numMath_suffix = [u'.', u'%', u'亿', u'万', u'千', u'百', u'十', u'个']
 17 | numCn = [u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九', u'○', u'零']
 18 | numCn_suffix_date = [u'年', u'月', u'日']
 19 | numCn_suffix_unit = [u'亿', u'万', u'千', u'百', u'十', u'个']
 20 | special_char = [u'(', u')']
 21 | num_char = numMath + numCn
 22 | num_suffix = numMath_suffix + numCn_suffix_unit + numCn_suffix_date
 23 | 
 24 | def proc_num_math(line, start):
 25 |     """ 处理句子中出现的数学符号 """
 26 |     oldstart = start
 27 |     start = start + 1
 28 |     while line[start] in numMath or line[start] in numMath_suffix:
 29 |         start = start + 1
 30 |     return start - oldstart
 31 | 
 32 | def proc_num_cn(line, start):
 33 |     """ 处理句子中出现的中文数字 """
 34 |     oldstart = start
 35 |     while line[start] in numCn or line[start] in numCn_suffix_unit:
 36 |         start = start + 1
 37 |     return start - oldstart
 38 | 
 39 | def rules(line, start):
 40 |     """ 处理特殊规则 """
 41 |     if line[start] in numMath or line[start] in num_suffix:
 42 |         return proc_num_math(line, start)
 43 |     elif line[start] in numCn or line[start] in num_suffix:
 44 |         return proc_num_cn(line, start)
 45 |     else:
 46 |         return 1
 47 | 
 48 | 
 49 | def genDict(path):
 50 |     """ 获取词典 """
 51 |     f = codecs.open(path,'r','utf-8')
 52 |     contents = f.read()
 53 |     contents = contents.replace(u'\r', u'')
 54 |     contents = contents.replace(u'\n', u'')
 55 |     # 将内容逆置，以便进行逆向匹配
 56 |     contents = contents[::-1]
 57 |     # 将文件内容按空格分开
 58 |     mydict = contents.split(u' ')
 59 |     # 去除词典List中的重复
 60 |     newdict = list(set(mydict))
 61 |     newdict.remove(u'')
 62 | 
 63 |     # 建立词典
 64 |     # key为词首字，value为以此字开始的词构成的List
 65 |     truedict = {}
 66 |     for item in newdict:
 67 |         if len(item)>0 and item[0] in truedict:
 68 |             value = truedict[item[0]]
 69 |             value.append(item)
 70 |             truedict[item[0]] = value
 71 |         else:
 72 |             truedict[item[0]] = [item]
 73 |     return truedict
 74 | 
 75 | def print_unicode_list(uni_list):
 76 |     for item in uni_list:
 77 |         print item,
 78 | 
 79 | def divideWords(mydict, sentence, maxlen):
 80 |     """ 
 81 |     根据词典对句子进行分词,
 82 |     使用逆向匹配的算法，从右到左扫描，遇到最长的词，
 83 |     就将它切下来，直到句子被分割完闭
 84 |     """
 85 |     # 对句子逆置，以便用正向匹配算法进行实际的逆向处理
 86 |     sentence = sentence[::-1]
 87 |     ruleChar = []
 88 |     ruleChar.extend(numCn)
 89 |     ruleChar.extend(numMath)
 90 |     result = []
 91 |     start = 0
 92 |     senlen = len(sentence)
 93 |     while start < senlen:
 94 |         curword = sentence[start]
 95 |         wdlen = 1
 96 |         wdlen_rule = 1
 97 |         # 首先查看是否可以匹配特殊规则
 98 |         if curword in num_char or curword in num_suffix:
 99 |             wdlen_rule = rules(sentence, start)
100 |         # 寻找以当前字开头的最长词
101 |         if curword in mydict:
102 |             wdlen = maxlen
103 |             words = mydict[curword]
104 |             while wdlen > 1:
105 |                 end = min(start+wdlen, senlen)
106 |                 if sentence[start:end] in words:
107 |                     break
108 |                 else:
109 |                     wdlen = wdlen - 1
110 |         # 将新词使用[::-1]逆置，变为正常词序
111 |         wdlen = max(wdlen_rule, wdlen)
112 |         end = min(start+wdlen, senlen)
113 |         result.append(sentence[start:end][::-1])
114 |         start = start + wdlen
115 |     return result[::-1]
116 | 
117 | def main():
118 |     args = sys.argv[1:]
119 |     if len(args) < 3:
120 |         print 'Usage: python ' + sys.argv[0] + ' dict_path test_path result_path'
121 |         exit(-1)
122 |     dict_path = args[0]
123 |     test_path = args[1]
124 |     result_path = args[2]
125 | 
126 |     dicts = genDict(dict_path)
127 |     fr = codecs.open(test_path,'r','utf-8')
128 |     test = fr.read()
129 |     result = divideWords(dicts,test,5)
130 |     fr.close()
131 |     fw = codecs.open(result_path,'w','utf-8')
132 |     for item in result:
133 |         fw.write(item + ' ')
134 |     fw.close()
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/src/cwsBMM_NRule.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Author: 赵晓凯
 5 | # Date:   2013-06-05
 6 | # Email:  minix007@foxmail.com
 7 | 
 8 | # 使用BMM 进行中文分词
 9 | 
10 | import codecs
11 | import sys
12 | 
13 | 
14 | def genDict(path):
15 |     """ 获取词典 """
16 |     f = codecs.open(path,'r','utf-8')
17 |     contents = f.read()
18 |     contents = contents.replace(u'\r', u'')
19 |     contents = contents.replace(u'\n', u'')
20 |     # 将内容逆置，以便进行逆向匹配
21 |     contents = contents[::-1]
22 |     # 将文件内容按空格分开
23 |     mydict = contents.split(u' ')
24 |     # 去除词典List中的重复
25 |     newdict = list(set(mydict))
26 |     newdict.remove(u'')
27 | 
28 |     # 建立词典
29 |     # key为词首字，value为以此字开始的词构成的List
30 |     truedict = {}
31 |     for item in newdict:
32 |         if len(item)>0 and item[0] in truedict:
33 |             value = truedict[item[0]]
34 |             value.append(item)
35 |             truedict[item[0]] = value
36 |         else:
37 |             truedict[item[0]] = [item]
38 |     return truedict
39 | 
40 | def print_unicode_list(uni_list):
41 |     for item in uni_list:
42 |         print item,
43 | 
44 | def divideWords(mydict, sentence, maxlen):
45 |     """ 
46 |     根据词典对句子进行分词,
47 |     使用逆向匹配的算法，从右到左扫描，遇到最长的词，
48 |     就将它切下来，直到句子被分割完闭
49 |     """
50 |     # 对句子逆置，以便用正向匹配算法进行实际的逆向处理
51 |     sentence = sentence[::-1]
52 |     result = []
53 |     start = 0
54 |     senlen = len(sentence)
55 |     while start < senlen:
56 |         curword = sentence[start]
57 |         wdlen = 1
58 |         wdlen_rule = 1
59 | 
60 |         # 寻找以当前字开头的最长词
61 |         if curword in mydict:
62 |             wdlen = maxlen
63 |             words = mydict[curword]
64 |             while wdlen > 1:
65 |                 end = min(start+wdlen, senlen)
66 |                 if sentence[start:end] in words:
67 |                     break
68 |                 else:
69 |                     wdlen = wdlen - 1
70 |         # 将新词使用[::-1]逆置，变为正常词序
71 |         wdlen = max(wdlen_rule, wdlen)
72 |         end = min(start+wdlen, senlen)
73 |         result.append(sentence[start:end][::-1])
74 |         start = start + wdlen
75 |     return result[::-1]
76 | 
77 | def main():
78 |     args = sys.argv[1:]
79 |     if len(args) < 3:
80 |         print 'Usage: python' + sys.argv[0] + ' dict_path test_path result_path'
81 |         exit(-1)
82 |     dict_path = args[0]
83 |     test_path = args[1]
84 |     result_path = args[2]
85 | 
86 |     dicts = genDict(dict_path)
87 |     fr = codecs.open(test_path,'r','utf-8')
88 |     test = fr.read()
89 |     result = divideWords(dicts,test,5)
90 |     fr.close()
91 |     fw = codecs.open(result_path,'w','utf-8')
92 |     for item in result:
93 |         fw.write(item + ' ')
94 |     fw.close()
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/src/cwsFMM.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Author: 赵晓凯
  5 | # Date:   2013-06-05
  6 | # Email:  minix007@foxmail.com
  7 | 
  8 | import codecs
  9 | import sys
 10 | 
 11 | # 由规则处理的一些特殊符号
 12 | numMath = [u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9']
 13 | numMath_suffix = [u'.', u'%', u'亿', u'万', u'千', u'百', u'十', u'个']
 14 | numCn = [u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九', u'○', u'零']
 15 | numCn_suffix_date = [u'年', u'月', u'日']
 16 | numCn_suffix_unit = [u'亿', u'万', u'千', u'百', u'十', u'个']
 17 | special_char = [u'(', u')']
 18 | 
 19 | 
 20 | def proc_num_math(line, start):
 21 |     """ 处理句子中出现的数学符号 """
 22 |     oldstart = start
 23 |     while line[start] in numMath or line[start] in numMath_suffix:
 24 |         start = start + 1
 25 |     if line[start] in numCn_suffix_date:
 26 |         start = start + 1
 27 |     return start - oldstart
 28 | 
 29 | def proc_num_cn(line, start):
 30 |     """ 处理句子中出现的中文数字 """
 31 |     oldstart = start
 32 |     while line[start] in numCn or line[start] in numCn_suffix_unit:
 33 |         start = start + 1
 34 |     if line[start] in numCn_suffix_date:
 35 |         start = start + 1
 36 |     return start - oldstart
 37 | 
 38 | def rules(line, start):
 39 |     """ 处理特殊规则 """
 40 |     if line[start] in numMath:
 41 |         return proc_num_math(line, start)
 42 |     elif line[start] in numCn:
 43 |         return proc_num_cn(line, start)
 44 | 
 45 | def genDict(path):
 46 |     """ 获取词典 """
 47 |     f = codecs.open(path,'r','utf-8')
 48 |     contents = f.read()
 49 |     contents = contents.replace(u'\r', u'')
 50 |     contents = contents.replace(u'\n', u'')
 51 |     # 将文件内容按空格分开
 52 |     mydict = contents.split(u' ')
 53 |     # 去除词典List中的重复
 54 |     newdict = list(set(mydict))
 55 |     newdict.remove(u'')
 56 | 
 57 |     # 建立词典
 58 |     # key为词首字，value为以此字开始的词构成的List
 59 |     truedict = {}
 60 |     for item in newdict:
 61 |         if len(item)>0 and item[0] in truedict:
 62 |             value = truedict[item[0]]
 63 |             value.append(item)
 64 |             truedict[item[0]] = value
 65 |         else:
 66 |             truedict[item[0]] = [item]
 67 |     return truedict
 68 | 
 69 | def print_unicode_list(uni_list):
 70 |     for item in uni_list:
 71 |         print item,
 72 | 
 73 | def divideWords(mydict, sentence):
 74 |     """ 
 75 |     根据词典对句子进行分词,
 76 |     使用正向匹配的算法，从左到右扫描，遇到最长的词，
 77 |     就将它切下来，直到句子被分割完闭
 78 |     """
 79 |     ruleChar = []
 80 |     ruleChar.extend(numCn)
 81 |     ruleChar.extend(numMath)
 82 |     result = []
 83 |     start = 0
 84 |     senlen = len(sentence)
 85 |     while start < senlen:
 86 |         curword = sentence[start]
 87 |         maxlen = 1
 88 |         # 首先查看是否可以匹配特殊规则
 89 |         if curword in numCn or curword in numMath:
 90 |             maxlen = rules(sentence, start)
 91 |         # 寻找以当前字开头的最长词
 92 |         if curword in mydict:
 93 |             words = mydict[curword]
 94 |             for item in words:
 95 |                 itemlen = len(item)
 96 |                 if sentence[start:start+itemlen] == item and itemlen > maxlen and itemlen <= 5:
 97 |                     maxlen = itemlen
 98 | 
 99 |         result.append(sentence[start:start+maxlen])
100 |         start = start + maxlen
101 |     return result
102 | 
103 | def main():
104 |     args = sys.argv[1:]
105 |     if len(args) < 3:
106 |         print 'Usage: python ' + sys.argv[0] + ' dict_path test_path result_path'
107 |         exit(-1)
108 |     dict_path = args[0]
109 |     test_path = args[1]
110 |     result_path = args[2]
111 | 
112 |     dicts = genDict(dict_path)
113 |     fr = codecs.open(test_path,'r','utf-8')
114 |     test = fr.read()
115 |     result = divideWords(dicts,test)
116 |     fr.close()
117 |     fw = codecs.open(result_path,'w','utf-8')
118 |     for item in result:
119 |         fw.write(item + ' ')
120 |     fw.close()
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/src/cwsFMM_NRule.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Author: 赵晓凯
 5 | # Date:   2013-06-05
 6 | # Email:  minix007@foxmail.com
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def genDict(path):
12 |     """ 获取词典 """
13 |     f = codecs.open(path,'r','utf-8')
14 |     contents = f.read()
15 |     contents = contents.replace(u'\r', u'')
16 |     contents = contents.replace(u'\n', u'')
17 |     # 将文件内容按空格分开
18 |     mydict = contents.split(u' ')
19 |     # 去除词典List中的重复
20 |     newdict = list(set(mydict))
21 |     newdict.remove(u'')
22 | 
23 |     # 建立词典
24 |     # key为词首字，value为以此字开始的词构成的List
25 |     truedict = {}
26 |     for item in newdict:
27 |         if len(item)>0 and item[0] in truedict:
28 |             value = truedict[item[0]]
29 |             value.append(item)
30 |             truedict[item[0]] = value
31 |         else:
32 |             truedict[item[0]] = [item]
33 |     return truedict
34 | 
35 | def print_unicode_list(uni_list):
36 |     for item in uni_list:
37 |         print item,
38 | 
39 | def divideWords(mydict, sentence):
40 |     """ 
41 |     根据词典对句子进行分词,
42 |     使用正向匹配的算法，从左到右扫描，遇到最长的词，
43 |     就将它切下来，直到句子被分割完闭
44 |     """
45 |     result = []
46 |     start = 0
47 |     senlen = len(sentence)
48 |     while start < senlen:
49 |         curword = sentence[start]
50 |         maxlen = 1
51 |         # 寻找以当前字开头的最长词
52 |         if curword in mydict:
53 |             words = mydict[curword]
54 |             for item in words:
55 |                 itemlen = len(item)
56 |                 if sentence[start:start+itemlen] == item and itemlen > maxlen and itemlen <= 5:
57 |                     maxlen = itemlen
58 |         result.append(sentence[start:start+maxlen])
59 |         start = start + maxlen
60 |     return result
61 | 
62 | def main():
63 |     args = sys.argv[1:]
64 |     if len(args) < 3:
65 |         print 'Usage: python ' + sys.argv[0] + ' dict_path test_path result_path'
66 |         exit(-1)
67 |     dict_path = args[0]
68 |     test_path = args[1]
69 |     result_path = args[2]
70 | 
71 |     dicts = genDict(dict_path)
72 |     fr = codecs.open(test_path,'r','utf-8')
73 |     test = fr.read()
74 |     result = divideWords(dicts,test)
75 |     fr.close()
76 |     fw = codecs.open(result_path,'w','utf-8')
77 |     for item in result:
78 |         fw.write(item + ' ')
79 |     fw.close()
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/src/cwsMaxEn-13f.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Author: 赵晓凯
  5 | # Date:   2013-06-05
  6 | # Email:  minix007@foxmail.com
  7 | 
  8 | # 基于最大熵模型及字标注的分词工具，13特征版本
  9 | 
 10 | import codecs
 11 | import sys
 12 | from maxent import MaxentModel
 13 | 
 14 | def tag_training_set(training_file, tag_training_set_file):
 15 |     f = codecs.open(training_file,'r','utf-8')
 16 |     contents = f.read()
 17 |     contents = contents.replace(u'\r', u'')
 18 |     contents = contents.replace(u'\n', u'')
 19 |     # 将文件内容按空格分开
 20 |     words = contents.split(' ')
 21 |     print len(words)
 22 |     
 23 |     tag_words_list = []
 24 |     i = 0
 25 |     for word in words:
 26 |       i += 1
 27 |       if (i % 100 == 0): tag_words_list.append(u'\r')
 28 |       if(len(word) == 0):
 29 |         continue
 30 |       if(len(word) == 1):
 31 |         tag_word = word + '/S'
 32 |       elif(len(word) == 2):
 33 |         tag_word = word[0] + '/B' + word[1] + '/E'
 34 |       elif(len(word) == 3):
 35 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/E'
 36 |       elif(len(word) == 4):
 37 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/D' + word[3] + '/E'
 38 |       else:
 39 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/D'
 40 |         mid_words = word[3:-1]
 41 |         for mid_word in mid_words:
 42 |           tag_word += (mid_word + '/M')
 43 |         tag_word += (word[-1] + '/E')
 44 | 
 45 |       tag_words_list.append(tag_word)
 46 | 
 47 |     tag_words = ''.join(tag_words_list)
 48 |     fw = codecs.open(tag_training_set_file, 'w', 'utf-8')
 49 |     fw.write(tag_words)
 50 |     fw.close()
 51 | 
 52 |     return (words,tag_words_list)
 53 | 
 54 | def get_near_char(contents, i, times):
 55 |     words_len = len(contents) / times;
 56 |     if (i<0 or i >words_len-1): return '_'
 57 |     else: return contents[i*times]
 58 | 
 59 | def get_near_tag(contents, i, times):
 60 |     words_len = len(contents) / times;
 61 |     if (i<0 or i >words_len-1): return '_'
 62 |     else: return contents[i*times+2]
 63 | 
 64 | def isPu(char):
 65 |     punctuation = [u'，', u'。', u'？', u'！', u'；', u'－', u'、', u'—', u'（',u'）',u'《', u'》',u'：',
 66 |         u'“',u'”',u'’',u'‘']
 67 |     if char in punctuation:
 68 |       return '1'
 69 |     else:
 70 |       return '0'
 71 | 
 72 | def get_class(char):
 73 |     zh_num = [u'零',u'○',u'一', u'二',u'三',u'四',u'五',u'六',u'七',u'八',u'九',u'十',u'百',u'千',u'万']
 74 |     ar_num = [u'0',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',u'.',
 75 |               u'０',u'１',u'２',u'３',u'４',u'５',u'６',u'７',u'８',u'９']
 76 |     date = [u'日', u'年', u'月']
 77 |     letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
 78 |         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
 79 |     if char in zh_num or char in ar_num: 
 80 |       return '1'
 81 |     elif char in date: 
 82 |       return '2'
 83 |     elif char in letter:
 84 |       return '3'
 85 |     else:
 86 |       return '4'
 87 | 
 88 | 
 89 | def get_event(tag_file_path, event_file_path):
 90 |     f = codecs.open(tag_file_path,'r','utf-8')
 91 |     contents = f.read()
 92 |     contents = contents.replace(u'\r', u'')
 93 |     contents = contents.replace(u'\n', u'')
 94 |     words_len = len(contents)/3
 95 |     event_list = []
 96 | 
 97 |     index = range(0,words_len)
 98 |     for i in index:
 99 |       pre_char = get_near_char(contents,i-1,3)
100 |       pre_pre_char = get_near_char(contents,i-2,3)
101 |       cur_char = get_near_char(contents,i,3)
102 |       next_char = get_near_char(contents,i+1,3)
103 |       next_next_char = get_near_char(contents,i+2,3)
104 |       event_list.append(
105 |           contents[i*3+2] + ' '
106 |           + 'C-2='+pre_pre_char + ' ' + 'C-1='+pre_char + ' ' 
107 |           + ' ' + 'C0='+cur_char + ' ' 
108 |           + 'C1='+next_char + ' ' + 'C2='+next_next_char + ' '
109 |           + 'C-2='+pre_pre_char+'C-1='+pre_char + ' '
110 |           + 'C-1='+pre_char+'C0='+cur_char + ' ' 
111 |           + 'C0='+cur_char+'C1='+next_char + ' '
112 |           + 'C1='+next_char+'C2='+next_next_char + ' '
113 |           + 'C-1='+pre_char+'C1='+next_char + ' '
114 |           + 'Pu='+isPu(cur_char) + ' '
115 |           + 'TC-2='+get_class(pre_pre_char)+'TC-1='+get_class(pre_char)
116 |           + 'TC0='+get_class(cur_char)+'TC1='+get_class(next_char)
117 |           + 'TC2='+get_class(next_next_char) + ' '
118 |           + 'T-1='+get_near_tag(contents,i-1,3) + ' '
119 |           + 'T-2='+get_near_tag(contents,i-2,3)
120 |           + '\r')
121 | 
122 |    
123 |     #events = ''.join(event_list)
124 |     fw = codecs.open(event_file_path, 'w', 'utf-8')
125 |     for event in event_list:
126 |       fw.write(event)
127 |     fw.close()
128 | 
129 |     return event_list
130 | 
131 | def get_feature(test_file_path, feature_file_path):
132 |     f = codecs.open(test_file_path,'r','utf-8')
133 |     contents = f.read()
134 |     contents_list = contents.split('\r\n')
135 |     contents_list.remove('')
136 |     contents_list.remove('')
137 | 
138 |     fw = codecs.open(feature_file_path, 'w', 'utf-8')
139 |     for line in contents_list:
140 |       words_len = len(line)
141 |       feature_list = []
142 | 
143 |       index = range(0,words_len)
144 |       for i in index:
145 |         pre_char = get_near_char(line,i-1,1)
146 |         pre_pre_char = get_near_char(line,i-2,1)
147 |         cur_char = get_near_char(line,i,1)
148 |         next_char = get_near_char(line,i+1,1)
149 |         next_next_char = get_near_char(line,i+2,1)
150 |         feature_list.append(
151 |               'C-2='+pre_pre_char + ' ' + 'C-1='+pre_char + ' ' 
152 |             + 'C0='+cur_char + ' ' 
153 |             + 'C1='+next_char + ' ' + 'C2='+next_next_char + ' '
154 |             + 'C-2='+pre_pre_char+'C-1='+pre_char + ' '
155 |             + 'C-1='+pre_char+'C0='+cur_char + ' ' 
156 |             + 'C0='+cur_char+'C1='+next_char + ' '
157 |             + 'C1='+next_char+'C2='+next_next_char + ' '
158 |             + 'C-1='+pre_char+'C1='+next_char + ' '
159 |             + 'Pu='+isPu(cur_char) + ' '
160 |             + 'TC-2='+get_class(pre_pre_char)+'TC-1='+get_class(pre_char)
161 |             + 'TC0='+get_class(cur_char)+'TC1='+get_class(next_char)
162 |             + 'TC2='+get_class(next_next_char) + ' '
163 |             + '\r')
164 | 
165 |       for item in feature_list:
166 |         fw.write(item)
167 |       fw.write('split\r\n')
168 | 
169 |     fw.close()
170 | 
171 |     return feature_list
172 | 
173 | def split_by_blank(line):
174 |   line_list = []
175 |   line_len = len(line)
176 |   i = 0
177 |   while i < line_len:
178 |     line_list.append(line[i])
179 |     i += 2
180 | 
181 |   return line_list
182 | 
183 | def training(feature_file_path, trained_model_file, times):
184 |   m = MaxentModel()
185 |   fr = codecs.open(feature_file_path, 'r', 'utf-8')
186 |   all_list = []
187 |   m.begin_add_event()
188 |   for line in fr:
189 |     line = line.rstrip()
190 |     line_list = line.split(' ')
191 |     str_list = []
192 |     for item in line_list:
193 |       str_list.append(item.encode('utf-8'))
194 |     all_list.append(str_list)
195 |     m.add_event(str_list[1:], str_list[0], 1)
196 |   m.end_add_event()
197 |   print 'begin training'
198 |   m.train(times, "lbfgs")
199 |   print 'end training'
200 |   m.save(trained_model_file)
201 |   return all_list
202 | 
203 | def max_prob(label_prob_list):
204 |   max_prob = 0
205 |   max_prob_label = ''
206 |   for label_prob in label_prob_list:
207 |     if label_prob[1] > max_prob:
208 |       max_prob = label_prob[1]
209 |       max_prob_label = label_prob[0]
210 | 
211 |   return max_prob_label
212 | 
213 | def tag_test(test_feature_file, trained_model_file,  tag_test_set_file):
214 |   fr = codecs.open(test_feature_file, 'r', 'utf-8')
215 |   fw = codecs.open(tag_test_set_file, 'w', 'utf-8')
216 |   m = MaxentModel()
217 |   m.load(trained_model_file)
218 |   contents = fr.read()
219 |   feature_list = contents.split('\r')
220 |   feature_list.remove('\n')
221 |   #return feature_list
222 |   pre_tag = '_'
223 |   pre_pre_tag = '_'
224 |   for feature in feature_list:
225 |     if (feature == 'split'):
226 |       fw.write('\n\n\n')
227 |       continue
228 |     str_feature = []
229 |     u_feature = feature.split(' ')
230 |     for item in u_feature:
231 |       str_feature.append(item.encode('utf-8'))
232 |     str_feature.append('T-1=' + pre_tag)
233 |     str_feature.append('T-2=' + pre_pre_tag)
234 |     label_prob_list = m.eval_all(str_feature)
235 |     label = max_prob(label_prob_list)
236 |     #print str_feature
237 |     try:
238 |       new_tag = str_feature[2].split('=')[1] + '/' + label
239 |     except IndexError:
240 |       print str_feature
241 |     fw.write(new_tag.decode('utf-8'))
242 |     pre_pre_tag = pre_tag 
243 |     pre_tag = label
244 |   return feature_list
245 | 
246 | def tag_to_words(tag_training_set_file, result_file):
247 |     fr = codecs.open(tag_training_set_file, 'r', 'utf-8')
248 |     fw = codecs.open(result_file, 'w', 'utf-8')
249 | 
250 |     contents = fr.read()
251 |     words_len = len(contents)/3
252 |     result = []
253 |     i = 0
254 |     while (i<words_len):
255 |       cur_word_label = contents[i*3+2]
256 |       cur_word = contents[i*3]
257 |       if (cur_word_label == 'S'):
258 |         result.append(cur_word + ' ')
259 |       elif(cur_word_label == 'B'):
260 |         result.append(cur_word)
261 |       elif(cur_word_label == 'C'):
262 |         result.append(cur_word)
263 |       elif(cur_word_label == 'D'):
264 |         result.append(cur_word)
265 |       elif(cur_word_label == 'M'):
266 |         result.append(cur_word)
267 |       elif(cur_word_label == 'E'):
268 |         result.append(cur_word + ' ')
269 |       else:
270 |         result.append(cur_word)
271 |       i += 1
272 | 
273 |     fw.write(''.join(result))
274 | 
275 | 
276 | def main():
277 |     args = sys.argv[1:]
278 |     if len(args) < 3:
279 | 
280 |         print 'Usage: python ' + sys.argv[0] + ' training_file test_file result_file'
281 |         exit(-1)
282 |     training_file = args[0]
283 |     test_file = args[1]
284 |     result_file = args[2]
285 | 
286 |     # 标注训练集
287 |     tag_training_set_file = training_file + ".tag"
288 |     tag_training_set(training_file, tag_training_set_file)
289 |     print 'tag training set succeed'
290 | 
291 |     # 获取训练集特征
292 |     feature_file_path = training_file + ".feature"
293 |     get_event(tag_training_set_file, feature_file_path)
294 |     print 'get training set features succeed'
295 | 
296 |     # 测试集生成特征
297 |     test_feature_file = test_file + ".feature"
298 |     get_feature(test_file, test_feature_file)
299 |     print 'get test set features succeed'
300 | 
301 |     # 训练模型
302 |     times = [1000]
303 |     for time in times:
304 |       trained_model_file = training_file + '.' + str(time) + ".model"
305 |       training(feature_file_path, trained_model_file,time)
306 |       print 'training model succeed:' + str(time)
307 | 
308 |       # 标注测试集
309 |       tag_test_set_file = test_file + ".tag"
310 |       tag_test(test_feature_file, trained_model_file,  tag_test_set_file)
311 |       print 'tag test set succeed'
312 |   
313 |       # 获取最终结果
314 |       tag_to_words(tag_test_set_file, result_file+'.'+str(time))
315 |       print 'get final result succeed ' + result_file + '.'+str(time)
316 | 
317 | if __name__ == "__main__":
318 |     main()
319 | 
320 | 


--------------------------------------------------------------------------------
/src/cwsMaxEn-16f.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Author: 赵晓凯
  5 | # Date:   2013-06-05
  6 | # Email:  minix007@foxmail.com
  7 | 
  8 | # 基于最大熵模型及字标注的分词工具，16特征版本
  9 | 
 10 | import codecs
 11 | import sys
 12 | from maxent import MaxentModel
 13 | 
 14 | def tag_training_set(training_file, tag_training_set_file):
 15 |     f = codecs.open(training_file,'r','utf-8')
 16 |     contents = f.read()
 17 |     contents = contents.replace(u'\r', u'')
 18 |     contents = contents.replace(u'\n', u'')
 19 |     # 将文件内容按空格分开
 20 |     words = contents.split(' ')
 21 |     print len(words)
 22 |     
 23 |     tag_words_list = []
 24 |     i = 0
 25 |     for word in words:
 26 |       i += 1
 27 |       if (i % 100 == 0): tag_words_list.append(u'\r')
 28 |       if(len(word) == 0):
 29 |         continue
 30 |       if(len(word) == 1):
 31 |         tag_word = word + '/S'
 32 |       elif(len(word) == 2):
 33 |         tag_word = word[0] + '/B' + word[1] + '/E'
 34 |       elif(len(word) == 3):
 35 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/E'
 36 |       elif(len(word) == 4):
 37 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/D' + word[3] + '/E'
 38 |       else:
 39 |         tag_word = word[0] + '/B' + word[1] + '/C' + word[2] + '/D'
 40 |         mid_words = word[3:-1]
 41 |         for mid_word in mid_words:
 42 |           tag_word += (mid_word + '/M')
 43 |         tag_word += (word[-1] + '/E')
 44 | 
 45 |       tag_words_list.append(tag_word)
 46 | 
 47 |     tag_words = ''.join(tag_words_list)
 48 |     fw = codecs.open(tag_training_set_file, 'w', 'utf-8')
 49 |     fw.write(tag_words)
 50 |     fw.close()
 51 | 
 52 |     return (words,tag_words_list)
 53 | 
 54 | def get_near_char(contents, i, times):
 55 |     words_len = len(contents) / times;
 56 |     if (i<0 or i >words_len-1): return '_'
 57 |     else: return contents[i*times]
 58 | 
 59 | def get_near_tag(contents, i, times):
 60 |     words_len = len(contents) / times;
 61 |     if (i<0 or i >words_len-1): return '_'
 62 |     else: return contents[i*times+2]
 63 | 
 64 | def isPu(char):
 65 |     punctuation = [u'，', u'。', u'？', u'！', u'；', u'－－', u'、', u'——', u'（',u'）',u'《', u'》',u'：',
 66 |         u'“',u'”',u'’',u'‘']
 67 |     if char in punctuation:
 68 |       return '1'
 69 |     else:
 70 |       return '0'
 71 | 
 72 | def get_class(char):
 73 |     zh_num = [u'零',u'○',u'一', u'二',u'三',u'四',u'五',u'六',u'七',u'八',u'九',u'十',u'百',u'千',u'万']
 74 |     ar_num = [u'0',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',u'.',
 75 |               u'０',u'１',u'２',u'３',u'４',u'５',u'６',u'７',u'８',u'９']
 76 |     date = [u'日', u'年', u'月']
 77 |     letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
 78 |         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
 79 |     if char in zh_num or char in ar_num: 
 80 |       return '1'
 81 |     elif char in date: 
 82 |       return '2'
 83 |     elif char in letter:
 84 |       return '3'
 85 |     else:
 86 |       return '4'
 87 | 
 88 | 
 89 | def get_event(tag_file_path, event_file_path):
 90 |     f = codecs.open(tag_file_path,'r','utf-8')
 91 |     contents = f.read()
 92 |     contents = contents.replace(u'\r', u'')
 93 |     contents = contents.replace(u'\n', u'')
 94 |     words_len = len(contents)/3
 95 |     event_list = []
 96 | 
 97 |     index = range(0,words_len)
 98 |     for i in index:
 99 |       pre_char = get_near_char(contents,i-1,3)
100 |       pre_pre_char = get_near_char(contents,i-2,3)
101 |       cur_char = get_near_char(contents,i,3)
102 |       next_char = get_near_char(contents,i+1,3)
103 |       next_next_char = get_near_char(contents,i+2,3)
104 |       event_list.append(
105 |           contents[i*3+2] + ' '
106 |           + 'C-2='+pre_pre_char + ' ' + 'C-1='+pre_char + ' ' 
107 |           + ' ' + 'C0='+cur_char + ' ' 
108 |           + 'C1='+next_char + ' ' + 'C2='+next_next_char + ' '
109 |           + 'C-2='+pre_pre_char+'C-1='+pre_char + ' '
110 |           + 'C-1='+pre_char+'C0='+cur_char + ' ' 
111 |           + 'C0='+cur_char+'C1='+next_char + ' '
112 |           + 'C1='+next_char+'C2='+next_next_char + ' '
113 |           + 'C-1='+pre_char+'C1='+next_char + ' '
114 |           + 'C-2='+pre_pre_char+'C-1='+pre_char+'C0='+cur_char + ' '
115 |           + 'C-1='+pre_char+'C0='+cur_char + 'C1='+next_char + ' ' 
116 |           + 'C0='+cur_char + 'C1='+next_char + 'C2='+next_next_char + ' '
117 |           + 'Pu='+isPu(cur_char) + ' '
118 |           + 'TC-2='+get_class(pre_pre_char)+'TC-1='+get_class(pre_char)
119 |           + 'TC0='+get_class(cur_char)+'TC1='+get_class(next_char)
120 |           + 'TC2='+get_class(next_next_char) + ' '
121 |           + '\r')
122 | 
123 |    
124 |     #events = ''.join(event_list)
125 |     fw = codecs.open(event_file_path, 'w', 'utf-8')
126 |     for event in event_list:
127 |       fw.write(event)
128 |     fw.close()
129 | 
130 |     return event_list
131 | 
132 | def get_feature(test_file_path, feature_file_path):
133 |     f = codecs.open(test_file_path,'r','utf-8')
134 |     contents = f.read()
135 |     contents_list = contents.split('\r\n')
136 |     contents_list.remove('')
137 |     contents_list.remove('')
138 | 
139 |     fw = codecs.open(feature_file_path, 'w', 'utf-8')
140 |     for line in contents_list:
141 |       words_len = len(line)
142 |       feature_list = []
143 | 
144 |       index = range(0,words_len)
145 |       for i in index:
146 |         pre_char = get_near_char(line,i-1,1)
147 |         pre_pre_char = get_near_char(line,i-2,1)
148 |         cur_char = get_near_char(line,i,1)
149 |         next_char = get_near_char(line,i+1,1)
150 |         next_next_char = get_near_char(line,i+2,1)
151 |         feature_list.append(
152 |               'C-2='+pre_pre_char + ' ' + 'C-1='+pre_char + ' ' 
153 |             + 'C0='+cur_char + ' ' 
154 |             + 'C1='+next_char + ' ' + 'C2='+next_next_char + ' '
155 |             + 'C-2='+pre_pre_char+'C-1='+pre_char + ' '
156 |             + 'C-1='+pre_char+'C0='+cur_char + ' ' 
157 |             + 'C0='+cur_char+'C1='+next_char + ' '
158 |             + 'C1='+next_char+'C2='+next_next_char + ' '
159 |             + 'C-1='+pre_char+'C1='+next_char + ' '
160 |             + 'C-2='+pre_pre_char+'C-1='+pre_char+'C0='+cur_char + ' '
161 |             + 'C-1='+pre_char+'C0='+cur_char + 'C1='+next_char + ' ' 
162 |             + 'C0='+cur_char + 'C1='+next_char + 'C2='+next_next_char + ' '
163 |             + 'Pu='+isPu(cur_char) + ' '
164 |             + 'TC-2='+get_class(pre_pre_char)+'TC-1='+get_class(pre_char)
165 |             + 'TC0='+get_class(cur_char)+'TC1='+get_class(next_char)
166 |             + 'TC2='+get_class(next_next_char) + ' '
167 |             + '\r')
168 | 
169 |       for item in feature_list:
170 |         fw.write(item)
171 |       fw.write('split\r\n')
172 | 
173 |     fw.close()
174 | 
175 |     return feature_list
176 | 
177 | def split_by_blank(line):
178 |   line_list = []
179 |   line_len = len(line)
180 |   i = 0
181 |   while i < line_len:
182 |     line_list.append(line[i])
183 |     i += 2
184 | 
185 |   return line_list
186 | 
187 | def training(feature_file_path, trained_model_file, times):
188 |   m = MaxentModel()
189 |   fr = codecs.open(feature_file_path, 'r', 'utf-8')
190 |   all_list = []
191 |   m.begin_add_event()
192 |   for line in fr:
193 |     line = line.rstrip()
194 |     line_list = line.split(' ')
195 |     str_list = []
196 |     for item in line_list:
197 |       str_list.append(item.encode('utf-8'))
198 |     all_list.append(str_list)
199 |     m.add_event(str_list[1:], str_list[0], 1)
200 |   m.end_add_event()
201 |   print 'begin training'
202 |   m.train(times, "lbfgs")
203 |   print 'end training'
204 |   m.save(trained_model_file)
205 |   return all_list
206 | 
207 | def max_prob(label_prob_list):
208 |   max_prob = 0
209 |   max_prob_label = ''
210 |   for label_prob in label_prob_list:
211 |     if label_prob[1] > max_prob:
212 |       max_prob = label_prob[1]
213 |       max_prob_label = label_prob[0]
214 | 
215 |   return max_prob_label
216 | 
217 | def tag_test(test_feature_file, trained_model_file,  tag_test_set_file):
218 |   fr = codecs.open(test_feature_file, 'r', 'utf-8')
219 |   fw = codecs.open(tag_test_set_file, 'w', 'utf-8')
220 |   m = MaxentModel()
221 |   m.load(trained_model_file)
222 |   contents = fr.read()
223 |   feature_list = contents.split('\r')
224 |   feature_list.remove('\n')
225 |   for feature in feature_list:
226 |     if (feature == 'split'):
227 |       fw.write('\n\n\n')
228 |       continue
229 |     str_feature = []
230 |     u_feature = feature.split(' ')
231 |     for item in u_feature:
232 |       str_feature.append(item.encode('utf-8'))
233 |     label_prob_list = m.eval_all(str_feature)
234 |     label = max_prob(label_prob_list)
235 | 
236 |     try:
237 |       new_tag = str_feature[2].split('=')[1] + '/' + label
238 |     except IndexError:
239 |       print str_feature
240 |     fw.write(new_tag.decode('utf-8'))
241 |     pre_tag = label
242 |   return feature_list
243 | 
244 | def tag_to_words(tag_training_set_file, result_file):
245 |     fr = codecs.open(tag_training_set_file, 'r', 'utf-8')
246 |     fw = codecs.open(result_file, 'w', 'utf-8')
247 | 
248 |     contents = fr.read()
249 |     words_len = len(contents)/3
250 |     result = []
251 |     i = 0
252 |     while (i<words_len):
253 |       cur_word_label = contents[i*3+2]
254 |       cur_word = contents[i*3]
255 |       if (cur_word_label == 'S'):
256 |         result.append(cur_word + ' ')
257 |       elif(cur_word_label == 'B'):
258 |         result.append(cur_word)
259 |       elif(cur_word_label == 'C'):
260 |         result.append(cur_word)
261 |       elif(cur_word_label == 'D'):
262 |         result.append(cur_word)
263 |       elif(cur_word_label == 'M'):
264 |         result.append(cur_word)
265 |       elif(cur_word_label == 'E'):
266 |         result.append(cur_word + ' ')
267 |       else:
268 |         result.append(cur_word)
269 |       i += 1
270 | 
271 |     fw.write(''.join(result))
272 | 
273 | 
274 | def main():
275 |     args = sys.argv[1:]
276 |     if len(args) < 3:
277 | 
278 |         print 'Usage: python ' + sys.argv[0] + ' training_file test_file result_file'
279 |         exit(-1)
280 |     training_file = args[0]
281 |     test_file = args[1]
282 |     result_file = args[2]
283 | 
284 |     # 标注训练集
285 |     tag_training_set_file = training_file + ".tag"
286 |     tag_training_set(training_file, tag_training_set_file)
287 |     print 'tag training set succeed'
288 | 
289 |     # 获取训练集特征
290 |     feature_file_path = training_file + ".feature"
291 |     get_event(tag_training_set_file, feature_file_path)
292 |     print 'get training set features succeed'
293 | 
294 |     # 测试集生成特征
295 |     test_feature_file = test_file + ".feature"
296 |     get_feature(test_file, test_feature_file)
297 |     print 'get test set features succeed'
298 | 
299 |     # 训练模型
300 |     times = [1000]
301 |     for time in times:
302 |       trained_model_file = training_file + '.' + str(time) + ".model"
303 |       training(feature_file_path, trained_model_file,time)
304 |       print 'training model succeed:' + str(time)
305 | 
306 |       # 标注测试集
307 |       tag_test_set_file = test_file + ".tag"
308 |       tag_test(test_feature_file, trained_model_file,  tag_test_set_file)
309 |       print 'tag test set succeed'
310 |   
311 |       # 获取最终结果
312 |       tag_to_words(tag_test_set_file, result_file+'.'+str(time))
313 |       print 'get final result succeed ' + result_file + '.'+str(time)
314 | 
315 | if __name__ == "__main__":
316 |     main()
317 | 
318 | 


--------------------------------------------------------------------------------
/src/score:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | ###########################################################################
  4 | #                                                                         #
  5 | #                               SIGHAN                                    #
  6 | #                      Copyright (c) 2003,2005                            #
  7 | #                        All Rights Reserved.                             #
  8 | #                                                                         #
  9 | #  Permission is hereby granted, free of charge, to use and distribute    #
 10 | #  this software and its documentation without restriction, including     #
 11 | #  without limitation the rights to use, copy, modify, merge, publish,    #
 12 | #  distribute, sublicense, and/or sell copies of this work, and to        #
 13 | #  permit persons to whom this work is furnished to do so, subject to     #
 14 | #  the following conditions:                                              #
 15 | #   1. The code must retain the above copyright notice, this list of      #
 16 | #      conditions and the following disclaimer.                           #
 17 | #   2. Any modifications must be clearly marked as such.                  #
 18 | #   3. Original authors' names are not deleted.                           #
 19 | #   4. The authors' names are not used to endorse or promote products     #
 20 | #      derived from this software without specific prior written          #
 21 | #      permission.                                                        #
 22 | #                                                                         #
 23 | #  SIGHAN AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES       #
 24 | #  WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF      #
 25 | #  MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SIGHAN NOR THE          #
 26 | #  CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL      #
 27 | #  DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA     #
 28 | #  OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER      #
 29 | #  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR       #
 30 | #  PERFORMANCE OF THIS SOFTWARE.                                          #
 31 | #                                                                         #
 32 | ###########################################################################
 33 | #                                                                         #
 34 | # Author: Richard Sproat (rws@uiuc.edu)                                   #
 35 | #         Tom Emerson (tree@basistech.com)                                #
 36 | #                                                                         #
 37 | ###########################################################################
 38 | 
 39 | ## This code depends upon a version of diff (e.g. GNU diffutils 2.7.2)
 40 | ## that supports the -y flag:
 41 | ##
 42 | ## -y     Use the side by side output format.
 43 | ##
 44 | ## change the following per your installation:
 45 | 
 46 | $diff = "/usr/bin/diff";
 47 | 
 48 | $USAGE = "Usage:\t$0 dictionary truth test\n\t";
 49 | 
 50 | if (@ARGV != 3) {print "$USAGE\n"; exit;}
 51 | 
 52 | $tmp1 = "/tmp/comp01$$";
 53 | $tmp2 = "/tmp/comp02$$";
 54 | 
 55 | %dict = ();
 56 | 
 57 | open (S, $ARGV[0]) or  die "$ARGV[0]: $!\n";
 58 | 
 59 | while (<S>) {
 60 |     chop;
 61 |     s/^\s*//;
 62 |     s/\s*$//;
 63 |     $dict{$_} = 1;
 64 | }
 65 | 
 66 | close(S);
 67 | 
 68 | open (TRUTH, $ARGV[1]) or die "$ARGV[1]: $!\n";
 69 | open (TEST, $ARGV[2]) or die "$ARGV[2]: $!\n";
 70 | 
 71 | $Tot = $Del = $Ins = $Subst = $Truecount = $Testcount = 0;
 72 | $RawRecall = $RawPrecision = 0;
 73 | 
 74 | $linenum = 0;
 75 | 
 76 | 
 77 | $IVMISSED = $OOVMISSED = $OOV = $IV = 0;
 78 | 
 79 | $file1 = $ARGV[1];
 80 | $file2 = $ARGV[2];
 81 | $file1 =~ s=^/.*/==;
 82 | $file2 =~ s=^/.*/==;
 83 | 
 84 | while (defined($truth = <TRUTH>) && defined($test = <TEST>)) {
 85 |     $truth =~ s/^\s*//;
 86 |     $test =~ s/^\s*//;
 87 |     $truth =~ s/\s*$//;
 88 |     $test =~ s/\s*$//;
 89 |     $truth =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g;
 90 |     $test =~ s/(\xe3\x80\x80)|(\xa1\x40)/ /g;
 91 |     $truth =~ s///g;
 92 |     $test =~ s///g;
 93 |     @truthwords = split /\s+/, $truth;
 94 |     @testwords = split /\s+/, $test;
 95 |     $truecount = scalar(@truthwords);
 96 |     $testcount = scalar(@testwords);
 97 |     ++$linenum;
 98 |     if ($truecount == 0) { 
 99 | 	if ($testcount > 0) {
100 | 	    print STDERR "Warning: training is 0 but test is nonzero, possible misalignment at line $linenum.\n";
101 | 	}
102 | 	next; 
103 |     }
104 |     if ($testcount == 0) { 
105 | 	print STDERR "Warning: No output in test data where there is in training data, line $linenum\n";
106 |     }
107 |     open (T1, ">$tmp1") or die "Can't open $tmp1";
108 |     open (T2, ">$tmp2") or die "Can't open $tmp2";
109 |     foreach my $w (@truthwords) { print T1 "$w\n"; }
110 |     foreach my $w (@testwords) {print T2 "$w\n";}
111 |     close (T1);
112 |     close (T2);
113 |     open (P, "$diff -y $tmp1 $tmp2 |") 
114 | 	or die "Can't open pipe.\n";
115 |     print "--$file1-------$file2----$linenum\n";
116 |     my $del = 0;
117 |     my $ins = 0;
118 |     my $subst = 0;
119 |     my $rawrecall = 0;
120 |     my $rawprecision = 0;
121 |     while (<P>) {
122 | 	my $err = 0;
123 | 	if (/\s\|\s/) {$subst++ ; $err++; }
124 | 	elsif (/\s\>\s/) {$ins++ ; $err++; }
125 | 	elsif (/\s\<\s/) {$del++ ; $err++; }
126 | 	if (/^([^\s]+)\s/) { 
127 | 	    my $w = $1;
128 | 	    if (!$dict{$w}) {++$OOV;}	    
129 | 	    else {++$IV;}
130 | 	    if (/^[^\s]+\s.*\s[\|\>\<]\s/) {
131 | 		if (!$dict{$w}) {++$OOVMISSED;}
132 | 		else {++$IVMISSED;}
133 | 		++$rawrecall; 
134 | 	    }
135 | 	}
136 | 	if (/\s[\|\>\<]\s.*[^\s]$/) { ++$rawprecision; }
137 | 	print "$_";
138 |     }
139 |     close (P);
140 |     my $tot = $del + $ins + $subst;
141 |     $Tot += $tot;
142 |     $Del += $del;
143 |     $Ins += $ins;
144 |     $Subst += $subst;
145 |     $Truecount += $truecount;
146 |     $Testcount += $testcount;
147 |     $rawrecall = $truecount - $rawrecall;
148 |     $rawprecision = $testcount - $rawprecision;
149 |     $RawRecall += $rawrecall;
150 |     $RawPrecision += $rawprecision;
151 |     $rawrecall = sprintf("%2.3f", $rawrecall/$truecount);
152 |     $rawprecision = sprintf("%2.3f", $rawprecision/$testcount);
153 |     print "INSERTIONS:\t$ins\n";
154 |     print "DELETIONS:\t$del\n";
155 |     print "SUBSTITUTIONS:\t$subst\n";
156 |     print "NCHANGE:\t$tot\n";
157 |     print "NTRUTH:\t$truecount\n";
158 |     print "NTEST:\t$testcount\n";
159 |     print "TRUE WORDS RECALL:\t$rawrecall\n";
160 |     print "TEST WORDS PRECISION:\t$rawprecision\n";
161 | }
162 | 
163 | close(TRUTH);
164 | close(TEST);
165 | unlink($tmp1);
166 | unlink($tmp2);
167 | 
168 | print "=== SUMMARY:\n";
169 | print "=== TOTAL INSERTIONS:\t$Ins\n";
170 | print "=== TOTAL DELETIONS:\t$Del\n";
171 | print "=== TOTAL SUBSTITUTIONS:\t$Subst\n";
172 | print "=== TOTAL NCHANGE:\t$Tot\n";
173 | print "=== TOTAL TRUE WORD COUNT:\t$Truecount\n";
174 | print "=== TOTAL TEST WORD COUNT:\t$Testcount\n";
175 | $RawRecall =  $RawRecall/$Truecount;
176 | $RawPrecision = $RawPrecision/$Testcount;
177 | $beta = 1;
178 | $R = $RawRecall;
179 | $P = $RawPrecision;
180 | $F = (1 + $beta)*$P*$R/($beta * $P + $R);
181 | $F = sprintf("%2.3f", $F);
182 | $RawRecall = sprintf("%2.3f", $RawRecall);
183 | $RawPrecision = sprintf("%2.3f", $RawPrecision);
184 | print "=== TOTAL TRUE WORDS RECALL:\t$RawRecall\n";
185 | print "=== TOTAL TEST WORDS PRECISION:\t$RawPrecision\n";
186 | print "=== F MEASURE:\t$F\n";
187 | if ($OOV > 0) {
188 |     $OOVMISSED = sprintf("%2.3f", 1 - $OOVMISSED / $OOV);
189 | }
190 | else {
191 |     $OOVMISSED = "--";
192 | }
193 | $OOV = sprintf("%2.3f", $OOV / $Truecount);
194 | if ($IV > 0) {
195 |     $IVMISSED = sprintf("%2.3f", 1 - $IVMISSED / $IV);
196 | }
197 | else {
198 |     $IVMISSED = "--";
199 | }
200 | print "=== OOV Rate:\t$OOV\n";
201 | print "=== OOV Recall Rate:\t$OOVMISSED\n";
202 | print "=== IV Recall Rate:\t$IVMISSED\n";
203 | 
204 | print "###\t$file2\t$Ins\t$Del\t$Subst\t$Tot\t$Truecount\t$Testcount\t$RawRecall\t$RawPrecision\t$F\t$OOV\t$OOVMISSED\t$IVMISSED\n";
205 | exit(0);
206 | 


--------------------------------------------------------------------------------