├── sample-data ├── ref.txt ├── out.txt └── hyp.txt ├── constants ├── src └── ter │ ├── util │ ├── IntPair.java │ ├── Pair.java │ └── StringUtils.java │ ├── io │ ├── Tag.java │ └── SgmlProcessor.java │ ├── core │ ├── Shift.java │ ├── CostFunction.java │ ├── Alignment.java │ ├── Normalizer.java │ ├── AlignmentStringUtils.java │ └── TerScorer.java │ ├── Parameters.java │ └── TER.java ├── dist.sh ├── CHANGELOG.md ├── README.md └── LICENSE.txt /sample-data/ref.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jhclark/tercom/HEAD/sample-data/ref.txt -------------------------------------------------------------------------------- /constants: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # These must also be updated in build.xml and README.md 4 | version=0.10.0 5 | -------------------------------------------------------------------------------- /src/ter/util/IntPair.java: -------------------------------------------------------------------------------- 1 | package ter.util; 2 | 3 | public class IntPair { 4 | public int car; 5 | public int cdr; 6 | public IntPair(int car, int cdr) { 7 | this.car = car; 8 | this.cdr = cdr; 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/ter/util/Pair.java: -------------------------------------------------------------------------------- 1 | package ter.util; 2 | 3 | public class Pair { 4 | public final T1 first; 5 | public final T2 second; 6 | public Pair(T1 first, T2 second) { 7 | this.first = first; 8 | this.second = second; 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/ter/io/Tag.java: -------------------------------------------------------------------------------- 1 | package ter.io; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class Tag { 7 | public String name; 8 | public Map content; 9 | public String rest; 10 | 11 | public Tag() { 12 | name = ""; 13 | content = new HashMap(); 14 | rest = ""; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /dist.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xeo pipefail 3 | scriptDir=$(dirname $0) 4 | source $scriptDir/constants 5 | 6 | dist=tercom-$version 7 | distDir=$scriptDir/dist 8 | mkdir -p $distDir/$dist 9 | 10 | cd $scriptDir 11 | ant 12 | cp -r constants \ 13 | LICENSE.txt \ 14 | README.md \ 15 | CHANGELOG.md \ 16 | tercom-$version.jar \ 17 | sample-data \ 18 | $distDir/$dist 19 | 20 | cd $distDir 21 | tar -cvzf $dist.tgz $dist 22 | -------------------------------------------------------------------------------- /src/ter/util/StringUtils.java: -------------------------------------------------------------------------------- 1 | package ter.util; 2 | 3 | public class StringUtils { 4 | 5 | public static String join(String delim, String[] arr) { 6 | if (arr == null) return ""; 7 | if (delim == null) delim = new String(""); 8 | String s = new String(""); 9 | for (int i = 0; i < arr.length; i++) { 10 | if (i == 0) { s += arr[i]; } 11 | else { s += delim + arr[i]; } 12 | } 13 | return s; 14 | } 15 | 16 | public static String join(String delim, char[] arr) { 17 | if (arr == null) return ""; 18 | if (delim == null) delim = new String(""); 19 | String s = new String(""); 20 | for (int i = 0; i < arr.length; i++) { 21 | if (i == 0) { s += arr[i]; } 22 | else { s += delim + arr[i]; } 23 | } 24 | return s; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 1/3/2012 0.8.0 2 | 3 | * Jon's updates: 4 | * Moved source code to github (with Matt Snover's permission) 5 | * Changed to LGPL license 6 | * Moved all code under the "ter" package 7 | * Removed most static variables and methods to allow TER to be used as a libarary in multi-threaded applications (for MultEval) 8 | * Added generic parameters to collections, etc. 9 | * Cleaned up most compiler warnings 10 | 11 | 12 | 8/19/08 Version 0.7.25 13 | 14 | * Tokenization changed so that 's is correctly tokenized when using -N flag. No other changes from version 0.7.2. 15 | 16 | 17 | 3/26/08 Version 0.7.2 18 | 19 | * Several bug fixes. Scores might vary slightly from version 0.7.0. 20 | 21 | 22 | 3/27/07 Version 07 23 | 24 | * First full Java release 25 | * Several bug fixes. 26 | 27 | 28 | 4/28/06 Basic Java Classes 29 | 30 | * Bug fix, and an increase in the beam width (to 20) for minimum edit distance. 31 | * Contains some basic Java classes which can be used to calculate TER. It is much faster than the perl script but the UI is not very developed. It is useful if you wish to examine the algorithm to more fully understand how it works. This code is presented here for research and educational purposes. It was designed to be integrated into NIST's HTER annotation tool. A test wrapper for the code is given, but it is not as functional as the tercom script. 32 | * The full version contains this code, plus bug fixes and additional functionality. 33 | 34 | 35 | 4/10/06 Version 6b 36 | 37 | * Added document level scoring (.sum_doc output) 38 | * Added support for n-best list scoring (.sum_nbest output) 39 | * Added option -N to normalize hyp and ref as in MT eval scoring 40 | * Increased precision of floating point output 41 | 42 | 43 | 11/10/05 Version 5 44 | 45 | * Changed program name from calc_ter to tercom 46 | * Changed default options (-f 2 is now default) 47 | * Added support for sgm files (-i sgm) 48 | * Added option to specify beam width 49 | -------------------------------------------------------------------------------- /src/ter/core/Shift.java: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright� 2006 by BBN Technologies and University of Maryland (UMD) 4 | 5 | License: LGPL v. 2.1. See LICENSE.txt. 6 | 7 | TERshift.java v1 8 | Matthew Snover (snover@cs.umd.edu) 9 | 10 | */ 11 | 12 | package ter.core; 13 | 14 | import java.util.List; 15 | 16 | /* This is just a useful class for containing shift information */ 17 | public class Shift { 18 | Shift () { 19 | start = 0; 20 | end = 0; 21 | moveto = 0; 22 | newloc = 0; 23 | } 24 | Shift (int _start, int _end, int _moveto, int _newloc) { 25 | start = _start; 26 | end = _end; 27 | moveto = _moveto; 28 | newloc = _newloc; 29 | } 30 | // TERshift (int _start, int _end, int _moveto, int _newloc, List _shifted) { 31 | // start = _start; 32 | // end = _end; 33 | // moveto = _moveto; 34 | // newloc = _newloc; 35 | // shifted = _shifted; 36 | // } 37 | 38 | public String toString() { 39 | String s = "[" + start + ", " + end + ", " + moveto + "/" + newloc + "]"; 40 | if (shifted != null) 41 | s += " (" + shifted + ")"; 42 | return s; 43 | } 44 | 45 | /* The distance of the shift. */ 46 | public int distance() { 47 | if (moveto < start) { 48 | return start - moveto; 49 | } else if (moveto > end) { 50 | return moveto - end; 51 | } else { 52 | return moveto - start; 53 | } 54 | } 55 | 56 | public boolean leftShift() { 57 | return (moveto < start); 58 | } 59 | 60 | public int size() { 61 | return (end - start) + 1; 62 | } 63 | 64 | public int start = 0; 65 | public int end = 0; 66 | public int moveto = 0; 67 | public int newloc = 0; 68 | public List shifted = null; // The words we shifted 69 | public char[] alignment = null; // for pra_more output 70 | public String[] aftershift = null; // for pra_more output 71 | // This is used to store the cost of a shift, so we don't have to 72 | // calculate it multiple times. 73 | public double cost = 1.0; 74 | } 75 | -------------------------------------------------------------------------------- /src/ter/core/CostFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright� 2006 by BBN Technologies and University of Maryland (UMD) 4 | 5 | License: LGPL v. 2.1. See LICENSE.txt. 6 | 7 | TERcost.java v1 8 | Matthew Snover (snover@cs.umd.edu) 9 | 10 | */ 11 | 12 | package ter.core; 13 | 14 | 15 | /* 16 | 17 | If you wish to experiment with other cost functions, (including 18 | specific word to word cost matrices), a child class of TERcost should 19 | be made, and an instance of it should be passed as the third arguement 20 | to the TER function in TERcalc. 21 | 22 | All costs must be in the range of 0.0 to 1.0. Deviating outside of 23 | this range may break the functionality of the TERcalc code. 24 | 25 | This code does not do phrasal costs functions, such modification would require 26 | changes to the TERcalc code. 27 | 28 | In addition shifts only occur when the two subsequences are equal(), 29 | and hash to the same location. This can be modified from the standard 30 | definition by using a new String data structure which redefines 31 | those functions. 32 | 33 | */ 34 | 35 | public class CostFunction { 36 | /* For all of these functions, the score should be between 0 and 1 37 | * (inclusive). If it isn't, then it will break TERcalc! */ 38 | 39 | /* The cost of matching ref word for hyp word. (They are equal) */ 40 | public double match_cost(String hyp, String ref) { 41 | return _match_cost; 42 | } 43 | 44 | /* The cost of substituting ref word for hyp word. (They are not equal) */ 45 | public double substitute_cost(String hyp, String ref) { 46 | return _substitute_cost; 47 | } 48 | 49 | /* The cost of inserting the hyp word */ 50 | public double insert_cost(String hyp) { 51 | return _insert_cost; 52 | } 53 | 54 | /* The cost of deleting the ref word */ 55 | public double delete_cost(String ref) { 56 | return _delete_cost; 57 | } 58 | 59 | /* The cost of making a shift */ 60 | public double shift_cost(Shift shift) { 61 | return _shift_cost; 62 | } 63 | 64 | public double _shift_cost = 1.0; 65 | public double _insert_cost = 1.0; 66 | public double _delete_cost = 1.0; 67 | public double _substitute_cost = 1.0; 68 | public double _match_cost = 0.0; 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/ter/core/Alignment.java: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright� 2006 by BBN Technologies and University of Maryland (UMD) 4 | 5 | License: LGPL v. 2.1. See LICENSE.txt. 6 | 7 | TERalignment.java v1 8 | Matthew Snover (snover@cs.umd.edu) 9 | 10 | */ 11 | 12 | package ter.core; 13 | 14 | 15 | import ter.util.StringUtils; 16 | 17 | 18 | /* Storage Class for TER alignments */ 19 | public class Alignment { 20 | 21 | @Override 22 | public String toString() { 23 | String s = "Original Ref: " + StringUtils.join(" ", ref) + 24 | "\nOriginal Hyp: " + StringUtils.join(" ", hyp) + 25 | "\nHyp After Shift: " + StringUtils.join(" ", aftershift); 26 | 27 | if (alignment != null) { 28 | s += "\nAlignment: ("; 29 | for (int i = 0; i < alignment.length; i++) {s+=alignment[i];} 30 | s += ")"; 31 | } 32 | if (allshifts == null) { 33 | s += "\nNumShifts: 0"; 34 | } else { 35 | s += "\nNumShifts: " + allshifts.length; 36 | for (int i = 0; i < allshifts.length; i++) { 37 | s += "\n " + allshifts[i]; 38 | } 39 | } 40 | 41 | s += "\nScore: " + this.score() + 42 | " (" + this.numEdits + "/" + this.numWords + ")"; 43 | return s; 44 | } 45 | 46 | public String toMoreString() { 47 | String s = "Best Ref: " + StringUtils.join(" ", ref) + 48 | "\nOrig Hyp: " + StringUtils.join(" ", hyp) + "\n"; 49 | s += AlignmentStringUtils.getPraStr(hyp, ref, aftershift, alignment, allshifts, false); 50 | s += String.format("TER Score: %1$6.2f (%2$5.1f/%3$5.1f)\n", 100*this.score(), 51 | this.numEdits, this.numWords); 52 | s += AlignmentStringUtils.formatShiftsString(hyp, ref, allshifts); 53 | return s; 54 | } 55 | 56 | public double score() { 57 | if ((numWords <= 0.0) && (this.numEdits > 0.0)) { return 1.0; } 58 | if (numWords <= 0.0) { return 0.0; } 59 | return (double) numEdits / numWords; 60 | } 61 | 62 | public void populateScoreDetails() { 63 | numIns = numDel = numSub = numWsf = numSft = 0; 64 | if(allshifts != null) { 65 | for(int i = 0; i < allshifts.length; ++i) 66 | numWsf += allshifts[i].size(); 67 | numSft = allshifts.length; 68 | } 69 | 70 | if(alignment != null) { 71 | for(int i = 0; i < alignment.length; ++i) { 72 | switch (alignment[i]) { 73 | case 'S': 74 | case 'T': 75 | numSub++; 76 | break; 77 | case 'D': 78 | numDel++; 79 | break; 80 | case 'I': 81 | numIns++; 82 | break; 83 | } 84 | } 85 | } 86 | // if(numEdits != numSft + numDel + numIns + numSub) 87 | // System.out.println("** Error, unmatch edit erros " + numEdits + 88 | // " vs " + (numSft + numDel + numIns + numSub)); 89 | } 90 | 91 | public String[] ref; 92 | public String[] hyp; 93 | public String[] aftershift; 94 | 95 | public Shift[] allshifts = null; 96 | 97 | public double numEdits = 0; 98 | public double numWords = 0.0; 99 | public char[] alignment = null; 100 | public String bestRef = ""; 101 | 102 | public int numIns = 0; 103 | public int numDel = 0; 104 | public int numSub = 0; 105 | public int numSft = 0; 106 | public int numWsf = 0; 107 | } 108 | -------------------------------------------------------------------------------- /src/ter/core/Normalizer.java: -------------------------------------------------------------------------------- 1 | package ter.core; 2 | 3 | public class Normalizer { 4 | 5 | private static String asianPunct = "([\\x{3001}\\x{3002}\\x{3008}-\\x{3011}\\x{3014}-\\x{301f}\\x{ff61}-\\x{ff65}\\x{30fb}])"; 6 | private static String fullwidthPunct = "([\\x{ff0e}\\x{ff0c}\\x{ff1f}\\x{ff1a}\\x{ff1b}\\x{ff01}\\x{ff02}\\x{ff08}\\x{ff09}])"; 7 | 8 | public static String[] tokenize(String s, boolean normalized, boolean nopunct, boolean asianSupport, boolean noTagBrackets) { 9 | 10 | /* tokenizes according to the mtevalv11 specs, plus added material 11 | for handling languages written with CJK ideographs */ 12 | 13 | if(normalized) { 14 | // language-independent part: 15 | s = s.replaceAll("", ""); // strip "skipped" tags 16 | s = s.replaceAll("-\n", ""); // strip end-of-line hyphenation and join lines 17 | s = s.replaceAll("\n", " "); // join lines 18 | s = s.replaceAll(""", "\""); // convert SGML tag for quote to " 19 | s = s.replaceAll("&", "&"); // convert SGML tag for ampersand to & 20 | s = s.replaceAll("<", "<"); // convert SGML tag for less-than to > 21 | s = s.replaceAll(">", ">"); // convert SGML tag for greater-than to < 22 | 23 | // remove HTML-style tag brackets: "
" -> "br/" 24 | if(noTagBrackets) { 25 | s = s.replaceAll("<([^<>]+)>", " $1 "); 26 | } 27 | 28 | // language-dependent part (assuming Western languages): 29 | s = " " + s + " "; 30 | s = s.replaceAll("([\\{-\\~\\[-\\` -\\&\\(-\\+\\:-\\@\\/])", " $1 "); // tokenize punctuation 31 | s = s.replaceAll("'s ", " 's "); // handle possesives 32 | s = s.replaceAll("'s$", " 's"); // handle possesives 33 | s = s.replaceAll("([^0-9])([\\.,])", "$1 $2 "); // tokenize period and comma unless preceded by a digit 34 | s = s.replaceAll("([\\.,])([^0-9])", " $1 $2"); // tokenize period and comma unless followed by a digit 35 | s = s.replaceAll("([0-9])(-)", "$1 $2 "); // tokenize dash when preceded by a digit 36 | 37 | // further language-dependent part (if Asian support turned on): 38 | if(asianSupport) { 39 | // Split Chinese chars and Japanese kanji down to character level: 40 | s = s.replaceAll("([\\p{InCJKUnifiedIdeographs}\\p{InCJKUnifiedIdeographsExtensionA}])", " $1 "); 41 | s = s.replaceAll("([\\p{InCJKStrokes}\\p{InCJKRadicalsSupplement}])", " $1 "); 42 | s = s.replaceAll("([\\p{InCJKCompatibility}\\p{InCJKCompatibilityIdeographs}\\p{InCJKCompatibilityForms}])", " $1 "); 43 | s = s.replaceAll("([\\p{InEnclosedCJKLettersAndMonths}])", " $1 "); 44 | 45 | // Split Hiragana, Katakana, and KatakanaPhoneticExtensions 46 | // only when adjacent to something else: 47 | s = s.replaceAll("(^|\\P{InHiragana})(\\p{InHiragana}+)(?=$|\\P{InHiragana})", "$1 $2 "); 48 | s = s.replaceAll("(^|\\P{InKatakana})(\\p{InKatakana}+)(?=$|\\P{InKatakana})", "$1 $2 "); 49 | s = s.replaceAll("(^|\\P{InKatakanaPhoneticExtensions})(\\p{InKatakanaPhoneticExtensions}+)(?=$|\\P{InKatakanaPhoneticExtensions})", "$1 $2 "); 50 | 51 | // Split punctuation: 52 | s = s.replaceAll(asianPunct, " $1 "); 53 | s = s.replaceAll(fullwidthPunct, " $1 "); 54 | } 55 | 56 | s = s.replaceAll("\\s+"," "); // one space only between words 57 | s = s.replaceAll("^\\s+", ""); // no leading space 58 | s = s.replaceAll("\\s+$", ""); // no trailing space 59 | } 60 | if(nopunct) s = Normalizer.removePunctuation(s, asianSupport); 61 | return s.split("\\s+"); 62 | } 63 | 64 | static String removePunctuation(String str, boolean asianSupport) { 65 | String s = str.replaceAll("[\\.,\\?:;!\"\\(\\)]", ""); 66 | if(asianSupport) { 67 | s = s.replaceAll(asianPunct, ""); 68 | s = s.replaceAll(fullwidthPunct, ""); 69 | } 70 | s = s.replaceAll("\\s+", " "); 71 | return s; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /sample-data/out.txt: -------------------------------------------------------------------------------- 1 | "hyp.txt" was successfully parsed as Trans text 2 | "ref.txt" was successfully parsed as Trans text 3 | ** Warning: XML ouput may not have correct doc id for Trans format inputs 4 | Processing AFA20040101.0510-8:1 5 | Processing AFA20040101.5400-6:1 6 | Processing AFA20040102.2700-3:1 7 | Processing AFA20040102.6110-5:1 8 | Processing AFA20040103.0120-5:1 9 | Processing AFA20040103.2900-8:1 10 | Processing AFA20040103.5700-2:1 11 | Processing AFA20040103.7010-6:1 12 | Processing AFA20040103.7010-7:1 13 | Processing AFA20040103.9100-4:1 14 | Processing AFA20040103.9200-6:1 15 | Processing AFA20040104.3110-1:1 16 | Processing AFA20040104.3110-3:1 17 | Processing AFA20040104.7300-4:1 18 | Processing AFA20040104.7300-5:1 19 | Processing AFA20040104.8400-4:1 20 | Processing AFA20040104.9200-3:1 21 | Processing AFA20040105.3220-4:1 22 | Processing AFA20040105.4910-7:1 23 | Processing AFA20040105.6000-2:1 24 | Processing AFA20040105.6200-2:1 25 | Processing AFA20040106.6100-2:1 26 | Processing AFA20040106.6100-3:1 27 | Processing AFA20040106.7300-1:1 28 | Processing AFA20040106.7300-6:1 29 | Processing AFA20040129.5120-6:1 30 | Processing AFA20040130.6910-2:1 31 | Processing AFA20040130.6910-9:1 32 | Processing AFA20040131.3700-8:1 33 | Processing AFA20040131.4510-5:1 34 | Processing AKHB20040219.001-1:1 35 | Processing AKHB20040221.001-1:1 36 | Processing ALH20040204.001-1:1 37 | Processing ALH20040204.007-3:1 38 | Processing ALH20040206.002-5:1 39 | Processing ALH20040206.002-7:1 40 | Processing ALH20040206.003-8:1 41 | Processing ALH20040220.001-2:1 42 | Processing ALH20040220.001-8:1 43 | Processing ALH20040227.001-5:1 44 | Processing ALH20040227.001-6:1 45 | Processing ALH20040303.001-5:1 46 | Processing ALH20040304.001-1:1 47 | Processing ALH20040304.002-6:1 48 | Processing ALH20040305.001-9:1 49 | Processing ALH20040305.002-1:1 50 | Processing ALH20040305.002-3:1 51 | Processing AMI20020707.001-1:1 52 | Processing AMI20020707.001-3:1 53 | Processing ANN20040204.001-2:1 54 | Processing ANN20040204.001-3:1 55 | Processing ANN20040204.002-2:1 56 | Processing ANN20040204.003-3:1 57 | Processing ANN20040204.003-4:1 58 | Processing ANN20040204.004-4:1 59 | Processing ANN20040204.004-7:1 60 | Processing ANN20040204.005-8:1 61 | Processing ANN20040205.002-2:1 62 | Processing ANN20040220.001-6:1 63 | Processing ANN20040227.001-3:1 64 | Processing ANN20040227.002-2:1 65 | Processing ANN20040227.002-4:1 66 | Processing ANN20040228.001-7:1 67 | Processing ANN20040229.001-2:1 68 | Processing ANN20040304.004-5:1 69 | Processing ANN20040304.005-1:1 70 | Processing ANN20040305.001-3:1 71 | Processing ANN20040309.001-6:1 72 | Processing EJO20031223.001-3:1 73 | Processing MOFA20040219.001-1:1 74 | Processing MOFA20040219.002-1:1 75 | Processing OMSAR20040107.001-4:1 76 | Processing OMSAR20040107.002-2:1 77 | Processing SPC20030310.001-5:1 78 | Processing UN20040101.002-3:1 79 | Processing UN20040101.005-6:1 80 | Processing UN20040101.007-2:1 81 | Processing XIN20040103.0035-4:1 82 | Processing XIN20040105.0070-7:1 83 | Processing XIN20040107.0046-7:1 84 | Processing XIN20040108.0031-5:1 85 | Processing XIN20040110.0019-2:1 86 | Processing XIN20040113.0114-7:1 87 | Processing XIN20040115.0055-1:1 88 | Processing XIN20040115.0212-2:1 89 | Processing XIN20040116.0152-2:1 90 | Processing XIN20040116.0152-5:1 91 | Processing XIN20040117.0050-2:1 92 | Processing XIN20040117.0140-6:1 93 | Processing XIN20040118.0127-4:1 94 | Processing XIN20040121.0073-1:1 95 | Processing XIN20040123.0090-5:1 96 | Processing XIN20040127.0006-1:1 97 | Processing XIN20040128.0114-6:1 98 | Processing XIN20040128.0119-2:1 99 | Processing XIN20040128.0152-3:1 100 | Processing XIN20040128.0152-4:1 101 | Processing XIN20040129.0044-2:1 102 | Processing XIN20040129.0044-5:1 103 | Processing XIN20040129.0054-3:1 104 | Total TER: 0.5618171814823548 (1787.0/3180.75) 105 | Number of calls to beam search: 10260 106 | Number of segments scored: 400 107 | Number of shifts tried: 9860 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------ 2 | TRANSLATION ERROR RATE (TER) 0.10.0 3 | 4 | Matthew Snover 5 | Shuguang Wang 6 | Spyros Matsoukas 7 | 8 | Copyright (C) 2007 BBN Technologies and University of Maryland 9 | ------------------------------------------------------------------------ 10 | 11 | TER contains a Java program for computing the translation error rate, 12 | which is an error metric for machine translation that messures the number of 13 | edits required to change a system output into one of the references. More details 14 | about the translation error rate can be found at 15 | http://www.cs.umd.edu/~snover/tercom/ 16 | 17 | TER parses input reference and hypothesis files as SGML(NIST format), XML, 18 | or Trans. Both reference and hypothesis files should be in the same formats. 19 | 20 | The code has been compiled into an executable jar file: tercom.jar 21 | 22 | The source files are also provided in the src/ directory. TERtest is the main 23 | executable class. 24 | 25 | References to TER should cite: 26 | 27 | Matthew Snover, Bonnie Dorr, Richard Schwartz, Linnea Micciulla, and 28 | John Makhoul, "A Study of Translation Edit Rate with Targeted Human 29 | Annotation," Proceedings of Association for Machine Translation in the 30 | Americas, 2006. 31 | 32 | USAGE 33 | ===== 34 | 35 | Currently, the following options are supported: 36 | 37 | ``` 38 | -N normalization, optional, default is no. 39 | -s case sensitivity, optional, default is insensitive 40 | -P no punctuations, default is with punctuations. 41 | -A Asian language support for -N and -P, optional, default is without. 42 | -K remove brackets around HTML-style tags, optional, default is to keep 43 | -r reference file path, required. 44 | -h hypothesis file path, required. 45 | -o output formats, optional, default are all formats. 46 | Valid formats include "ter", "xml", "sum", "sum_nbest" and "pra". 47 | "ter", plain text file contains four columns: chunkid, numerrs, numwrds, ter% 48 | "xml", XML format with detailed alignment for each word 49 | "pra", plain text with alignment details, simpler than that of tercom_v6b. 50 | "pra_more", identical to pra output as tercom_v6b. 51 | "sum", same summary output as that of tercom_v6b. 52 | "sum_nbest", same nbest summary output as that of tercom_v6b. 53 | -n output name prefix, optional, no output will be generated if it is not set. 54 | -b beam width, optional, default is 20. 55 | -S translation span prefix, optional, this option only works with single reference. 56 | -a alternative reference path, optional, this file will be only used to compute the reference length. 57 | -d maximum shift distance, optional, default is 50 words. 58 | ``` 59 | 60 | Examples 61 | ======== 62 | 1. Use default values for all options and output in all formats. 63 | 64 | ```bash 65 | $ java -jar tercom.jar -r -h -n 66 | ``` 67 | 68 | 2. Enable normalization and case sensitivity, and set the beam width 69 | to 10. 70 | 71 | ```bash 72 | $ java -jar tercom.jar -N -s -b 10 -r -h -n 73 | ``` 74 | 75 | 3. Same as above, but split most Chinese and Japanese text appearing in the output down to the character level. 76 | 77 | ```bash 78 | $ java -jar tercom.jar -N -A -s -b 10 -r -h -n 79 | ``` 80 | 81 | 4. Output only summary output. 82 | 83 | ```bash 84 | $ java -jar tercom.jar -r -h -o xml -n 85 | ``` 86 | 87 | Sample Data 88 | =========== 89 | 90 | Sample Data and output is provided in the sample-data directory. 91 | 92 | sample-data/hyp.txt contains the hypothesis text 93 | sample-data/ref.txt contains the reference text 94 | sample-data/out.txt contains the output of running the following command 95 | 96 | cd sample-data/ 97 | java -jar ../tercom.jar -r ref.txt -h hyp.txt > out.txt 98 | 99 | Source File Descriptions 100 | ======================== 101 | 102 | The primary classes of the tercom computation code are described below: 103 | 104 | TERtest - The command-line UI for the code. Its usage is described above. 105 | 106 | TERcalc - This does all of the work calculating TER. All of the 107 | methods are static, so no instantiations should ever be made. To 108 | calculate TER, call: TERcalc.TER(String a, String b). This tokenizes 109 | and then runs TER. It returns a TERalignment object. If the input is 110 | pretokenized (or tokenized and then converted to another type, such as 111 | Integer (which give better performance)), then you can call 112 | TER(Comparable[], Comparable[]). If you want to use a different 113 | scoring module (see TERcost class), then use the functions 114 | TER(Comparable[], Comparable[], TERcost) or TER(String, String, 115 | TERcost). 116 | 117 | TERshift - This is a tiny storage class, which stores the information 118 | for indiviual shifts. 119 | 120 | TERalignment - This contains the output of TER. Some processing might 121 | be needed to make it as pretty as TERcom pra output. 122 | 123 | TERcost - This class is used to determine the cost of insertions, 124 | deletions, substitutions, matches, and shifts. The base class gives 125 | all edits a cost of 1, and matches a cost of 0. If a researcher wants 126 | to experiment with alternative cost functions, or cost matrices, then 127 | a new child class of TERcost should be made that calculates those cost 128 | functions. This can be passed in as the third arguement to the TER 129 | function of TERcalc. It should be noted that this version of the code 130 | only does word to word cost matrices (not phrasal matrices), and that 131 | all costs must be in the range of 0.0 to 1.0. 132 | 133 | TERpara - This class is used to parse the command arguments given to TERtest. -------------------------------------------------------------------------------- /src/ter/Parameters.java: -------------------------------------------------------------------------------- 1 | package ter; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | public class Parameters { 10 | private String reffn; 11 | private String hypfn; 12 | private String reflenfn; 13 | private boolean mt_normalized; 14 | private boolean case_on; 15 | private boolean no_punctuation; 16 | private boolean asian_support; 17 | private boolean no_tag_brackets; 18 | private List out_formats; 19 | private String out_pfx; 20 | private Pattern opts_p; 21 | private int beam_width; 22 | private String span_pfx; 23 | private int shift_dist; 24 | 25 | private double shift_cost; 26 | private double delete_cost; 27 | private double insert_cost; 28 | private double substitute_cost; 29 | private double match_cost; 30 | 31 | public static enum OPTIONS { 32 | NORMALIZE, CASEON, NOPUNCTUATION, ASIANSUPPORT, NOTAGBRACKETS, REF, HYP, FORMATS, OUTPFX, BEAMWIDTH, REFLEN, TRANSSPAN, SHIFTDIST, DELETE_COST, INSERT_COST, SUBSTITUTE_COST, MATCH_COST, SHIFT_COST; 33 | } 34 | 35 | public Parameters() { 36 | reffn = ""; 37 | hypfn = ""; 38 | reflenfn = ""; 39 | out_pfx = ""; 40 | mt_normalized = false; 41 | case_on = false; 42 | no_punctuation = false; 43 | asian_support = false; 44 | no_tag_brackets = false; 45 | opts_p = Pattern.compile("^\\s*-(\\S+)\\s*$"); 46 | beam_width = 20; 47 | span_pfx = ""; 48 | shift_dist = 50; 49 | shift_cost = 1.0; 50 | match_cost = 0.0; 51 | delete_cost = 1.0; 52 | insert_cost = 1.0; 53 | substitute_cost = 1.0; 54 | } 55 | 56 | // TODO: don't use Object 57 | public HashMap getOpts(String[] args) { 58 | HashMap paras = new HashMap(); 59 | 60 | for(int i = 0; i < args.length; ++i) { 61 | Matcher m = opts_p.matcher(args[i]); 62 | 63 | if(m.matches()) { 64 | char opt = m.group(1).charAt(0); 65 | switch(opt) { 66 | case 'N': 67 | mt_normalized = true; 68 | break; 69 | case 's': 70 | case_on = true; 71 | break; 72 | case 'P': 73 | no_punctuation = true; 74 | break; 75 | case 'A': 76 | asian_support = true; 77 | break; 78 | case 'K': 79 | no_tag_brackets = true; 80 | break; 81 | case 'r': 82 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 83 | usage(); 84 | else 85 | reffn = args[++i]; 86 | break; 87 | case 'h': 88 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 89 | usage(); 90 | else 91 | hypfn = args[++i]; 92 | break; 93 | case 'o': 94 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 95 | usage(); 96 | else 97 | out_formats = getOutFormats(args[++i]); 98 | // System.out.println("formats: " + args[i]); 99 | break; 100 | case 'n': 101 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 102 | usage(); 103 | else 104 | out_pfx = args[++i]; 105 | break; 106 | case 'b': 107 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 108 | usage(); 109 | else 110 | beam_width = Integer.valueOf(args[++i]); 111 | break; 112 | case 'a': 113 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 114 | usage(); 115 | else 116 | reflenfn = args[++i]; 117 | break; 118 | case 'S': 119 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 120 | usage(); 121 | else 122 | span_pfx = args[++i]; 123 | break; 124 | case 'd': 125 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 126 | usage(); 127 | else 128 | shift_dist = Integer.valueOf(args[++i]); 129 | break; 130 | case 'M': 131 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 132 | usage(); 133 | else 134 | match_cost = Double.valueOf(args[++i]); 135 | break; 136 | case 'T': 137 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 138 | usage(); 139 | else 140 | shift_cost = Double.valueOf(args[++i]); 141 | break; 142 | case 'I': 143 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 144 | usage(); 145 | else 146 | insert_cost = Double.valueOf(args[++i]); 147 | break; 148 | case 'D': 149 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 150 | usage(); 151 | else 152 | delete_cost = Double.valueOf(args[++i]); 153 | break; 154 | case 'B': 155 | if(i == args.length -1 || args[i+1].charAt(0) == '-') 156 | usage(); 157 | else 158 | substitute_cost = Double.valueOf(args[++i]); 159 | break; 160 | default: 161 | ; 162 | } 163 | } else 164 | usage(); 165 | } 166 | if(reffn.equals("") || hypfn.equals("")) { 167 | System.out.println("** Please specify both reference and hypothesis inputs"); 168 | usage(); 169 | } else { 170 | if (out_formats == null || out_formats.isEmpty()) 171 | out_formats = getOutFormats("sum,pra,pra_more,xml,ter,sum_nbest"); 172 | 173 | paras.put(OPTIONS.NORMALIZE, mt_normalized); 174 | paras.put(OPTIONS.CASEON, case_on); 175 | paras.put(OPTIONS.NOPUNCTUATION, no_punctuation); 176 | paras.put(OPTIONS.ASIANSUPPORT, asian_support); 177 | paras.put(OPTIONS.NOTAGBRACKETS, no_tag_brackets); 178 | paras.put(OPTIONS.OUTPFX, out_pfx); 179 | paras.put(OPTIONS.REF, reffn); 180 | paras.put(OPTIONS.HYP, hypfn); 181 | paras.put(OPTIONS.FORMATS, out_formats); 182 | paras.put(OPTIONS.BEAMWIDTH, beam_width); 183 | paras.put(OPTIONS.REFLEN, reflenfn); 184 | paras.put(OPTIONS.TRANSSPAN, span_pfx); 185 | paras.put(OPTIONS.SHIFTDIST, shift_dist); 186 | paras.put(OPTIONS.SHIFT_COST, shift_cost); 187 | paras.put(OPTIONS.MATCH_COST, match_cost); 188 | paras.put(OPTIONS.INSERT_COST, insert_cost); 189 | paras.put(OPTIONS.DELETE_COST, delete_cost); 190 | paras.put(OPTIONS.SUBSTITUTE_COST, substitute_cost); 191 | } 192 | return paras; 193 | } 194 | 195 | public static void usage() { 196 | System.out.println("** Usage: java -jar tercom.jar [-N] [-s] [-P] [-A] [-K] -r ref -h hyp [-a alter_ref] [-b beam_width] [-S trans_span_prefix] [-o out_format -n out_pefix] [-d max_shift_distance] [-M match_cost] [-D delete_cost] [-B substitute_cost] [-I insert_cost] [-T shift_cost]"); 197 | System.exit(1); 198 | } 199 | 200 | private static ArrayList getOutFormats(String s) { 201 | ArrayList ret = new ArrayList(); 202 | String [] arrays = s.split(","); 203 | if(arrays != null) 204 | for(int i = 0; i < arrays.length; ++i) 205 | ret.add(arrays[i]); 206 | return ret; 207 | } 208 | 209 | } 210 | -------------------------------------------------------------------------------- /src/ter/core/AlignmentStringUtils.java: -------------------------------------------------------------------------------- 1 | package ter.core; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | 7 | import ter.io.SgmlProcessor; 8 | 9 | public class AlignmentStringUtils { 10 | 11 | static String formatShiftsString(String[] hyp, 12 | String[] ref, 13 | Shift[] allshifts) { 14 | String to_return = ""; 15 | int ostart, oend, odest; 16 | int nstart; 17 | @SuppressWarnings("unused") int nend; 18 | int dist; 19 | String direction = ""; 20 | 21 | if(allshifts != null) { 22 | for(int i = 0; i < allshifts.length; ++i) { 23 | Shift[] oneshift = new Shift[1]; 24 | ostart = allshifts[i].start; 25 | oend = allshifts[i].end; 26 | odest = allshifts[i].newloc; 27 | 28 | if(odest >= oend) { 29 | // right 30 | nstart = odest + 1 - allshifts[i].size(); 31 | nend = nstart + allshifts[i].size() - 1; 32 | dist = odest - oend; 33 | direction = "right"; 34 | } else { 35 | // left 36 | nstart = odest + 1; 37 | nend = nstart + allshifts[i].size() - 1; 38 | dist = ostart - odest -1; 39 | direction = "left"; 40 | } 41 | 42 | to_return += "\nShift " + allshifts[i].shifted + " " + dist + " words " + direction; 43 | oneshift[0] = new Shift(ostart, oend, allshifts[i].moveto, odest); 44 | to_return += getPraStr(hyp, ref, allshifts[i].aftershift, allshifts[i].alignment, oneshift, true); 45 | } 46 | to_return += "\n"; 47 | } 48 | return to_return; 49 | } 50 | 51 | static String getPraStr(String[] hyp, 52 | String[] ref, 53 | String[] aftershift, 54 | char[] alignment, 55 | Shift[] allshifts, 56 | boolean shiftonly) { 57 | String to_return = ""; 58 | String rstr = ""; 59 | String hstr = ""; 60 | String estr = ""; 61 | String sstr = ""; 62 | HashMap> align_info = new HashMap>(); 63 | ArrayList shift_dists = new ArrayList(); 64 | int anum = 1; 65 | int ind_start = 0; 66 | int ind_end = 1; 67 | int ind_from = 2; 68 | int ind_in = 3; 69 | int ostart, oend, odest; 70 | int slen = 0; 71 | int nstart, nend, nfrom, dist; 72 | @SuppressWarnings("unused") int non_inserr = 0; 73 | 74 | if(allshifts != null) { 75 | for(int i = 0; i < allshifts.length; ++i) { 76 | ostart = allshifts[i].start; 77 | oend = allshifts[i].end; 78 | odest = allshifts[i].newloc; 79 | slen = allshifts[i].size(); 80 | 81 | if(odest >= oend) { 82 | // right 83 | nstart = odest + 1 - slen; 84 | nend = nstart + slen - 1; 85 | nfrom = ostart; 86 | dist = odest - oend; 87 | } else { 88 | // left 89 | nstart = odest + 1; 90 | nend = nstart + slen - 1; 91 | nfrom = ostart + slen; 92 | dist = (ostart - odest -1) * -1; 93 | } 94 | 95 | //dist = (allshifts[i].leftShift())?-1*allshifts[i].distance():allshifts[i].distance(); 96 | shift_dists.add(dist); 97 | // System.out.println("[" + hyp[ostart] + ".." + hyp[oend] + " are shifted " + dist); 98 | 99 | if(anum > 1) AlignmentStringUtils.performShiftArray(align_info, ostart, oend, odest, alignment.length); 100 | 101 | List val = align_info.get(nstart + "-" + ind_start); 102 | if(val == null) { 103 | List al = new ArrayList(); 104 | al.add(anum); 105 | align_info.put(nstart + "-" + ind_start, al); 106 | } else { 107 | List al = val; 108 | al.add(anum); 109 | } 110 | 111 | val = align_info.get(nend + "-" + ind_end); 112 | if(val == null) { 113 | ArrayList al = new ArrayList(); 114 | al.add(anum); 115 | align_info.put(nend + "-" + ind_end, al); 116 | } else { 117 | ArrayList al = (ArrayList) val; 118 | al.add(anum); 119 | } 120 | 121 | val = align_info.get(nfrom + "-" + ind_from); 122 | if(val == null) { 123 | ArrayList al = new ArrayList(); 124 | al.add(anum); 125 | align_info.put(nfrom + "-" + ind_from, al); 126 | } else { 127 | ArrayList al = (ArrayList) val; 128 | al.add(anum); 129 | } 130 | 131 | /* 132 | val = align_info.get("60-"+ind_start); 133 | if(val != null) 134 | System.out.println(((ArrayList) val).get(0)); 135 | else 136 | System.out.println("empty"); 137 | 138 | System.out.println("nstart: " + nstart + ", nend:" + nend + "," + ostart +"," + oend +","+ odest + "," + align_info.size()); 139 | */ 140 | if(slen > 0) { 141 | for(int j = nstart; j <= nend; ++j) { 142 | val = align_info.get(j + "-" + ind_in); 143 | if(val == null) { 144 | ArrayList al = new ArrayList(); 145 | al.add(anum); 146 | align_info.put(j + "-" + ind_in, al); 147 | } else { 148 | ArrayList al = (ArrayList) val; 149 | al.add(anum); 150 | } 151 | } 152 | } 153 | anum++; 154 | } 155 | } 156 | 157 | int hyp_idx = 0; 158 | int ref_idx = 0; 159 | if(alignment != null) { 160 | for(int i = 0; i < alignment.length; ++i) { 161 | String shift_in_str = ""; 162 | String ref_wd = (ref_idx < ref.length)?String.valueOf(ref[ref_idx]):""; 163 | String hyp_wd = (hyp_idx < hyp.length)?String.valueOf(aftershift[hyp_idx]):""; 164 | int l = 0; 165 | 166 | if(alignment[i] != 'D') { 167 | List val = align_info.get(hyp_idx + "-" + ind_from); 168 | if(val != null) { 169 | // System.out.println("hyp_idx: " + hyp_idx + "," + hyp_wd); 170 | List list = val; 171 | for(int j = 0; j < list.size(); ++j) { 172 | String s = "" + list.get(j); 173 | hstr += " @"; 174 | rstr += " "; 175 | estr += " "; 176 | sstr += " " + s; 177 | for(int k = 1; k < s.length(); ++k) { 178 | hstr += " "; 179 | rstr += " "; 180 | estr += " "; 181 | } 182 | } 183 | } 184 | 185 | val = align_info.get(hyp_idx + "-" + ind_start); 186 | if(val != null) { 187 | // System.out.println("hyp_idx: " + hyp_idx + "," + hyp_wd + "," + alignment.length); 188 | List list = val; 189 | for(int j = 0; j < list.size(); ++j) { 190 | String s = "" + list.get(j); 191 | hstr += " ["; 192 | rstr += " "; 193 | estr += " "; 194 | sstr += " " + s; 195 | for(int k = 1; k < s.length(); ++k) { 196 | hstr += " "; 197 | rstr += " "; 198 | estr += " "; 199 | } 200 | } 201 | } 202 | if(slen > 0) { 203 | val = align_info.get(hyp_idx + "-" + ind_in); 204 | if(val != null) 205 | shift_in_str = SgmlProcessor.join(",", (ArrayList) val); 206 | // if(val != null) System.out.println("shiftstr: " + ref_idx + "," + hyp_idx + "-" + ind_in + ":" + shift_in_str); 207 | } 208 | } 209 | switch (alignment[i]) { 210 | case ' ': 211 | l = Math.max(ref_wd.length(), hyp_wd.length()); 212 | hstr += " " + hyp_wd; 213 | rstr += " " + ref_wd; 214 | estr += " "; 215 | sstr += " "; 216 | for(int j = 0; j < l; ++j) { 217 | if(hyp_wd.length() <= j) hstr += " "; 218 | if(ref_wd.length() <= j) rstr += " "; 219 | estr += " "; 220 | sstr += " "; 221 | } 222 | hyp_idx++; 223 | ref_idx++; 224 | non_inserr++; 225 | break; 226 | case 'S': 227 | case 'T': 228 | l = Math.max(ref_wd.length(), Math.max(hyp_wd.length(), Math.max(1, shift_in_str.length()))); 229 | hstr += " " + hyp_wd; 230 | rstr += " " + ref_wd; 231 | if(hyp_wd.equals("") || ref_wd.equals("")) System.out.println("unexpected empty"); 232 | estr += " " + alignment[i]; 233 | sstr += " " + shift_in_str; 234 | for(int j = 0; j < l; ++j) { 235 | if(hyp_wd.length() <= j) hstr += " "; 236 | if(ref_wd.length() <= j) rstr += " "; 237 | if(j > 0) estr += " "; 238 | if(j >= shift_in_str.length()) sstr += " "; 239 | } 240 | ref_idx++; 241 | hyp_idx++; 242 | non_inserr++; 243 | break; 244 | case 'D': 245 | l = ref_wd.length(); 246 | hstr += " "; 247 | rstr += " " + ref_wd; 248 | estr += " D"; 249 | sstr += " "; 250 | for(int j = 0; j < l; ++j) { 251 | hstr += "*"; 252 | if(j > 0) estr += " "; 253 | sstr += " "; 254 | } 255 | ref_idx++; 256 | non_inserr++; 257 | break; 258 | case 'I': 259 | l = Math.max(hyp_wd.length(), shift_in_str.length()); 260 | hstr += " " + hyp_wd; 261 | rstr += " "; 262 | estr += " I"; 263 | sstr += " " + shift_in_str; 264 | for(int j = 0; j < l; ++j) { 265 | rstr += "*"; 266 | if(j >= hyp_wd.length()) hstr += " "; 267 | if(j > 0) estr += " "; 268 | if(j >= shift_in_str.length()) sstr += " "; 269 | } 270 | hyp_idx++; 271 | break; 272 | } 273 | 274 | if(alignment[i] != 'D') { 275 | List val = align_info.get((hyp_idx-1) + "-" + ind_end); 276 | if(val != null) { 277 | List list = val; 278 | for(int j = 0; j < list.size(); ++j) { 279 | String s = "" + list.get(j); 280 | hstr += " ]"; 281 | rstr += " "; 282 | estr += " "; 283 | sstr += " " + s; 284 | for(int k = 1; k < s.length(); ++k) { 285 | hstr += " "; 286 | rstr += " "; 287 | estr += " "; 288 | } 289 | } 290 | } 291 | } 292 | } 293 | } 294 | // if(non_inserr != ref.length && ref.length > 1) 295 | // System.out.println("** Error, unmatch non-insertion erros " + non_inserr + 296 | // " and reference length " + ref.length ); 297 | String indent = ""; 298 | if (shiftonly) indent = " "; 299 | to_return += "\n" + indent + "REF: " + rstr; 300 | to_return += "\n" + indent + "HYP: " + hstr; 301 | if(!shiftonly) { 302 | to_return += "\n" + indent + "EVAL:" + estr; 303 | to_return += "\n" + indent + "SHFT:" + sstr; 304 | } 305 | to_return += "\n"; 306 | return to_return; 307 | } 308 | 309 | public static void performShiftArray(HashMap> hwords, 310 | int start, 311 | int end, 312 | int moveto, 313 | int capacity) { 314 | 315 | HashMap> nhwords = new HashMap>(); 316 | 317 | if(moveto == -1) { 318 | AlignmentStringUtils.copyHashWords(hwords, nhwords, start, end, 0); 319 | AlignmentStringUtils.copyHashWords(hwords, nhwords, 0, start - 1, end - start + 1); 320 | AlignmentStringUtils.copyHashWords(hwords, nhwords, end + 1, capacity, end + 1); 321 | } else if (moveto < start) { 322 | AlignmentStringUtils.copyHashWords(hwords, nhwords, 0, moveto, 0); 323 | AlignmentStringUtils.copyHashWords(hwords, nhwords, start, end, moveto + 1); 324 | AlignmentStringUtils.copyHashWords(hwords, nhwords, moveto + 1, start - 1, end - start + moveto + 2); 325 | AlignmentStringUtils.copyHashWords(hwords, nhwords, end + 1, capacity, end + 1); 326 | } else if (moveto > end) { 327 | AlignmentStringUtils.copyHashWords(hwords, nhwords, 0, start - 1, 0); 328 | AlignmentStringUtils.copyHashWords(hwords, nhwords, end + 1, moveto, start); 329 | AlignmentStringUtils.copyHashWords(hwords, nhwords, start, end, start + moveto - end); 330 | AlignmentStringUtils.copyHashWords(hwords, nhwords, moveto + 1, capacity, moveto + 1); 331 | } else { 332 | AlignmentStringUtils.copyHashWords(hwords, nhwords, 0, start - 1, 0); 333 | AlignmentStringUtils.copyHashWords(hwords, nhwords, end + 1, end + moveto - start, start); 334 | AlignmentStringUtils.copyHashWords(hwords, nhwords, start, end, moveto); 335 | AlignmentStringUtils.copyHashWords(hwords, nhwords, end + moveto - start + 1, capacity, end + moveto - start + 1); 336 | } 337 | hwords.clear(); 338 | hwords.putAll(nhwords); 339 | } 340 | 341 | static void copyHashWords(HashMap> ohwords, 342 | HashMap> nhwords, 343 | int start, 344 | int end, 345 | int nstart) { 346 | int ind_start = 0; 347 | @SuppressWarnings("unused") int ind_end = 1; 348 | @SuppressWarnings("unused") int ind_from = 2; 349 | int ind_in = 3; 350 | int k = nstart; 351 | 352 | for(int i = start; i <= end; ++k, ++i) { 353 | for(int j = ind_start; j <= ind_in; ++j) { 354 | List val = ohwords.get(i + "-" + j); 355 | if(val != null) { 356 | nhwords.put(k + "-" + j, val); 357 | } 358 | } 359 | } 360 | } 361 | } 362 | -------------------------------------------------------------------------------- /src/ter/TER.java: -------------------------------------------------------------------------------- 1 | /** 2 | * LGPL License 3 | * 4 | * TERtest.java v1 5 | * Matthew Snover (snover@cs.umd.edu) 6 | * 7 | * TERtest.java v2, added SGML input support. 8 | * Shuguang Wang (swang@bbn.com) 9 | */ 10 | package ter; 11 | 12 | import java.io.*; 13 | import java.util.regex.*; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.LinkedHashMap; 17 | import java.util.ArrayList; 18 | import java.util.List; 19 | import java.util.Iterator; 20 | import org.w3c.dom.Document; 21 | 22 | import ter.Parameters.OPTIONS; 23 | import ter.core.Alignment; 24 | import ter.core.CostFunction; 25 | import ter.core.TerScorer; 26 | import ter.io.SgmlProcessor; 27 | 28 | public class TER { 29 | public static void main(String[] args) { 30 | // 1. process arguments 31 | Parameters para = new Parameters(); 32 | // paras include Boolean, Double, and Integer 33 | Map paras = para.getOpts(args); 34 | String ref_fn = (String) paras.get(Parameters.OPTIONS.REF); 35 | String hyp_fn = (String) paras.get(Parameters.OPTIONS.HYP); 36 | Object val = paras.get(Parameters.OPTIONS.NORMALIZE); 37 | boolean normalized = (Boolean) val; 38 | val = paras.get(Parameters.OPTIONS.CASEON); 39 | boolean caseon = (Boolean) val; 40 | val = paras.get(Parameters.OPTIONS.NOPUNCTUATION); 41 | boolean nopunct = (Boolean) val; 42 | val = paras.get(Parameters.OPTIONS.ASIANSUPPORT); 43 | boolean asiansupport = (Boolean) val; 44 | val = paras.get(Parameters.OPTIONS.NOTAGBRACKETS); 45 | boolean notagbrackets = (Boolean) val; 46 | val = paras.get(Parameters.OPTIONS.OUTPFX); 47 | String out_pfx; 48 | if(val != null) 49 | out_pfx = (String) val; 50 | else 51 | out_pfx = ""; 52 | val = paras.get(Parameters.OPTIONS.FORMATS); 53 | 54 | List formats = new ArrayList(); 55 | if(val != null) 56 | formats = (List) val; 57 | val = paras.get(Parameters.OPTIONS.BEAMWIDTH); 58 | int beam_width = (Integer) val; 59 | val = paras.get(Parameters.OPTIONS.REFLEN); 60 | String reflen_fn = (String) val; 61 | val = paras.get(Parameters.OPTIONS.TRANSSPAN); 62 | String span_pfx = (String) val; 63 | val = paras.get(Parameters.OPTIONS.SHIFTDIST); 64 | int shift_dist = (Integer) val; 65 | 66 | CostFunction costfunc = new CostFunction(); 67 | costfunc._delete_cost = (Double) paras.get(Parameters.OPTIONS.DELETE_COST); 68 | costfunc._insert_cost = (Double) paras.get(Parameters.OPTIONS.INSERT_COST); 69 | costfunc._shift_cost = (Double) paras.get(Parameters.OPTIONS.SHIFT_COST); 70 | costfunc._match_cost = (Double) paras.get(Parameters.OPTIONS.MATCH_COST); 71 | costfunc._substitute_cost = (Double) paras.get(Parameters.OPTIONS.SUBSTITUTE_COST); 72 | 73 | // 2. init variables 74 | int in_ref_format; 75 | int in_hyp_format; 76 | double TOTAL_EDITS = 0.0; 77 | double TOTAL_WORDS = 0.0; 78 | 79 | // BufferedReader hypstream; 80 | // BufferedReader refstream; 81 | // BufferedReader reflenstream = null; 82 | 83 | Map> hypsegs; 84 | Map> refsegs; 85 | Map> reflensegs = null; 86 | Map> refsids = null; 87 | Map> reflenids = null; 88 | Map refspans = null; 89 | Map hypspans = null; 90 | SgmlProcessor hypsgm = new SgmlProcessor(); 91 | SgmlProcessor refsgm = new SgmlProcessor(); 92 | SgmlProcessor reflensgm = null; 93 | Document hypdoc = hypsgm.parse(hyp_fn); 94 | Document refdoc = refsgm.parse(ref_fn); 95 | Document reflendoc = null; 96 | 97 | // 3. load inputs 98 | if(hypdoc == null) { 99 | hypsegs = load_segs(hyp_fn); 100 | System.out.println("\"" + hyp_fn + "\" was successfully parsed as Trans text"); 101 | in_hyp_format = 1; 102 | } else { 103 | hypsegs = new LinkedHashMap>(); 104 | hypsgm.loadSegs(hypdoc, hypsegs); 105 | in_hyp_format = 2; 106 | } 107 | 108 | if(refdoc == null) { 109 | refsegs = load_segs(ref_fn); 110 | System.out.println("\"" + ref_fn + "\" was successfully parsed as Trans text"); 111 | in_ref_format = 1; 112 | } else { 113 | refsegs = new LinkedHashMap>(); 114 | refsids = new HashMap>(); 115 | refsgm.loadSegs(refdoc, refsegs, refsids); 116 | in_ref_format = 2; 117 | } 118 | 119 | if(reflen_fn != "") { 120 | reflensgm = new SgmlProcessor(); 121 | reflendoc = reflensgm.parse(reflen_fn); 122 | 123 | if(reflendoc == null) { 124 | reflensegs = load_segs(reflen_fn); 125 | System.out.println("\"" + reflen_fn + "\" was successfully parsed as Trans text"); 126 | } else { 127 | reflensegs = new LinkedHashMap>(); 128 | reflenids = new HashMap>(); 129 | refsgm.loadSegs(reflendoc, reflensegs, reflenids); 130 | } 131 | } 132 | 133 | if(span_pfx != "") { 134 | has_span = true; 135 | refspans = load_trans_span(span_pfx+refspan_ext); 136 | hypspans = load_trans_span(span_pfx+hypspan_ext); 137 | } 138 | 139 | // 4. verify input formats 140 | if(!verifyFormats(in_ref_format, in_hyp_format, formats)) System.exit(1); 141 | 142 | // set options to compute TER 143 | TerScorer calc = new TerScorer(); 144 | calc.setNormalize(normalized); 145 | calc.setCase(caseon); 146 | calc.setPunct(nopunct); 147 | calc.setAsian(asiansupport); 148 | calc.setTagBrackets(notagbrackets); 149 | calc.setBeamWidth(beam_width); 150 | calc.setShiftDist(shift_dist); 151 | 152 | // 5. prepare output streams, xml, pra, and ter 153 | BufferedWriter xml_out = openFile(formats, "xml", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 154 | BufferedWriter pra_out = openFile(formats, "pra", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 155 | BufferedWriter prm_out = openFile(formats, "pra_more", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 156 | BufferedWriter ter_out = openFile(formats, "ter", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 157 | BufferedWriter sum_out = openFile(formats, "sum", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 158 | BufferedWriter nbt_out = openFile(formats, "sum_nbest", out_pfx, hyp_fn, ref_fn, reflen_fn, caseon, hypsgm); 159 | 160 | // 6. compute TER 161 | Pattern id_rank = Pattern.compile("^\\s*(.*):([^ ]*)\\s*$", Pattern.CASE_INSENSITIVE); 162 | // For each id 163 | // Map sorted_map = new TreeMap(hypsegs); 164 | // Iterator hypids = (sorted_map.keySet()).iterator(); 165 | Iterator hypids = hypsegs.keySet().iterator(); 166 | while (hypids.hasNext()) { 167 | String id_nrank = ""; 168 | @SuppressWarnings("unused") String rank = ""; 169 | String id = hypids.next(); 170 | List hyps = hypsegs.get(id); 171 | String hypspan = ""; 172 | if(has_span) hypspan = hypspans.get(id); 173 | Matcher id_rank_m = id_rank.matcher(id); 174 | 175 | if(id_rank_m.matches()) { 176 | id_nrank = id_rank_m.group(1); 177 | rank = id_rank_m.group(2); 178 | } else { 179 | id_nrank = id; 180 | id = id + ":1"; 181 | } 182 | 183 | /* Find set of refs */ 184 | if (refsegs.containsKey(id_nrank)) { 185 | System.out.println("Processing " + id); 186 | 187 | List refids; 188 | if(refsids != null && refsids.containsKey(id_nrank)) 189 | refids = refsids.get(id_nrank); 190 | else 191 | refids = new ArrayList(1); 192 | 193 | List reflenseglist = null; 194 | if(reflensegs != null) { 195 | reflenseglist = reflensegs.get(id_nrank); 196 | if(reflenseglist == null) 197 | System.out.println("Warning: NO reference length can be found for hyp: " + refids); 198 | } 199 | 200 | String refspan = ""; 201 | if(has_span) refspan = refspans.get(id_nrank); 202 | 203 | Alignment result = score_all_refs(hyps.get(0), 204 | refsegs.get(id_nrank), 205 | reflenseglist, 206 | refids, 207 | refspan, 208 | hypspan, 209 | costfunc, 210 | calc); 211 | TOTAL_EDITS += result.numEdits; 212 | TOTAL_WORDS += result.numWords; 213 | // 6.1 write output files 214 | try { 215 | if(ter_out != null) 216 | ter_out.write(id + " " + result.numEdits + " " + result.numWords + " " + result.score()+"\n"); 217 | if(xml_out != null) 218 | hypsgm.writeXMLAlignment(xml_out, result, id, (in_ref_format == 1)); 219 | if(pra_out != null) 220 | pra_out.write("Sentence ID: " + id + "\n" + result.toString()+"\n\n"); 221 | if(prm_out != null) 222 | prm_out.write("Sentence ID: " + id + "\n" + result.toMoreString()+"\n\n"); 223 | if(sum_out != null) 224 | writeSummary(sum_out, result, id); 225 | if(nbt_out != null) 226 | writeNbestSum(nbt_out, result, id); 227 | } catch (IOException ioe) { 228 | System.out.println(ioe); 229 | return; 230 | } 231 | } else { 232 | System.out.println("***ERROR*** No reference for segment " + id_nrank); 233 | System.exit(1); 234 | } 235 | } 236 | 237 | closeFile(xml_out, "xml", hypsgm); 238 | closeFile(pra_out, "pra", hypsgm); 239 | closeFile(prm_out, "pra_more", hypsgm); 240 | closeFile(ter_out, "ter", hypsgm); 241 | closeFile(sum_out, "sum", hypsgm); 242 | closeFile(nbt_out, "sum_nbest", hypsgm); 243 | 244 | System.out.println("Total TER: " + (TOTAL_EDITS / TOTAL_WORDS) + " (" + 245 | TOTAL_EDITS + "/" + TOTAL_WORDS + ")"); 246 | System.out.println("Number of calls to beam search: " + 247 | calc.numBeamCalls()); 248 | System.out.println("Number of segments scored: " + 249 | calc.numSegsScored()); 250 | System.out.println("Number of shifts tried: " + 251 | calc.numShiftsTried()); 252 | } 253 | 254 | public static BufferedWriter openFile(List formats, 255 | String type, 256 | String out_pfx, 257 | String hyp_fn, 258 | String ref_fn, 259 | String reflen_fn, 260 | boolean caseon, 261 | SgmlProcessor sgml) { 262 | BufferedWriter bw = null; 263 | if(out_pfx != "" && formats != null && formats.contains(type)) { 264 | try { 265 | bw = new BufferedWriter(new FileWriter(out_pfx + "." + type)); 266 | if (type.equals("xml")) 267 | sgml.writeXMLHeader(bw, hyp_fn, ref_fn, caseon); 268 | else if (type.equals("sum")) { 269 | bw.write("Hypothesis File: " + hyp_fn + "\nReference File: " + ref_fn + "\n" + 270 | "Ave-Reference File: " + ((reflen_fn == "")?ref_fn:reflen_fn) + "\n"); 271 | bw.write(String.format("%1$-19s | %2$-4s | %3$-4s | %4$-4s | %5$-4s | %6$-4s | %7$-6s | %8$-8s | %9$-8s\n", 272 | "Sent Id", "Ins", "Del", "Sub", "Shft", "WdSh", "NumEr", "NumWd", "TER")); 273 | bw.write("-------------------------------------------------------------------------------------\n"); 274 | } else if(type.equals("ter")) 275 | bw.write("Hypothesis File: " + hyp_fn + "\nReference File: " + ref_fn + "\n"); 276 | } catch (IOException ioe) { 277 | System.out.println(ioe); 278 | } 279 | } 280 | 281 | return bw; 282 | } 283 | 284 | public static void closeFile(BufferedWriter bw, 285 | String type, SgmlProcessor sgml) { 286 | 287 | if(bw != null) { 288 | try { 289 | if(type.equals("xml")) 290 | sgml.writeXMLFooter(bw); 291 | else if(type.equals("sum")) { 292 | bw.write("-------------------------------------------------------------------------------------\n"); 293 | bw.write(String.format("%1$-19s | %2$-4d | %3$-4d | %4$-4d | %5$-4d | %6$-4d | %7$-6.1f | %8$-8.3f | %9$-8.3f\n", 294 | "TOTAL", tot_ins, tot_del, tot_sub, tot_sft, tot_wsf, tot_err, tot_wds, tot_err*100.0/tot_wds)); 295 | } 296 | bw.close(); 297 | } catch (IOException ioe) { 298 | System.out.println(ioe); 299 | return; 300 | } 301 | } 302 | } 303 | 304 | public static void writeSummary(BufferedWriter sum, 305 | Alignment result, 306 | String id) { 307 | try { 308 | result.populateScoreDetails(); 309 | 310 | sum.write(String.format("%1$-19s | %2$4d | %3$4d | %4$4d | %5$4d | %6$4d | %7$6.1f | %8$8.3f | %9$8.3f\n", 311 | id, result.numIns, result.numDel, result.numSub, result.numSft, result.numWsf, result.numEdits, result.numWords, result.score()*100.0)); 312 | tot_ins += result.numIns; 313 | tot_del += result.numDel; 314 | tot_sub += result.numSub; 315 | tot_sft += result.numSft; 316 | tot_wsf += result.numWsf; 317 | tot_err += result.numEdits; 318 | tot_wds += result.numWords; 319 | 320 | } catch (IOException ioe) { 321 | System.out.println(ioe); 322 | return; 323 | } 324 | } 325 | 326 | public static void writeNbestSum(BufferedWriter nbt, 327 | Alignment result, 328 | String id) { 329 | try { 330 | result.populateScoreDetails(); 331 | 332 | nbt.write(String.format("%1$-19s %2$4d %3$4d %4$4d %5$4d %6$4d %7$6.1f %8$8.3f %9$8.3f\n", 333 | id, result.numIns, result.numDel, result.numSub, result.numSft, result.numWsf, result.numEdits, result.numWords, result.score()*100.0)); 334 | 335 | } catch (IOException ioe) { 336 | System.out.println(ioe); 337 | return; 338 | } 339 | } 340 | 341 | // it will be more flexible to verify the input formats later. 342 | public static boolean verifyFormats(int in_ref_format, 343 | int in_hyp_format, 344 | List out_formats) 345 | { 346 | if(in_ref_format != in_hyp_format) { 347 | System.out.println("** Error: Both hypothesis and reference have to be in the SAME format"); 348 | return false; 349 | } else if (in_ref_format == 1 && out_formats.indexOf("xml") > -1) { 350 | System.out.println("** Warning: XML ouput may not have correct doc id for Trans format inputs"); 351 | return true; 352 | } else 353 | return true; 354 | } 355 | 356 | public static Alignment score_all_refs(String hyp, 357 | List refs, 358 | List reflens, 359 | List refids, 360 | String refspan, 361 | String hypspan, 362 | CostFunction costfunc, 363 | TerScorer calc) { 364 | double totwords = 0; 365 | String ref; 366 | String refid = ""; 367 | String bestref = ""; 368 | @SuppressWarnings("unused") String reflen = ""; 369 | 370 | Alignment bestresult = null; 371 | 372 | if(has_span && refs.size() > 1) { 373 | System.out.println("Error, translation spans should only be used with SINGLE reference"); 374 | System.exit(1); 375 | } 376 | 377 | calc.setRefLen(reflens); 378 | /* For each reference, compute the TER */ 379 | for (int i = 0; i < refs.size(); ++i) { 380 | ref = (String) refs.get(i); 381 | if(!refids.isEmpty()) 382 | refid = refids.get(i); 383 | 384 | if(has_span) { 385 | calc.setRefSpan(refspan); 386 | calc.setHypSpan(hypspan); 387 | } 388 | 389 | Alignment result = calc.TER(hyp, ref, costfunc); 390 | 391 | if ((bestresult == null) || (bestresult.numEdits > result.numEdits)) { 392 | bestresult = result; 393 | if(!refids.isEmpty()) bestref = refid; 394 | } 395 | 396 | totwords += result.numWords; 397 | } 398 | bestresult.numWords = ((double) totwords) / ((double) refs.size()); 399 | if(!refids.isEmpty()) bestresult.bestRef = bestref; 400 | return bestresult; 401 | } 402 | 403 | public static Map> load_segs(String fn) { 404 | Pattern p = Pattern.compile("^\\s*(.*?)\\s*\\(([^()]+)\\)\\s*$"); 405 | BufferedReader stream; 406 | Map> segs = new LinkedHashMap>(); 407 | 408 | try { 409 | stream = new BufferedReader(new FileReader(fn)); 410 | } catch (IOException ioe) { 411 | System.out.println(ioe); 412 | return null; 413 | } 414 | 415 | try { 416 | String line; 417 | while ((line = stream.readLine()) != null) { 418 | if (line.matches("^\\s*$")) { 419 | continue; 420 | } 421 | Matcher m = p.matcher(line); 422 | if (m.matches()) { 423 | String text = m.group(1); 424 | String id = m.group(2); 425 | List val = segs.get(id); 426 | List al; 427 | if (val == null) { 428 | al = new ArrayList(6); 429 | segs.put(id, al); 430 | } else { 431 | al = val; 432 | } 433 | 434 | al.add(text.trim()); 435 | } else { 436 | System.out.println("Warning, Invalid line: " + line); 437 | } 438 | } 439 | } catch(IOException ioe) { 440 | System.out.println(ioe); 441 | return null; 442 | } 443 | return segs; 444 | } 445 | 446 | private static HashMap load_trans_span(String fn) { 447 | Pattern p = Pattern.compile("^\\s*(.*?)\\s*\\(([^()]+)\\)\\s*$"); 448 | BufferedReader stream; 449 | HashMap spans = new HashMap(); 450 | 451 | try { 452 | stream = new BufferedReader(new FileReader(fn)); 453 | 454 | String line; 455 | while ((line = stream.readLine()) != null) { 456 | if (line.matches("^\\s*$")) { 457 | continue; 458 | } 459 | Matcher m = p.matcher(line); 460 | if (m.matches()) { 461 | String text = m.group(1); 462 | String id = m.group(2); 463 | Object val = spans.get(id); 464 | if (val == null) { 465 | spans.put(id, text); 466 | } else { 467 | System.out.println("Error, translation spans should only be used with SINGLE reference"); 468 | System.exit(1); 469 | } 470 | } else { 471 | System.out.println("Warning, Invalid line: " + line); 472 | } 473 | } 474 | } catch (IOException ioe) { 475 | System.out.println(ioe); 476 | return null; 477 | } 478 | 479 | return spans; 480 | } 481 | 482 | private static int tot_ins = 0; 483 | private static int tot_del = 0; 484 | private static int tot_sub = 0; 485 | private static int tot_sft = 0; 486 | private static int tot_wsf = 0; 487 | private static float tot_err = 0; 488 | private static float tot_wds = 0; 489 | private static String refspan_ext = ".ref"; 490 | private static String hypspan_ext = ".hyp"; 491 | private static boolean has_span = false; 492 | } 493 | -------------------------------------------------------------------------------- /sample-data/hyp.txt: -------------------------------------------------------------------------------- 1 | He noted that the authorities responsible for the British American Security insisted the questioning of all the passengers , " he said , adding that the passengers were forced to quit the plane and interrogated by American authorities . (AFA20040101.0510-8) 2 | Al Baz , was fired during a visit to Israel in Ramallah last August the idea of a new truce between the Palestinians and Israel but between Palestinian factions under Egyptian auspices on this matter did not result in as a result . (AFA20040101.5400-6) 3 | JERUSALEM 1 2 ( AFP ) The correspondence of the former Israeli army newspaper Ben striking shift in the field of public information programme for selected after " paint " Network television broadcast by Cable in Israel . (AFA20040102.2700-3) 4 | The Turkish authorities on 26 December that it dismantled the group associated with the Turkish terrorist organization Al Qaeda responsible for a series of attacks . (AFA20040102.6110-5) 5 | For his part , called for the Colombian president Alvaro ( not represented are related to the minister of defence ) rebels to " resolve themselves and return to constitutional way " and asserted that " terrorism will not win , " in Colombia . (AFA20040103.0120-5) 6 | He will be divided into several groups of pilgrims to perform the services it will be deployed special forces to interfere in the case of congestion . (AFA20040103.2900-8) 7 | ISLAMABAD , 3.1 ( AFP ) Indian Prime Minister Atal Bihari Vajpayee on Saturday in Islamabad in the first visit by the chairman of the Indian government to Pakistan since February 1999 . (AFA20040103.5700-2) 8 | Khaddam has made these statements at the end of his meeting with a delegation of Iraqi clans to visit Damascus . (AFA20040103.7010-6) 9 | The Israeli ministerial committee decided Wednesday to increase the number of the population of the agricultural settlements in the Golan Heights by 50 percent . (AFA20040103.7010-7) 10 | Said a source close to the minors that the prince of Monaco was " fatigue " after the end of the Year celebrations would be subject to a series of medical examinations . (AFA20040103.9100-4) 11 | Army was pursuing an extensive process for more than two weeks in Nablus in search of Palestinian activists . (AFA20040103.9200-6) 12 | Tony Blair Put on President Mubarak New Ideas to Advance the Peace Process ( Maher ) (AFA20040104.3110-1) 13 | Said Maher told reporters that Blair , who met with Mubarak on Saturday night in Sharm el Sheikh resort on the Red Sea , he proposed that the quartet to play an important role through some new ideas put forward by . " (AFA20040104.3110-3) 14 | The expert who requested anonymity said that " the situation of the matter is linked to the dead bodies " . (AFA20040104.7300-4) 15 | . consultant before entering the hospital , " We have a difference in the judicial police and the gendarmerie in Paris is prepared to intervene when the teams to identify with what we need it . " (AFA20040104.7300-5) 16 | He added , " when there is a Palestinian action against the Israelis do not abstain without the minimum move the world nothing about the Israeli crimes in Nablus and Rafah . " particular (AFA20040104.8400-4) 17 | News Agency quoted the Saudi Prince Nayef as saying before leaving Riyadh on Saturday , heading to Tunisia to participate in the meetings of Arab interior ministers " even though the channels open between us and them ( British ) did not for aircraft of anything . " (AFA20040104.9200-3) 18 | On 24 December reported that Annan will meet 19 of the current Iraqi transitional governing council to examine the role of the United Nations could play in Iraq . (AFA20040105.3220-4) 19 | Straw . " I think we have established a relationship of trust with Libya renounce terrorism and provided first and second to give up on the continuing its programme for developing weapons of mass destruction . " (AFA20040105.4910-7) 20 | Washington 6 1 ( AFP ) American official announced yesterday a Turkish prime minister Rajab Tayib Ardogan would travel to the United States the end of January in an official visit in Iraq in particular section of the Kurdish section of the larger . (AFA20040105.6000-2) 21 | Riyadh 5 1 ( AFP ) Saudi security source said on Monday told Agence France Presse , was found on Sunday night to Monday a bomb in the office of special telephone in Riyadh , was withdrawn " from the place , without mention any details about the nature of the bomb . (AFA20040105.6200-2) 22 | TEHRAN , 6 1 ( AFP ) Iranian Foreign Ministry spokesman said that Iran welcomes the adoption of its neighbour Afghanistan a new constitution continues its full support of the interim government headed by Hamid Karzai . (AFA20040106.6100-2) 23 | Hamid Reza Asefi in a statement broadcast by the Iranian news agency on Monday evening to build the first constitution since the fall of the Taliban regime is " an important step . " (AFA20040106.6100-3) 24 | North Korea and Iran Participated in the Development of Weapons of Mass Destruction in Libya ( Newspaper ) (AFA20040106.7300-1) 25 | The newspaper said that the Libyan declaration would allow the inspectors examined each of military technology from North Korea and Iran . (AFA20040106.7300-6) 26 | The army offered his condolences to the families of soldiers in a statement . (AFA20040129.5120-6) 27 | WASHINGTON , 30 1 ( AFP ) Democratic MP Victor Snider's Friday that the idea of American parliamentarians who visited Iran considered this week during a dinner between the Iranian ambassador members of Congress to the United Nations Mohammad Javad Zarif . (AFA20040130.6910-2) 28 | " I believe that this is only in being recognized , the world would be better if Iran and the United States to overcome the differences . " The lack of confidence (AFA20040130.6910-9) 29 | Bremer said that " these officers have the Iraqi and other tangible progress to ensure security and the future of the new Iraq . (AFA20040131.3700-8) 30 | Before that , newspapers reported today that the information or guidance system to a breakdown in may be behind the incident . (AFA20040131.4510-5) 31 | Relations with Tunisia , United States of America on the distinctive and traditional relations of mutual trust and respect due to the eighteenth century , particularly to the year the date of the first trade agreement between Tunisia and the United States . (AKHB20040219.001-1) 32 | Note at the outset that there was no delay in the area of democracy and freedom of expression in Tunisia in building a democratic society go hand in hand with the work of the comprehensive development . (AKHB20040221.001-1) 33 | The secularists are aware that the United States , for a long time , took advantage of factors's underdevelopment humiliating subordination to the west , namely secular curriculum , which advocated by some of us ? (ALH20040204.001-1) 34 | And here I am once again at odds with the governing council , it comprises seven members of the judgments in personal or eight national who do not emerge from their patriotism , including specific Bajah Adnan g . , which remind him in his capacity as chairman of the governing council this month . (ALH20040204.007-3) 35 | Saudi Arabia have sharia courts after the issuance of the arbitration act of the new Saudi that the arbitration clause , however , contend with its jurisdiction with and in spite of the arbitration clause despite the adherence to one of the parties to the dispute to arbitration . (ALH20040206.002-5) 36 | SYRIA : An international arbitration was sentenced to between the company , " Stahl any and ( Swiss ) and " Public Company for Fertilizers , " the Syrian government company . (ALH20040206.002-7) 37 | Income , which went away . (ALH20040206.003-8) 38 | Khomeini a quarter of a century ago has attracted people's thoughts . (ALH20040220.001-2) 39 | It is not on the future of the Mohammad Khatami . (ALH20040220.001-8) 40 | Suspicion and malaise and anger are usually the Iraqis . (ALH20040227.001-5) 41 | Poor , tired of his mission . (ALH20040227.001-6) 42 | Clear that the massacre yesterday aimed at setting on fire . (ALH20040303.001-5) 43 | Has been charged with the Iraqi U . S . forces of involvement in the explosions Karbala and Baghdad yesterday . (ALH20040304.001-1) 44 | In any case , the greater Middle East initiative to rely heavily on the Arab human development plan for 2002 and 2003 . (ALH20040304.002-6) 45 | It the reports of the task often be decisive . (ALH20040305.001-9) 46 | When Ramon does not see the lie , it is used , in one of those attributed to the American Defense Secretary Donald Rumsfeld as saying that Tehran helped al Qaeda and Taliban flight via Iran . (ALH20040305.002-1) 47 | Said was issued in January ( January 2002 , we now know that every accusation made by the American administration before the war on Iraq were false . (ALH20040305.002-3) 48 | At the outset to thank you for the warm welcome and rejoicing , which them . (AMI20020707.001-1) 49 | That the objective of the visit is to contact you talk with you on national issues and local information on the quarters undertaken by the development process . (AMI20020707.001-3) 50 | From the rightist , especially the left to that Spain was sponsoring the interests of the great Saddam course is not the size of the interests of French huge during all the covenant . (ANN20040204.001-2) 51 | Recognize as , in particular , the American double standards for a long time in Iraq especially during the 1980s . (ANN20040204.001-3) 52 | Have not yet done so , or rather , are not used to act as the state preparing to become healthier than before . (ANN20040204.002-2) 53 | Opposition , as opposed to the West should remain strong , just as British Prime Minister Tony Blair wished his country when he said that sad because his opposition to the weak . (ANN20040204.003-3) 54 | We assure the prime minister we say to them : Do not be sad , by the great strong it seems it is growing day by day . (ANN20040204.003-4) 55 | It aimed at creating an atmosphere of terror in the same is not only think about to embark on a specific offence for various reasons but also in from the dream , as the authority believes the idea of freedom , freedom and the opposition or the heart of the system and composition of the ruling or even the idea of insurgency in this critical stage through the region , which terrified many rulers . (ANN20040204.004-4) 56 | Practical In fact , in which Lebanon is a clear indication as you say " state law " or " state " . (ANN20040204.004-7) 57 | That people who govern in accordance with the standards of the class type spent from the old and intractable as long as we were told that it had produced the war . (ANN20040204.005-8) 58 | Thus in each stage of the export of Iraqi oil has been made by the US quota formula for petroleum and other companies including American , through the branch " Sonatrach " in London . (ANN20040205.002-2) 59 | At Home is the spectre of war and death in the cellars of intelligence of all people , even those who thought that they accepted by the regime or are engaged in its policy . (ANN20040220.001-6) 60 | The former based on exclusivity , by level , it is obedience and discipline on any other even if that is wrong pursuant to Rule the golden she said , " carried out first , to object to , if cut warheads . (ANN20040227.001-3) 61 | Wishers to talk about the June 2005 as a benefit not flight , unless Lebanon has succeeded on the occasion of the presidential elections , renewal of the international confidence in it , and today is non existent after the Paris commitments were lost in wars of the heads of states . (ANN20040227.002-2) 62 | No matter how friendly the French President Jacques Chirac to Lebanon , whatever the specific relations with the assumption that ( Prime Minister Rafiq al Hariri to remain in place last ) is Paris without European accord . (ANN20040227.002-4) 63 | To see reason to be in the dimension of the Iraqi issues : First , where the damage Pearl's lucrative deals on the reconstruction of Iraq before the war has begun investigations which include companies . (ANN20040228.001-7) 64 | There is no more than right wing governments in the history of Israel , but also Israeli public opinion , a hard liner also believes the background of the survival and continuation of this government to nurture each other . (ANN20040229.001-2) 65 | Only way that inspired some to Damascus , despite their acceptance of the participation and the devaluation of the level , nothing has changed its position of strategic and Syria in Lebanon . (ANN20040304.004-5) 66 | The official Arab system like the critical towards the wagon down to the abyss more quickly over time , as does not have a designated stop the rush , not even the mitigation . (ANN20040304.005-1) 67 | However , maintained the former president of the opposition to the extension of President Emile Lahoud , does not hesitate to correct a distortion of the phalanges old believes that the deliberate hit finally , is that the Phalangist Party support the president of the Republic , president of the republic , has not been submitted to the support of the renewal of the mandate extension . (ANN20040305.001-3) 68 | It is tying on a correlation in the course of destiny , even the president of the Lebanese Republic is Syria making it is not making Lebanon , and the formation of governments in Lebanon would be intervention to overcome the obstacles to a Syrian . (ANN20040309.001-6) 69 | No one is unaware of the fact that we agree and disagree on the same boat . . that our sole and eternal the final final the only eternal and Jordan to inherit the earth and God . (EJO20031223.001-3) 70 | That the development of our partnership strong and justified by the facts of history and geography with the requirements of logic . (MOFA20040219.001-1) 71 | Human development and economic prosperity are key to a better future for the Middle East . (MOFA20040219.002-1) 72 | This is the importance of cooperation between local administrations across the border because the development of science and knowledge and know no boundaries and geographical it belongs to all mankind . (OMSAR20040107.001-4) 73 | Includes the protocol of agreement to bring about a separate account the value of 10.2 million euros to finance the projects prepared by the treatment by municipal solid waste management , Minister of Administrative Development and control of the European Union Commission . (OMSAR20040107.002-2) 74 | For example , industrial revolution not only to day or days or a few years . (SPC20030310.001-5) 75 | The council to pay for several years , special attention to the theme of the children of refugees and internally displaced persons who were forced to flee their homes , as well as children who have been recruited and sexual exploitation , giving particular importance to the girls because of their more vulnerable to additional risks , particularly sexual violence . (UN20040101.002-3) 76 | My delegation is satisfied with the completion of a large part of the recommendations of the mission visited Central Africa from 7 to 16 June 2003 . (UN20040101.005-6) 77 | Could not be underestimated from the central element of national reconciliation in the rehabilitation of societies that have suffered the bitterness of the armed conflict and its implications on the social fabric of economic infrastructure of states and regions concerned , and to enable them to the page of the past and look into the future of peaceful coexistence and cooperation and unity of purpose and interests . (UN20040101.007-2) 78 | As the governor said , by improving the financial market rules and regulations , organized by the clear and objective will help improve the efficiency of investment to stimulate higher rates of savings of citizens . (XIN20040103.0035-4) 79 | It is also expected to meet with Syrian citizens living in Istanbul , Turkish businessmen . (XIN20040105.0070-7) 80 | The peace process depends on the cease fire . (XIN20040107.0046-7) 81 | The Israeli government did not have any step for the advancement of the peace process between the two sides as they continue the closure of offices and Palestinian institutions in Jerusalem continued detention of prisoners and detainees have not withdrawn to what it was before 28 September 2000 as part of the obligations on the Israeli government in the first phase of the plan " road map " which provides the establishment of a Palestinian state by 2005 . (XIN20040108.0031-5) 82 | Lhasa January 10 ( Xinhua ) Private economy witnessed Tibet Autonomous Region in southwest China has developed rapidly in recent years with the support of the central government , according to Chow Chung Chieu director of the Department of Industry and Trade Area of Tibet . (XIN20040110.0019-2) 83 | , demonstrated today in the 200 armed Palestinians from Fatah , in protest against the assassination attempt against their commander in the northern West Bank Zakariya Zabidi Battalions vowed reprisals . (XIN20040113.0114-7) 84 | Train Rapid Between India and Pakistan up to in India (XIN20040115.0055-1) 85 | Tokyo January 16 ( Xinhua ) The Japanese government decided Friday to extend the participation of the Japanese forces in the task of the United Nations peacekeeping in the Golan Heights occupied by Israel , following the issuance of the Security Council a decision on the continuation of a task . (XIN20040115.0212-2) 86 | Received January 16 ( Xinhua ) Kuwait , Sheikh Mohammad Sabah al Salem al Sabah , Kuwait's foreign minister , a telephone conversation this evening his American counterpart , Colin Powell addressed the American Declaration of Kuwait as strategic ally outside NATO . American (XIN20040116.0152-2) 87 | The Falcon in a press statement distributed here this evening that this place distinctive Kuwait with the United States will enable it to transfer the voice of the peoples of the region and hopes for the issues and problems . (XIN20040116.0152-5) 88 | CANBERRA , January 17 ( Xinhua ) Australian Foreign Minister Alexander Downer said today that negotiations with the United States on a bilateral free trade agreement still faces problems . (XIN20040117.0050-2) 89 | The company will also guide the electricity market will price of electricity prices in peak season . (XIN20040117.0140-6) 90 | The Chinese ambassador would be reviewed during the visit of the agreement of strategic cooperation between Egypt and China on the occasion of the five years to sign it , he pointed out that would be signed several agreements between the two countries , including agreements in the areas of economic and information . (XIN20040118.0127-4) 91 | Egyptian Communications Company Launched New Television Station in Iraq (XIN20040121.0073-1) 92 | Fault occurred in the Wednesday night , on the eve of the traditional Chinese lunar new year . (XIN20040123.0090-5) 93 | Chinese Economic and Financial Figures in the First and Last Week as Well as (XIN20040127.0006-1) 94 | Of the local government statistics show that there were 139 farm raising poultry , 36.63 . market , supermarket , 31 for the slaughter of chickens in Beijing . (XIN20040128.0114-6) 95 | SANAA , January 28 ( Xinhua ) The sources said Yemeni forces units of the Yemeni and American Coast Guard concluded today joint military exercises in the Gulf of Aden . (XIN20040128.0119-2) 96 | Made chairman of the committee on defence and interior , this statement during the parliament to report to parliament on the visit by a delegation of the Committee to China . (XIN20040128.0152-3) 97 | " said that China provide an opportunity for Zimbabwe to establish cooperation with the state was prepared to offer friendly assistance . " (XIN20040128.0152-4) 98 | Kabul January 29 ( Xinhua ) Manuel de Almeida the Selva of the United Nations spokesman here today that 2500 Afghan refugees from eight states in particular Iran have returned to their country so far in 2004 . (XIN20040129.0044-2) 99 | On the other hand , 50 refugees from seven other states with the assistance of the agency . (XIN20040129.0044-5) 100 | The number of Palestinian prisoners , prison authorities yesterday returned 37 prisoners who were released today , and today their 16 prisoners , without mentioning names other reasons for the write off . (XIN20040129.0054-3) 101 | -------------------------------------------------------------------------------- /src/ter/core/TerScorer.java: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright� 2006 by BBN Technologies and University of Maryland (UMD) 4 | 5 | License: LGPL v. 2.1. See LICENSE.txt. 6 | 7 | TERcalc.java v1 8 | Matthew Snover (snover@cs.umd.edu) 9 | 10 | */ 11 | 12 | package ter.core; 13 | 14 | import java.util.HashSet; 15 | import java.util.TreeSet; 16 | import java.util.HashMap; 17 | import java.util.Map; 18 | import java.util.Set; 19 | import java.util.List; 20 | import java.util.Arrays; 21 | import java.util.ArrayList; 22 | import java.util.Iterator; 23 | 24 | import ter.util.IntPair; 25 | import ter.util.Pair; 26 | import ter.util.StringUtils; 27 | 28 | public class TerScorer { 29 | /* Turn on if you want a lot of debugging info. */ 30 | static final private boolean DEBUG = false; 31 | private boolean normalized = false; 32 | private boolean caseon = false; 33 | private boolean nopunct = false; 34 | private boolean asiansupport = false; 35 | private boolean notagbrackets = false; 36 | private IntPair[] refSpans = null; 37 | private IntPair[] hypSpans = null; 38 | public double ref_len = -1.; 39 | 40 | public void setNormalize(boolean b) { 41 | normalized = b; 42 | } 43 | 44 | public void setCase(boolean b) { 45 | caseon = b; 46 | } 47 | 48 | public void setPunct(boolean b) { 49 | nopunct = b; 50 | } 51 | 52 | public void setAsian(boolean b) { 53 | asiansupport = b; 54 | } 55 | 56 | public void setTagBrackets(boolean b) { 57 | notagbrackets = b; 58 | } 59 | 60 | public void setBeamWidth(int i) { 61 | BEAM_WIDTH = i; 62 | } 63 | 64 | public void setShiftDist(int i) { 65 | MAX_SHIFT_DIST = i; 66 | } 67 | 68 | public void setRefSpan(String span) { 69 | if(span != null && span.trim() != "") { 70 | String[] spans = span.split("\\s+"); 71 | refSpans = new IntPair[spans.length]; 72 | for(int i = 0; i < spans.length; ++i) { 73 | String s[] = spans[i].split(":"); 74 | refSpans[i] = new IntPair(Integer.valueOf(s[0]), 75 | Integer.valueOf(s[1])); 76 | } 77 | } 78 | } 79 | 80 | public void setHypSpan(String span) { 81 | if(span != null && span.trim() != "") { 82 | // hypSpans = span.split("\\s+"); 83 | String[] spans = span.split("\\s+"); 84 | hypSpans = new IntPair[spans.length]; 85 | for(int i = 0; i < spans.length; ++i) { 86 | String s[] = spans[i].split(":"); 87 | hypSpans[i] = new IntPair(Integer.valueOf(s[0]), 88 | Integer.valueOf(s[1])); 89 | } 90 | } 91 | } 92 | 93 | public void setRefLen(List reflens) { 94 | String reflen = ""; 95 | 96 | if (reflens == null || reflens.size() == 0) { 97 | ref_len = -1.0; 98 | return; 99 | } 100 | 101 | ref_len = 0.0; 102 | for (int i = 0; i < reflens.size(); ++i) { 103 | reflen = reflens.get(i); 104 | if(reflen.length() == 0) 105 | ref_len += 0.; 106 | else 107 | ref_len += tokenize(reflen).length; 108 | } 109 | ref_len /= reflens.size(); 110 | } 111 | 112 | public void setRefLen(double d) { 113 | ref_len = (d >= 0) ? d : -1; 114 | } 115 | 116 | public Alignment TER(String[] hyp, String[] ref) { 117 | return TER(hyp, ref, new CostFunction()); 118 | } 119 | 120 | public Alignment TER(String hyp, String ref) { 121 | return TER(hyp, ref, new CostFunction()); 122 | } 123 | 124 | public Alignment TER(String hyp, String ref, CostFunction costfunc) { 125 | /* Tokenize the strings and pass them off to TER */ 126 | Alignment to_return; 127 | 128 | if(!caseon) { 129 | hyp = hyp.toLowerCase(); 130 | ref = ref.toLowerCase(); 131 | } 132 | 133 | if(ref.length() == 0 || hyp.length() == 0) { 134 | to_return = TERnullstr(hyp, ref, costfunc); 135 | if (ref_len >= 0) to_return.numWords = ref_len; 136 | } else { 137 | String[] hyparr = tokenize(hyp); 138 | String[] refarr = tokenize(ref); 139 | 140 | to_return = TER(hyparr, refarr, costfunc); 141 | if(ref_len >= 0) to_return.numWords = ref_len; 142 | } 143 | return to_return; 144 | } 145 | 146 | public Alignment TERnullstr(String hyp, String ref, CostFunction costfunc) { 147 | Alignment to_return = new Alignment(); 148 | String [] hyparr = tokenize(hyp); 149 | String [] refarr = tokenize(ref); 150 | 151 | if(hyp.length() == 0 && ref.length() == 0) { 152 | to_return.numWords = 0; 153 | to_return.numEdits = 0; 154 | } else if (hyp.length() == 0) { 155 | to_return.alignment = new char[refarr.length]; 156 | for(int i = 0; i < refarr.length; ++i) 157 | to_return.alignment[i] = 'D'; 158 | to_return.numWords = refarr.length; 159 | to_return.numEdits = refarr.length; 160 | } else { 161 | to_return.alignment = new char[hyparr.length]; 162 | for(int i = 0; i < hyparr.length; ++i) 163 | to_return.alignment[i] = 'I'; 164 | to_return.numWords = 0; 165 | to_return.numEdits = hyparr.length; 166 | } 167 | to_return.hyp = hyparr; 168 | to_return.ref = refarr; 169 | to_return.aftershift = hyparr; 170 | 171 | return to_return; 172 | } 173 | 174 | public Alignment TER(String[] hyp, String[] ref, 175 | CostFunction costfunc) { 176 | /* Calculates the TER score for the hyp/ref pair */ 177 | Map, Set> rloc = BuildWordMatches(hyp, ref); 178 | Alignment cur_align = MinEditDist(hyp,ref,costfunc,hypSpans); 179 | String[] cur = hyp; 180 | 181 | cur_align.hyp = hyp; 182 | cur_align.ref = ref; 183 | cur_align.aftershift = hyp; 184 | 185 | double edits = 0; 186 | @SuppressWarnings("unused") int numshifts = 0; 187 | ArrayList allshifts = new ArrayList(hyp.length+ref.length); 188 | 189 | if (DEBUG) 190 | System.out.println("Initial Alignment:\n" + cur_align + "\n"); 191 | 192 | while (true) { 193 | Object[] returns = CalcBestShift(cur, hyp, ref, rloc, cur_align, 194 | costfunc); 195 | if (returns == null) { 196 | break; 197 | } 198 | Shift bestShift = (Shift) returns[0]; 199 | edits += bestShift.cost; 200 | cur_align = (Alignment) returns[1]; 201 | 202 | 203 | bestShift.alignment = cur_align.alignment; 204 | bestShift.aftershift = cur_align.aftershift; 205 | 206 | 207 | allshifts.add(bestShift); 208 | cur = cur_align.aftershift; 209 | } 210 | 211 | Alignment to_return = cur_align; 212 | to_return.allshifts = allshifts.toArray(new Shift[0]); 213 | 214 | to_return.numEdits += edits; 215 | 216 | NUM_SEGMENTS_SCORED++; 217 | return to_return; 218 | } 219 | 220 | public String[] tokenize(String s) { 221 | return Normalizer.tokenize(s, normalized, nopunct, asiansupport, notagbrackets); 222 | } 223 | 224 | private Map, Set> BuildWordMatches(String[] hyp, 225 | String[] ref) { 226 | Set hwhash = new HashSet(); 227 | for (int i = 0; i < hyp.length; i++) { 228 | hwhash.add(hyp[i]); 229 | } 230 | boolean[] cor = new boolean[ref.length]; 231 | for (int i = 0; i < ref.length; i++) { 232 | if (hwhash.contains(ref[i])) { 233 | cor[i] = true; 234 | } else { 235 | cor[i] = false; 236 | } 237 | } 238 | 239 | List reflist = Arrays.asList(ref); 240 | HashMap, Set> to_return = new HashMap, Set>(); 241 | for (int start = 0; start < ref.length; start++) { 242 | if (cor[start]) { 243 | for (int end = start; ((end < ref.length) && 244 | (end - start <= MAX_SHIFT_SIZE) && 245 | (cor[end])); 246 | end++) { 247 | List topush = reflist.subList(start, end+1); 248 | if (to_return.containsKey(topush)) { 249 | Set vals = to_return.get(topush); 250 | vals.add(new Integer(start)); 251 | } else { 252 | Set vals = new TreeSet(); 253 | vals.add(new Integer(start)); 254 | to_return.put(topush, vals); 255 | } 256 | } 257 | } 258 | } 259 | return to_return; 260 | } 261 | 262 | private static void FindAlignErr(Alignment align, boolean[] herr, 263 | boolean[] rerr, 264 | int[] ralign) { 265 | int hpos = -1; 266 | int rpos = -1; 267 | for (int i = 0; i < align.alignment.length; i++) { 268 | char sym = align.alignment[i]; 269 | if (sym == ' ') { 270 | hpos++; rpos++; 271 | herr[hpos] = false; 272 | rerr[rpos] = false; 273 | ralign[rpos] = hpos; 274 | } else if (sym == 'S') { 275 | hpos++; rpos++; 276 | herr[hpos] = true; 277 | rerr[rpos] = true; 278 | ralign[rpos] = hpos; 279 | } else if (sym == 'I') { 280 | hpos++; 281 | herr[hpos] = true; 282 | } else if (sym == 'D') { 283 | rpos++; 284 | rerr[rpos] = true; 285 | ralign[rpos] = hpos; 286 | } else { 287 | throw new IllegalStateException("Error! Invalid mini align sequence " + sym + " at pos " + i + "\n"); 288 | } 289 | } 290 | } 291 | 292 | // TODO: Don't use Object 293 | private Object[] CalcBestShift(String[] cur, 294 | String[] hyp, String[] ref, 295 | Map, Set> rloc, Alignment med_align, 296 | CostFunction costfunc) { 297 | /* 298 | return null if no good shift is found 299 | or return Object[ TERshift bestShift, 300 | TERalignment cur_align ] 301 | */ 302 | Object[] to_return = new Object[2]; 303 | 304 | boolean anygain = false; 305 | 306 | /* Arrays that records which hyp and ref words are currently wrong */ 307 | boolean[] herr = new boolean[hyp.length]; 308 | boolean[] rerr = new boolean[ref.length]; 309 | /* Array that records the alignment between ref and hyp */ 310 | int[] ralign = new int[ref.length]; 311 | FindAlignErr(med_align, herr, rerr, ralign); 312 | 313 | Shift[][] poss_shifts = GatherAllPossShifts(cur, ref, rloc, med_align, herr, rerr, ralign, costfunc); 314 | double curerr = med_align.numEdits; 315 | 316 | if (DEBUG) { 317 | // CUT HERE 318 | System.out.println("Possible Shifts:"); 319 | for (int i = poss_shifts.length - 1; i >= 0; i--) { 320 | for (int j = 0; j < poss_shifts[i].length; j++) { 321 | System.out.println(" [" + i + "] " + poss_shifts[i][j]); 322 | } 323 | } 324 | System.out.println(""); 325 | // CUT HERE 326 | } 327 | 328 | double cur_best_shift_cost = 0.0; 329 | Alignment cur_best_align = med_align; 330 | Shift cur_best_shift = new Shift(); 331 | 332 | for (int i = poss_shifts.length - 1; i >= 0; i--) { 333 | if (DEBUG) System.out.println("Considering shift of length " + i + " (" + poss_shifts[i].length +")"); 334 | 335 | /* Consider shifts of length i+1 */ 336 | double curfix = curerr - 337 | (cur_best_shift_cost + cur_best_align.numEdits); 338 | double maxfix = (2 * (1 + i)); 339 | 340 | if ((curfix > maxfix) || 341 | ((cur_best_shift_cost != 0) && (curfix == maxfix))) { 342 | break; 343 | } 344 | 345 | for (int s = 0; s < poss_shifts[i].length; s++) { 346 | curfix = curerr - 347 | (cur_best_shift_cost + cur_best_align.numEdits); 348 | if ((curfix > maxfix) || 349 | ((cur_best_shift_cost != 0) && (curfix == maxfix))) { 350 | break; 351 | } 352 | 353 | Shift curshift = poss_shifts[i][s]; 354 | 355 | Pair shiftReturns = PerformShift(cur, curshift); 356 | String[] shiftarr = shiftReturns.first; 357 | IntPair[] curHypSpans = shiftReturns.second; 358 | 359 | Alignment curalign = MinEditDist(shiftarr, ref, costfunc, curHypSpans); 360 | 361 | curalign.hyp = hyp; 362 | curalign.ref = ref; 363 | curalign.aftershift = shiftarr; 364 | 365 | double gain = (cur_best_align.numEdits + cur_best_shift_cost) 366 | - (curalign.numEdits + curshift.cost); 367 | 368 | if (DEBUG) { 369 | System.out.println("Gain for " + curshift + " is " + gain + ". (result: [" + StringUtils.join(" ", shiftarr) + "]"); 370 | System.out.println("" + curalign + "\n"); 371 | } 372 | 373 | if ((gain > 0) || ((cur_best_shift_cost == 0) && (gain == 0))) { 374 | anygain = true; 375 | cur_best_shift = curshift; 376 | cur_best_shift_cost = curshift.cost; 377 | cur_best_align = curalign; 378 | if (DEBUG) System.out.println("Tmp Choosing shift: " + cur_best_shift + " gives:\n" + cur_best_align + "\n"); 379 | } 380 | 381 | } 382 | } 383 | 384 | if (anygain) { 385 | to_return[0] = cur_best_shift; 386 | to_return[1] = cur_best_align; 387 | return to_return; 388 | } else { 389 | if (DEBUG) System.out.println("No good shift found.\n"); 390 | return null; 391 | } 392 | } 393 | 394 | private Shift[][] GatherAllPossShifts(String[] hyp, String[] ref, Map, Set> rloc, 395 | Alignment align, 396 | boolean[] herr, boolean[] rerr, int[] ralign, CostFunction costfunc) { 397 | 398 | // Don't even bother to look if shifts can't be done 399 | if ((MAX_SHIFT_SIZE <= 0) || (MAX_SHIFT_DIST <= 0)) { 400 | Shift[][] to_return = new Shift[0][]; 401 | return to_return; 402 | } 403 | 404 | 405 | ArrayList[] allshifts = new ArrayList[MAX_SHIFT_SIZE+1]; 406 | for (int i = 0; i < allshifts.length; i++) 407 | allshifts[i] = new ArrayList(); 408 | 409 | List hyplist = Arrays.asList(hyp); 410 | for (int start = 0; start < hyp.length; start++) { 411 | if (! rloc.containsKey(hyplist.subList(start, start+1))) continue; 412 | 413 | boolean ok = false; 414 | Iterator mti = rloc.get(hyplist.subList(start, start+1)).iterator(); 415 | while (mti.hasNext() && (! ok)) { 416 | int moveto = mti.next(); 417 | if ((start != ralign[moveto]) && 418 | (ralign[moveto] - start <= MAX_SHIFT_DIST) && 419 | ((start - ralign[moveto] - 1) <= MAX_SHIFT_DIST)) 420 | ok = true; 421 | } 422 | if (! ok) continue; 423 | 424 | ok = true; 425 | for (int end = start; (ok && (end < hyp.length) && (end < start + MAX_SHIFT_SIZE)); 426 | end++) { 427 | 428 | /* check if cand is good if so, add it */ 429 | List cand = hyplist.subList(start, end+1); 430 | ok = false; 431 | if (! (rloc.containsKey(cand))) continue; 432 | 433 | boolean any_herr = false; 434 | for (int i = 0; (i <= end - start) && (! any_herr); i++) { 435 | if (herr[start+i]) any_herr = true; 436 | } 437 | 438 | if (any_herr == false) { 439 | ok = true; 440 | continue; 441 | } 442 | 443 | Iterator movetoit = rloc.get(cand).iterator(); 444 | while (movetoit.hasNext()) { 445 | int moveto = movetoit.next(); 446 | if (! ((ralign[moveto] != start) && 447 | ((ralign[moveto] < start) || (ralign[moveto] > end)) && 448 | ((ralign[moveto] - start) <= MAX_SHIFT_DIST) && 449 | ((start - ralign[moveto]) <= MAX_SHIFT_DIST))) 450 | continue; 451 | ok = true; 452 | 453 | /* check to see if there are any errors in either string 454 | (only move if this is the case!) 455 | */ 456 | 457 | boolean any_rerr = false; 458 | for (int i = 0; (i <= end - start) && (! any_rerr); i++) { 459 | if (rerr[moveto+i]) any_rerr = true; 460 | } 461 | if (! any_rerr) continue; 462 | 463 | for (int roff = -1; roff <= (end - start); roff++) { 464 | Shift topush = null; 465 | if ((roff == -1) && (moveto == 0)) { 466 | if (DEBUG) System.out.println("Consider making " + start + "..." + end + " moveto: " + moveto + " roff: " 467 | + roff + " ralign[mt+roff]: " + -1); 468 | 469 | topush = new Shift(start, end, -1, -1); 470 | } else if ((start != ralign[moveto+roff]) && 471 | ((roff == 0) || 472 | (ralign[moveto+roff] != ralign[moveto]))) { 473 | int newloc = ralign[moveto+roff]; 474 | if (DEBUG) System.out.println("Consider making " + start + "..." + end + " moveto: " + moveto + " roff: " 475 | + roff + " ralign[mt+roff]: " + newloc); 476 | 477 | // if (newloc != start + 1) { 478 | topush = new Shift(start, end, moveto+roff, newloc); 479 | // } 480 | } 481 | if (topush != null) { 482 | topush.shifted = cand; 483 | topush.cost = costfunc.shift_cost(topush); 484 | allshifts[end - start].add(topush); 485 | } 486 | } 487 | } 488 | } 489 | } 490 | 491 | Shift[][] to_return = new Shift[MAX_SHIFT_SIZE+1][]; 492 | for (int i = 0; i < to_return.length; i++) { 493 | to_return[i] = (Shift[]) allshifts[i].toArray(new Shift[allshifts[i].size()]); 494 | } 495 | return to_return; 496 | } 497 | 498 | public Pair PerformShift(String[] words, Shift s) { 499 | return PerformShift(words, s.start, s.end, s.newloc); 500 | } 501 | 502 | private Pair PerformShift(String[] words, int start, int end, int newloc) { 503 | int c = 0; 504 | String[] nwords = words.clone(); 505 | IntPair[] spans = null; 506 | 507 | if(hypSpans != null) spans = new IntPair[hypSpans.length]; 508 | if(DEBUG) { 509 | if (hypSpans != null) { 510 | System.out.println("word length: " + words.length + " span length: " + hypSpans.length); 511 | } else { 512 | System.out.println("word length: " + words.length + " span length: null"); 513 | } 514 | } 515 | 516 | if (newloc == -1) { 517 | for (int i = start; i<=end;i++) { nwords[c++] = words[i]; if(hypSpans != null) spans[c-1] = hypSpans[i]; } 518 | for (int i = 0; i<=start-1;i++) { nwords[c++] = words[i]; if(hypSpans != null) spans[c-1] = hypSpans[i]; } 519 | for (int i = end+1; i end) { 526 | for (int i = 0; i<=start-1; i++) { nwords[c++] = words[i]; if(hypSpans != null) spans[c-1] = hypSpans[i]; } 527 | for (int i = end+1; i<=newloc;i++) { nwords[c++] = words[i]; if(hypSpans != null) spans[c-1] = hypSpans[i]; } 528 | for (int i = start; i<=end;i++) { nwords[c++] = words[i]; if(hypSpans != null) spans[c-1] = hypSpans[i]; } 529 | for (int i = newloc+1; i(nwords, spans); 540 | } 541 | 542 | private Alignment MinEditDist(String[] hyp, String[] ref, 543 | CostFunction costfunc, IntPair[] curHypSpans) { 544 | double current_best = INF; 545 | double last_best = INF; 546 | int first_good = 0; 547 | int current_first_good = 0; 548 | int last_good = -1; 549 | int cur_last_good = 0; 550 | @SuppressWarnings("unused") int last_peak = 0; 551 | int cur_last_peak = 0; 552 | int i, j; 553 | double cost, icost, dcost; 554 | double score; 555 | 556 | @SuppressWarnings("unused") int hwsize = hyp.length-1; 557 | @SuppressWarnings("unused") int rwsize = ref.length-1; 558 | 559 | final int size = Math.max(ref.length, hyp.length) + 1; 560 | final double[][] S = new double[size][size]; 561 | final char[][] P = new char[size][size]; 562 | 563 | NUM_BEAM_SEARCH_CALLS++; 564 | 565 | for (i=0; i <= ref.length; i++){ 566 | for (j=0; j <= hyp.length; j++){ 567 | S[i][j]=-1.0; 568 | P[i][j]='0'; 569 | } 570 | } 571 | S[0][0] = 0.0; 572 | 573 | for (j=0; j <= hyp.length; j++) { 574 | last_best = current_best; 575 | current_best = INF; 576 | 577 | first_good = current_first_good; 578 | current_first_good = -1; 579 | 580 | last_good = cur_last_good; 581 | cur_last_good = -1; 582 | 583 | last_peak = cur_last_peak; 584 | cur_last_peak = 0; 585 | 586 | for (i=first_good; i <= ref.length; i++) { 587 | if (i > last_good) 588 | break; 589 | if (S[i][j] < 0) 590 | continue; 591 | score = S[i][j]; 592 | 593 | if ((j < hyp.length) && (score > last_best+BEAM_WIDTH)) 594 | continue; 595 | 596 | if (current_first_good == -1) 597 | current_first_good = i ; 598 | 599 | if ((i < ref.length) && (j < hyp.length)) { 600 | if(refSpans == null || hypSpans == null || 601 | spanIntersection(refSpans[i], curHypSpans[j])) { 602 | if (ref[i].equals(hyp[j])) { 603 | cost = costfunc.match_cost(hyp[j], ref[i]) + score; 604 | if ((S[i+1][j+1] == -1) || (cost < S[i+1][j+1])) { 605 | S[i+1][j+1] = cost; 606 | P[i+1][j+1] = ' '; 607 | } 608 | if (cost < current_best) 609 | current_best = cost; 610 | 611 | if (current_best == cost) 612 | cur_last_peak = i+1; 613 | } else { 614 | cost = costfunc.substitute_cost(hyp[j], ref[i]) + score; 615 | if ((S[i+1][j+1] <0) || (cost < S[i+1][j+1])) { 616 | S[i+1][j+1] = cost; 617 | P[i+1][j+1] = 'S'; 618 | if (cost < current_best) 619 | current_best = cost; 620 | if (current_best == cost) 621 | cur_last_peak = i+1 ; 622 | } 623 | } 624 | } 625 | } 626 | 627 | cur_last_good = i+1; 628 | 629 | if (j < hyp.length) { 630 | icost = score+costfunc.insert_cost(hyp[j]); 631 | if ((S[i][j+1] < 0) || (S[i][j+1] > icost)) { 632 | S[i][j+1] = icost; 633 | P[i][j+1] = 'I'; 634 | if (( cur_last_peak < i) && ( current_best == icost)) 635 | cur_last_peak = i; 636 | } 637 | } 638 | 639 | if (i < ref.length) { 640 | dcost = score + costfunc.delete_cost(ref[i]); 641 | if ((S[ i+1][ j]<0.0) || ( S[i+1][j] > dcost)) { 642 | S[i+1][j] = dcost; 643 | P[i+1][j] = 'D'; 644 | if (i >= last_good) 645 | last_good = i + 1 ; 646 | } 647 | } 648 | } 649 | } 650 | 651 | int tracelength = 0; 652 | i = ref.length; 653 | j = hyp.length; 654 | while ((i > 0) || (j > 0)) { 655 | tracelength++; 656 | if (P[i][j] == ' ') { 657 | i--; j--; 658 | } else if (P[i][j] == 'S') { 659 | i--; j--; 660 | } else if (P[i][j] == 'D') { 661 | i--; 662 | } else if (P[i][j] == 'I') { 663 | j--; 664 | } else { 665 | throw new IllegalStateException("Invalid path: " + P[i][j]); 666 | } 667 | } 668 | char[] path = new char[tracelength]; 669 | i = ref.length; 670 | j = hyp.length; 671 | while ((i > 0) || (j > 0)) { 672 | path[--tracelength] = P[i][j]; 673 | if (P[i][j] == ' ') { 674 | i--; j--; 675 | } else if (P[i][j] == 'S') { 676 | i--; j--; 677 | } else if (P[i][j] == 'D') { 678 | i--; 679 | } else if (P[i][j] == 'I') { 680 | j--; 681 | } 682 | } 683 | 684 | Alignment to_return = new Alignment(); 685 | to_return.numWords = ref.length; 686 | to_return.alignment = path; 687 | to_return.numEdits = S[ref.length][hyp.length]; 688 | 689 | return to_return; 690 | } 691 | 692 | // private static boolean spanIntersection (String refSpan, 693 | // String hypSpan) { 694 | // String[] hSpans = hypSpan.split(":"); 695 | // String[] rSpans = refSpan.split(":"); 696 | // 697 | // return (Integer.valueOf(rSpans[1]) >= Integer.valueOf(hSpans[0]) && 698 | // Integer.valueOf(rSpans[0]) <= Integer.valueOf(hSpans[1])); 699 | // } 700 | 701 | private static boolean spanIntersection (IntPair refSpan, 702 | IntPair hypSpan) { 703 | return (refSpan.cdr >= hypSpan.car && 704 | refSpan.car <= hypSpan.cdr); 705 | } 706 | 707 | /* Accessor functions to some internal counters */ 708 | public int numBeamCalls () { return NUM_BEAM_SEARCH_CALLS; } 709 | public int numSegsScored () { return NUM_SEGMENTS_SCORED; } 710 | public int numShiftsTried () { return NUM_SHIFTS_CONSIDERED; } 711 | 712 | /* We may want to add some function to change the beam width */ 713 | public int BEAM_WIDTH = 20; 714 | 715 | private static final double INF = 999999.0; 716 | 717 | private final int MAX_SHIFT_SIZE = 10; 718 | private int MAX_SHIFT_DIST = 50; 719 | 720 | /* Variables for some internal counting. */ 721 | private int NUM_SEGMENTS_SCORED = 0; 722 | private int NUM_SHIFTS_CONSIDERED = 0; 723 | private int NUM_BEAM_SEARCH_CALLS = 0; 724 | 725 | 726 | } 727 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | -------------------------------------------------------------------------------- /src/ter/io/SgmlProcessor.java: -------------------------------------------------------------------------------- 1 | package ter.io; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.text.SimpleDateFormat; 9 | import java.util.ArrayList; 10 | import java.util.Calendar; 11 | import java.util.HashMap; 12 | import java.util.Iterator; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import javax.xml.parsers.DocumentBuilder; 19 | import javax.xml.parsers.DocumentBuilderFactory; 20 | import javax.xml.parsers.ParserConfigurationException; 21 | 22 | import org.w3c.dom.Document; 23 | import org.w3c.dom.Element; 24 | import org.w3c.dom.NamedNodeMap; 25 | import org.w3c.dom.Node; 26 | import org.w3c.dom.NodeList; 27 | import org.xml.sax.SAXException; 28 | import org.xml.sax.SAXParseException; 29 | 30 | import ter.core.Alignment; 31 | import ter.core.Shift; 32 | 33 | public class SgmlProcessor { 34 | public static enum TAGNAME { 35 | TSTSET, REFSET, DOC, AUDIOFILE, SEG, HYP, WORD, HL, P, UNKNOWN; 36 | } 37 | 38 | public TAGNAME tag_type(String name) { 39 | TAGNAME val = sgm_tag_names.get(name.toLowerCase()); 40 | if(val == null) 41 | return TAGNAME.UNKNOWN; 42 | else 43 | return val; 44 | } 45 | 46 | public static enum ATTRNAME { 47 | SETID, SRCLANG, TRGLANG, DOCID, FILEID, SYSID, ID, SEGID, START, END, RANK, UNKNOWN; 48 | } 49 | 50 | public ATTRNAME attr_type(String name) { 51 | ATTRNAME val = sgm_attr_names.get(name.toLowerCase()); 52 | if(val == null) 53 | return ATTRNAME.UNKNOWN; 54 | else 55 | return val; 56 | } 57 | 58 | public SgmlProcessor() { 59 | factory = null; 60 | builder = null; 61 | 62 | last_doc = ""; 63 | last_seg = ""; 64 | is_first_doc = true; 65 | is_first_seg = true; 66 | 67 | sgm_tag_names = new HashMap(); 68 | sgm_tag_names.put("tstset", TAGNAME.TSTSET); 69 | sgm_tag_names.put("refset", TAGNAME.REFSET); 70 | sgm_tag_names.put("doc", TAGNAME.DOC); 71 | sgm_tag_names.put("audiofile", TAGNAME.AUDIOFILE); 72 | sgm_tag_names.put("hyp", TAGNAME.HYP); 73 | sgm_tag_names.put("seg", TAGNAME.SEG); 74 | sgm_tag_names.put("word", TAGNAME.WORD); 75 | sgm_tag_names.put("p", TAGNAME.P); 76 | sgm_tag_names.put("hl", TAGNAME.HL); 77 | 78 | sgm_attr_names = new HashMap(); 79 | sgm_attr_names.put("setid", ATTRNAME.SETID); 80 | sgm_attr_names.put("srclang", ATTRNAME.SRCLANG); 81 | sgm_attr_names.put("trglang", ATTRNAME.TRGLANG); 82 | sgm_attr_names.put("docid", ATTRNAME.DOCID); 83 | sgm_attr_names.put("fileid", ATTRNAME.FILEID); 84 | sgm_attr_names.put("sysid", ATTRNAME.SYSID); 85 | sgm_attr_names.put("id", ATTRNAME.ID); 86 | sgm_attr_names.put("segid", ATTRNAME.SEGID); 87 | sgm_attr_names.put("start", ATTRNAME.START); 88 | sgm_attr_names.put("end", ATTRNAME.END); 89 | } 90 | 91 | 92 | /******************************************************** 93 | * Name: parse 94 | * Desc: It tries to parse the file with the name given 95 | * first, as XML 96 | * then, as SGML if we failed to parse it as XML 97 | * last, return document if successful or null. 98 | ********************************************************/ 99 | public Document parse(String fn) { 100 | document = null; 101 | factory = DocumentBuilderFactory.newInstance(); 102 | factory.setValidating(false); 103 | factory.setNamespaceAware(true); 104 | 105 | try { 106 | builder = factory.newDocumentBuilder(); 107 | 108 | builder.setErrorHandler( 109 | new org.xml.sax.ErrorHandler() { 110 | // ignore fatal errors (an exception is guaranteed) 111 | public void fatalError(SAXParseException sxe) 112 | throws SAXException { 113 | /* 114 | System.out.println("** SAXException" 115 | + ", line " + sxe.getLineNumber() 116 | + ", uri " + sxe.getSystemId()); 117 | System.out.println(" " + sxe.getMessage()); 118 | */ 119 | } 120 | 121 | // treat validation errors as fatal 122 | public void error(SAXParseException spe) 123 | throws SAXParseException 124 | { 125 | System.out.println("** SAXParseException" 126 | + ", line " + spe.getLineNumber() 127 | + ", uri " + spe.getSystemId()); 128 | System.out.println(" " + spe.getMessage()); 129 | // throw e; 130 | } 131 | 132 | // dump warnings too 133 | public void warning(SAXParseException warn) 134 | throws SAXParseException 135 | { 136 | System.out.println("** Warning" 137 | + ", line " + warn.getLineNumber() 138 | + ", uri " + warn.getSystemId()); 139 | System.out.println(" " + warn.getMessage()); 140 | } 141 | } 142 | ); 143 | 144 | 145 | document = builder.parse(new File(fn)); 146 | System.out.println("\"" + fn + "\" was successfully parsed as XML"); 147 | } catch (SAXParseException spe) { 148 | buildDom(fn); 149 | if(document != null) 150 | System.out.println("\"" + fn + "\" was successfully parsed as SGML"); 151 | } catch (SAXException sxe) { 152 | System.out.println("** SAXExceptionError "); 153 | sxe.printStackTrace(); 154 | } catch (ParserConfigurationException pce) { 155 | System.out.println("** PCError "); 156 | pce.printStackTrace(); 157 | } catch (IOException ioe) { 158 | System.out.println("** IOError "); 159 | ioe.printStackTrace(); 160 | } 161 | return document; 162 | } 163 | 164 | /******************************************************** 165 | * Name: loadSegs 166 | * Desc: Loads the segments from the document objects. 167 | * This document object assumes the existence of sysid 168 | * field, which is ususally in reference file. 169 | * Segment text and sysids are put into the given hashmaps. 170 | ********************************************************/ 171 | public void loadSegs(Node node, Map> segs, Map> ids) { 172 | int type = node.getNodeType(); 173 | is_ref = true; 174 | 175 | switch(type) { 176 | case Node.DOCUMENT_NODE: 177 | loadSegs(((Document) node).getDocumentElement(), segs, ids); 178 | break; 179 | case Node.ELEMENT_NODE: 180 | loadSegsElement(node, segs); 181 | NodeList children = node.getChildNodes(); 182 | if(children != null) { 183 | int len = children.getLength(); 184 | for(int i = 0; i < len; ++i) 185 | loadSegs(children.item(i), segs, ids); 186 | } 187 | break; 188 | case Node.ENTITY_REFERENCE_NODE: 189 | break; 190 | case Node.CDATA_SECTION_NODE: 191 | break; 192 | case Node.TEXT_NODE: 193 | String text = node.getNodeValue().trim(); 194 | if(id != null && text != null) { 195 | List val1 = segs.get(id); 196 | List val2 = ids.get(id); 197 | List al1, al2; 198 | 199 | if(val1 == null) { 200 | al1 = new ArrayList(6); 201 | al2 = new ArrayList(6); 202 | al1.add(text); 203 | al2.add(sys_id); 204 | segs.put(id, al1); 205 | ids.put(id, al2); 206 | } else { 207 | al1 = val1; 208 | al2 = val2; 209 | al1.add(text); 210 | al2.add(sys_id); 211 | } 212 | 213 | } 214 | 215 | id = null; 216 | break; 217 | case Node.PROCESSING_INSTRUCTION_NODE: 218 | break; 219 | } 220 | } 221 | 222 | /******************************************************** 223 | * Name: loadSegs 224 | * Desc: Loads the segments from the document objects. 225 | * This document object do NOT assumes the existence of sysid 226 | * field. 227 | * Segment text and sysids are put into the given hashmaps. 228 | ********************************************************/ 229 | public void loadSegs(Node node, Map> segs) { 230 | int type = node.getNodeType(); 231 | 232 | is_ref = false; 233 | switch(type) { 234 | case Node.DOCUMENT_NODE: 235 | loadSegs(((Document) node).getDocumentElement(), segs); 236 | break; 237 | case Node.ELEMENT_NODE: 238 | loadSegsElement(node, segs); 239 | NodeList children = node.getChildNodes(); 240 | if(children != null) { 241 | int len = children.getLength(); 242 | for(int i = 0; i < len; ++i) 243 | loadSegs(children.item(i), segs); 244 | } 245 | break; 246 | case Node.ENTITY_REFERENCE_NODE: 247 | break; 248 | case Node.CDATA_SECTION_NODE: 249 | 250 | break; 251 | case Node.TEXT_NODE: 252 | String text = node.getNodeValue().trim(); 253 | if(id != null && text != null) { 254 | List val = segs.get(id); 255 | List al; 256 | 257 | if(val == null) { 258 | al = new ArrayList(6); 259 | al.add(text); 260 | segs.put(id, al); 261 | } else { 262 | al = val; 263 | al.add(text); 264 | } 265 | 266 | } 267 | 268 | id = null; 269 | break; 270 | case Node.PROCESSING_INSTRUCTION_NODE: 271 | break; 272 | } 273 | } 274 | 275 | /******************************************************** 276 | * Name: writeXMLHeader 277 | * Desc: Writes the header information to the given XML file. 278 | ********************************************************/ 279 | public void writeXMLHeader(BufferedWriter xml, 280 | String hyp_fn, 281 | String ref_fn, 282 | boolean caseon) { 283 | 284 | Calendar now = Calendar.getInstance(); 285 | SimpleDateFormat formatter = new SimpleDateFormat("hh:mm:ss, E, MM dd yyyy"); 286 | 287 | try { 288 | xml.write(""); 289 | xml.newLine(); 290 | xml.write(""); 295 | else 296 | xml.write("\" case_sense=\"0\">"); 297 | xml.newLine(); 298 | 299 | xml.write(""); 303 | xml.newLine(); 304 | } catch (IOException ioe) { 305 | System.out.println(ioe); 306 | return; 307 | } 308 | } 309 | 310 | /******************************************************** 311 | * Name: writeXMLAlignment 312 | * Desc: Writes the detailed alignment for each word of the 313 | * segments. 314 | ********************************************************/ 315 | public void writeXMLAlignment(BufferedWriter xml, 316 | Alignment result, 317 | String id, 318 | boolean istrans) { 319 | HashMap cur_ids = parseID(id, istrans); 320 | if(istrans && is_first_doc) 321 | try { 322 | xml.write(" "); 323 | is_first_doc = false; 324 | } catch (IOException ioe) { 325 | System.out.println(ioe); 326 | return; 327 | } 328 | if(cur_ids != null) { 329 | String cur_rank = cur_ids.get(ATTRNAME.RANK); 330 | if(newDoc(cur_ids)) writeXMLDoc(xml, cur_ids); 331 | else if(newSeg(cur_ids)) writeXMLSeg(xml, cur_ids); 332 | try { 333 | xml.write(" "); 338 | writeXMLAlignmentDetails(xml, result.ref, result.aftershift, result.alignment, result.allshifts); 339 | xml.write(" "); 340 | } catch (IOException ioe) { 341 | System.out.println(ioe); 342 | return; 343 | } 344 | } 345 | } 346 | 347 | /******************************************************** 348 | * Name: writeXMLFooter 349 | * Desc: Closes the tags for the given XML file. 350 | ********************************************************/ 351 | public void writeXMLFooter(BufferedWriter xml) { 352 | try { 353 | xml.newLine(); 354 | xml.write(" "); 355 | xml.newLine(); 356 | xml.write(" "); 357 | xml.newLine(); 358 | xml.write(""); 359 | xml.newLine(); 360 | xml.write(""); 361 | } catch (IOException ioe) { 362 | System.out.println(ioe); 363 | return; 364 | } 365 | } 366 | 367 | /******************************************************** 368 | * Name: join 369 | * Desc: Joins the elements of the given array list with 370 | * the given delimeter. 371 | ********************************************************/ 372 | public static String join(String delim, 373 | List al) { 374 | String ret = ""; 375 | if(al != null) { 376 | for(int i = 0; i < al.size() - 1; ++i) 377 | ret += al.get(i) + delim; 378 | ret += al.get(al.size() - 1); 379 | } 380 | return ret; 381 | } 382 | 383 | /********************************** 384 | * Private Attributes & Functions * 385 | **********************************/ 386 | private String cur_set; 387 | private String src_lang; 388 | private String trg_lang; 389 | private String doc_id; 390 | private String sys_id; 391 | private String seg_id; 392 | @SuppressWarnings("unused") private String hyp_rank; 393 | private String id; 394 | private Map sgm_attr_names; 395 | private Map sgm_tag_names; 396 | private boolean is_ref; 397 | private String last_doc; 398 | private String last_seg; 399 | private String hyp_set; 400 | private boolean is_first_doc; 401 | private boolean is_first_seg; 402 | 403 | // patterns to pre-process sgml 404 | private Pattern start_tag_i; 405 | private Pattern end_tag_i; 406 | private Pattern start_end_tag_i; 407 | private Pattern no_tag_i; 408 | private Pattern attr_i; 409 | private boolean is_sgml; 410 | private Document document; 411 | private DocumentBuilderFactory factory; 412 | private DocumentBuilder builder; 413 | private Element cur_element; 414 | 415 | /******************************************************** 416 | * Name: newDoc 417 | * Desc: Test if the given id is from a new document. 418 | ********************************************************/ 419 | private boolean newDoc(HashMap cur_ids) { 420 | Object val = cur_ids.get(ATTRNAME.DOCID); 421 | String cur_doc = (String) val; 422 | if(cur_doc.equalsIgnoreCase(last_doc)) 423 | return false; 424 | else { 425 | last_doc = cur_doc; 426 | return true; 427 | } 428 | } 429 | 430 | /******************************************************** 431 | * Name: newSeg 432 | * Desc: Test if the given id is from a new Segment. 433 | ********************************************************/ 434 | private boolean newSeg(HashMap cur_ids) { 435 | Object val = cur_ids.get(ATTRNAME.SEGID); 436 | String cur_seg = (String) val; 437 | if(cur_seg.equalsIgnoreCase(last_seg)) 438 | return false; 439 | else { 440 | last_seg = cur_seg; 441 | return true; 442 | } 443 | } 444 | 445 | /******************************************************** 446 | * Name: parseID 447 | * Desc: Extracts the docid, segid and rank from the given id. 448 | ********************************************************/ 449 | private HashMap parseID(String id, 450 | boolean istrans) { 451 | HashMap ret = new HashMap(); 452 | String[] sl = id.split(":+"); 453 | 454 | if(istrans) { 455 | if(sl.length < 2) { 456 | ret.put(ATTRNAME.RANK, ""); 457 | ret.put(ATTRNAME.SEGID, id); 458 | } else { 459 | ret.put(ATTRNAME.RANK, sl[1]); 460 | ret.put(ATTRNAME.SEGID, sl[0]); 461 | } 462 | ret.put(ATTRNAME.DOCID, ""); 463 | } else { 464 | 465 | if(sl.length < 2) { 466 | System.out.println("** Error: Invalid id"); 467 | return null; 468 | } else { 469 | ret.put(ATTRNAME.RANK, sl[1]); 470 | String s = sl[0].replaceAll("\\[", ""); 471 | String[] sl1 = s.split("\\]+"); 472 | if(sl1.length < 2) { 473 | System.out.println("** Error: Invalid id"); 474 | return null; 475 | } else { 476 | ret.put(ATTRNAME.DOCID, sl1[0]); 477 | try { 478 | ret.put(ATTRNAME.SEGID, ""+Integer.valueOf(sl1[1])); 479 | } catch (NumberFormatException nfe) { 480 | ret.put(ATTRNAME.SEGID, sl1[1]); 481 | } 482 | } 483 | } 484 | } 485 | return ret; 486 | } 487 | 488 | /******************************************************** 489 | * Name: writeXMLDoc 490 | * Desc: Writes the tags for the new document. 491 | ********************************************************/ 492 | private void writeXMLDoc(BufferedWriter xml, 493 | HashMap cur_ids) { 494 | try { 495 | if(!is_first_doc) { 496 | xml.newLine(); 497 | xml.write(" "); 498 | xml.newLine(); 499 | xml.write(" "); 500 | xml.newLine(); 501 | } 502 | is_first_doc = false; 503 | is_first_seg = false; 504 | String cur_doc = cur_ids.get(ATTRNAME.DOCID); 505 | String cur_seg = cur_ids.get(ATTRNAME.SEGID); 506 | xml.write(" "); 507 | xml.newLine(); 508 | xml.write(" "); 509 | xml.newLine(); 510 | } catch (IOException ioe) { 511 | System.out.println(ioe); 512 | return; 513 | } 514 | } 515 | 516 | /******************************************************** 517 | * Name: writeXMLSeg 518 | * Desc: Writes the tags for the new segment. 519 | ********************************************************/ 520 | private void writeXMLSeg(BufferedWriter xml, 521 | HashMap cur_ids) { 522 | try { 523 | if(!is_first_seg) { 524 | xml.newLine(); 525 | xml.write(" "); 526 | } 527 | xml.newLine(); 528 | is_first_seg = false; 529 | String cur_seg = cur_ids.get(ATTRNAME.SEGID); 530 | xml.write(" "); 531 | xml.newLine(); 532 | } catch (IOException ioe) { 533 | System.out.println(ioe); 534 | return; 535 | } 536 | } 537 | 538 | /******************************************************** 539 | * Name: writeXMLAlignmentDetails 540 | * Desc: Writes the alignment information for the given 541 | * segments. They can be I, S, D and ' '. 542 | ********************************************************/ 543 | private void writeXMLAlignmentDetails(BufferedWriter xml, 544 | String[] ref, 545 | String[] hyp, 546 | char[] alignment, 547 | Shift[] allshifts) { 548 | HashMap> align_info = new HashMap>(); 549 | ArrayList shift_dists = new ArrayList(); 550 | int anum = 1; 551 | int ind_start = 0; 552 | int ind_end = 1; 553 | int ind_from = 2; 554 | int ind_in = 3; 555 | int ostart, oend, odest; 556 | int slen = 0; 557 | int nstart, nend, nfrom, dist; 558 | int non_inserr = 0; 559 | 560 | if(allshifts != null) { 561 | for(int i = 0; i < allshifts.length; ++i) { 562 | ostart = allshifts[i].start; 563 | oend = allshifts[i].end; 564 | odest = allshifts[i].newloc; 565 | slen = allshifts[i].size(); 566 | 567 | if(odest >= oend) { 568 | // right 569 | nstart = odest + 1 - slen; 570 | nend = nstart + slen - 1; 571 | nfrom = ostart; 572 | dist = odest - oend; 573 | } else { 574 | // left 575 | nstart = odest + 1; 576 | nend = nstart + slen - 1; 577 | nfrom = ostart + slen; 578 | dist = (ostart - odest -1) * -1; 579 | } 580 | 581 | 582 | shift_dists.add(dist); 583 | 584 | List val = align_info.get(nstart + "-" + ind_start); 585 | if(val == null) { 586 | List al = new ArrayList(); 587 | al.add(anum); 588 | align_info.put(nstart + "-" + ind_start, al); 589 | } else { 590 | List al = val; 591 | al.add(anum); 592 | } 593 | 594 | val = align_info.get(nend + "-" + ind_end); 595 | if(val == null) { 596 | List al = new ArrayList(); 597 | al.add(anum); 598 | align_info.put(nend + "-" + ind_end, al); 599 | } else { 600 | ArrayList al = (ArrayList) val; 601 | al.add(anum); 602 | } 603 | 604 | val = align_info.get(nfrom + "-" + ind_from); 605 | if(val == null) { 606 | List al = new ArrayList(); 607 | al.add(anum); 608 | align_info.put(nfrom + "-" + ind_from, al); 609 | } else { 610 | List al = (ArrayList) val; 611 | al.add(anum); 612 | } 613 | // System.out.println("nstart: " + nstart + ", nend:" + nend); 614 | if(slen > 0) { 615 | for(int j = nstart; j <= nend; ++j) { 616 | val = align_info.get(j + "-" + ind_in); 617 | if(val == null) { 618 | ArrayList al = new ArrayList(); 619 | al.add(anum); 620 | align_info.put(j + "-" + ind_in, al); 621 | } else { 622 | ArrayList al = (ArrayList) val; 623 | al.add(anum); 624 | } 625 | } 626 | } 627 | anum++; 628 | } 629 | } 630 | 631 | int hyp_idx = 0; 632 | int ref_idx = 0; 633 | if(alignment != null) { 634 | for(int i = 0; i < alignment.length; ++i) { 635 | String shift_in_str = ""; 636 | if(alignment[i] != 'D') { 637 | if(slen > 0) { 638 | List val = align_info.get(hyp_idx + "-" + ind_in); 639 | if(val != null) 640 | shift_in_str = join(",", val); 641 | // if(val != null) System.out.println("shiftstr: " + ref_idx + "," + hyp_idx + "-" + ind_in + ":" + shift_in_str); 642 | } 643 | } 644 | switch (alignment[i]) { 645 | case ' ': 646 | try { 647 | if(i == 0) xml.newLine(); 648 | if(!ref[ref_idx].equals(hyp[hyp_idx])) 649 | System.out.println("* Error: unmatch found, " + ref[ref_idx] + 650 | " vs. " + hyp[hyp_idx]); 651 | xml.write(" "); 652 | xml.write("\"" + ref[ref_idx] + "\",\"" + 653 | hyp[hyp_idx] + "\"," + "C,"); 654 | // System.out.println("hyp[" + hyp_idx + "] ref[" + ref_idx + "]:"+shift_in_str + ","+ ref[ref_idx] + "," + hyp[hyp_idx]); 655 | if(shift_in_str.equalsIgnoreCase("")) 656 | xml.write("0"); 657 | else { 658 | List val = align_info.get(hyp_idx + "-" + ind_in); 659 | if(val != null) { 660 | List al = val; 661 | // System.out.println(hyp_idx + "," + ref_idx + ":" + al.get(0) + "-" + shift_dists.get((Integer)al.get(0) - 1)); 662 | xml.write(""+shift_dists.get(al.get(0) - 1)); 663 | } 664 | } 665 | xml.newLine(); 666 | hyp_idx++; 667 | ref_idx++; 668 | non_inserr++; 669 | } catch (IOException ioe) { 670 | System.out.println(ioe); 671 | return; 672 | } 673 | break; 674 | case 'S': 675 | case 'T': 676 | try { 677 | if(i == 0) xml.newLine(); 678 | xml.write(" "); 679 | xml.write("\"" + ref[ref_idx] + "\",\"" + 680 | hyp[hyp_idx] + "\",S,0"); 681 | xml.newLine(); 682 | ref_idx++; 683 | hyp_idx++; 684 | non_inserr++; 685 | } catch (IOException ioe) { 686 | System.out.println(ioe); 687 | return; 688 | } 689 | break; 690 | case 'D': 691 | try { 692 | if(i == 0) xml.newLine(); 693 | xml.write(" "); 694 | xml.write("\"" + ref[ref_idx] + 695 | "\",\"\",D,0"); 696 | xml.newLine(); 697 | ref_idx++; 698 | non_inserr++; 699 | } catch (IOException ioe) { 700 | System.out.println(ioe); 701 | return; 702 | } 703 | break; 704 | case 'I': 705 | try { 706 | // System.out.println("hyp[hyp_idx-1]: " + ((hyp_idx>0)?hyp[hyp_idx-1]:"") + "," + hyp_idx + ", "+ hyp.length); 707 | if(i == 0) xml.newLine(); 708 | xml.write(" "); 709 | xml.write("\"\",\"" + hyp[hyp_idx] + 710 | "\",I,0"); 711 | xml.newLine(); 712 | hyp_idx++; 713 | } catch (IOException ioe) { 714 | System.out.println(ioe); 715 | return; 716 | } 717 | break; 718 | } 719 | } 720 | } 721 | if(non_inserr != ref.length && ref.length > 1) 722 | System.out.println("** Error, unmatch non-insertion erros " + non_inserr + 723 | " and reference length " + ref.length ); 724 | } 725 | 726 | 727 | /******************************************************** 728 | * Name: addDomElement 729 | * Desc: Adds a new element into the document. 730 | ********************************************************/ 731 | private void addDomElement(Element parent, 732 | Element child, 733 | Map content) { 734 | Iterator attrs = (content.keySet()).iterator(); 735 | while(attrs.hasNext()) { 736 | String attr_name = attrs.next(); 737 | String attr_val = content.get(attr_name); 738 | attr_val = attr_val.replaceAll("\"", ""); 739 | child.setAttribute(attr_name, attr_val); 740 | } 741 | parent.appendChild(child); 742 | } 743 | 744 | /******************************************************** 745 | * Name: getAttr 746 | * Desc: Extracts the attributes from the string. 747 | ********************************************************/ 748 | private void getAttr(String inner, 749 | Map content) { 750 | while(inner != "") { 751 | Matcher attr_m = attr_i.matcher(inner); 752 | if(attr_m.matches()) { 753 | String attr_name = attr_m.group(1); 754 | String attr_val = attr_m.group(2); 755 | content.put(attr_name, attr_val); 756 | inner = attr_m.group(3); 757 | } else 758 | inner = ""; 759 | } 760 | } 761 | 762 | /******************************************************** 763 | * Name: getNTag 764 | * Desc: Parse the line to extract the properties of the tag. 765 | * It returns int to indicate different type of tags found: 766 | * 0 - nothing is found. 767 | * 1 - start tag in the form of is found. 768 | * 2 - end tag in the form of is found. 769 | * 3 - both start and end tags are found is found. 770 | ********************************************************/ 771 | private int getNTag(String line, 772 | Tag tag) { 773 | 774 | Matcher start_tag_m = start_tag_i.matcher(line); 775 | Matcher end_tag_m = end_tag_i.matcher(line); 776 | Matcher start_end_tag_m = start_end_tag_i.matcher(line); 777 | Matcher no_tag_m = no_tag_i.matcher(line); 778 | 779 | if(start_end_tag_m.matches()) { 780 | tag.name = start_end_tag_m.group(1); 781 | String inner = start_end_tag_m.group(2); 782 | getAttr(inner, tag.content); 783 | tag.rest = start_end_tag_m.group(3); 784 | 785 | return 3; 786 | } else if (end_tag_m.matches()) { 787 | tag.name = end_tag_m.group(1); 788 | tag.rest = end_tag_m.group(3); 789 | 790 | return 2; 791 | } else if (start_tag_m.matches()) { 792 | tag.name = start_tag_m.group(1); 793 | String inner = start_tag_m.group(2); 794 | getAttr(inner, tag.content); 795 | tag.rest = start_tag_m.group(3); 796 | 797 | return 1; 798 | } else if (no_tag_m.matches()) { 799 | tag.name = ""; 800 | tag.rest = no_tag_m.group(2); 801 | tag.content.put("text", no_tag_m.group(1)); 802 | 803 | return 1; 804 | } else 805 | return 0; 806 | } 807 | 808 | /******************************************************** 809 | * Name: buildDomLine 810 | * Desc: Builds the document elements from a given line is possible. 811 | ********************************************************/ 812 | private void buildDomLine(String line) { 813 | Tag tag = new Tag(); 814 | 815 | if(line.trim().equals("")) return; 816 | else if(document == null) return; 817 | 818 | int found = getNTag(line, tag); 819 | 820 | if(cur_element == null && (found == 1 || found == 3) && !tag.name.equalsIgnoreCase("")) { 821 | if(tag.name.equalsIgnoreCase("tstset") || 822 | tag.name.equalsIgnoreCase("refset")) { 823 | is_sgml = true; 824 | Element root = (Element) document.createElement(tag.name); 825 | document.appendChild(root); 826 | cur_element = root; 827 | Iterator attrs = (tag.content.keySet()).iterator(); 828 | while(attrs.hasNext()) { 829 | String attr_name = attrs.next(); 830 | String attr_val = tag.content.get(attr_name); 831 | attr_val = attr_val.replaceAll("\"", ""); 832 | cur_element.setAttribute(attr_name, attr_val); 833 | } 834 | buildDomLine(tag.rest); 835 | } else return; 836 | } else if(cur_element != null) { 837 | switch(found) { 838 | case 1: 839 | // text node or start tag 840 | if(tag.name.equalsIgnoreCase("")) { 841 | String text = tag.content.get("text"); 842 | if(text != null) 843 | cur_element.appendChild(document.createTextNode(text)); 844 | buildDomLine(tag.rest); 845 | } else { 846 | Element child = document.createElement(tag.name); 847 | addDomElement(cur_element, child, tag.content); 848 | cur_element = child; 849 | 850 | buildDomLine(tag.rest); 851 | } 852 | break; 853 | case 2: 854 | // end tag 855 | if(tag.name.equalsIgnoreCase(cur_element.getTagName())) { 856 | if(cur_element.getParentNode().getNodeType() != Node.DOCUMENT_NODE) 857 | cur_element = (Element) cur_element.getParentNode(); 858 | } else { 859 | System.out.println("** Warning: found mis-matching tags in line: " + line + " - " + 860 | tag.name + " vs " + cur_element.getTagName()); 861 | document = null; 862 | return; 863 | } 864 | break; 865 | case 3: 866 | // < /> 867 | if(!tag.name.equalsIgnoreCase("")) { 868 | Element child = document.createElement(tag.name); 869 | addDomElement(cur_element, child, tag.content); 870 | buildDomLine(tag.rest); 871 | } else { 872 | System.out.println("** Warning: found empty name tags " + 873 | tag.name); 874 | document = null; 875 | return; 876 | } 877 | break; 878 | default: 879 | ; 880 | } 881 | } 882 | } 883 | 884 | /******************************************************** 885 | * Name: buildDom 886 | * Desc: Constructs the document from a text file if possible. 887 | ********************************************************/ 888 | private void buildDom(String fn) { 889 | is_sgml = false; 890 | start_tag_i = Pattern.compile("^\\s*\\<([^> ]*)\\s*([^>]*)\\>(.*)$", Pattern.CASE_INSENSITIVE); 891 | end_tag_i = Pattern.compile("^\\s*\\ ]*)\\s*([^>]*)\\>(.*)$", Pattern.CASE_INSENSITIVE); 892 | start_end_tag_i = Pattern.compile("^\\s*\\<([^> ]*)\\s*([^>]*)\\\\>(.*)$", Pattern.CASE_INSENSITIVE); 893 | attr_i = Pattern.compile("^\\s*(\\S+)=(\\S+)\\s*(.*)$", Pattern.CASE_INSENSITIVE); 894 | no_tag_i = Pattern.compile("^\\s*([^\\<]*)(.*)\\s*$", Pattern.CASE_INSENSITIVE); 895 | 896 | // parse the input with java pattern 897 | BufferedReader stream; 898 | try { 899 | stream = new BufferedReader(new FileReader(fn)); 900 | } catch (IOException ioe) { 901 | System.out.println(ioe); 902 | document = null; 903 | return; 904 | } 905 | 906 | try { 907 | String line; 908 | cur_element = null; 909 | builder = factory.newDocumentBuilder(); 910 | document = builder.newDocument(); 911 | while ((line = stream.readLine()) != null) { 912 | if(line.matches("^\\s*$")) 913 | continue; 914 | buildDomLine(line); 915 | } 916 | if(!is_sgml) 917 | document = null; 918 | 919 | } catch (ParserConfigurationException pce) { 920 | System.out.println("** PCError "); 921 | pce.printStackTrace(); 922 | document = null; 923 | return; 924 | } catch (IOException ioe) { 925 | System.out.println(ioe); 926 | document = null; 927 | return; 928 | } 929 | 930 | } 931 | 932 | 933 | /******************************************************** 934 | * Name: loadSegsElement 935 | * Desc: Load segs from element nodes of the document. This 936 | * function decides how to deal with different tags. 937 | ********************************************************/ 938 | private void loadSegsElement(Node node, Map> segs) { 939 | String name = node.getNodeName().trim(); 940 | NamedNodeMap attrs = node.getAttributes(); 941 | 942 | switch(tag_type(name)) { 943 | case TSTSET: 944 | case REFSET: 945 | if(attrs != null) { 946 | for(int i = 0; i < attrs.getLength(); ++i) { 947 | Node attr = attrs.item(i); 948 | String attrName = attr.getNodeName().trim(); 949 | switch(attr_type(attrName)) { 950 | case SETID: 951 | cur_set = attr.getNodeValue().trim(); 952 | if(!is_ref) hyp_set = cur_set; 953 | break; 954 | case SRCLANG: 955 | src_lang = attr.getNodeValue().trim(); 956 | break; 957 | case TRGLANG: 958 | trg_lang = attr.getNodeValue().trim(); 959 | break; 960 | default: 961 | ; 962 | } 963 | } 964 | } 965 | break; 966 | case DOC: 967 | if(attrs != null) { 968 | for(int i = 0; i < attrs.getLength(); ++i) { 969 | Node attr = attrs.item(i); 970 | String attrName = attr.getNodeName().trim().toLowerCase(); 971 | switch(attr_type(attrName)) { 972 | case DOCID: 973 | doc_id = attr.getNodeValue().trim(); 974 | break; 975 | case SYSID: 976 | sys_id = attr.getNodeValue().trim(); 977 | break; 978 | default: 979 | ; 980 | } 981 | } 982 | } 983 | break; 984 | case AUDIOFILE: 985 | if(attrs != null) { 986 | for(int i = 0; i < attrs.getLength(); ++i) { 987 | Node attr = attrs.item(i); 988 | String attrName = attr.getNodeName().trim().toLowerCase(); 989 | switch(attr_type(attrName)) { 990 | case FILEID: 991 | doc_id = attr.getNodeValue().trim(); 992 | break; 993 | default: 994 | ; 995 | } 996 | } 997 | } 998 | break; 999 | case SEG: 1000 | if(attrs != null) { 1001 | for(int i = 0; i < attrs.getLength(); ++i) { 1002 | Node attr = attrs.item(i); 1003 | String attrName = attr.getNodeName().trim(); 1004 | switch(attr_type(attrName)) { 1005 | case ID: 1006 | seg_id = attr.getNodeValue().trim(); 1007 | id = "[" + doc_id + "][" + String.format("%1$04d", Integer.valueOf(seg_id)) + "]"; 1008 | break; 1009 | case SEGID: 1010 | seg_id = attr.getNodeValue().trim(); 1011 | break; 1012 | default: 1013 | ; 1014 | } 1015 | } 1016 | } 1017 | break; 1018 | case HYP: 1019 | if(attrs != null) { 1020 | for(int i = 0; i < attrs.getLength(); ++i) { 1021 | Node attr = attrs.item(i); 1022 | String attrName = attr.getNodeName().trim(); 1023 | switch(attr_type(attrName)) { 1024 | case ID: 1025 | String rank = attr.getNodeValue().trim(); 1026 | id = "[" + doc_id + "][" + String.format("%1$04d", Integer.valueOf(seg_id)) + "]:" + rank; 1027 | break; 1028 | default: 1029 | ; 1030 | } 1031 | } 1032 | } 1033 | case WORD: 1034 | break; 1035 | default: 1036 | ; 1037 | } 1038 | } 1039 | 1040 | } 1041 | 1042 | --------------------------------------------------------------------------------