├── .gitignore ├── Levenshtein.pm ├── README.md ├── adjust-alignments.pl ├── combine-predicate.pl ├── compare-mt.py ├── count.pl ├── counterrors.pl ├── error-diff.pl ├── fakesgm.pl ├── grade.pl ├── gradews.pl ├── han2zen.pl ├── iconv.pl ├── interleave.pl ├── moses-trace-to-penn-tree.pl ├── ngram-hits.pl ├── paired-bootstrap.py ├── phrase-extract.pl ├── print-enju.pl ├── print-trees.py ├── reverse.pl ├── romanize.pl ├── serverfy.pl ├── shuffle.pl ├── split-column.pl ├── splittraintest.pl ├── syntactic-complexity.py ├── syseval-combine.pl ├── syseval-combine.py ├── syseval-report.pl ├── syseval-report.py ├── t └── levenshtein.t ├── txt2penntokens.pl ├── wc.pl ├── zen2han.pl └── zenspace2under.pl /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .idea 3 | -------------------------------------------------------------------------------- /Levenshtein.pm: -------------------------------------------------------------------------------- 1 | package Levenshtein; 2 | use strict; 3 | use Carp; 4 | 5 | # return a minimum edit distance path with the following notation, plus cost 6 | # d=delete i=insert s=substitute e=equal 7 | sub distance { 8 | my ($s, $t) = @_; 9 | my (@sc, $m, @tc, $n, %d, %str, $i, $j, $id, $cost, $type, $aid, $bid, $cid, $c); 10 | @sc = split(/ +/, $s); 11 | $m = @sc; 12 | @tc = split(/ +/, $t); 13 | $n = @tc; 14 | # initialize 15 | %d = (); %str = {}; 16 | foreach $i (0 .. $m) { $id = pack('S2', $i, 0); $d{$id} = $i; $str{$id} = 'd'x$i; } 17 | foreach $j (1 .. $n) { $id = pack('S2', 0, $j); $d{$id} = $j; $str{$id} = 'i'x$j; } 18 | 19 | foreach $i (1 .. $m) { 20 | foreach $j (1 .. $n) { 21 | if($sc[$i-1] eq $tc[$j-1]) { 22 | $cost = 0; $type = 'e'; # equal 23 | } else { 24 | $cost = 1.1; $type = 's'; # substitution 25 | } 26 | 27 | $aid = pack('S2', $i-1, $j); $a = $d{$aid} + 1; # deletion 28 | $bid = pack('S2', $i, $j-1); $b = $d{$bid} + 1; # insertion 29 | $cid = pack('S2', $i-1, $j-1); $c = $d{$cid} + $cost; # insertion 30 | 31 | $id = pack('S2', $i, $j); 32 | 33 | # we want matches to come at the end, so do deletions/insertions first 34 | if($a <= $b and $a <= $c) { 35 | $d{$id} = $a; 36 | $type = 'd'; 37 | $str{$id} = $str{$aid}.'d'; 38 | } 39 | elsif($b <= $c) { 40 | $d{$id} = $b; 41 | $type = 'i'; 42 | $str{$id} = $str{$bid}.'i'; 43 | } 44 | else { 45 | $d{$id} = $c; 46 | $str{$id} = $str{$cid}.$type; 47 | } 48 | 49 | delete $d{$cid}; 50 | delete $str{$cid}; 51 | # print "".$sc[$i-1]." ".$tc[$j-1]." $i $j $a $b $c $d[$id] $type\n" 52 | } 53 | } 54 | 55 | $id = pack('S2', $m, $n); 56 | return ($str{$id}, $d{$id}); 57 | } 58 | 59 | # Divide an aligned string into corresponding parts 60 | sub divide { 61 | my ($ref, $test, $hist) = @_; 62 | my @ra = split(/ +/, $ref); 63 | my @ta = split(/ +/, $test); 64 | my @ret; 65 | my (@er, @et, @dr, @dt); 66 | foreach my $h (split(//, $hist)) { 67 | if($h eq 'e') { 68 | if(@dr or @dt) { 69 | push @ret, "@dr\t@dt"; @dr = (); @dt = (); 70 | } 71 | push @er, shift(@ra); 72 | push @et, shift(@ta); 73 | } else { 74 | if(@er or @et) { 75 | die "@er != @et" if("@er" ne "@et"); 76 | push @ret, "@er\t@et"; @er = (); @et = (); 77 | } 78 | push @dr, shift(@ra) if $h ne 'i'; 79 | push @dt, shift(@ta) if $h ne 'd'; 80 | } 81 | } 82 | if(@dr or @dt) { push @ret, "@dr\t@dt"; } 83 | elsif(@er or @et) { push @ret, "@er\t@et"; } 84 | die "non-empty ra=@ra or ta=@ta\n" if(@ra or @ta); 85 | return @ret; 86 | } 87 | 88 | 1; 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | util-scripts 2 | ============ 3 | 4 | ### Tests ### 5 | 6 | You can run tests to verify you have not broken anything by typing: 7 | 8 | $ prove 9 | -------------------------------------------------------------------------------- /adjust-alignments.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use utf8; 7 | use FindBin; 8 | use lib $FindBin::Bin; 9 | use Getopt::Long; 10 | use List::Util qw(sum min max shuffle); 11 | use Cwd qw(cwd); 12 | use Levenshtein; 13 | binmode STDIN, ":utf8"; 14 | binmode STDOUT, ":utf8"; 15 | binmode STDERR, ":utf8"; 16 | 17 | my $SRC_TYPE = "lev"; 18 | my $TRG_TYPE = "lev"; 19 | GetOptions( 20 | "src-type=s" => \$SRC_TYPE, 21 | "trg-type=s" => \$SRC_TYPE, 22 | ); 23 | 24 | if(@ARGV != 5) { 25 | print STDERR "Usage: $0 SRC_OLD TRG_OLD ALIGN SRC_NEW TRG_NEW\n"; 26 | exit 1; 27 | } 28 | 29 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n"; 30 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n"; 31 | open FILE2, "<:utf8", $ARGV[2] or die "Couldn't open $ARGV[2]\n"; 32 | open FILE3, "<:utf8", $ARGV[3] or die "Couldn't open $ARGV[3]\n"; 33 | open FILE4, "<:utf8", $ARGV[4] or die "Couldn't open $ARGV[4]\n"; 34 | 35 | sub map_char { 36 | my ($in, $out) = @_; 37 | my @ia = split(/ /, $in); 38 | my @oa = split(/ /, $out); 39 | my (@iw, @ow); 40 | foreach my $i (0 .. $#ia) { push @iw, $i for(0 .. length($ia[$i])-1); } 41 | foreach my $i (0 .. $#oa) { push @ow, $i for(0 .. length($oa[$i])-1); } 42 | # print "@iw\n@ow\n"; 43 | die "$in\n$out\n" if(@iw != @ow); 44 | # Create the map 45 | my %ret; 46 | foreach my $i (0 .. $#iw) { 47 | $ret{$iw[$i]} = {} if not $ret{$iw[$i]}; 48 | $ret{$iw[$i]}->{$ow[$i]}++; 49 | } 50 | return %ret; 51 | } 52 | 53 | sub normalize_en { 54 | $_ = shift; 55 | s/[\((]/-lrb-/g; 56 | s/[\))]/-rrb-/g; 57 | s/(``|'')/"/g; 58 | s/\'/'/g; 59 | s/\"/"/g; 60 | s/\<//g; 62 | s/([a-z])n 't/$1 n't/g; 63 | s/–/--/g; 64 | s/(`)/'/g; 65 | s/([^\\])\//$1\\\//g; 66 | s/[、,]/,/g; 67 | s/[。.]/./g; 68 | return lc($_); 69 | } 70 | 71 | sub map_lev { 72 | my ($in, $out) = @_; 73 | $in = normalize_en($in); 74 | $out = normalize_en($out); 75 | my @ia = split(/ /, $in); 76 | my @oa = split(/ /, $out); 77 | my ($hist, $score) = Levenshtein::distance($in, $out); 78 | my @hists = split(//, $hist); 79 | my (@ierr, @oerr); 80 | my ($ipos, $opos) = (0, 0); 81 | my %ret; 82 | while(@hists) { 83 | my $h = shift(@hists); 84 | if($h eq 'e') { 85 | foreach my $i (@ierr) { 86 | $ret{$i} = {}; 87 | foreach my $o (@oerr) { $ret{$i}->{$o}++; } 88 | } 89 | @ierr = (); 90 | @oerr = (); 91 | $ret{$ipos++} = { $opos++ => 1 }; 92 | } else { 93 | push @ierr, $ipos++ if $h ne 'i'; 94 | push @oerr, $opos++ if $h ne 'd'; 95 | } 96 | } 97 | foreach my $i (@ierr) { 98 | $ret{$i} = {}; 99 | foreach my $o (@oerr) { $ret{$i}->{$o}++; } 100 | } 101 | # die "$in\n$out\n" if($in ne $out); 102 | return %ret; 103 | } 104 | 105 | sub print_map { 106 | my $map = shift; 107 | my @arr; 108 | # print join(" ", keys %{$map})."\n"; 109 | foreach my $i (sort { $a <=> $b } keys %{$map}) { 110 | foreach my $j (sort { $a <=> $b } keys %{$map->{$i}}) { 111 | push @arr, "$i-$j"; 112 | } 113 | } 114 | print "@arr\n"; 115 | } 116 | 117 | sub combine_map { 118 | my ($in, $out) = @_; 119 | my %ret; 120 | foreach my $i ( keys %$in ) { 121 | foreach my $j ( keys %{$in->{$i}} ) { 122 | foreach my $k ( keys %{$out->{$j}} ) { 123 | $ret{$i} = {} if not $ret{$i}; 124 | $ret{$i}->{$k}++; 125 | } 126 | } 127 | } 128 | return %ret; 129 | } 130 | 131 | 132 | my ($jo, $eo, $al, $jn, $en); 133 | while(defined($jo = ) and defined($eo = ) and defined($al = ) and defined($jn = ) and defined($en = )) { 134 | chomp $jo; chomp $eo; chomp $al; chomp $jn; chomp $en; 135 | my %em = ($SRC_TYPE eq "lev") ? map_lev($eo, $en) : map_char($eo, $en); 136 | my %jm = ($SRC_TYPE eq "lev") ? map_lev($jn, $jo) : map_char($jn, $jo); 137 | my %am; 138 | s/ +$//g; 139 | for(split(/ /, $al)) { 140 | my ($ja, $ea) = split(/-/); 141 | $am{$ja} = {} if not $am{$ja}; 142 | $am{$ja}->{$ea}++; 143 | } 144 | my %jam = combine_map(\%jm, \%am); 145 | my %jaem = combine_map(\%jam, \%em); 146 | print_map(\%jaem); 147 | } 148 | -------------------------------------------------------------------------------- /combine-predicate.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # 4 | # What's this? 5 | # ------------ 6 | # 7 | # Combine segmented predicates in Japanese sentences into manageable 8 | # units for ease of statistical machine translation tasks such as word 9 | # alignment, rule extraction and other subsequent processes. 10 | # 11 | # Usage and Data Format 12 | # --------------------- 13 | # 14 | # Currently, we are assuming the input format is sequences of triplets of (word, the 15 | # part-of-speech, and the reading of the word). Each triplet should be delimitted by 16 | # slash "/". 17 | # 18 | # Here is the sample input and output of this script. 19 | # 20 | # $ echo "これ/代名詞/これ は/助詞/は ペン/名詞/ぺん で/助動詞/で あ/動詞/あ る/語尾/る" | ./combine-predicate.pl 21 | # word=これ pos=代名詞 pron=これ 22 | # word=は pos=助詞 pron=は 23 | # word=ペン pos=名詞 pron=ぺん 24 | # word=で pos=助動詞 pron=で 25 | # word=あ pos=動詞 pron=あ 26 | # word=る pos=語尾 pron=る 27 | # これ は ペン である 28 | # 29 | # 30 | # This kinds of the input data are used in the output format of KyTea 31 | # (http://www.phontron.com/kytea/) For example, 32 | # 33 | # $ echo "これはペンである" | kytea | ./combine-predicate.pl 34 | # これ は ペン である 35 | # 36 | 37 | use strict; 38 | use warnings; 39 | use utf8; 40 | use Getopt::Long; 41 | use List::Util qw(sum min max shuffle); 42 | binmode STDIN, ":utf8"; 43 | binmode STDOUT, ":utf8"; 44 | binmode STDERR, ":utf8"; 45 | 46 | my $VERB = 0; 47 | GetOptions( 48 | "verb" => \$VERB, 49 | ); 50 | 51 | 52 | my $GLUE = ""; 53 | if (@ARGV eq 1) { 54 | $GLUE = $ARGV[0]; 55 | } elsif (@ARGV > 1) { 56 | print STDERR "Usage: $0 [GLUE]\n"; 57 | exit 1; 58 | } 59 | 60 | # Split each of the triplet into three things. 61 | # 62 | # E.g., "これ/代名詞/これ" => ("これ", "代名詞", "これ"), 63 | # "ペン/名詞/ぺん" => ("ペン", "名詞", "ぺん") 64 | # 65 | sub wpp { 66 | my $s = shift; 67 | # print STDERR "$s\n"; 68 | $s =~ /^(.+)\/([^\/]+)\/([^\/]+)$/ or die $s; 69 | return ($1, $2, $3); 70 | } 71 | 72 | # TODO: Write the documentation about used heuristic rules. 73 | sub iscombine { 74 | my $s = shift; 75 | my ($word, $pos, $pron) = wpp($s); 76 | if($VERB and ($pos =~ /^動詞$/)) { 77 | return 1; 78 | } elsif($pos =~ /^(語尾|助動詞)$/) { 79 | return 1; 80 | } elsif(($word =~ /^(て|ば)$/) and ($pos =~ /^(助詞)$/)) { 81 | return 1; 82 | } elsif(($word =~ /^(な)$/) and ($pos =~ /^(形容詞)$/)) { 83 | return 1; 84 | } elsif(($word =~ /^(さ|し|す|あ|い)$/) and ($pos =~ /^(動詞)$/)) { 85 | return 1; 86 | } 87 | return 0; 88 | } 89 | 90 | # Combine predicates, and return the sequence of words. 91 | # 92 | # If there are consecutive words that fire the bits, 93 | # they will be combined into a single word. 94 | # 95 | # For example, 96 | # $harr = ["これ", "は", "ペン", "で", "あ", "る"] 97 | # $carr = [0, 0, 0, 1, 1, 1] 98 | # => ["これ", "は", "ペン", "である"] 99 | # 100 | # Note that the words ["で", "あ", "る"] were merged into 101 | # "である". 102 | sub combine { 103 | my ($harr, $carr) = @_; 104 | my @newarr = ($$harr[0]); 105 | foreach my $i (1 .. $#$harr) { 106 | if (($$carr[$i] == 1) and ($$carr[$i-1] == 1)) { 107 | $newarr[-1] .= $GLUE . $$harr[$i]; 108 | } else { 109 | push @newarr, $$harr[$i]; 110 | } 111 | } 112 | return \@newarr; 113 | } 114 | 115 | while() { 116 | chomp; 117 | # print "$_\n"; 118 | s/\\ / /g; 119 | my @warr = split(/ +/); 120 | my @harr = map { my ($w, $pr, $ps) = wpp($_); $w } @warr; 121 | my @carr = map { iscombine($_) } @warr; 122 | print "@{combine(\@harr, \@carr)}\n"; 123 | } 124 | -------------------------------------------------------------------------------- /compare-mt.py: -------------------------------------------------------------------------------- 1 | # NOTE: The master version of this script has moved and can be found here: 2 | # https://github.com/neulab/compare-mt 3 | 4 | import sys 5 | import argparse 6 | import operator 7 | import itertools 8 | import nltk 9 | from collections import defaultdict 10 | 11 | parser = argparse.ArgumentParser( 12 | description='Program to compare MT results', 13 | ) 14 | parser.add_argument('ref_file', type=str, help='A path to a correct reference file') 15 | parser.add_argument('out_file', type=str, help='A path to a system output') 16 | parser.add_argument('out2_file', nargs='?', type=str, default=None, help='A path to another system output. Add only if you want to compare outputs from two systems.') 17 | parser.add_argument('--train_file', type=str, default=None, help='A link to the training corpus target file') 18 | parser.add_argument('--train_counts', type=str, default=None, help='A link to the training word frequency counts as a tab-separated "word\\tfreq" file') 19 | parser.add_argument('--alpha', type=float, default=1.0, help='A smoothing coefficient to control how much the model focuses on low- and high-frequency events. 1.0 should be fine most of the time.') 20 | parser.add_argument('--ngram', type=int, default=4, help='Maximum length of n-grams.') 21 | parser.add_argument('--ngram_size', type=int, default=50, help='How many n-grams to print.') 22 | parser.add_argument('--sent_size', type=int, default=10, help='How many sentences to print.') 23 | args = parser.parse_args() 24 | 25 | with open(args.ref_file, "r") as f: 26 | ref = [line.strip().split() for line in f] 27 | with open(args.out_file, "r") as f: 28 | out = [line.strip().split() for line in f] 29 | if args.out2_file != None: 30 | with open(args.out2_file, "r") as f: 31 | out2 = [line.strip().split() for line in f] 32 | 33 | def calc_ngrams(sent): 34 | ret = defaultdict(lambda: 0) 35 | for n in range(args.ngram): 36 | for i in range(len(sent)-n): 37 | ret[tuple(sent[i:i+n+1])] += 1 38 | return ret 39 | 40 | def match_ngrams(left, right): 41 | ret = defaultdict(lambda: 0) 42 | for k, v in left.items(): 43 | if k in right: 44 | ret[k] = min(v, right[k]) 45 | return ret 46 | 47 | # Calculate over and under-generated n-grams for two corpora 48 | def calc_over_under(ref, out, alpha): 49 | # Create n-grams 50 | refall = defaultdict(lambda: 0) 51 | outall = defaultdict(lambda: 0) 52 | for refsent, outsent in zip(ref, out): 53 | for k, v in calc_ngrams(refsent).items(): 54 | refall[k] += v 55 | for k, v in calc_ngrams(outsent).items(): 56 | outall[k] += v 57 | # Calculate scores 58 | scores = {} 59 | for k, v in refall.items(): 60 | scores[k] = (v + args.alpha) / (v + outall[k] + 2*args.alpha) 61 | for k, v in outall.items(): 62 | scores[k] = (refall[k] + args.alpha) / (refall[k] + v + 2*args.alpha) 63 | return refall, outall, scores 64 | 65 | # Calculate over and under-generated n-grams for two corpora 66 | def calc_compare(ref, out, out2, alpha): 67 | outall = defaultdict(lambda: 0) 68 | out2all = defaultdict(lambda: 0) 69 | for refsent, outsent, out2sent in zip(ref, out, out2): 70 | refn = calc_ngrams(refsent) 71 | outmatch = match_ngrams(refn, calc_ngrams(outsent)) 72 | out2match = match_ngrams(refn, calc_ngrams(out2sent)) 73 | for k, v in outmatch.items(): 74 | if v > out2match[k]: 75 | outall[k] += v - out2match[k] 76 | for k, v in out2match.items(): 77 | if v > outmatch[k]: 78 | out2all[k] += v - outmatch[k] 79 | # Calculate scores 80 | scores = {} 81 | for k, v in out2all.items(): 82 | scores[k] = (v + args.alpha) / (v + outall[k] + 2*args.alpha) 83 | for k, v in outall.items(): 84 | scores[k] = (out2all[k] + args.alpha) / (out2all[k] + v + 2*args.alpha) 85 | return outall, out2all, scores 86 | 87 | # Calculate the frequency counts, from the training corpus 88 | # or training frequency file if either are specified, from the 89 | # reference file if not 90 | freq_counts = defaultdict(lambda: 0) 91 | if args.train_counts != None: 92 | with open(args.train_counts, "r") as f: 93 | for line in f: 94 | word, freq = line.strip().split('\t') 95 | freq_counts[word] = freq 96 | else: 97 | my_file = args.train_file if args.train_file != None else args.ref_file 98 | with open(my_file, "r") as f: 99 | for line in f: 100 | for word in line.strip().split(): 101 | freq_counts[word] += 1 102 | 103 | def calc_matches_by_freq(ref, out, buckets): 104 | extended_buckets = buckets + [max(freq_counts.values()) + 1] 105 | matches = [[0,0,0] for x in extended_buckets] 106 | for refsent, outsent in zip(ref, out): 107 | reffreq, outfreq = defaultdict(lambda: 0), defaultdict(lambda: 0) 108 | for x in refsent: 109 | reffreq[x] += 1 110 | for x in outsent: 111 | outfreq[x] += 1 112 | for k in set(itertools.chain(reffreq.keys(), outfreq.keys())): 113 | for bucket, match in zip(extended_buckets, matches): 114 | if freq_counts[k] < bucket: 115 | match[0] += min(reffreq[k], outfreq[k]) 116 | match[1] += reffreq[k] 117 | match[2] += outfreq[k] 118 | break 119 | for bothf, reff, outf in matches: 120 | if bothf == 0: 121 | rec, prec, fmeas = 0.0, 0.0, 0.0 122 | else: 123 | rec = bothf / float(reff) 124 | prec = bothf / float(outf) 125 | fmeas = 2 * prec * rec / (prec + rec) 126 | yield bothf, reff, outf, rec, prec, fmeas 127 | 128 | buckets = [1, 2, 3, 4, 5, 10, 100, 1000] 129 | bucket_strs = [] 130 | last_start = 0 131 | for x in buckets: 132 | if x-1 == last_start: 133 | bucket_strs.append(str(last_start)) 134 | else: 135 | bucket_strs.append("{}-{}".format(last_start, x-1)) 136 | last_start = x 137 | bucket_strs.append("{}+".format(last_start)) 138 | 139 | # Analyze the reference/output 140 | if args.out2_file == None: 141 | refall, outall, scores = calc_over_under(ref, out, args.alpha) 142 | scorelist = sorted(scores.items(), key=operator.itemgetter(1)) 143 | # Print the ouput 144 | print('********************** N-gram Difference Analysis ************************') 145 | print('--- %d over-generated n-grams indicative of output' % args.ngram_size) 146 | for k, v in scorelist[:args.ngram_size]: 147 | print('%s\t%f (ref=%d, out=%d)' % (' '.join(k), v, refall[k], outall[k])) 148 | print() 149 | print('--- %d under-generated n-grams indicative of reference' % args.ngram_size) 150 | for k, v in reversed(scorelist[-args.ngram_size:]): 151 | print('%s\t%f (ref=%d, out=%d)' % (' '.join(k), v, refall[k], outall[k])) 152 | # Calculate f-measure 153 | matches = calc_matches_by_freq(ref, out, buckets) 154 | print('\n\n********************** Word Frequency Analysis ************************') 155 | print('--- word f-measure by frequency bucket') 156 | for bucket_str, match in zip(bucket_strs, matches): 157 | print("{}\t{:.4f}".format(bucket_str, match[5])) 158 | # Analyze the differences between two systems 159 | else: 160 | outall, out2all, scores = calc_compare(ref, out, out2, args.alpha) 161 | scorelist = sorted(scores.items(), key=operator.itemgetter(1)) 162 | # Print the ouput 163 | print('********************** N-gram Difference Analysis ************************') 164 | print('--- %d n-grams that System 1 did a better job of producing' % args.ngram_size) 165 | for k, v in scorelist[:args.ngram_size]: 166 | print('%s\t%f (sys1=%d, sys2=%d)' % (' '.join(k), v, outall[k], out2all[k])) 167 | print('\n--- %d n-grams that System 2 did a better job of producing' % args.ngram_size) 168 | for k, v in reversed(scorelist[-args.ngram_size:]): 169 | print('%s\t%f (sys1=%d, sys2=%d)' % (' '.join(k), v, outall[k], out2all[k])) 170 | # Calculate f-measure 171 | matches = calc_matches_by_freq(ref, out, buckets) 172 | matches2 = calc_matches_by_freq(ref, out2, buckets) 173 | print('\n\n********************** Word Frequency Analysis ************************') 174 | print('--- word f-measure by frequency bucket') 175 | for bucket_str, match, match2 in zip(bucket_strs, matches, matches2): 176 | print("{}\t{:.4f}\t{:.4f}".format(bucket_str, match[5], match2[5])) 177 | # Calculate BLEU diff 178 | scorediff_list = [] 179 | chencherry = nltk.translate.bleu_score.SmoothingFunction() 180 | for i, (o1, o2, r) in enumerate(zip(out, out2, ref)): 181 | b1 = nltk.translate.bleu_score.sentence_bleu([r], o1, smoothing_function=chencherry.method2) 182 | b2 = nltk.translate.bleu_score.sentence_bleu([r], o2, smoothing_function=chencherry.method2) 183 | scorediff_list.append((b2-b1, b1, b2, i)) 184 | scorediff_list.sort() 185 | print('\n\n********************** Length Analysis ************************') 186 | print('--- length ratio') 187 | length_ref = sum([len(x) for x in ref]) 188 | length_out = sum([len(x) for x in out]) 189 | length_out2 = sum([len(x) for x in out2]) 190 | print('System 1: {}, System 2: {}'.format(length_out/length_ref, length_out2/length_ref)) 191 | print('--- length difference from reference by bucket') 192 | length_diff = {} 193 | length_diff2 = {} 194 | for r, o, o2 in zip(ref, out, out2): 195 | ld = len(o)-len(r) 196 | ld2 = len(o2)-len(r) 197 | length_diff[ld] = length_diff.get(ld,0) + 1 198 | length_diff2[ld2] = length_diff2.get(ld2,0) + 1 199 | for ld in sorted(list(set(length_diff.keys()) | set(length_diff2.keys()))): 200 | print("{}\t{}\t{}".format(ld, length_diff.get(ld,0), length_diff2.get(ld,0))) 201 | print('\n\n********************** BLEU Analysis ************************') 202 | print('--- %d sentences that System 1 did a better job at than System 2' % args.sent_size) 203 | for bdiff, b1, b2, i in scorediff_list[:args.sent_size]: 204 | print ('BLEU+1 sys2-sys1={}, sys1={}, sys2={}\nRef: {}\nSys1: {}\nSys2: {}\n'.format(bdiff, b1, b2, ' '.join(ref[i]), ' '.join(out[i]), ' '.join(out2[i]))) 205 | print('--- %d sentences that System 2 did a better job at than System 1' % args.sent_size) 206 | for bdiff, b1, b2, i in scorediff_list[-args.sent_size:]: 207 | print ('BLEU+1 sys2-sys1={}, sys1={}, sys2={}\nRef: {}\nSys1: {}\nSys2: {}\n'.format(bdiff, b1, b2, ' '.join(ref[i]), ' '.join(out[i]), ' '.join(out2[i]))) 208 | -------------------------------------------------------------------------------- /count.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | binmode STDIN, ":utf8"; 4 | binmode STDOUT, ":utf8"; 5 | 6 | my %count; 7 | while() { 8 | chomp; 9 | for(split(/ /)) { 10 | $count{$_}++; 11 | } 12 | } 13 | 14 | foreach my $k (sort {$count{$b}<=>$count{$a}} keys %count) { 15 | print "$k\t$count{$k}\n"; 16 | } 17 | -------------------------------------------------------------------------------- /counterrors.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # This is a script to count errors (insertions, deletions, substitutions) 4 | # according to edit distance 5 | 6 | use strict; 7 | use warnings; 8 | use utf8; 9 | use List::Util qw(max min); 10 | use FindBin; 11 | use lib $FindBin::Bin; 12 | 13 | use Levenshtein; 14 | 15 | binmode STDIN, ":utf8"; 16 | binmode STDOUT, ":utf8"; 17 | 18 | if(@ARGV != 2) { 19 | print STDERR "Usage: counterrors.pl REFERENCE SYSTEM\n"; 20 | exit; 21 | } 22 | 23 | # find which of the inputs are in error 24 | open REF, "<:utf8", $ARGV[0] or die $!; 25 | open TEST, "<:utf8", $ARGV[1] or die $!; 26 | my ($ref, $test, @refs, @tests, %errs, @hists); 27 | while(defined($ref = ) and defined($test = )) { 28 | chomp $ref; chomp $test; 29 | my ($hist, $score) = Levenshtein::distance($ref, $test); 30 | @refs = split(/ +/, $ref); 31 | @tests = split(/ +/, $test); 32 | @hists = split(//, $hist); 33 | my (@rerr, @terr); 34 | while(@hists) { 35 | my $h = shift(@hists); 36 | if($h eq 'e') { 37 | if(@rerr+@terr) { 38 | my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL"); 39 | $errs{$err}++; 40 | @rerr = (); 41 | @terr = (); 42 | } 43 | shift @refs; shift @tests; 44 | } else { 45 | push @rerr, shift(@refs) if $h ne 'i'; 46 | push @terr, shift(@tests) if $h ne 'd'; 47 | } 48 | } 49 | if(@rerr+@terr) { 50 | my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL"); 51 | $errs{$err}++; 52 | } 53 | } 54 | close REF; 55 | close TEST; 56 | 57 | for(sort { $errs{$b} <=> $errs{$a} } keys %errs) { 58 | print "$_\t$errs{$_}\n"; 59 | } 60 | -------------------------------------------------------------------------------- /error-diff.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # This is a script to tabulate the errors made by two systems 4 | # In order to use it first run counterrors.pl on two different system outputs 5 | # 6 | # $ counterrors.pl ref.txt test-1.txt > test-1.err 7 | # $ counterrors.pl ref.txt test-2.txt > test-2.err 8 | # 9 | # Next, run this script on the error files to get the output 10 | # 11 | # $ error-diff.pl test-1.err test-2.err > test-diff.err 12 | # 13 | # test-diff.err will now contain error examples along with frequencies. 14 | # examples with positive frequencies are more common in the first file, 15 | # while examples with negative frequencies are more common in the second file. 16 | 17 | use strict; 18 | use warnings; 19 | use utf8; 20 | binmode STDIN, ":utf8"; 21 | binmode STDOUT, ":utf8"; 22 | use List::Util qw(sum min max shuffle); 23 | 24 | if(@ARGV != 2) { 25 | print STDERR "Usage: error-diff.pl file1.err file2.err\n"; 26 | exit 1; 27 | } 28 | 29 | my %vals; 30 | open FILE, "<:utf8", $ARGV[0] or die "$ARGV[0]: $!\n"; 31 | while() { 32 | chomp; 33 | /(.*)\t([0-9]*)$/; 34 | $vals{$1} += $2; 35 | } 36 | close FILE; 37 | open FILE, "<:utf8", $ARGV[1] or die "$ARGV[1]: $!\n"; 38 | while() { 39 | chomp; 40 | /(.*)\t([0-9]*)$/; 41 | $vals{$1} -= $2; 42 | } 43 | close FILE; 44 | 45 | for(sort {$vals{$b} <=> $vals{$a}} keys %vals) { 46 | print "$_\t$vals{$_}\n" if $vals{$_}; 47 | } 48 | -------------------------------------------------------------------------------- /fakesgm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use List::Util qw(sum min max shuffle); 7 | use Getopt::Long; 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | if(@ARGV != 2) { 13 | print STDERR "Usage: fakesgm.pl src/ref tar\n"; 14 | exit 1; 15 | } 16 | 17 | print "<".$ARGV[0]."set setid=\"test2006\" srclang=\"any\" trglang=\"$ARGV[1]\"> 18 | \n"; 19 | 20 | my $sent = 1; 21 | while() { 22 | chomp; 23 | print " $_ \n"; 24 | } 25 | 26 | print "\n\n"; 27 | -------------------------------------------------------------------------------- /grade.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # This is a script to grade word error rates according to edit distance 4 | if(@ARGV != 2) { 5 | print STDERR "$0 REF TEST\n"; 6 | exit(1); 7 | } 8 | 9 | binmode STDIN, ":utf8"; 10 | binmode STDOUT, ":utf8"; 11 | binmode STDERR, ":utf8"; 12 | 13 | use utf8; 14 | use strict; 15 | use warnings; 16 | use List::Util qw(max min); 17 | use Cwd qw(cwd abs_path); 18 | use FindBin; 19 | use lib $FindBin::Bin; 20 | 21 | use Levenshtein; 22 | 23 | my $PRINT_INLINE = 1; 24 | 25 | sub width { 26 | $_ = shift; 27 | my $ret = 0; 28 | for(split(//)) { 29 | $ret += ((/\p{InKatakana}/ or /\p{InHiragana}/ or /\p{InCJKSymbolsAndPunctuation}/ or /\p{InKatakanaPhoneticExtensions}/ or /\p{InCJKUnifiedIdeographs}/)?2:1); 30 | } 31 | return $ret; 32 | } 33 | 34 | sub pad { 35 | my ($s, $l) = @_; 36 | return $s . (' ' x ($l-width($s))); 37 | } 38 | 39 | use strict; 40 | binmode STDOUT, ":utf8"; 41 | open REF, "<:utf8", $ARGV[0]; 42 | open TEST, "<:utf8", $ARGV[1]; 43 | 44 | my ($reflen, $testlen); 45 | my %scores = qw(s 0 i 0 d 0 e 0); 46 | my($ref, $test, $sent, $sentacc) = (0, 0, 0, 0); 47 | while(defined($ref = ) and defined($test = )) { 48 | chomp $ref; 49 | chomp $test; 50 | $ref =~ s/^ *//g; $ref =~ s/ *$//g; 51 | $test =~ s/^ *//g; $test =~ s/ *$//g; 52 | 53 | # get the arrays 54 | my @ra = split(/ +/, $ref); 55 | $reflen += @ra; 56 | my @ta = split(/ +/, $test); 57 | $testlen += @ta; 58 | 59 | # do levenshtein distance if the scores aren't equal 60 | my ($hist, $score); 61 | if ($ref eq $test) { 62 | $sentacc++; 63 | for (@ra) { $hist .= 'e'; } 64 | $score = 0; 65 | } else { 66 | ($hist, $score) = Levenshtein::distance($ref, $test); 67 | } 68 | $sent++; 69 | 70 | my @ha = split(//, $hist); 71 | for(@ha) { $scores{$_}++; } 72 | my ($rd, $td, $hd, $h, $r, $t, $l); 73 | if(not $PRINT_INLINE) { 74 | while(@ha) { 75 | $h = shift(@ha); 76 | if($h eq 'e' or $h eq 's') { 77 | $r = shift(@ra); 78 | $t = shift(@ta); 79 | } elsif ($h eq 'i') { 80 | $r = ''; 81 | $t = shift(@ta); 82 | } elsif ($h eq 'd') { 83 | $r = shift(@ra); 84 | $t = ''; 85 | } else { die "bad history value $h"; } 86 | # find the length 87 | $l = max(width($r), width($t)) + 1; 88 | $rd .= pad($r, $l); 89 | $td .= pad($t, $l); 90 | $hd .= pad($h, $l); 91 | } 92 | print "$rd\n$td\n$hd\n\n"; 93 | die "non-empty ra=@ra or ta=@ta\n" if(@ra or @ta); 94 | } else { 95 | for(Levenshtein::divide($ref, $test, $hist)) { 96 | my ($stra, $strb) = split(/\t/); 97 | print ((($stra eq $strb) ? "O" : "X")."\t$_\n"); 98 | } 99 | print "\n"; 100 | } 101 | } 102 | 103 | my $total = 0; 104 | for (values %scores) { $total += $_; } 105 | foreach my $k (keys %scores) { 106 | print "$k: $scores{$k} (".$scores{$k}/$total*100 . "%)\n"; 107 | } 108 | my $wer = ($scores{'s'}+$scores{'i'}+$scores{'d'})/$reflen*100; 109 | my $prec = $scores{'e'}/$testlen*100; 110 | my $rec = $scores{'e'}/$reflen*100; 111 | my $fmeas = (2*$prec*$rec)/($prec+$rec); 112 | $sentacc = $sentacc/$sent*100; 113 | printf ("WER: %.2f%%\nPrec: %.2f%%\nRec: %.2f%%\nF-meas: %.2f%%\nSent: %.2f%%\n", $wer, $prec, $rec, $fmeas, $sentacc); 114 | -------------------------------------------------------------------------------- /gradews.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | if(@ARGV != 2) { 6 | print STDERR "$0 REF TEST\n"; 7 | exit(1); 8 | } 9 | 10 | sub lengths { 11 | my @ret; 12 | my @bounds = (@_,1); 13 | my $last = -1; 14 | for(0 .. $#bounds) { 15 | if($bounds[$_]) { 16 | push @ret, ($_-$last); 17 | $last = $_; 18 | } 19 | } 20 | return @ret; 21 | } 22 | 23 | open REF, "<:utf8", $ARGV[0] or die $!; 24 | open TEST, "<:utf8", $ARGV[1] or die $!; 25 | my ($ref, $test); 26 | my ($totb, $corb, $refw, $testw, $corw,$tots,$cors); 27 | while(defined($ref = ) and defined($test = )) { 28 | chomp $ref; chomp $test; 29 | $tots++; 30 | $cors++ if ($ref eq $test); 31 | $ref =~ s/\/[^ ]*//g; 32 | $test =~ s/\/[^ ]*//g; 33 | my @rarr = split(//, $ref); 34 | my @tarr = split(//, $test); 35 | shift @rarr; 36 | shift @tarr; 37 | my (@rb,@tb); 38 | while(@rarr and @tarr) { 39 | my $rs = ($rarr[0] eq ' '); 40 | shift @rarr if $rs; 41 | push @rb, $rs; 42 | my $ts = ($tarr[0] eq ' '); 43 | shift @tarr if $ts; 44 | push @tb, $ts; 45 | shift @tarr; 46 | shift @rarr; 47 | } 48 | die "mismatched lines \n$ref\n$test\n" if(@rb != @tb); 49 | # total boundaries 50 | $totb += @rb; 51 | # correct boundaries 52 | for(0 .. $#rb) { $corb++ if ($rb[$_] == $tb[$_]); } 53 | # find word counts 54 | my @rlens = lengths(@rb); 55 | $refw += @rlens; 56 | my @tlens = lengths(@tb); 57 | $testw += @tlens; 58 | # print "$ref\n@rlens\n$test\n@tlens\n"; 59 | # find word matches 60 | my ($rlast, $tlast) = (0, 0); 61 | while(@rlens and @tlens) { 62 | if($rlast == $tlast) { 63 | $corw++ if($rlens[0] == $tlens[0]); 64 | } 65 | if($rlast <= $tlast) { 66 | $rlast += shift(@rlens); 67 | } 68 | if($tlast < $rlast) { 69 | $tlast += shift(@tlens); 70 | } 71 | } 72 | } 73 | print "Sent Accuracy: ".sprintf("%.2f", ($cors/$tots*100))."% ($cors/$tots)\n"; 74 | my $precw = $corw/$testw; 75 | my $recw = $corw/$refw; 76 | my $fmeasw = (2*$precw*$recw)/($precw+$recw); 77 | print "Word Prec: ".sprintf("%.2f", ($precw*100))."% ($corw/$testw)\n"; 78 | print "Word Rec: ".sprintf("%.2f", ($recw*100))."% ($corw/$refw)\n"; 79 | print "F-meas: ".sprintf("%.2f", ($fmeasw*100))."%\n"; 80 | print "Bound Accuracy: ".sprintf("%.2f", ($corb/$totb*100))."% ($corb/$totb)\n"; 81 | -------------------------------------------------------------------------------- /han2zen.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use List::Util qw(sum min max shuffle); 7 | use Getopt::Long; 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my $NOHYPHEN = 0; 13 | my $NOSPACE = 0; 14 | GetOptions( 15 | "nohyphen" => \$NOHYPHEN, 16 | "nospace" => \$NOSPACE, 17 | ); 18 | 19 | if(@ARGV != 0) { 20 | print STDERR "Usage: han2zen.pl < INPUT > OUTPUT\n"; 21 | exit 1; 22 | } 23 | 24 | while() { 25 | tr/a-zA-Z0-9()[]{}<>.,_%「」、"?・+:。!&*/a-zA-Z0-9()[]{}<>.,_%「」、”?・+:。!&*/; 26 | s/-/-/g if not $NOHYPHEN; 27 | s/ / /g if not $NOSPACE; 28 | s/\////g; 29 | print $_; 30 | } 31 | -------------------------------------------------------------------------------- /iconv.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use Text::Iconv; 4 | 5 | if(@ARGV != 2) { 6 | print STDERR "Usage: $0 FROM TO\n"; 7 | exit 1; 8 | } 9 | 10 | $| = 1; 11 | my $converter = Text::Iconv->new($ARGV[0], $ARGV[1]) or die; 12 | while() { 13 | my $converted = $converter->convert($_) or die $!; 14 | print $converted; 15 | } 16 | -------------------------------------------------------------------------------- /interleave.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use IO::File; 7 | use Getopt::Long; 8 | use List::Util qw(sum min max shuffle); 9 | binmode STDIN, ":utf8"; 10 | binmode STDOUT, ":utf8"; 11 | binmode STDERR, ":utf8"; 12 | 13 | my $NOSPACE = 0; 14 | GetOptions( 15 | "nospace" => \$NOSPACE, 16 | ); 17 | 18 | if(@ARGV == 0) { 19 | print STDERR "Usage: FILES\n"; 20 | exit 1; 21 | } 22 | 23 | my @handles; 24 | for(@ARGV) { 25 | my $fh = IO::File->new("< $_") or die "didn't work for $_"; 26 | binmode $fh, ":utf8"; 27 | push @handles, $fh; 28 | } 29 | 30 | while(1) { 31 | for(@handles) { 32 | my $val = <$_>; 33 | exit if not $val; 34 | print $val; 35 | } 36 | print "\n" if not $NOSPACE; 37 | } 38 | -------------------------------------------------------------------------------- /moses-trace-to-penn-tree.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my (@sentences, @roots); 13 | while() { 14 | chomp; 15 | /^Trans Opt (\d+) (\[\d+\.\.\d+\]): (.*)$/ or die "Bad $_\n"; 16 | my ($sent, $span, $value) = ($1, $2, $3); 17 | if(not $sentences[$sent]) { 18 | $roots[$sent] = $span; 19 | $sentences[$sent] = {} 20 | } 21 | $sentences[$sent]->{$span} = $value; 22 | } 23 | 24 | sub span2josh { 25 | my $str = shift; 26 | $str =~ /^\[(\d+)\.\.(\d+)\]$/ or die "bad moses span $str\n"; 27 | return "{$1-".($2+1)."}"; 28 | } 29 | 30 | sub print_tree { 31 | my ($tree, $span) = @_; 32 | my $val = $tree->{$span}; 33 | $val =~ /^((\[\d+\.\.\d+\]=\S* +)+): ([A-Z]+) ->(.*):((\d+-\d+ )*): pC=/ or die "bad $val\n"; 34 | my ($tags, $head, $tails, $correspond) = ($1, $3, $4, $5); 35 | $tails =~ s/ +$//g; $correspond =~ s/ +$//g; 36 | my @spans = reverse map { my @arr = split(/=/); $arr[0] } split(/ +/, $tags); 37 | my @srcs = reverse map { my @arr = split(/=/); $arr[1] } split(/ +/, $tags); 38 | my @tails = split(/ /, $tails); 39 | my %t2s; 40 | my @correspond = map { my @arr = split(/-/); $t2s{$arr[1]} = $arr[0]; \@arr } split(/ +/, $correspond); 41 | # Label the source side values 42 | my $id = 0; 43 | for(sort { $a->[1] <=> $b->[1] } @correspond) { 44 | $srcs[$_->[0]] .= ++$id; 45 | } 46 | my $ret = "(".join("_",@srcs); 47 | foreach my $i ( 0 .. $#tails ) { 48 | $ret .= " "; 49 | if(exists $t2s{$i}) { 50 | $ret .= print_tree($tree, $spans[$t2s{$i}]); 51 | } else { 52 | $ret .= "(W $tails[$i])"; 53 | } 54 | } 55 | $ret .= ")"; 56 | return $ret; 57 | } 58 | 59 | for(0 .. $#sentences) { 60 | print print_tree($sentences[$_], $roots[$_])."\n"; 61 | } 62 | -------------------------------------------------------------------------------- /ngram-hits.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my $N = 4; 13 | GetOptions( 14 | "n=i" => \$N, 15 | ); 16 | 17 | if(@ARGV != 2) { 18 | print STDERR "Usage: $0 REF SYS\n"; 19 | exit 1; 20 | } 21 | 22 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n"; 23 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n"; 24 | 25 | sub count_ngrams { 26 | my %ret; 27 | my @arr = split(/ /, $_[0]); 28 | foreach my $i (0 .. $#arr) { 29 | my @str; 30 | foreach my $j ($i .. min($#arr, $i+$N-1)) { 31 | push @str, $arr[$j]; 32 | $ret{"@str"}++; 33 | } 34 | } 35 | return %ret; 36 | } 37 | 38 | my %total; 39 | my ($s0, $s1); 40 | while(defined($s0 = ) and defined($s1 = )) { 41 | chomp $s0; chomp $s1; 42 | my %n0 = count_ngrams($s0); 43 | my %n1 = count_ngrams($s1); 44 | for(keys %n0) { 45 | if($n1{$_}) { 46 | $total{$_} += min($n0{$_}, $n1{$_}); 47 | } 48 | } 49 | } 50 | 51 | for(sort { $total{$b} <=> $total{$a} } keys %total) { 52 | print "$_\t$total{$_}\n"; 53 | } 54 | -------------------------------------------------------------------------------- /paired-bootstrap.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # Compare two systems using bootstrap resampling # 3 | # * by Graham Neubig # 4 | # * minor modifications by Mathias Müller # 5 | # # 6 | # See, e.g. the following paper for references # 7 | # # 8 | # Statistical Significance Tests for Machine Translation Evaluation # 9 | # Philipp Koehn # 10 | # http://www.aclweb.org/anthology/W04-3250 # 11 | # # 12 | ###################################################################### 13 | 14 | import numpy as np 15 | 16 | 17 | EVAL_TYPE_ACC = "acc" 18 | EVAL_TYPE_BLEU = "bleu" 19 | EVAL_TYPE_BLEU_DETOK = "bleu_detok" 20 | EVAL_TYPE_PEARSON = "pearson" 21 | 22 | EVAL_TYPES = [EVAL_TYPE_ACC, 23 | EVAL_TYPE_BLEU, 24 | EVAL_TYPE_BLEU_DETOK, 25 | EVAL_TYPE_PEARSON] 26 | 27 | 28 | def eval_preproc(data, eval_type='acc'): 29 | ''' Preprocess into the appropriate format for a particular evaluation type ''' 30 | if type(data) == str: 31 | data = data.strip() 32 | if eval_type == EVAL_TYPE_BLEU: 33 | data = data.split() 34 | elif eval_type == EVAL_TYPE_PEARSON: 35 | data = float(data) 36 | return data 37 | 38 | def eval_measure(gold, sys, eval_type='acc'): 39 | ''' Evaluation measure 40 | 41 | This takes in gold labels and system outputs and evaluates their 42 | accuracy. It currently supports: 43 | * Accuracy (acc), percentage of labels that match 44 | * Pearson's correlation coefficient (pearson) 45 | * BLEU score (bleu) 46 | * BLEU_detok, on detokenized references and translations, with internal tokenization 47 | 48 | :param gold: the correct labels 49 | :param sys: the system outputs 50 | :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok) 51 | ''' 52 | if eval_type == EVAL_TYPE_ACC: 53 | return sum([1 if g == s else 0 for g, s in zip(gold, sys)]) / float(len(gold)) 54 | elif eval_type == EVAL_TYPE_BLEU: 55 | import nltk 56 | gold_wrap = [[x] for x in gold] 57 | return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys) 58 | elif eval_type == EVAL_TYPE_PEARSON: 59 | return np.corrcoef([gold, sys])[0,1] 60 | elif eval_type == EVAL_TYPE_BLEU_DETOK: 61 | import sacrebleu 62 | # make sure score is 0-based instead of 100-based 63 | return sacrebleu.corpus_bleu(sys, [gold]).score / 100. 64 | else: 65 | raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type) 66 | 67 | def eval_with_paired_bootstrap(gold, sys1, sys2, 68 | num_samples=10000, sample_ratio=0.5, 69 | eval_type='acc'): 70 | ''' Evaluate with paired boostrap 71 | 72 | This compares two systems, performing a significance tests with 73 | paired bootstrap resampling to compare the accuracy of the two systems. 74 | 75 | :param gold: The correct labels 76 | :param sys1: The output of system 1 77 | :param sys2: The output of system 2 78 | :param num_samples: The number of bootstrap samples to take 79 | :param sample_ratio: The ratio of samples to take every time 80 | :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok) 81 | ''' 82 | assert(len(gold) == len(sys1)) 83 | assert(len(gold) == len(sys2)) 84 | 85 | # Preprocess the data appropriately for they type of eval 86 | gold = [eval_preproc(x, eval_type) for x in gold] 87 | sys1 = [eval_preproc(x, eval_type) for x in sys1] 88 | sys2 = [eval_preproc(x, eval_type) for x in sys2] 89 | 90 | sys1_scores = [] 91 | sys2_scores = [] 92 | wins = [0, 0, 0] 93 | n = len(gold) 94 | ids = list(range(n)) 95 | 96 | for _ in range(num_samples): 97 | # Subsample the gold and system outputs 98 | reduced_ids = np.random.choice(ids,int(len(ids)*sample_ratio),replace=True) 99 | reduced_gold = [gold[i] for i in reduced_ids] 100 | reduced_sys1 = [sys1[i] for i in reduced_ids] 101 | reduced_sys2 = [sys2[i] for i in reduced_ids] 102 | # Calculate accuracy on the reduced sample and save stats 103 | sys1_score = eval_measure(reduced_gold, reduced_sys1, eval_type=eval_type) 104 | sys2_score = eval_measure(reduced_gold, reduced_sys2, eval_type=eval_type) 105 | if sys1_score > sys2_score: 106 | wins[0] += 1 107 | elif sys1_score < sys2_score: 108 | wins[1] += 1 109 | else: 110 | wins[2] += 1 111 | sys1_scores.append(sys1_score) 112 | sys2_scores.append(sys2_score) 113 | 114 | # Print win stats 115 | wins = [x/float(num_samples) for x in wins] 116 | print('Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f' % (wins[0], wins[1], wins[2])) 117 | if wins[0] > wins[1]: 118 | print('(sys1 is superior with p value p=%.3f)\n' % (1-wins[0])) 119 | elif wins[1] > wins[0]: 120 | print('(sys2 is superior with p value p=%.3f)\n' % (1-wins[1])) 121 | 122 | # Print system stats 123 | sys1_scores.sort() 124 | sys2_scores.sort() 125 | print('sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]' % 126 | (np.mean(sys1_scores), np.median(sys1_scores), sys1_scores[int(num_samples * 0.025)], sys1_scores[int(num_samples * 0.975)])) 127 | print('sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]' % 128 | (np.mean(sys2_scores), np.median(sys2_scores), sys2_scores[int(num_samples * 0.025)], sys2_scores[int(num_samples * 0.975)])) 129 | 130 | if __name__ == "__main__": 131 | # execute only if run as a script 132 | import argparse 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument('gold', help='File of the correct answers') 135 | parser.add_argument('sys1', help='File of the answers for system 1') 136 | parser.add_argument('sys2', help='File of the answers for system 2') 137 | parser.add_argument('--eval_type', help='The evaluation type (acc/pearson/bleu/bleu_detok)', type=str, default='acc', choices=EVAL_TYPES) 138 | parser.add_argument('--num_samples', help='Number of samples to use', type=int, default=10000) 139 | args = parser.parse_args() 140 | 141 | with open(args.gold, 'r') as f: 142 | gold = f.readlines() 143 | with open(args.sys1, 'r') as f: 144 | sys1 = f.readlines() 145 | with open(args.sys2, 'r') as f: 146 | sys2 = f.readlines() 147 | eval_with_paired_bootstrap(gold, sys1, sys2, eval_type=args.eval_type, num_samples=args.num_samples) 148 | -------------------------------------------------------------------------------- /phrase-extract.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my $NONULL = 0; 13 | GetOptions( 14 | "nonull!" => \$NONULL 15 | ); 16 | if(@ARGV != 0) { 17 | print STDERR "Usage: $0 < ALIGNMENTS > PHRASES\n"; 18 | exit 1; 19 | } 20 | 21 | # Extracts phrases according to Och's algorithm, and outputs their indexes 22 | # in i1-i2-j1-j2 format 23 | 24 | sub quasi_consecutive { 25 | my ($small, $large, $tp, $f2e); 26 | for($small .. $large) { 27 | return 0 if ($f2e->[$_] and not exists $tp->{$_}); 28 | } 29 | return 1; 30 | } 31 | 32 | while() { 33 | chomp; 34 | s/[SP]-//g; 35 | my ($I,$J,@e2f,@f2e); 36 | # save in appropriate format 37 | for(split(/ /)) { 38 | my ($e,$f) = split(/-/); 39 | # print "e-f= $e-$f\n"; 40 | $I = max($I,$e); 41 | $J = max($J,$f); 42 | $e2f[$e] = [] if not exists $e2f[$e]; 43 | push @{$e2f[$e]}, $f; 44 | $f2e[$f] = [] if not exists $f2e[$f]; 45 | push @{$f2e[$f]}, $e; 46 | } 47 | 48 | # phrase-extract 49 | my @out; 50 | foreach my $i1 (0 .. $I) { 51 | next if $NONULL and not $e2f[$i1]; 52 | my %tp; 53 | foreach my $i2 ($i1 .. $I) { 54 | if($e2f[$i2]) { for (@{$e2f[$i2]}) { $tp{$_}++; } } 55 | elsif($NONULL) { next; } 56 | my $j1 = min(keys %tp); 57 | my $j2 = max(keys %tp); 58 | if(quasi_consecutive($j1, $j2, \%tp, \@f2e)) { 59 | my %sp; 60 | foreach my $j ($j1 .. $j2) { 61 | if($f2e[$j]) { for (@{$f2e[$j]}) { $sp{$_}++; } } 62 | } 63 | if(min(keys %sp) >= $i1 and max(keys %sp) <= $i2) { 64 | while($j1 >= 0) { 65 | my $jp = $j2; 66 | while($jp <= $J) { 67 | push @out, "$i1-".($i2+1)."-$j1-".($jp+1); 68 | $jp++; 69 | last if $NONULL or $f2e[$jp]; 70 | } 71 | $j1--; 72 | last if $NONULL or $f2e[$j1]; 73 | } 74 | } 75 | } 76 | } 77 | } 78 | print "@out\n"; 79 | } 80 | -------------------------------------------------------------------------------- /print-enju.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # *** print-enju.pl 4 | # A script to visualize enju trees coming in from STDIN 5 | # by Graham Neubig 6 | # 7 | # This script relies on Enju (of course): 8 | # http://www.nactem.ac.uk/enju/ 9 | # 10 | # And Hideki Isozaki's enjutree package: 11 | # http://softcream.oka-pu.ac.jp/tex-macros/ 12 | # 13 | # If the enjutree package is installed, you can parse a file 14 | # and display the results one by one by running: 15 | # 16 | # enju -xml file.txt | print-enju.pl 17 | 18 | use strict; 19 | use warnings; 20 | use utf8; 21 | use Getopt::Long; 22 | use List::Util qw(sum min max shuffle); 23 | binmode STDIN, ":utf8"; 24 | binmode STDOUT, ":utf8"; 25 | binmode STDERR, ":utf8"; 26 | 27 | my $PDFLATEX = "pdflatex"; 28 | my $ACROREAD = "acroread"; 29 | GetOptions( 30 | "acroread=s" => \$ACROREAD, 31 | ); 32 | 33 | if(@ARGV != 0) { 34 | print STDERR "Usage: $0\n"; 35 | exit 1; 36 | } 37 | 38 | while() { 39 | chomp; 40 | open OUT, ">:utf8", "/tmp/enju.tex" or die "Couldn't open /tmp/enju.tex\n"; 41 | print OUT "\\documentclass{article}\\usepackage{enjutree}\\begin{document}\n\\begin{enjutree}{}\n"; 42 | print OUT "$_\n"; 43 | print OUT "\\end{enjutree}\\end{document}\n"; 44 | safesystem("$PDFLATEX -output-directory /tmp /tmp/enju.tex") or die; 45 | safesystem("$ACROREAD /tmp/enju.pdf") or die; 46 | } 47 | 48 | 49 | # Adapted from Moses's train-model.perl 50 | sub safesystem { 51 | print STDERR "Executing: @_\n"; 52 | system(@_); 53 | if ($? == -1) { 54 | print STDERR "ERROR: Failed to execute: @_\n $!\n"; 55 | exit(1); 56 | } 57 | elsif ($? & 127) { 58 | printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", 59 | ($? & 127), ($? & 128) ? 'with' : 'without'; 60 | exit(1); 61 | } 62 | else { 63 | my $exitcode = $? >> 8; 64 | print STDERR "Exit code: $exitcode\n" if $exitcode; 65 | return ! $exitcode; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /print-trees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | '''A program to display parse trees (in Penn treebank format) 4 | with the NLTK. 5 | ''' 6 | 7 | import sys 8 | 9 | try: 10 | from nltk.tree import Tree 11 | except ImportError: 12 | print('Error: cannot import the NLTK!') 13 | print('You need to install the NLTK. Please visit http://nltk.org/install.html for details.') 14 | print("On Ubuntu, the installation can be done via 'sudo apt-get install python-nltk'") 15 | sys.exit() 16 | 17 | def main(): 18 | for line in sys.stdin: 19 | t = Tree.fromstring(line) 20 | t.draw() 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /reverse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | while(<>) { 3 | print join(' ', reverse(split(/[ \t\n]+/))) . "\n"; 4 | } 5 | -------------------------------------------------------------------------------- /romanize.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use utf8; 5 | use FileHandle; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | if(@ARGV != 0) { 13 | print STDERR "Usage: $0\n"; 14 | exit 1; 15 | } 16 | 17 | our %Map = ( 18 | "あ", "A", 19 | "い", "I", 20 | "う", "U", 21 | "え", "E", 22 | "お", "O", 23 | "か", "KA", 24 | "き", "KI", 25 | "く", "KU", 26 | "け", "KE", 27 | "こ", "KO", 28 | "さ", "SA", 29 | "し", "SHI", 30 | "す", "SU", 31 | "せ", "SE", 32 | "そ", "SO", 33 | "た", "TA", 34 | "ち", "CHI", 35 | "つ", "TSU", 36 | "て", "TE", 37 | "と", "TO", 38 | "な", "NA", 39 | "に", "NI", 40 | "ぬ", "NU", 41 | "ね", "NE", 42 | "の", "NO", 43 | "は", "HA", 44 | "ひ", "HI", 45 | "ふ", "FU", 46 | "へ", "HE", 47 | "ほ", "HO", 48 | "ま", "MA", 49 | "み", "MI", 50 | "む", "MU", 51 | "め", "ME", 52 | "も", "MO", 53 | "や", "YA", 54 | "ゆ", "YU", 55 | "よ", "YO", 56 | "ら", "RA", 57 | "り", "RI", 58 | "る", "RU", 59 | "れ", "RE", 60 | "ろ", "RO", 61 | "わ", "WA", 62 | "ゐ", "I", 63 | "ゑ", "E", 64 | "を", "O", 65 | "ん", "N", 66 | "ぁ", "A", 67 | "ぃ", "I", 68 | "ぅ", "U", 69 | "ぇ", "E", 70 | "ぉ", "O", 71 | "が", "GA", 72 | "ぎ", "GI", 73 | "ぐ", "GU", 74 | "げ", "GE", 75 | "ご", "GO", 76 | "ざ", "ZA", 77 | "じ", "JI", 78 | "ず", "ZU", 79 | "ぜ", "ZE", 80 | "ぞ", "ZO", 81 | "だ", "DA", 82 | "ぢ", "JI", 83 | "づ", "ZU", 84 | "で", "DE", 85 | "ど", "DO", 86 | "ば", "BA", 87 | "び", "BI", 88 | "ぶ", "BU", 89 | "べ", "BE", 90 | "ぼ", "BO", 91 | "ぱ", "PA", 92 | "ぴ", "PI", 93 | "ぷ", "PU", 94 | "ぺ", "PE", 95 | "ぽ", "PO", 96 | "きゃ", "KYA", 97 | "きゅ", "KYU", 98 | "きょ", "KYO", 99 | "しゃ", "SHA", 100 | "しゅ", "SHU", 101 | "しょ", "SHO", 102 | "ちゃ", "CHA", 103 | "ちゅ", "CHU", 104 | "ちょ", "CHO", 105 | "ちぇ", "CHE", 106 | "にゃ", "NYA", 107 | "にゅ", "NYU", 108 | "にょ", "NYO", 109 | "ひゃ", "HYA", 110 | "ひゅ", "HYU", 111 | "ひょ", "HYO", 112 | "みゃ", "MYA", 113 | "みゅ", "MYU", 114 | "みょ", "MYO", 115 | "りゃ", "RYA", 116 | "りゅ", "RYU", 117 | "りょ", "RYO", 118 | "ぎゃ", "GYA", 119 | "ぎゅ", "GYU", 120 | "ぎょ", "GYO", 121 | "じゃ", "JA", 122 | "じゅ", "JU", 123 | "じょ", "JO", 124 | "びゃ", "BYA", 125 | "びゅ", "BYU", 126 | "びょ", "BYO", 127 | "ぴゃ", "PYA", 128 | "ぴゅ", "PYU", 129 | "ぴょ", "PYO", 130 | "ふぇ", "FE", 131 | "ふぁ", "FA", 132 | "ふぉ", "FO", 133 | "ふぃ", "FI", 134 | "じぇ", "JE", 135 | "じぁ", "JA", 136 | "じぉ", "JO", 137 | "っど", "D", 138 | "っと", "T", 139 | ); 140 | 141 | sub new { 142 | my($class, %opt) = @_; 143 | bless { %opt }, $class; 144 | } 145 | 146 | sub _hepburn_for { 147 | my($string, $index) = @_; 148 | 149 | my($hepburn, $char); 150 | if ($index + 1 < length $string) { 151 | $char = substr $string, $index, 2; 152 | $hepburn = $Map{$char}; 153 | } 154 | if (!$hepburn && $index < length $string) { 155 | $char = substr $string, $index, 1; 156 | $hepburn = $Map{$char}; 157 | } 158 | 159 | return { char => $char, hepburn => $hepburn }; 160 | } 161 | 162 | sub romanize { 163 | my($string) = @_; 164 | return $string if not $string =~ /(\p{InHiragana}|\p{InKatakana})/g; 165 | 166 | unless (utf8::is_utf8($string)) { 167 | die "romanize(string): should be UTF-8 flagged string"; 168 | } 169 | 170 | $string =~ tr/ァ-ン/ぁ-ん/; 171 | 172 | my $output; 173 | my $last_hepburn; 174 | my $last_char; 175 | my $i = 0; 176 | 177 | while ($i < length $string) { 178 | my $hr = _hepburn_for($string, $i); 179 | 180 | # 1.撥音 ヘボン式ではB ・M ・P の前に N の代わりに M をおく 181 | if ($hr->{char} eq 'ん') { 182 | my $next = _hepburn_for($string, $i + 1); 183 | $hr->{hepburn} = $next->{hepburn} && $next->{hepburn} =~ /^[BMP]/ 184 | ? 'M' : 'N'; 185 | } 186 | 187 | # 2.促音 子音を重ねて示す 188 | elsif ($hr->{char} eq 'っ') { 189 | my $next = _hepburn_for($string, $i + 1); 190 | 191 | # チ(CH I)、チャ(CHA)、チュ(CHU)、チョ(CHO)音に限り、その前に T を加える。 192 | if ($next->{hepburn}) { 193 | $hr->{hepburn} = $next->{hepburn} =~ /^CH/ 194 | ? 'T' : substr($next->{hepburn}, 0, 1); 195 | } 196 | } 197 | 198 | # 3.長音 ヘボン式では長音を表記しない 199 | elsif ($hr->{char} eq "ー") { 200 | $hr->{hepburn} = ""; 201 | } 202 | 203 | if (defined $hr->{hepburn}) { 204 | if ($last_hepburn) { 205 | my $h_test = $last_hepburn . $hr->{hepburn}; 206 | if (length $h_test > 2) { 207 | $h_test = substr $h_test, -2; 208 | } 209 | 210 | # 3.長音 ヘボン式では長音を表記しない 211 | if (grep $h_test eq $_, qw( AA II UU EE )) { 212 | $hr->{hepburn} = ''; 213 | } 214 | 215 | } 216 | 217 | $output .= $hr->{hepburn}; 218 | } else { 219 | $output .= $hr->{char}; 220 | } 221 | 222 | $last_hepburn = $hr->{hepburn}; 223 | $last_char = $hr->{char}; 224 | $i += length $hr->{char}; 225 | } 226 | 227 | return ucfirst(lc($output)); 228 | } 229 | 230 | while() { 231 | chomp; 232 | my @arr = map { romanize($_) } split(/ /); 233 | print "@arr\n"; 234 | } 235 | 236 | -------------------------------------------------------------------------------- /serverfy.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use utf8; 5 | use Getopt::Long; 6 | use RPC::XML::Server; 7 | use MIME::Base64; 8 | use File::Basename; 9 | use List::Util qw(sum min max shuffle); 10 | use FileHandle; 11 | use IPC::Open2; 12 | binmode STDIN, ":utf8"; 13 | binmode STDOUT, ":utf8"; 14 | binmode STDERR, ":utf8"; 15 | 16 | $RPC::XML::ENCODING = 'utf-8'; 17 | $RPC::XML::FORCE_STRING_ENCODING = 1; 18 | 19 | my $PORT = 9002; 20 | my $TIMEOUT = 20; 21 | GetOptions( 22 | "port=s" => \$PORT, 23 | "timeout=s" => \$TIMEOUT, 24 | ); 25 | 26 | if(@ARGV != 1) { 27 | print STDERR "Usage: $0\n"; 28 | exit 1; 29 | } 30 | 31 | print STDERR "Serverfying:\n$ARGV[0]\n"; 32 | my $pid = open2(*Reader, *Writer, $ARGV[0]) or die "Could not run $ARGV[0]\n"; 33 | 34 | # 関数を追加 35 | sub run_cmd { 36 | my $s = shift; # サーバーオブジェクト 37 | my $t = shift; # txt name 38 | # Remove bad unicode and the optional header 39 | utf8::decode($t); 40 | $t =~ tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd; 41 | utf8::encode($t); 42 | return "" if $t =~ /^ *$/; 43 | # Send this to the processor 44 | print STDERR "IN: $t\n"; 45 | print Writer "$t\n"; 46 | my $got; 47 | eval { 48 | local $SIG{ALRM} = sub { die "ALARUM" }; 49 | alarm($TIMEOUT); 50 | chomp($got = ); 51 | alarm(0); 52 | }; 53 | if ($@ =~ /ALARUM/) { 54 | close Writer or die $!; 55 | close Reader or die $!; 56 | waitpid($pid, 0); 57 | $pid = open2(*Reader, *Writer, $ARGV[0]) or die "Could not run $ARGV[0]\n"; 58 | return "__FAILURE__: Timeout of $TIMEOUT seconds reached, restarting."; 59 | } 60 | my $pr = $got; 61 | utf8::decode($pr); 62 | print STDERR "OUT: $pr\n"; 63 | return $got; 64 | } 65 | 66 | # サーバーの情報を初期化 67 | my $srv = RPC::XML::Server->new(port => $PORT); 68 | 69 | # メソッドを構築 70 | my $ls_method = RPC::XML::Procedure->new(); 71 | 72 | # メソッドを追加 73 | $srv->add_method({ name => 'process', code => \&run_cmd, signature => [ 'string string' ] }); 74 | 75 | # サーバーを立ち上げる 76 | $srv->server_loop; 77 | -------------------------------------------------------------------------------- /shuffle.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my @arr = ; 13 | @arr = shuffle(@arr); 14 | for(@arr) { print $_; } 15 | -------------------------------------------------------------------------------- /split-column.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use FileHandle; 7 | use Getopt::Long; 8 | use List::Util qw(sum min max shuffle); 9 | binmode STDIN, ":utf8"; 10 | binmode STDOUT, ":utf8"; 11 | binmode STDERR, ":utf8"; 12 | 13 | if(@ARGV != 1) { 14 | print STDERR "Usage: $0\n"; 15 | exit 1; 16 | } 17 | 18 | while() { 19 | chomp; 20 | my @arr = split(/\t/); 21 | print "$arr[$ARGV[0]]\n"; 22 | } 23 | -------------------------------------------------------------------------------- /splittraintest.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use FileHandle; 7 | use Getopt::Long; 8 | use List::Util qw(sum min max shuffle); 9 | binmode STDIN, ":utf8"; 10 | binmode STDOUT, ":utf8"; 11 | binmode STDERR, ":utf8"; 12 | 13 | if(@ARGV < 3) { 14 | print STDERR "Usage: $0 FRACTION TRAIN TEST [DEV]\n"; 15 | exit 1; 16 | } 17 | 18 | my $fraction = $ARGV[0]; 19 | open TRAIN, ">:utf8", $ARGV[1] or die $!; 20 | open TEST, ">:utf8", $ARGV[2] or die $!; 21 | if(@ARGV == 4) { 22 | open DEV, ">:utf8", $ARGV[3] or die $!; 23 | } 24 | 25 | my $i = 0; 26 | while() { 27 | $i++; 28 | if(($i % $fraction) == 0) { 29 | print TEST $_; 30 | } elsif ((($i % $fraction) == 1) and (@ARGV == 4)) { 31 | print DEV $_; 32 | } else { 33 | print TRAIN $_; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /syntactic-complexity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | A program to calculate syntactic complexity of parse trees. (Relies on NLTK) 5 | This is an implementation of some of the methods in: 6 | 7 | Syntactic complexity measures for detecting Mild Cognitive Impairment 8 | Brian Roark, Margaret Mitchell and Kristy Hollingshead 9 | Proc BioNLP 2007. 10 | ''' 11 | 12 | import sys 13 | 14 | try: 15 | from nltk.tree import Tree 16 | from nltk.draw.tree import TreeWidget 17 | from nltk.draw.util import (CanvasFrame, CanvasWidget, BoxWidget, 18 | TextWidget, ParenWidget, OvalWidget) 19 | except ImportError: 20 | print 'Error: cannot import the NLTK!' 21 | print 'You need to install the NLTK. Please visit http://nltk.org/install.html for details.' 22 | print "On Ubuntu, the installation can be done via 'sudo apt-get install python-nltk'" 23 | sys.exit() 24 | 25 | def calc_words(t): 26 | if type(t) == str: 27 | return 1 28 | else: 29 | val = 0 30 | for child in t: 31 | val += calc_words(child) 32 | return val 33 | 34 | def calc_nodes(t): 35 | if type(t) == str: 36 | return 0 37 | else: 38 | val = 0 39 | for child in t: 40 | val += calc_nodes(child)+1 41 | return val 42 | 43 | def calc_yngve(t, par): 44 | if type(t) == str: 45 | return par 46 | else: 47 | val = 0 48 | for i, child in enumerate(reversed(t)): 49 | val += calc_yngve(child, par+i) 50 | return val 51 | 52 | def is_sent(val): 53 | return len(val) > 0 and val[0] == "S" 54 | 55 | def calc_frazier(t, par, par_lab): 56 | # print t 57 | # print par 58 | if type(t) == str: 59 | # print par-1 60 | return par-1 61 | else: 62 | val = 0 63 | for i, child in enumerate(t): 64 | # For all but the leftmost child, zero 65 | score = 0 66 | if i == 0: 67 | my_lab = t.node 68 | # If it's a sentence, and not duplicated, add 1.5 69 | if is_sent(my_lab): 70 | score = (0 if is_sent(par_lab) else par+1.5) 71 | # Otherwise, unless it's a root node, add one 72 | elif my_lab != "" and my_lab != "ROOT" and my_lab != "TOP": 73 | score = par + 1 74 | val += calc_frazier(child, score, my_lab) 75 | return val 76 | 77 | def main(): 78 | sents = 0 79 | words_tot = 0 80 | yngve_tot = 0 81 | frazier_tot = 0 82 | nodes_tot = 0 83 | for line in sys.stdin: 84 | if line.strip() == "": 85 | continue 86 | t = Tree.parse(line) 87 | words = calc_words(t) 88 | words_tot += words 89 | sents += 1 90 | yngve = calc_yngve(t, 0) 91 | yngve_avg = float(yngve)/words 92 | yngve_tot += yngve_avg 93 | nodes = calc_nodes(t) 94 | nodes_avg = float(nodes)/words 95 | nodes_tot += nodes_avg 96 | frazier = calc_frazier(t, 0, "") 97 | frazier_avg = float(frazier)/words 98 | frazier_tot += frazier_avg 99 | # print "Sentence=%d\twords=%d\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words, yngve_avg, frazier_avg, nodes_avg) 100 | yngve_avg = float(yngve_tot)/sents 101 | frazier_avg = float(frazier_tot)/sents 102 | nodes_avg = float(nodes_tot)/sents 103 | words_avg = float(words_tot)/sents 104 | print "Total\tsents=%d\twords=%f\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words_avg, yngve_avg, frazier_avg, nodes_avg) 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /syseval-combine.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # 4 | # What's this? 5 | # ------------ 6 | # 7 | # This script allows you to combine multiple MT system outputs into a format 8 | # suitable for manual evaluation. It should be used in combination with 9 | # syseval-report.pl. 10 | # 11 | # Usage and Data Format 12 | # --------------------- 13 | # 14 | # syseval-combine.pl \ 15 | # -src test.src \ 16 | # -ref test.trg \ 17 | # -ids output.ids \ 18 | # -min 1 \ 19 | # -max 30 \ 20 | # system-1.trg system-2.trg system-3.trg ... \ 21 | # > output.csv 22 | # 23 | # Where test.src is the input, test.trg is the reference, output.ids is a 24 | # list of ids used in syseval-report.pl. -min and -max are the minimum and 25 | # maximum length of sentences to use in evaluation, and system-1.trg, 26 | # system-2.trg, etc. are system output files. output.txt will be output in 27 | # tab-separated format for reading into spreadsheet software such as Excel or 28 | # OpenOffice Calc. 29 | # 30 | # Note that the output will be a tab-separated file with the reference and 31 | # source first, followed by a system output and a blank space for entering 32 | # its rating. The order of the system outputs will be randomized to prevent 33 | # any effect of ordering on the decisions, and also identical hypotheses will 34 | # be combined so the rater does not need to grade the same sentence twice. 35 | # This order is saved in the output.ids file, and can be restored after the 36 | # manual evaluation is finished by syseval-report.pl. 37 | 38 | use strict; 39 | use warnings; 40 | use utf8; 41 | use Getopt::Long; 42 | use FileHandle; 43 | use List::Util qw(sum min max shuffle); 44 | binmode STDIN, ":utf8"; 45 | binmode STDOUT, ":utf8"; 46 | binmode STDERR, ":utf8"; 47 | 48 | my $SRC = ""; 49 | my $REF = ""; 50 | my $IDS = ""; 51 | my $EM = ""; # If an exact match, output this 52 | my $MAX = 999; 53 | my $MIN = 1; 54 | my $NUM = 0; 55 | my $DUP = 0; # Allow duplicates 56 | GetOptions( 57 | "ref=s" => \$REF, 58 | "src=s" => \$SRC, 59 | "ids=s" => \$IDS, 60 | "em=s" => \$EM, 61 | "max=i" => \$MAX, 62 | "min=i" => \$MIN, 63 | "num=i" => \$NUM, 64 | "dup" => \$DUP, 65 | ); 66 | 67 | if(@ARGV == 0) { 68 | print STDERR "Usage: $0 EVAL_FILES...\n"; 69 | exit 1; 70 | } 71 | 72 | my @handles; 73 | my ($refh, $srch, $idsh); 74 | for(@ARGV) { 75 | my $fh = IO::File->new("< $_") or die "Couldn't open $_"; 76 | binmode $fh, ":utf8"; 77 | push @handles, $fh; 78 | } 79 | die "Must define a reference -ref or a source -src" if not $REF and not $SRC; 80 | my @title; 81 | if($REF) { 82 | push @title, "Reference"; 83 | $refh = IO::File->new("< $REF") or die "Couldn't open $REF"; 84 | binmode $refh, ":utf8"; 85 | } 86 | if($SRC) { 87 | push @title, "Source"; 88 | $srch = IO::File->new("< $SRC") or die "Couldn't open $SRC"; 89 | binmode $srch, ":utf8"; 90 | } 91 | if($IDS) { 92 | $idsh = IO::File->new("> $IDS") or die "Couldn't open $IDS"; 93 | binmode $idsh, ":utf8"; 94 | } 95 | for(0 .. $#ARGV) { 96 | push @title, "Output", "Rating"; 97 | } 98 | print join("\t", @title)."\n"; 99 | 100 | my @lines; 101 | 102 | my %dups; 103 | while(1) { 104 | my ($ref, $src, %vals, @valarr); 105 | my $len = 0; 106 | foreach my $i (0 .. $#handles) { 107 | my $hand = $handles[$i]; 108 | my $val = <$hand>; 109 | if(not $val) { 110 | @lines = shuffle(@lines); 111 | @lines = @lines[0 .. $NUM-1] if $NUM; 112 | print join("\n", map {$_->[0]} @lines)."\n"; 113 | print $idsh join("\n", map {$_->[1]} @lines)."\n" if $idsh; 114 | exit; 115 | } 116 | chomp $val; 117 | $val =~ s/ *$//g; $val =~ s/^ *//g; 118 | $vals{$val}++; 119 | push @valarr, $val; 120 | } 121 | if($refh) { $ref = <$refh>; chomp $ref; $ref =~ s/ +$//g; $len = scalar(split(/ /, $ref)); } 122 | if($srch) { $src = <$srch>; chomp $src; $src =~ s/ +$//g; $len = scalar(split(/ /, $src)); } 123 | if(($len <= $MAX) and ($len >= $MIN)) { 124 | my @cols; 125 | push @cols, $ref if $refh; 126 | push @cols, $src if $srch; 127 | my @shufvals = shuffle(keys %vals); 128 | foreach my $i (0 .. $#shufvals) { 129 | $vals{$shufvals[$i]} = $i; 130 | push @cols, $shufvals[$i], ((defined($ref) and ($shufvals[$i] eq $ref)) ? $EM : ""); 131 | } 132 | my @valarr = map { $vals{$_} } @valarr; 133 | push @lines, [join("\t", @cols), join("\t", @valarr)] if $src and not ($dups{$src}++ and not $DUP); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /syseval-combine.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import random 5 | 6 | """ 7 | What's this? 8 | ------------ 9 | 10 | This script allows you to combine multiple MT system outputs into a format 11 | suitable for manual evaluation. It should be used in combination with 12 | syseval-report.py. 13 | 14 | Usage and Data Format 15 | --------------------- 16 | 17 | syseval-combine.pl \ 18 | -src test.src \ 19 | -ref test.trg \ 20 | -ids output.ids \ 21 | -min 1 \ 22 | -max 30 \ 23 | system-1.trg system-2.trg system-3.trg ... \ 24 | > output.csv 25 | 26 | Where test.src is the input, test.trg is the reference, output.ids is a 27 | list of ids used in syseval-report.pl. -min and -max are the minimum and 28 | maximum length of sentences to use in evaluation, and system-1.trg, 29 | system-2.trg, etc. are system output files. output.txt will be output in 30 | tab-separated format for reading into spreadsheet software such as Excel or 31 | OpenOffice Calc. 32 | 33 | Note that the output will be a tab-separated file with the reference and 34 | source first, followed by a system output and a blank space for entering 35 | its rating. The order of the system outputs will be randomized to prevent 36 | any effect of ordering on the decisions, and also identical hypotheses will 37 | be combined so the rater does not need to grade the same sentence twice. 38 | This order is saved in the output.ids file, and can be restored after the 39 | manual evaluation is finished by syseval-report.py. 40 | """ 41 | 42 | def load_lines(fname: str) -> list[str]: 43 | with open(fname, 'r') as fin: 44 | return [x.strip() for x in fin] 45 | 46 | def print_line(src: str, ref: str | None, hyp: str, rating: str, file): 47 | if ref: 48 | print(f'{src}\t{ref}\t{hyp}\t{rating}', file=file) 49 | else: 50 | print(f'{src}\t{hyp}\t{rating}', file=file) 51 | 52 | def main(): 53 | 54 | # Make parser object 55 | p = argparse.ArgumentParser(description=__doc__, 56 | formatter_class=argparse.RawDescriptionHelpFormatter) 57 | 58 | p.add_argument("--src", help="the source file") 59 | p.add_argument("--ref", required=False, help="the reference file") 60 | p.add_argument("--hyps", nargs="+", help="the hypotheses") 61 | p.add_argument("--min", default=0, help="the minimum sentence length") 62 | p.add_argument("--max", default=10000, help="the maximum sentence length") 63 | p.add_argument("--lines", default=None, help="the number of lines to print out") 64 | p.add_argument("--ids_out", help="the file to write IDs to") 65 | p.add_argument("--tsv_out", help="the file to write the tsv to") 66 | args = p.parse_args() 67 | 68 | src_lines = load_lines(args.src) 69 | ref_lines = load_lines(args.ref) if args.ref else None 70 | hyps_lines = [load_lines(x) for x in args.hyps] 71 | 72 | if not all([len(x) == len(src_lines) for x in hyps_lines]): 73 | raise ValueError( 74 | f'src and hyp lines don\'t match ({len(src_lines)} != {[len(x) for x in hyps_lines]})') 75 | if ref_lines and len(ref_lines) != len(src_lines): 76 | raise ValueError(f'src and ref lines don\'t match ({len(src_lines)} != {len(ref_lines)})') 77 | 78 | valid_ids = [] 79 | for i, src_line in enumerate(src_lines): 80 | src_len = len(' '.split(src_line)) 81 | if src_len >= args.min and src_len <= args.max: 82 | valid_ids.append(i) 83 | random.shuffle(valid_ids) 84 | if args.lines and args.lines < valid_ids: 85 | valid_ids = valid_ids[:args.lines] 86 | 87 | with open(args.tsv_out, 'w') as tsv_out, open(args.ids_out, 'w') as ids_out: 88 | print_line('Source', 'Reference' if args.ref else None, 'Translation', 'Rating', tsv_out) 89 | for i in valid_ids: 90 | hyps_line = [x[i] for x in hyps_lines] 91 | dedup_line = list(set(hyps_line)) 92 | random.shuffle(dedup_line) 93 | dedup_map = {v: str(i) for i, v in enumerate(dedup_line)} 94 | hyps_ids = [dedup_map[x] for x in hyps_line] 95 | ref_line = ref_lines[i] if ref_lines else None 96 | print('\t'.join(hyps_ids), file=ids_out) 97 | print_line(src_lines[i], ref_line, dedup_line[0], 'XXX', tsv_out) 98 | for x in dedup_line[1:]: 99 | print_line('', ' ' if ref_lines else None, x, 'XXX', tsv_out) 100 | print('', file=tsv_out) 101 | 102 | if __name__ == '__main__': 103 | main() -------------------------------------------------------------------------------- /syseval-report.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # 4 | # What's this? 5 | # ------------ 6 | # 7 | # Given a CSV-format manual evaluation file created by syseval-combine.pl, 8 | # this script restores the orders of the systems and calculates an average 9 | # score for each system. 10 | # 11 | # Usage and Data Format 12 | # --------------------- 13 | # 14 | # syseval-report.pl finished-eval.csv output.ids 15 | # 16 | # Where finished-eval is a completely filled in manual evaluation CSV file 17 | # and output.ids is the ID file output by syseval-combine.pl 18 | 19 | use strict; 20 | use warnings; 21 | use utf8; 22 | use Getopt::Long; 23 | use List::Util qw(sum min max shuffle); 24 | binmode STDIN, ":utf8"; 25 | binmode STDOUT, ":utf8"; 26 | binmode STDERR, ":utf8"; 27 | 28 | my $LINES = -1; 29 | my $COMPS = "0-1"; 30 | GetOptions( 31 | "lines=i" => \$LINES, 32 | "comps=s" => \$COMPS, 33 | ); 34 | 35 | if(@ARGV != 2) { 36 | print STDERR "Usage: $0 TSV IDS\n"; 37 | exit 1; 38 | } 39 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n"; 40 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n"; 41 | 42 | my %lettermap = ( 43 | "S" => 5, 44 | "AA" => 5, 45 | "A" => 4, 46 | "B" => 3, 47 | "C" => 2, 48 | "D" => 1, 49 | "F" => 1 50 | ); 51 | 52 | my ($stsv, $sids, $lines, @scores, @tsvs, @vals, @refs, $header); 53 | while(defined($stsv = ) and defined($sids = )) { 54 | if((not $header) and ($stsv =~ /^(Source|Reference)\t/)) { 55 | $header = 1; $stsv = ; 56 | } 57 | next if($stsv =~ /\tX\t/ or $stsv =~ /\tX$/); 58 | ++$lines; 59 | last if ($LINES != -1) and ($lines > $LINES); 60 | chomp $stsv; chomp $sids; 61 | push @tsvs, $stsv; 62 | $stsv =~ s/[\t ]+$//g; 63 | my @atsv = split(/\t/, $stsv); 64 | my @aids = split(/\t/, $sids); 65 | $refs[$lines-1] = shift(@atsv); 66 | $refs[$lines-1] .= "\t".shift(@atsv) if(@atsv % 2 == 1); 67 | if((max(@aids)+1)*2 != @atsv) { die "MISMATCHED LINES: ". (max(@aids)+1)*2 ." != ".scalar(@atsv)."\n$stsv\n$sids\n"; } 68 | foreach my $i (0 .. $#aids) { 69 | $scores[$i] = [] if not $scores[$i]; 70 | my $pos = $aids[$i]*2; 71 | $atsv[$pos+1] = $lettermap{$atsv[$pos+1]} if(defined $lettermap{$atsv[$pos+1]}); 72 | $atsv[$pos+1] =~ /^[0-9\.]+$/ or die "Unfinished line $stsv\n"; 73 | push @{$scores[$i]}, $atsv[$pos+1]; 74 | $vals[$i] = [] if not $vals[$i]; 75 | push @{$vals[$i]}, $atsv[$pos]; 76 | } 77 | } 78 | 79 | # for(split(/,/, $COMPS)) { 80 | # my ($base, $sys) = split(/-/); 81 | # } 82 | 83 | foreach my $i (1 .. scalar(@{$scores[0]})) { 84 | my @line = ($i, (map { $scores[$_]->[$i-1] } (0 .. $#scores)), (map { $vals[$_]->[$i-1] } (0 .. $#vals)), $refs[$i-1]); 85 | print join("\t", @line)."\n"; 86 | } 87 | my @avgscores = map { sum(@$_) / $lines } @scores; 88 | print join("\t", "Total", map { sprintf("%.2f", $_) } @avgscores)."\n"; 89 | foreach my $lev (2 .. 5) { 90 | my @levs = map { sprintf("%.2f", sum(map { ($_ >= $lev) ? 1 : 0 } @$_) * 100 / $lines) } @scores; 91 | print join("\t", "$lev+", @levs)."\n"; 92 | } 93 | foreach my $i (0 .. $#avgscores-1) { 94 | foreach my $j ($i+1 .. $#avgscores) { 95 | my ($w, $t, $l) = bootstrap($scores[$i], $scores[$j]); 96 | print "$i-vs.-$j\t$w\t$t\t$l\n"; 97 | } 98 | } 99 | 100 | # Perform bootstrap resampling to estimate probabilities 101 | sub bootstrap { 102 | my $ITER = 10000; 103 | my @sysa = @{shift(@_)}; 104 | my @sysb = @{shift(@_)}; 105 | @sysa == @sysb or die "Uneven numbers of scores"; 106 | my @ids = (0 .. $#sysa); 107 | my @wins = (0, 0, 0); 108 | for(1 .. $ITER) { 109 | @ids = shuffle(@ids); 110 | my ($sa, $sb); 111 | for(@ids[0 .. int(@ids/2)]) { 112 | $sa += $sysa[$_]; 113 | $sb += $sysb[$_]; 114 | # print "$sysa[$_] $sysb[$_]\n"; 115 | } 116 | if($sa > $sb) { $wins[0]++; } 117 | elsif($sb > $sa) { $wins[2]++; } 118 | else { $wins[1]++; } 119 | } 120 | return map { $_ / $ITER } @wins; 121 | } 122 | -------------------------------------------------------------------------------- /syseval-report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | 5 | """ 6 | What's this? 7 | ------------ 8 | 9 | Given a TSV-format manual evaluation file created by syseval-combine.py, 10 | this script restores the orders of the systems and calculates an average 11 | score for each system. 12 | 13 | Usage and Data Format 14 | --------------------- 15 | 16 | syseval-report.py finished-eval.tsv output.ids 17 | 18 | Where finished-eval is a completely filled in manual evaluation TSV file 19 | and output.ids is the ID file output by syseval-combine.pl 20 | """ 21 | 22 | with open(sys.argv[1], 'r') as tsv_in, open(sys.argv[2], 'r') as ids_in: 23 | header = next(tsv_in).split('\t') 24 | if header[0] != 'Source': 25 | raise ValueError(f'Illegal header {header}') 26 | has_reference = 1 if header[1] == 'Reference' else 0 27 | parsed_tsv = [] 28 | curr_data = {} 29 | for line in tsv_in: 30 | cols = [x.strip() for x in line.split('\t')] 31 | if len(cols) > 1: 32 | if 'src' not in curr_data: 33 | curr_data['src'] = cols[0] 34 | if has_reference: 35 | curr_data['ref'] = cols[1] 36 | curr_data['hyps'] = [] 37 | curr_data['scores'] = [] 38 | curr_data['hyps'].append(cols[1+has_reference]) 39 | curr_data['scores'].append(cols[2+has_reference]) 40 | else: 41 | parsed_tsv.append(curr_data) 42 | curr_data = {} 43 | all_scores = None 44 | for ids_str, data in zip(ids_in, parsed_tsv): 45 | ids = [int(x) for x in ids_str.strip().split('\t')] 46 | if not all_scores: 47 | all_scores = [list() for _ in ids] 48 | my_hyps = [data['hyps'][i] for i in ids] 49 | my_scores = [data['scores'][i] for i in ids] 50 | out_cols = [data['src']] 51 | if 'ref' in data: 52 | out_cols.append(data['ref']) 53 | for i, (hyp, score) in enumerate(zip(my_hyps, my_scores)): 54 | out_cols += [hyp, score] 55 | try: 56 | all_scores[i].append(float(score)) 57 | except: 58 | print(f'WARNING: illegal score {score}', file=sys.stderr) 59 | print('\t'.join(out_cols)) 60 | avg_scores = [str(np.average(x)) for x in all_scores] 61 | print('\n--- Average Scores ---\n'+'\t'.join(avg_scores)) 62 | 63 | -------------------------------------------------------------------------------- /t/levenshtein.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Test::Simple tests => 1; 5 | 6 | use Levenshtein; 7 | 8 | # TODO: Write tests about levenshtein(). 9 | # this is a dummy test. 10 | my $a = "test"; 11 | my $b = "test"; 12 | ok($a eq $b); 13 | -------------------------------------------------------------------------------- /txt2penntokens.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use FileHandle; 7 | use Getopt::Long; 8 | use List::Util qw(sum min max shuffle); 9 | binmode STDIN, ":utf8"; 10 | binmode STDOUT, ":utf8"; 11 | binmode STDERR, ":utf8"; 12 | 13 | if(@ARGV != 0) { 14 | print STDERR "Usage: $0\n"; 15 | exit 1; 16 | } 17 | 18 | while() { 19 | $_ =~ s/@/-AT-/g; 20 | $_ =~ s/" ([^"]+) "/`` $1 ''/g; 21 | $_ =~ s/\(/-LRB-/g; 22 | $_ =~ s/\)/-RRB-/g; 23 | $_ =~ s/\[/-LSB-/g; 24 | $_ =~ s/\]/-RSB-/g; 25 | $_ =~ s/\{/-LCB-/g; 26 | $_ =~ s/\}/-RCB-/g; 27 | $_ =~ s/\//\\\//g; 28 | print "$_"; 29 | } 30 | -------------------------------------------------------------------------------- /wc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $| = 1; 4 | 5 | use strict; 6 | use warnings; 7 | use utf8; 8 | use FileHandle; 9 | use Getopt::Long; 10 | use List::Util qw(sum min max shuffle); 11 | binmode STDIN, ":utf8"; 12 | binmode STDOUT, ":utf8"; 13 | binmode STDERR, ":utf8"; 14 | 15 | if(@ARGV == 0) { 16 | my ($l, $w, $c); 17 | while() { 18 | chomp; 19 | $l++; 20 | $c += length($_); 21 | $w += split(/ /); 22 | } 23 | print "$l $w $c\n"; 24 | } else { 25 | foreach my $f (@ARGV) { 26 | open FILE, "<:utf8", $f or die "Couldn't open $f"; 27 | my ($l, $w, $c); 28 | while() { 29 | chomp; 30 | $l++; 31 | $c += length($_); 32 | $w += split(/ /); 33 | } 34 | print "$l $w $c\t$f\n"; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /zen2han.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use List::Util qw(sum min max shuffle); 7 | use Getopt::Long; 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | my $NOHYPHEN = 0; 13 | my $ZENSPACE = 0; 14 | my $UNDERSPACE = 0; 15 | my $TRIM = 0; 16 | GetOptions( 17 | "nohyphen" => \$NOHYPHEN, 18 | "zenspace" => \$ZENSPACE, 19 | "underspace" => \$UNDERSPACE, 20 | "trim" => \$TRIM, 21 | ); 22 | 23 | if(@ARGV != 0) { 24 | print STDERR "Usage: han2zen.pl < INPUT > OUTPUT\n"; 25 | exit 1; 26 | } 27 | 28 | while() { 29 | chomp; 30 | if($TRIM) { s/^ +//g; s/ +$//g; } 31 | tr/a-zA-Z0-9()[]{}<>.,_%「」、”?・+:。!&*/a-zA-Z0-9()[]{}<>.,_%「」、"?・+:。!&*/; 32 | s///\//g; 33 | if(not $NOHYPHEN) { s/-/-/g; } 34 | if($ZENSPACE) { s/ / /g; } 35 | elsif($UNDERSPACE) { s/ /__/g; } 36 | print "$_\n"; 37 | } 38 | -------------------------------------------------------------------------------- /zenspace2under.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use utf8; 5 | use FileHandle; 6 | use Getopt::Long; 7 | use List::Util qw(sum min max shuffle); 8 | binmode STDIN, ":utf8"; 9 | binmode STDOUT, ":utf8"; 10 | binmode STDERR, ":utf8"; 11 | 12 | if(@ARGV != 0) { 13 | print STDERR "Usage: $0\n"; 14 | exit 1; 15 | } 16 | 17 | while() { 18 | s/ /__/g; 19 | print $_; 20 | } 21 | --------------------------------------------------------------------------------