├── .gitignore
├── Levenshtein.pm
├── README.md
├── adjust-alignments.pl
├── combine-predicate.pl
├── compare-mt.py
├── count.pl
├── counterrors.pl
├── error-diff.pl
├── fakesgm.pl
├── grade.pl
├── gradews.pl
├── han2zen.pl
├── iconv.pl
├── interleave.pl
├── moses-trace-to-penn-tree.pl
├── ngram-hits.pl
├── paired-bootstrap.py
├── phrase-extract.pl
├── print-enju.pl
├── print-trees.py
├── reverse.pl
├── romanize.pl
├── serverfy.pl
├── shuffle.pl
├── split-column.pl
├── splittraintest.pl
├── syntactic-complexity.py
├── syseval-combine.pl
├── syseval-combine.py
├── syseval-report.pl
├── syseval-report.py
├── t
    └── levenshtein.t
├── txt2penntokens.pl
├── wc.pl
├── zen2han.pl
└── zenspace2under.pl


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | .idea
3 | 


--------------------------------------------------------------------------------
/Levenshtein.pm:
--------------------------------------------------------------------------------
 1 | package Levenshtein;
 2 | use strict;
 3 | use Carp;
 4 | 
 5 | # return a minimum edit distance path with the following notation, plus cost
 6 | #  d=delete i=insert s=substitute e=equal
 7 | sub distance {
 8 |     my ($s, $t) = @_;
 9 |     my (@sc, $m, @tc, $n, %d, %str, $i, $j, $id, $cost, $type, $aid, $bid, $cid, $c);
10 |     @sc = split(/ +/, $s);
11 |     $m = @sc;
12 |     @tc = split(/ +/, $t);
13 |     $n = @tc;
14 |     # initialize
15 |     %d = (); %str = {};
16 |     foreach $i (0 .. $m) { $id = pack('S2', $i, 0); $d{$id} = $i; $str{$id} = 'd'x$i; }
17 |     foreach $j (1 .. $n) { $id = pack('S2', 0, $j); $d{$id} = $j; $str{$id} = 'i'x$j; }
18 | 
19 |     foreach $i (1 .. $m) {
20 |         foreach $j (1 .. $n) {
21 |             if($sc[$i-1] eq $tc[$j-1]) {
22 |                 $cost = 0; $type = 'e'; # equal
23 |             } else {
24 |                 $cost = 1.1; $type = 's'; # substitution
25 |             }
26 | 
27 |             $aid = pack('S2', $i-1, $j); $a = $d{$aid} + 1; # deletion
28 |             $bid = pack('S2', $i, $j-1); $b = $d{$bid} + 1; # insertion
29 |             $cid = pack('S2', $i-1, $j-1); $c = $d{$cid} + $cost; # insertion
30 | 
31 |             $id = pack('S2', $i, $j);
32 | 
33 |             # we want matches to come at the end, so do deletions/insertions first
34 |             if($a <= $b and $a <= $c) {
35 |                 $d{$id} = $a;
36 |                 $type = 'd';
37 |                 $str{$id} = $str{$aid}.'d';
38 |             }
39 |             elsif($b <= $c) {
40 |                 $d{$id} = $b;
41 |                 $type = 'i';
42 |                 $str{$id} = $str{$bid}.'i';
43 |             }
44 |             else {
45 |                 $d{$id} = $c;
46 |                 $str{$id} = $str{$cid}.$type;
47 |             }
48 | 
49 |             delete $d{$cid};
50 |             delete $str{$cid};
51 |             # print "".$sc[$i-1]." ".$tc[$j-1]." $i $j $a $b $c $d[$id] $type\n"
52 |         }
53 |     }
54 | 
55 |     $id = pack('S2', $m, $n);
56 |     return ($str{$id}, $d{$id});
57 | }
58 | 
59 | # Divide an aligned string into corresponding parts
60 | sub divide {
61 |     my ($ref, $test, $hist) = @_;
62 |     my @ra = split(/ +/, $ref);
63 |     my @ta = split(/ +/, $test);
64 |     my @ret;
65 |     my (@er, @et, @dr, @dt);
66 |     foreach my $h (split(//, $hist)) {
67 |         if($h eq 'e') {
68 |             if(@dr or @dt) {
69 |                 push @ret, "@dr\t@dt"; @dr = (); @dt = ();
70 |             }
71 |             push @er, shift(@ra);
72 |             push @et, shift(@ta);
73 |         } else {
74 |             if(@er or @et) {
75 |                 die "@er != @et" if("@er" ne "@et");
76 |                 push @ret, "@er\t@et"; @er = (); @et = ();
77 |             }
78 |             push @dr, shift(@ra) if $h ne 'i';
79 |             push @dt, shift(@ta) if $h ne 'd';
80 |         }
81 |     }
82 |     if(@dr or @dt) { push @ret, "@dr\t@dt"; }
83 |     elsif(@er or @et) { push @ret, "@er\t@et"; }
84 |     die "non-empty ra=@ra or ta=@ta\n" if(@ra or @ta);
85 |     return @ret;
86 | }
87 | 
88 | 1;
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | util-scripts
2 | ============
3 | 
4 | ### Tests ###
5 | 
6 | You can run tests to verify you have not broken anything by typing:
7 | 
8 |     $ prove
9 | 


--------------------------------------------------------------------------------
/adjust-alignments.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use utf8;
  7 | use FindBin;
  8 | use lib $FindBin::Bin;
  9 | use Getopt::Long;
 10 | use List::Util qw(sum min max shuffle);
 11 | use Cwd qw(cwd);
 12 | use Levenshtein;
 13 | binmode STDIN, ":utf8";
 14 | binmode STDOUT, ":utf8";
 15 | binmode STDERR, ":utf8";
 16 | 
 17 | my $SRC_TYPE = "lev";
 18 | my $TRG_TYPE = "lev";
 19 | GetOptions(
 20 | "src-type=s" => \$SRC_TYPE,
 21 | "trg-type=s" => \$SRC_TYPE,
 22 | );
 23 | 
 24 | if(@ARGV != 5) {
 25 |     print STDERR "Usage: $0 SRC_OLD TRG_OLD ALIGN SRC_NEW TRG_NEW\n";
 26 |     exit 1;
 27 | }
 28 | 
 29 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n";
 30 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n";
 31 | open FILE2, "<:utf8", $ARGV[2] or die "Couldn't open $ARGV[2]\n";
 32 | open FILE3, "<:utf8", $ARGV[3] or die "Couldn't open $ARGV[3]\n";
 33 | open FILE4, "<:utf8", $ARGV[4] or die "Couldn't open $ARGV[4]\n";
 34 | 
 35 | sub map_char {
 36 |     my ($in, $out) = @_;
 37 |     my @ia = split(/ /, $in);
 38 |     my @oa = split(/ /, $out);
 39 |     my (@iw, @ow);
 40 |     foreach my $i (0 .. $#ia) { push @iw, $i for(0 .. length($ia[$i])-1); }
 41 |     foreach my $i (0 .. $#oa) { push @ow, $i for(0 .. length($oa[$i])-1); }
 42 |     # print "@iw\n@ow\n";
 43 |     die "$in\n$out\n" if(@iw != @ow);
 44 |     # Create the map
 45 |     my %ret;
 46 |     foreach my $i (0 .. $#iw) {
 47 |         $ret{$iw[$i]} = {} if not $ret{$iw[$i]};
 48 |         $ret{$iw[$i]}->{$ow[$i]}++;
 49 |     }
 50 |     return %ret;
 51 | }
 52 | 
 53 | sub normalize_en {
 54 |     $_ = shift;
 55 |     s/[\(（]/-lrb-/g;
 56 |     s/[\)）]/-rrb-/g;
 57 |     s/(``|'')/"/g;
 58 |     s/\&apos;/'/g;
 59 |     s/\&quot;/"/g;
 60 |     s/\&lt;/</g;
 61 |     s/\&gt;/>/g;
 62 |     s/([a-z])n 't/$1 n't/g;
 63 |     s/–/--/g;
 64 |     s/(`)/'/g;
 65 |     s/([^\\])\//$1\\\//g;
 66 |     s/[、，]/,/g;
 67 |     s/[。．]/./g;
 68 |     return lc($_);
 69 | }
 70 | 
 71 | sub map_lev {
 72 |     my ($in, $out) = @_;
 73 |     $in = normalize_en($in);
 74 |     $out = normalize_en($out);
 75 |     my @ia = split(/ /, $in);
 76 |     my @oa = split(/ /, $out);
 77 |     my ($hist, $score) = Levenshtein::distance($in, $out);
 78 |     my @hists = split(//, $hist);
 79 |     my (@ierr, @oerr);
 80 |     my ($ipos, $opos) = (0, 0);
 81 |     my %ret;
 82 |     while(@hists) {
 83 |         my $h = shift(@hists);
 84 |         if($h eq 'e') {
 85 |             foreach my $i (@ierr) {
 86 |                 $ret{$i} = {};
 87 |                 foreach my $o (@oerr) { $ret{$i}->{$o}++; }
 88 |             }
 89 |             @ierr = ();
 90 |             @oerr = ();
 91 |             $ret{$ipos++} = { $opos++ => 1 };
 92 |         } else {
 93 |             push @ierr, $ipos++ if $h ne 'i';
 94 |             push @oerr, $opos++ if $h ne 'd';
 95 |         }
 96 |     }
 97 |     foreach my $i (@ierr) {
 98 |         $ret{$i} = {};
 99 |         foreach my $o (@oerr) { $ret{$i}->{$o}++; }
100 |     }
101 |     # die "$in\n$out\n" if($in ne $out);
102 |     return %ret;
103 | }
104 | 
105 | sub print_map {
106 |     my $map = shift;
107 |     my @arr;
108 |     # print join(" ", keys %{$map})."\n";
109 |     foreach my $i (sort { $a <=> $b } keys %{$map}) {
110 |         foreach my $j (sort { $a <=> $b } keys %{$map->{$i}}) {
111 |             push @arr, "$i-$j";
112 |         }
113 |     }
114 |     print "@arr\n";
115 | }
116 | 
117 | sub combine_map {
118 |     my ($in, $out) = @_;
119 |     my %ret;
120 |     foreach my $i ( keys %$in ) {
121 |         foreach my $j ( keys %{$in->{$i}} ) {
122 |             foreach my $k ( keys %{$out->{$j}} ) {
123 |                 $ret{$i} = {} if not $ret{$i};
124 |                 $ret{$i}->{$k}++;
125 |             }
126 |         }
127 |     }
128 |     return %ret;
129 | }
130 | 
131 | 
132 | my ($jo, $eo, $al, $jn, $en);
133 | while(defined($jo = <FILE0>) and defined($eo = <FILE1>) and defined($al = <FILE2>) and defined($jn = <FILE3>) and defined($en = <FILE4>)) {
134 |     chomp $jo; chomp $eo; chomp $al; chomp $jn; chomp $en;
135 |     my %em = ($SRC_TYPE eq "lev") ? map_lev($eo, $en) : map_char($eo, $en);
136 |     my %jm = ($SRC_TYPE eq "lev") ? map_lev($jn, $jo) : map_char($jn, $jo);
137 |     my %am;
138 |     s/ +$//g;
139 |     for(split(/ /, $al)) {
140 |         my ($ja, $ea) = split(/-/);
141 |         $am{$ja} = {} if not $am{$ja};
142 |         $am{$ja}->{$ea}++;
143 |     }
144 |     my %jam = combine_map(\%jm, \%am);
145 |     my %jaem = combine_map(\%jam, \%em);
146 |     print_map(\%jaem);
147 | }
148 | 


--------------------------------------------------------------------------------
/combine-predicate.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #
  4 | # What's this?
  5 | # ------------
  6 | #
  7 | # Combine segmented predicates in Japanese sentences into manageable
  8 | # units for ease of statistical machine translation tasks such as word
  9 | # alignment, rule extraction and other subsequent processes.
 10 | #
 11 | # Usage and Data Format
 12 | # ---------------------
 13 | #
 14 | # Currently, we are assuming the input format is sequences of triplets of (word, the
 15 | # part-of-speech, and the reading of the word). Each triplet should be delimitted by
 16 | # slash "/".
 17 | #
 18 | # Here is the sample input and output of this script.
 19 | #
 20 | #    $ echo "これ/代名詞/これ は/助詞/は ペン/名詞/ぺん で/助動詞/で あ/動詞/あ る/語尾/る" | ./combine-predicate.pl
 21 | #    word=これ       pos=代名詞      pron=これ
 22 | #    word=は pos=助詞        pron=は
 23 | #    word=ペン       pos=名詞        pron=ぺん
 24 | #    word=で pos=助動詞      pron=で
 25 | #    word=あ pos=動詞        pron=あ
 26 | #    word=る pos=語尾        pron=る
 27 | #    これ は ペン である
 28 | #
 29 | #
 30 | # This kinds of the input data are used in the output format of KyTea
 31 | # (http://www.phontron.com/kytea/) For example,
 32 | #
 33 | #    $ echo "これはペンである" | kytea | ./combine-predicate.pl
 34 | #    これ は ペン である
 35 | #
 36 | 
 37 | use strict;
 38 | use warnings;
 39 | use utf8;
 40 | use Getopt::Long;
 41 | use List::Util qw(sum min max shuffle);
 42 | binmode STDIN, ":utf8";
 43 | binmode STDOUT, ":utf8";
 44 | binmode STDERR, ":utf8";
 45 | 
 46 | my $VERB = 0;
 47 | GetOptions(
 48 | "verb" => \$VERB,
 49 | );
 50 | 
 51 | 
 52 | my $GLUE = "";
 53 | if (@ARGV eq 1) {
 54 |     $GLUE = $ARGV[0];
 55 | } elsif (@ARGV > 1) {
 56 |     print STDERR "Usage: $0 [GLUE]\n";
 57 |     exit 1;
 58 | }
 59 | 
 60 | # Split each of the triplet into three things.
 61 | #
 62 | # E.g., "これ/代名詞/これ" => ("これ", "代名詞", "これ"),
 63 | # "ペン/名詞/ぺん" => ("ペン", "名詞", "ぺん")
 64 | #
 65 | sub wpp {
 66 |     my $s = shift;
 67 |     # print STDERR "$s\n";
 68 |     $s =~ /^(.+)\/([^\/]+)\/([^\/]+)$/ or die $s;
 69 |     return ($1, $2, $3);
 70 | }
 71 | 
 72 | # TODO: Write the documentation about used heuristic rules.
 73 | sub iscombine {
 74 |     my $s = shift;
 75 |     my ($word, $pos, $pron) = wpp($s);
 76 |     if($VERB and ($pos =~ /^動詞$/)) {
 77 |         return 1;
 78 |     } elsif($pos =~ /^(語尾|助動詞)$/) {
 79 |         return 1;
 80 |     } elsif(($word =~ /^(て|ば)$/) and ($pos =~ /^(助詞)$/)) {
 81 |         return 1;
 82 |     } elsif(($word =~ /^(な)$/) and ($pos =~ /^(形容詞)$/)) {
 83 |         return 1;
 84 |     } elsif(($word =~ /^(さ|し|す|あ|い)$/) and ($pos =~ /^(動詞)$/)) {
 85 |         return 1;
 86 |     }
 87 |     return 0;
 88 | }
 89 | 
 90 | # Combine predicates, and return the sequence of words.
 91 | #
 92 | # If there are consecutive words that fire the bits,
 93 | # they will be combined into a single word.
 94 | #
 95 | # For example,
 96 | # $harr = ["これ", "は", "ペン", "で", "あ", "る"]
 97 | # $carr = [0, 0, 0, 1, 1, 1]
 98 | # => ["これ", "は", "ペン", "である"]
 99 | #
100 | # Note that the words ["で", "あ", "る"] were merged into
101 | # "である".
102 | sub combine {
103 |   my ($harr, $carr) = @_;
104 |   my @newarr = ($$harr[0]);
105 |   foreach my $i (1 .. $#$harr) {
106 |     if (($$carr[$i] == 1) and ($$carr[$i-1] == 1)) {
107 |       $newarr[-1] .= $GLUE . $$harr[$i];
108 |     } else {
109 |       push @newarr, $$harr[$i];
110 |     }
111 |   }
112 |   return \@newarr;
113 | }
114 | 
115 | while(<STDIN>) {
116 |     chomp;
117 |     # print "$_\n";
118 |     s/\\ /　/g;
119 |     my @warr = split(/ +/);
120 |     my @harr = map { my ($w, $pr, $ps) = wpp($_); $w } @warr;
121 |     my @carr = map { iscombine($_) } @warr;
122 |     print "@{combine(\@harr, \@carr)}\n";
123 | }
124 | 


--------------------------------------------------------------------------------
/compare-mt.py:
--------------------------------------------------------------------------------
  1 | # NOTE: The master version of this script has moved and can be found here:
  2 | #       https://github.com/neulab/compare-mt
  3 | 
  4 | import sys
  5 | import argparse
  6 | import operator
  7 | import itertools
  8 | import nltk
  9 | from collections import defaultdict
 10 | 
 11 | parser = argparse.ArgumentParser(
 12 |     description='Program to compare MT results',
 13 | )
 14 | parser.add_argument('ref_file', type=str, help='A path to a correct reference file')
 15 | parser.add_argument('out_file', type=str, help='A path to a system output')
 16 | parser.add_argument('out2_file', nargs='?', type=str, default=None, help='A path to another system output. Add only if you want to compare outputs from two systems.')
 17 | parser.add_argument('--train_file', type=str, default=None, help='A link to the training corpus target file')
 18 | parser.add_argument('--train_counts', type=str, default=None, help='A link to the training word frequency counts as a tab-separated "word\\tfreq" file')
 19 | parser.add_argument('--alpha', type=float, default=1.0, help='A smoothing coefficient to control how much the model focuses on low- and high-frequency events. 1.0 should be fine most of the time.')
 20 | parser.add_argument('--ngram', type=int, default=4, help='Maximum length of n-grams.')
 21 | parser.add_argument('--ngram_size', type=int, default=50, help='How many n-grams to print.')
 22 | parser.add_argument('--sent_size', type=int, default=10, help='How many sentences to print.')
 23 | args = parser.parse_args()
 24 | 
 25 | with open(args.ref_file, "r") as f:
 26 |   ref = [line.strip().split() for line in f]
 27 | with open(args.out_file, "r") as f:
 28 |   out = [line.strip().split() for line in f]
 29 | if args.out2_file != None:
 30 |   with open(args.out2_file, "r") as f:
 31 |     out2 = [line.strip().split() for line in f]
 32 | 
 33 | def calc_ngrams(sent):
 34 |   ret = defaultdict(lambda: 0)
 35 |   for n in range(args.ngram):
 36 |     for i in range(len(sent)-n):
 37 |       ret[tuple(sent[i:i+n+1])] += 1
 38 |   return ret
 39 | 
 40 | def match_ngrams(left, right):
 41 |   ret = defaultdict(lambda: 0)
 42 |   for k, v in left.items():
 43 |     if k in right:
 44 |       ret[k] = min(v, right[k])
 45 |   return ret
 46 | 
 47 | # Calculate over and under-generated n-grams for two corpora
 48 | def calc_over_under(ref, out, alpha):
 49 |   # Create n-grams
 50 |   refall = defaultdict(lambda: 0)
 51 |   outall = defaultdict(lambda: 0)
 52 |   for refsent, outsent in zip(ref, out):
 53 |     for k, v in calc_ngrams(refsent).items():
 54 |       refall[k] += v
 55 |     for k, v in calc_ngrams(outsent).items():
 56 |       outall[k] += v
 57 |   # Calculate scores
 58 |   scores = {}
 59 |   for k, v in refall.items():
 60 |     scores[k] = (v + args.alpha) / (v + outall[k] + 2*args.alpha)
 61 |   for k, v in outall.items():
 62 |     scores[k] = (refall[k] + args.alpha) / (refall[k] + v + 2*args.alpha)
 63 |   return refall, outall, scores
 64 | 
 65 | # Calculate over and under-generated n-grams for two corpora
 66 | def calc_compare(ref, out, out2, alpha):
 67 |   outall = defaultdict(lambda: 0)
 68 |   out2all = defaultdict(lambda: 0)
 69 |   for refsent, outsent, out2sent in zip(ref, out, out2):
 70 |     refn = calc_ngrams(refsent)
 71 |     outmatch = match_ngrams(refn, calc_ngrams(outsent))
 72 |     out2match = match_ngrams(refn, calc_ngrams(out2sent))
 73 |     for k, v in outmatch.items():
 74 |       if v > out2match[k]:
 75 |         outall[k] += v - out2match[k]
 76 |     for k, v in out2match.items():
 77 |       if v > outmatch[k]:
 78 |         out2all[k] += v - outmatch[k]
 79 |   # Calculate scores
 80 |   scores = {}
 81 |   for k, v in out2all.items():
 82 |     scores[k] = (v + args.alpha) / (v + outall[k] + 2*args.alpha)
 83 |   for k, v in outall.items():
 84 |     scores[k] = (out2all[k] + args.alpha) / (out2all[k] + v + 2*args.alpha)
 85 |   return outall, out2all, scores
 86 | 
 87 | # Calculate the frequency counts, from the training corpus
 88 | # or training frequency file if either are specified, from the
 89 | # reference file if not
 90 | freq_counts = defaultdict(lambda: 0)
 91 | if args.train_counts != None:
 92 |   with open(args.train_counts, "r") as f:
 93 |     for line in f:
 94 |       word, freq = line.strip().split('\t')
 95 |       freq_counts[word] = freq
 96 | else:
 97 |   my_file = args.train_file if args.train_file != None else args.ref_file
 98 |   with open(my_file, "r") as f:
 99 |     for line in f:
100 |       for word in line.strip().split():
101 |         freq_counts[word] += 1 
102 | 
103 | def calc_matches_by_freq(ref, out, buckets):
104 |   extended_buckets = buckets + [max(freq_counts.values()) + 1]
105 |   matches = [[0,0,0] for x in extended_buckets]
106 |   for refsent, outsent in zip(ref, out):
107 |     reffreq, outfreq = defaultdict(lambda: 0), defaultdict(lambda: 0)
108 |     for x in refsent:
109 |       reffreq[x] += 1
110 |     for x in outsent:
111 |       outfreq[x] += 1
112 |     for k in set(itertools.chain(reffreq.keys(), outfreq.keys())):
113 |       for bucket, match in zip(extended_buckets, matches):
114 |         if freq_counts[k] < bucket:
115 |           match[0] += min(reffreq[k], outfreq[k])
116 |           match[1] += reffreq[k]
117 |           match[2] += outfreq[k]
118 |           break 
119 |   for bothf, reff, outf in matches:
120 |     if bothf == 0:
121 |       rec, prec, fmeas = 0.0, 0.0, 0.0
122 |     else:
123 |       rec = bothf / float(reff)
124 |       prec = bothf / float(outf)
125 |       fmeas = 2 * prec * rec / (prec + rec)
126 |     yield bothf, reff, outf, rec, prec, fmeas
127 | 
128 | buckets = [1, 2, 3, 4, 5, 10, 100, 1000]
129 | bucket_strs = []
130 | last_start = 0
131 | for x in buckets:
132 |   if x-1 == last_start:
133 |     bucket_strs.append(str(last_start))
134 |   else:
135 |     bucket_strs.append("{}-{}".format(last_start, x-1))
136 |   last_start = x
137 | bucket_strs.append("{}+".format(last_start))
138 | 
139 | # Analyze the reference/output
140 | if args.out2_file == None:
141 |   refall, outall, scores = calc_over_under(ref, out, args.alpha)
142 |   scorelist = sorted(scores.items(), key=operator.itemgetter(1))
143 |   # Print the ouput
144 |   print('********************** N-gram Difference Analysis ************************')
145 |   print('--- %d over-generated n-grams indicative of output' % args.ngram_size)
146 |   for k, v in scorelist[:args.ngram_size]:
147 |     print('%s\t%f (ref=%d, out=%d)' % (' '.join(k), v, refall[k], outall[k]))
148 |   print()
149 |   print('--- %d under-generated n-grams indicative of reference' % args.ngram_size)
150 |   for k, v in reversed(scorelist[-args.ngram_size:]):
151 |     print('%s\t%f (ref=%d, out=%d)' % (' '.join(k), v, refall[k], outall[k]))
152 |   # Calculate f-measure
153 |   matches = calc_matches_by_freq(ref, out, buckets)
154 |   print('\n\n********************** Word Frequency Analysis ************************')
155 |   print('--- word f-measure by frequency bucket')
156 |   for bucket_str, match in zip(bucket_strs, matches):
157 |     print("{}\t{:.4f}".format(bucket_str, match[5]))
158 | # Analyze the differences between two systems
159 | else:
160 |   outall, out2all, scores = calc_compare(ref, out, out2, args.alpha)
161 |   scorelist = sorted(scores.items(), key=operator.itemgetter(1))
162 |   # Print the ouput
163 |   print('********************** N-gram Difference Analysis ************************')
164 |   print('--- %d n-grams that System 1 did a better job of producing' % args.ngram_size)
165 |   for k, v in scorelist[:args.ngram_size]:
166 |     print('%s\t%f (sys1=%d, sys2=%d)' % (' '.join(k), v, outall[k], out2all[k]))
167 |   print('\n--- %d n-grams that System 2 did a better job of producing' % args.ngram_size)
168 |   for k, v in reversed(scorelist[-args.ngram_size:]):
169 |     print('%s\t%f (sys1=%d, sys2=%d)' % (' '.join(k), v, outall[k], out2all[k]))  
170 |   # Calculate f-measure
171 |   matches = calc_matches_by_freq(ref, out, buckets)
172 |   matches2 = calc_matches_by_freq(ref, out2, buckets)
173 |   print('\n\n********************** Word Frequency Analysis ************************')
174 |   print('--- word f-measure by frequency bucket')
175 |   for bucket_str, match, match2 in zip(bucket_strs, matches, matches2):
176 |     print("{}\t{:.4f}\t{:.4f}".format(bucket_str, match[5], match2[5]))
177 |   # Calculate BLEU diff
178 |   scorediff_list = []
179 |   chencherry = nltk.translate.bleu_score.SmoothingFunction()
180 |   for i, (o1, o2, r) in enumerate(zip(out, out2, ref)):
181 |     b1 = nltk.translate.bleu_score.sentence_bleu([r], o1, smoothing_function=chencherry.method2)
182 |     b2 = nltk.translate.bleu_score.sentence_bleu([r], o2, smoothing_function=chencherry.method2)
183 |     scorediff_list.append((b2-b1, b1, b2, i))
184 |   scorediff_list.sort()
185 |   print('\n\n********************** Length Analysis ************************')
186 |   print('--- length ratio')
187 |   length_ref = sum([len(x) for x in ref])
188 |   length_out = sum([len(x) for x in out])
189 |   length_out2 = sum([len(x) for x in out2])
190 |   print('System 1: {}, System 2: {}'.format(length_out/length_ref, length_out2/length_ref))
191 |   print('--- length difference from reference by bucket')
192 |   length_diff = {}
193 |   length_diff2 = {}
194 |   for r, o, o2 in zip(ref, out, out2):
195 |     ld = len(o)-len(r)
196 |     ld2 = len(o2)-len(r)
197 |     length_diff[ld] = length_diff.get(ld,0) + 1
198 |     length_diff2[ld2] = length_diff2.get(ld2,0) + 1
199 |   for ld in sorted(list(set(length_diff.keys()) | set(length_diff2.keys()))):
200 |     print("{}\t{}\t{}".format(ld, length_diff.get(ld,0), length_diff2.get(ld,0)))
201 |   print('\n\n********************** BLEU Analysis ************************')
202 |   print('--- %d sentences that System 1 did a better job at than System 2' % args.sent_size)
203 |   for bdiff, b1, b2, i in scorediff_list[:args.sent_size]:
204 |     print ('BLEU+1 sys2-sys1={}, sys1={}, sys2={}\nRef:  {}\nSys1: {}\nSys2: {}\n'.format(bdiff, b1, b2, ' '.join(ref[i]), ' '.join(out[i]), ' '.join(out2[i])))
205 |   print('--- %d sentences that System 2 did a better job at than System 1' % args.sent_size)
206 |   for bdiff, b1, b2, i in scorediff_list[-args.sent_size:]:
207 |     print ('BLEU+1 sys2-sys1={}, sys1={}, sys2={}\nRef:  {}\nSys1: {}\nSys2: {}\n'.format(bdiff, b1, b2, ' '.join(ref[i]), ' '.join(out[i]), ' '.join(out2[i])))
208 | 


--------------------------------------------------------------------------------
/count.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | binmode STDIN, ":utf8";
 4 | binmode STDOUT, ":utf8";
 5 | 
 6 | my %count;
 7 | while(<STDIN>) {
 8 |     chomp;
 9 |     for(split(/ /)) {
10 |         $count{$_}++;
11 |     }
12 | }
13 | 
14 | foreach my $k (sort {$count{$b}<=>$count{$a}} keys %count) {
15 |     print "$k\t$count{$k}\n";
16 | }
17 | 


--------------------------------------------------------------------------------
/counterrors.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # This is a script to count errors (insertions, deletions, substitutions)
 4 | # according to edit distance
 5 | 
 6 | use strict;
 7 | use warnings;
 8 | use utf8;
 9 | use List::Util qw(max min);
10 | use FindBin;
11 | use lib $FindBin::Bin;
12 | 
13 | use Levenshtein;
14 | 
15 | binmode STDIN, ":utf8";
16 | binmode STDOUT, ":utf8";
17 | 
18 | if(@ARGV != 2) {
19 |     print STDERR "Usage: counterrors.pl REFERENCE SYSTEM\n";
20 |     exit;
21 | }
22 | 
23 | # find which of the inputs are in error
24 | open REF, "<:utf8", $ARGV[0] or die $!;
25 | open TEST,    "<:utf8", $ARGV[1] or die $!;
26 | my ($ref, $test, @refs, @tests, %errs, @hists);
27 | while(defined($ref = <REF>) and defined($test = <TEST>)) {
28 |     chomp $ref; chomp $test;
29 |     my ($hist, $score) = Levenshtein::distance($ref, $test);
30 |     @refs = split(/ +/, $ref);
31 |     @tests = split(/ +/, $test);
32 |     @hists = split(//, $hist);
33 |     my (@rerr, @terr);
34 |     while(@hists) {
35 |         my $h = shift(@hists);
36 |         if($h eq 'e') {
37 |             if(@rerr+@terr) {
38 |                 my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
39 |                 $errs{$err}++;
40 |                 @rerr = ();
41 |                 @terr = ();
42 |             }
43 |             shift @refs; shift @tests;
44 |         } else {
45 |             push @rerr, shift(@refs) if $h ne 'i';
46 |             push @terr, shift(@tests) if $h ne 'd';
47 |         }
48 |     }
49 |     if(@rerr+@terr) {
50 |         my $err = (@rerr?join(' ',@rerr):"NULL")."\t".(@terr?join(' ',@terr):"NULL");
51 |         $errs{$err}++;
52 |     }
53 | }
54 | close REF;
55 | close TEST;
56 | 
57 | for(sort { $errs{$b} <=> $errs{$a} } keys %errs) {
58 |     print "$_\t$errs{$_}\n";
59 | }
60 | 


--------------------------------------------------------------------------------
/error-diff.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # This is a script to tabulate the errors made by two systems
 4 | #  In order to use it first run counterrors.pl on two different system outputs
 5 | #
 6 | # $ counterrors.pl ref.txt test-1.txt > test-1.err
 7 | # $ counterrors.pl ref.txt test-2.txt > test-2.err
 8 | # 
 9 | # Next, run this script on the error files to get the output
10 | #
11 | # $ error-diff.pl test-1.err test-2.err > test-diff.err
12 | #
13 | # test-diff.err will now contain error examples along with frequencies.
14 | # examples with positive frequencies are more common in the first file,
15 | # while examples with negative frequencies are more common in the second file.
16 | 
17 | use strict;
18 | use warnings;
19 | use utf8;
20 | binmode STDIN, ":utf8";
21 | binmode STDOUT, ":utf8";
22 | use List::Util qw(sum min max shuffle);
23 | 
24 | if(@ARGV != 2) {
25 |     print STDERR "Usage: error-diff.pl file1.err file2.err\n";
26 |     exit 1;
27 | }
28 | 
29 | my %vals;
30 | open FILE, "<:utf8", $ARGV[0] or die "$ARGV[0]: $!\n";
31 | while(<FILE>) {
32 |     chomp;
33 |     /(.*)\t([0-9]*)$/;
34 |     $vals{$1} += $2;
35 | }
36 | close FILE;
37 | open FILE, "<:utf8", $ARGV[1] or die "$ARGV[1]: $!\n";
38 | while(<FILE>) {
39 |     chomp;
40 |     /(.*)\t([0-9]*)$/;
41 |     $vals{$1} -= $2;
42 | }
43 | close FILE;
44 | 
45 | for(sort {$vals{$b} <=> $vals{$a}} keys %vals) {
46 |     print "$_\t$vals{$_}\n" if $vals{$_};
47 | }
48 | 


--------------------------------------------------------------------------------
/fakesgm.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use List::Util qw(sum min max shuffle);
 7 | use Getopt::Long;
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | if(@ARGV != 2) {
13 |     print STDERR "Usage: fakesgm.pl src/ref tar\n";
14 |     exit 1;
15 | }
16 | 
17 | print "<".$ARGV[0]."set setid=\"test2006\" srclang=\"any\" trglang=\"$ARGV[1]\">
18 | <DOC docid=\"test2006\"".(($ARGV[1] eq "ref")?" sysid=\"ref\"":" sysid=\"src\"").">\n";
19 | 
20 | my $sent = 1;
21 | while(<STDIN>) {
22 |     chomp;
23 |     print "<seg id=\"".$sent++."\"> $_ </seg>\n";
24 | }
25 | 
26 | print "</DOC>\n</".$ARGV[0]."set>\n";
27 | 


--------------------------------------------------------------------------------
/grade.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # This is a script to grade word error rates according to edit distance
  4 | if(@ARGV != 2) {
  5 |     print STDERR "$0 REF TEST\n";
  6 |     exit(1);
  7 | }
  8 | 
  9 | binmode STDIN, ":utf8";
 10 | binmode STDOUT, ":utf8";
 11 | binmode STDERR, ":utf8";
 12 | 
 13 | use utf8;
 14 | use strict;
 15 | use warnings;
 16 | use List::Util qw(max min);
 17 | use Cwd qw(cwd abs_path);
 18 | use FindBin;
 19 | use lib $FindBin::Bin;
 20 | 
 21 | use Levenshtein;
 22 | 
 23 | my $PRINT_INLINE = 1;
 24 | 
 25 | sub width {
 26 |     $_ = shift;
 27 |     my $ret = 0;
 28 |     for(split(//)) {
 29 |         $ret += ((/\p{InKatakana}/ or /\p{InHiragana}/ or /\p{InCJKSymbolsAndPunctuation}/ or /\p{InKatakanaPhoneticExtensions}/ or /\p{InCJKUnifiedIdeographs}/)?2:1);
 30 |     }
 31 |     return $ret;
 32 | }
 33 | 
 34 | sub pad {
 35 |     my ($s, $l) = @_;
 36 |     return $s . (' ' x ($l-width($s)));
 37 | }
 38 | 
 39 | use strict;
 40 | binmode STDOUT, ":utf8";
 41 | open REF, "<:utf8", $ARGV[0];
 42 | open TEST, "<:utf8", $ARGV[1];
 43 | 
 44 | my ($reflen, $testlen);
 45 | my %scores = qw(s 0 i 0 d 0 e 0);
 46 | my($ref, $test, $sent, $sentacc) = (0, 0, 0, 0);
 47 | while(defined($ref = <REF>) and defined($test = <TEST>)) {
 48 |     chomp $ref;
 49 |     chomp $test;
 50 |     $ref =~ s/^ *//g; $ref =~ s/ *$//g;
 51 |     $test =~ s/^ *//g; $test =~ s/ *$//g;
 52 | 
 53 |     # get the arrays
 54 |     my @ra = split(/ +/, $ref);
 55 |     $reflen += @ra;
 56 |     my @ta = split(/ +/, $test);
 57 |     $testlen += @ta;
 58 | 
 59 |     # do levenshtein distance if the scores aren't equal
 60 |     my ($hist, $score);
 61 |     if ($ref eq $test) {
 62 |         $sentacc++;
 63 |         for (@ra) { $hist .= 'e'; }
 64 |         $score = 0;
 65 |     } else {
 66 |         ($hist, $score) = Levenshtein::distance($ref, $test);
 67 |     }
 68 |     $sent++;
 69 | 
 70 |     my @ha = split(//, $hist);
 71 |     for(@ha) { $scores{$_}++; }
 72 |     my ($rd, $td, $hd, $h, $r, $t, $l);
 73 |     if(not $PRINT_INLINE) {
 74 |         while(@ha) {
 75 |             $h = shift(@ha);
 76 |             if($h eq 'e' or $h eq 's') {
 77 |                 $r = shift(@ra);
 78 |                 $t = shift(@ta);
 79 |             } elsif ($h eq 'i') {
 80 |                 $r = '';
 81 |                 $t = shift(@ta);
 82 |             } elsif ($h eq 'd') {
 83 |                 $r = shift(@ra);
 84 |                 $t = '';
 85 |             } else { die "bad history value $h"; }
 86 |             # find the length
 87 |             $l = max(width($r), width($t)) + 1;
 88 |             $rd .= pad($r, $l);
 89 |             $td .= pad($t, $l);
 90 |             $hd .= pad($h, $l);
 91 |         }
 92 |         print "$rd\n$td\n$hd\n\n";
 93 |         die "non-empty ra=@ra or ta=@ta\n" if(@ra or @ta);
 94 |     } else {
 95 |         for(Levenshtein::divide($ref, $test, $hist)) {
 96 |             my ($stra, $strb) = split(/\t/);
 97 |             print ((($stra eq $strb) ? "O" : "X")."\t$_\n");
 98 |         }
 99 |         print "\n";
100 |     }
101 | }
102 | 
103 | my $total = 0;
104 | for (values %scores) { $total += $_; }
105 | foreach my $k (keys %scores) {
106 |     print "$k: $scores{$k} (".$scores{$k}/$total*100 . "%)\n";
107 | }
108 | my $wer = ($scores{'s'}+$scores{'i'}+$scores{'d'})/$reflen*100;
109 | my $prec = $scores{'e'}/$testlen*100;
110 | my $rec = $scores{'e'}/$reflen*100;
111 | my $fmeas = (2*$prec*$rec)/($prec+$rec);
112 | $sentacc = $sentacc/$sent*100;
113 | printf ("WER: %.2f%%\nPrec: %.2f%%\nRec: %.2f%%\nF-meas: %.2f%%\nSent: %.2f%%\n", $wer, $prec, $rec, $fmeas, $sentacc);
114 | 


--------------------------------------------------------------------------------
/gradews.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | 
 5 | if(@ARGV != 2) {
 6 |     print STDERR "$0 REF TEST\n";
 7 |     exit(1);
 8 | }
 9 | 
10 | sub lengths {
11 |     my @ret;
12 |     my @bounds = (@_,1);
13 |     my $last = -1;
14 |     for(0 .. $#bounds) {
15 |         if($bounds[$_]) {
16 |             push @ret, ($_-$last);
17 |             $last = $_;
18 |         }
19 |     }
20 |     return @ret;
21 | }
22 | 
23 | open REF, "<:utf8", $ARGV[0] or die $!;
24 | open TEST, "<:utf8", $ARGV[1] or die $!;
25 | my ($ref, $test);
26 | my ($totb, $corb, $refw, $testw, $corw,$tots,$cors);
27 | while(defined($ref = <REF>) and defined($test = <TEST>)) {
28 |     chomp $ref; chomp $test;
29 |     $tots++;
30 |     $cors++ if ($ref eq $test);
31 |     $ref =~ s/\/[^ ]*//g;
32 |     $test =~ s/\/[^ ]*//g;
33 |     my @rarr = split(//, $ref);
34 |     my @tarr = split(//, $test);
35 |     shift @rarr;
36 |     shift @tarr;
37 |     my (@rb,@tb);
38 |     while(@rarr and @tarr) {
39 |         my $rs = ($rarr[0] eq ' ');
40 |         shift @rarr if $rs;
41 |         push @rb, $rs;
42 |         my $ts = ($tarr[0] eq ' ');
43 |         shift @tarr if $ts;
44 |         push @tb, $ts;
45 |         shift @tarr;
46 |         shift @rarr;
47 |     }
48 |     die "mismatched lines \n$ref\n$test\n" if(@rb != @tb);
49 |     # total boundaries
50 |     $totb += @rb;
51 |     # correct boundaries
52 |     for(0 .. $#rb) { $corb++ if ($rb[$_] == $tb[$_]); }
53 |     # find word counts
54 |     my @rlens = lengths(@rb);
55 |     $refw += @rlens;
56 |     my @tlens = lengths(@tb);
57 |     $testw += @tlens;
58 |     # print "$ref\n@rlens\n$test\n@tlens\n";
59 |     # find word matches
60 |     my ($rlast, $tlast) = (0, 0);
61 |     while(@rlens and @tlens) {
62 |         if($rlast == $tlast) {
63 |             $corw++ if($rlens[0] == $tlens[0]);
64 |         }
65 |         if($rlast <= $tlast) {
66 |             $rlast += shift(@rlens);
67 |         }
68 |         if($tlast < $rlast) {
69 |             $tlast += shift(@tlens);
70 |         }
71 |     }
72 | }
73 | print "Sent Accuracy: ".sprintf("%.2f", ($cors/$tots*100))."% ($cors/$tots)\n";
74 | my $precw = $corw/$testw;
75 | my $recw = $corw/$refw;
76 | my $fmeasw = (2*$precw*$recw)/($precw+$recw);
77 | print "Word Prec: ".sprintf("%.2f", ($precw*100))."% ($corw/$testw)\n";
78 | print "Word Rec: ".sprintf("%.2f", ($recw*100))."% ($corw/$refw)\n";
79 | print "F-meas: ".sprintf("%.2f", ($fmeasw*100))."%\n";
80 | print "Bound Accuracy: ".sprintf("%.2f", ($corb/$totb*100))."% ($corb/$totb)\n";
81 | 


--------------------------------------------------------------------------------
/han2zen.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use List::Util qw(sum min max shuffle);
 7 | use Getopt::Long;
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my $NOHYPHEN = 0;
13 | my $NOSPACE = 0;
14 | GetOptions(
15 |     "nohyphen" => \$NOHYPHEN,
16 |     "nospace" => \$NOSPACE,
17 | );
18 | 
19 | if(@ARGV != 0) {
20 |     print STDERR "Usage: han2zen.pl < INPUT > OUTPUT\n";
21 |     exit 1;
22 | }
23 | 
24 | while(<STDIN>) {
25 |     tr/a-zA-Z0-9()[]{}<>.,_%｢｣､"?･+:｡!&*/ａ-ｚＡ-Ｚ０-９（）［］｛｝＜＞．，＿％「」、”？・＋：。！＆＊/;
26 |     s/-/－/g if not $NOHYPHEN;
27 |     s/ /　/g if not $NOSPACE;
28 |     s/\//／/g;
29 |     print $_;
30 | }
31 | 


--------------------------------------------------------------------------------
/iconv.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use Text::Iconv;
 4 | 
 5 | if(@ARGV != 2) {
 6 |     print STDERR "Usage: $0 FROM TO\n";
 7 |     exit 1;
 8 | }
 9 | 
10 | $| = 1;
11 | my $converter = Text::Iconv->new($ARGV[0], $ARGV[1]) or die;
12 | while(<STDIN>) {
13 |     my $converted = $converter->convert($_) or die $!;
14 |     print $converted;
15 | }
16 | 


--------------------------------------------------------------------------------
/interleave.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use IO::File;
 7 | use Getopt::Long;
 8 | use List::Util qw(sum min max shuffle);
 9 | binmode STDIN, ":utf8";
10 | binmode STDOUT, ":utf8";
11 | binmode STDERR, ":utf8";
12 | 
13 | my $NOSPACE = 0;
14 | GetOptions(
15 |     "nospace" => \$NOSPACE,
16 | );
17 | 
18 | if(@ARGV == 0) {
19 |     print STDERR "Usage: FILES\n";
20 |     exit 1;
21 | }
22 | 
23 | my @handles;
24 | for(@ARGV) {
25 |    my $fh = IO::File->new("< $_") or die "didn't work for $_";
26 |    binmode $fh, ":utf8";
27 |    push @handles, $fh;
28 | }
29 | 
30 | while(1) {
31 |     for(@handles) {
32 |         my $val = <$_>;
33 |         exit if not $val;
34 |         print $val;
35 |     }
36 |     print "\n" if not $NOSPACE;
37 | }
38 | 


--------------------------------------------------------------------------------
/moses-trace-to-penn-tree.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use Getopt::Long;
 7 | use List::Util qw(sum min max shuffle);
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my (@sentences, @roots);
13 | while(<STDIN>) {
14 |     chomp;
15 |     /^Trans Opt (\d+) (\[\d+\.\.\d+\]): (.*)$/ or die "Bad $_\n";
16 |     my ($sent, $span, $value) = ($1, $2, $3);
17 |     if(not $sentences[$sent]) {
18 |         $roots[$sent] = $span;
19 |         $sentences[$sent] = {}
20 |     }
21 |     $sentences[$sent]->{$span} = $value;
22 | }
23 | 
24 | sub span2josh {
25 |     my $str = shift;
26 |     $str =~ /^\[(\d+)\.\.(\d+)\]$/ or die "bad moses span $str\n";
27 |     return "{$1-".($2+1)."}";
28 | }
29 | 
30 | sub print_tree {
31 |     my ($tree, $span) = @_;
32 |     my $val = $tree->{$span};
33 |     $val =~ /^((\[\d+\.\.\d+\]=\S* +)+): ([A-Z]+) ->(.*):((\d+-\d+ )*): pC=/ or die "bad $val\n";
34 |     my ($tags, $head, $tails, $correspond) = ($1, $3, $4, $5);
35 |     $tails =~ s/ +$//g; $correspond =~ s/ +$//g;
36 |     my @spans = reverse map { my @arr = split(/=/); $arr[0] } split(/ +/, $tags);
37 |     my @srcs = reverse map { my @arr = split(/=/); $arr[1] } split(/ +/, $tags);
38 |     my @tails = split(/ /, $tails);
39 |     my %t2s;
40 |     my @correspond = map { my @arr = split(/-/); $t2s{$arr[1]} = $arr[0]; \@arr } split(/ +/, $correspond);
41 |     # Label the source side values
42 |     my $id = 0;
43 |     for(sort { $a->[1] <=> $b->[1] } @correspond) {
44 |         $srcs[$_->[0]] .= ++$id;
45 |     }
46 |     my $ret = "(".join("_",@srcs);
47 |     foreach my $i ( 0 .. $#tails ) {
48 |         $ret .= " ";
49 |         if(exists $t2s{$i}) {
50 |             $ret .= print_tree($tree, $spans[$t2s{$i}]);
51 |         } else {
52 |             $ret .= "(W $tails[$i])";
53 |         }
54 |     }
55 |     $ret .= ")";
56 |     return $ret;
57 | }
58 | 
59 | for(0 .. $#sentences) {
60 |     print print_tree($sentences[$_], $roots[$_])."\n";
61 | }
62 | 


--------------------------------------------------------------------------------
/ngram-hits.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use Getopt::Long;
 7 | use List::Util qw(sum min max shuffle);
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my $N = 4;
13 | GetOptions(
14 |     "n=i" => \$N,
15 | );
16 | 
17 | if(@ARGV != 2) {
18 |     print STDERR "Usage: $0 REF SYS\n";
19 |     exit 1;
20 | }
21 | 
22 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n";
23 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n";
24 | 
25 | sub count_ngrams {
26 |     my %ret;
27 |     my @arr = split(/ /, $_[0]);
28 |     foreach my $i (0 .. $#arr) {
29 |         my @str;
30 |         foreach my $j ($i .. min($#arr, $i+$N-1)) {
31 |             push @str, $arr[$j];
32 |             $ret{"@str"}++;
33 |         }
34 |     }
35 |     return %ret;
36 | }
37 | 
38 | my %total;
39 | my ($s0, $s1);
40 | while(defined($s0 = <FILE0>) and defined($s1 = <FILE1>)) {
41 |     chomp $s0; chomp $s1;
42 |     my %n0 = count_ngrams($s0);
43 |     my %n1 = count_ngrams($s1);
44 |     for(keys %n0) {
45 |         if($n1{$_}) {
46 |             $total{$_} += min($n0{$_}, $n1{$_});
47 |         }
48 |     }
49 | }
50 | 
51 | for(sort { $total{$b} <=> $total{$a} } keys %total) {
52 |     print "$_\t$total{$_}\n";
53 | }
54 | 


--------------------------------------------------------------------------------
/paired-bootstrap.py:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | # Compare two systems using bootstrap resampling                     #
  3 | #  * by Graham Neubig                                                #
  4 | #  * minor modifications by Mathias Müller                           #
  5 | #                                                                    #
  6 | # See, e.g. the following paper for references                       #
  7 | #                                                                    #
  8 | # Statistical Significance Tests for Machine Translation Evaluation  #
  9 | # Philipp Koehn                                                      #
 10 | # http://www.aclweb.org/anthology/W04-3250                           #
 11 | #                                                                    #
 12 | ######################################################################
 13 | 
 14 | import numpy as np
 15 | 
 16 | 
 17 | EVAL_TYPE_ACC = "acc"
 18 | EVAL_TYPE_BLEU = "bleu"
 19 | EVAL_TYPE_BLEU_DETOK = "bleu_detok"
 20 | EVAL_TYPE_PEARSON = "pearson"
 21 | 
 22 | EVAL_TYPES = [EVAL_TYPE_ACC,
 23 |               EVAL_TYPE_BLEU,
 24 |               EVAL_TYPE_BLEU_DETOK,
 25 |               EVAL_TYPE_PEARSON]
 26 | 
 27 | 
 28 | def eval_preproc(data, eval_type='acc'):
 29 |   ''' Preprocess into the appropriate format for a particular evaluation type '''
 30 |   if type(data) == str:
 31 |     data = data.strip()
 32 |     if eval_type == EVAL_TYPE_BLEU:
 33 |       data = data.split()
 34 |     elif eval_type == EVAL_TYPE_PEARSON:
 35 |       data = float(data)
 36 |   return data
 37 | 
 38 | def eval_measure(gold, sys, eval_type='acc'):
 39 |   ''' Evaluation measure
 40 |   
 41 |   This takes in gold labels and system outputs and evaluates their
 42 |   accuracy. It currently supports:
 43 |   * Accuracy (acc), percentage of labels that match
 44 |   * Pearson's correlation coefficient (pearson)
 45 |   * BLEU score (bleu)
 46 |   * BLEU_detok, on detokenized references and translations, with internal tokenization
 47 | 
 48 |   :param gold: the correct labels
 49 |   :param sys: the system outputs
 50 |   :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok)
 51 |   '''
 52 |   if eval_type == EVAL_TYPE_ACC:
 53 |     return sum([1 if g == s else 0 for g, s in zip(gold, sys)]) / float(len(gold))
 54 |   elif eval_type == EVAL_TYPE_BLEU:
 55 |     import nltk
 56 |     gold_wrap = [[x] for x in gold]
 57 |     return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys)
 58 |   elif eval_type == EVAL_TYPE_PEARSON:
 59 |     return np.corrcoef([gold, sys])[0,1]
 60 |   elif eval_type == EVAL_TYPE_BLEU_DETOK:
 61 |     import sacrebleu
 62 |     # make sure score is 0-based instead of 100-based
 63 |     return sacrebleu.corpus_bleu(sys, [gold]).score / 100.
 64 |   else:
 65 |     raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
 66 | 
 67 | def eval_with_paired_bootstrap(gold, sys1, sys2,
 68 |                                num_samples=10000, sample_ratio=0.5,
 69 |                                eval_type='acc'):
 70 |   ''' Evaluate with paired boostrap
 71 | 
 72 |   This compares two systems, performing a significance tests with
 73 |   paired bootstrap resampling to compare the accuracy of the two systems.
 74 |   
 75 |   :param gold: The correct labels
 76 |   :param sys1: The output of system 1
 77 |   :param sys2: The output of system 2
 78 |   :param num_samples: The number of bootstrap samples to take
 79 |   :param sample_ratio: The ratio of samples to take every time
 80 |   :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok)
 81 |   '''
 82 |   assert(len(gold) == len(sys1))
 83 |   assert(len(gold) == len(sys2))
 84 |   
 85 |   # Preprocess the data appropriately for they type of eval
 86 |   gold = [eval_preproc(x, eval_type) for x in gold]
 87 |   sys1 = [eval_preproc(x, eval_type) for x in sys1]
 88 |   sys2 = [eval_preproc(x, eval_type) for x in sys2]
 89 | 
 90 |   sys1_scores = []
 91 |   sys2_scores = []
 92 |   wins = [0, 0, 0]
 93 |   n = len(gold)
 94 |   ids = list(range(n))
 95 | 
 96 |   for _ in range(num_samples):
 97 |     # Subsample the gold and system outputs
 98 |     reduced_ids = np.random.choice(ids,int(len(ids)*sample_ratio),replace=True)
 99 |     reduced_gold = [gold[i] for i in reduced_ids]
100 |     reduced_sys1 = [sys1[i] for i in reduced_ids]
101 |     reduced_sys2 = [sys2[i] for i in reduced_ids]
102 |     # Calculate accuracy on the reduced sample and save stats
103 |     sys1_score = eval_measure(reduced_gold, reduced_sys1, eval_type=eval_type)
104 |     sys2_score = eval_measure(reduced_gold, reduced_sys2, eval_type=eval_type)
105 |     if sys1_score > sys2_score:
106 |       wins[0] += 1
107 |     elif sys1_score < sys2_score:
108 |       wins[1] += 1
109 |     else:
110 |       wins[2] += 1
111 |     sys1_scores.append(sys1_score)
112 |     sys2_scores.append(sys2_score)
113 | 
114 |   # Print win stats
115 |   wins = [x/float(num_samples) for x in wins]
116 |   print('Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f' % (wins[0], wins[1], wins[2]))
117 |   if wins[0] > wins[1]:
118 |     print('(sys1 is superior with p value p=%.3f)\n' % (1-wins[0]))
119 |   elif wins[1] > wins[0]:
120 |     print('(sys2 is superior with p value p=%.3f)\n' % (1-wins[1]))
121 | 
122 |   # Print system stats
123 |   sys1_scores.sort()
124 |   sys2_scores.sort()
125 |   print('sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]' %
126 |           (np.mean(sys1_scores), np.median(sys1_scores), sys1_scores[int(num_samples * 0.025)], sys1_scores[int(num_samples * 0.975)]))
127 |   print('sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f]' %
128 |           (np.mean(sys2_scores), np.median(sys2_scores), sys2_scores[int(num_samples * 0.025)], sys2_scores[int(num_samples * 0.975)]))
129 | 
130 | if __name__ == "__main__":
131 |   # execute only if run as a script
132 |   import argparse
133 |   parser = argparse.ArgumentParser()
134 |   parser.add_argument('gold', help='File of the correct answers')
135 |   parser.add_argument('sys1', help='File of the answers for system 1')
136 |   parser.add_argument('sys2', help='File of the answers for system 2')
137 |   parser.add_argument('--eval_type', help='The evaluation type (acc/pearson/bleu/bleu_detok)', type=str, default='acc', choices=EVAL_TYPES)
138 |   parser.add_argument('--num_samples', help='Number of samples to use', type=int, default=10000)
139 |   args = parser.parse_args()
140 |   
141 |   with open(args.gold, 'r') as f:
142 |     gold = f.readlines() 
143 |   with open(args.sys1, 'r') as f:
144 |     sys1 = f.readlines() 
145 |   with open(args.sys2, 'r') as f:
146 |     sys2 = f.readlines() 
147 |   eval_with_paired_bootstrap(gold, sys1, sys2, eval_type=args.eval_type, num_samples=args.num_samples)
148 | 


--------------------------------------------------------------------------------
/phrase-extract.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use Getopt::Long;
 7 | use List::Util qw(sum min max shuffle);
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my $NONULL = 0;
13 | GetOptions(
14 |     "nonull!" => \$NONULL
15 | );
16 | if(@ARGV != 0) {
17 |     print STDERR "Usage: $0 < ALIGNMENTS > PHRASES\n";
18 |     exit 1;
19 | }
20 | 
21 | # Extracts phrases according to Och's algorithm, and outputs their indexes
22 | # in i1-i2-j1-j2 format
23 | 
24 | sub quasi_consecutive {
25 |     my ($small, $large, $tp, $f2e);
26 |     for($small .. $large) {
27 |         return 0 if ($f2e->[$_] and not exists $tp->{$_});
28 |     }
29 |     return 1;
30 | }
31 | 
32 | while(<STDIN>) {
33 |     chomp;
34 |     s/[SP]-//g;
35 |     my ($I,$J,@e2f,@f2e);
36 |     # save in appropriate format
37 |     for(split(/ /)) {
38 |         my ($e,$f) = split(/-/);
39 |         # print "e-f= $e-$f\n";
40 |         $I = max($I,$e);
41 |         $J = max($J,$f);
42 |         $e2f[$e] = [] if not exists $e2f[$e];
43 |         push @{$e2f[$e]}, $f;
44 |         $f2e[$f] = [] if not exists $f2e[$f];
45 |         push @{$f2e[$f]}, $e;
46 |     }
47 |     
48 |     # phrase-extract
49 |     my @out;
50 |     foreach my $i1 (0 .. $I) {
51 |         next if $NONULL and not $e2f[$i1];
52 |         my %tp;
53 |         foreach my $i2 ($i1 .. $I) {
54 |             if($e2f[$i2]) { for (@{$e2f[$i2]}) { $tp{$_}++; } }
55 |             elsif($NONULL) { next; }
56 |             my $j1 = min(keys %tp);
57 |             my $j2 = max(keys %tp);
58 |             if(quasi_consecutive($j1, $j2, \%tp, \@f2e)) {
59 |                 my %sp;
60 |                 foreach my $j ($j1 .. $j2) {
61 |                     if($f2e[$j]) { for (@{$f2e[$j]}) { $sp{$_}++; } }
62 |                 }
63 |                 if(min(keys %sp) >= $i1 and max(keys %sp) <= $i2) {
64 |                     while($j1 >= 0) {
65 |                         my $jp = $j2;
66 |                         while($jp <= $J) {
67 |                             push @out, "$i1-".($i2+1)."-$j1-".($jp+1);
68 |                             $jp++;
69 |                             last if $NONULL or $f2e[$jp];
70 |                         }
71 |                         $j1--;
72 |                         last if $NONULL or $f2e[$j1];
73 |                     }
74 |                 }
75 |             }
76 |         }
77 |     }
78 |     print "@out\n";
79 | }
80 | 


--------------------------------------------------------------------------------
/print-enju.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # *** print-enju.pl
 4 | # A script to visualize enju trees coming in from STDIN
 5 | # by Graham Neubig
 6 | #
 7 | # This script relies on Enju (of course):
 8 | # http://www.nactem.ac.uk/enju/
 9 | #
10 | # And Hideki Isozaki's enjutree package:
11 | # http://softcream.oka-pu.ac.jp/tex-macros/ 
12 | #
13 | # If the enjutree package is installed, you can parse a file
14 | # and display the results one by one by running:
15 | #
16 | # enju -xml file.txt | print-enju.pl
17 | 
18 | use strict;
19 | use warnings;
20 | use utf8;
21 | use Getopt::Long;
22 | use List::Util qw(sum min max shuffle);
23 | binmode STDIN, ":utf8";
24 | binmode STDOUT, ":utf8";
25 | binmode STDERR, ":utf8";
26 | 
27 | my $PDFLATEX = "pdflatex";
28 | my $ACROREAD = "acroread";
29 | GetOptions(
30 | "acroread=s" => \$ACROREAD,
31 | );
32 | 
33 | if(@ARGV != 0) {
34 |     print STDERR "Usage: $0\n";
35 |     exit 1;
36 | }
37 | 
38 | while(<STDIN>) {
39 |     chomp;
40 |     open OUT, ">:utf8", "/tmp/enju.tex" or die "Couldn't open /tmp/enju.tex\n";
41 |     print OUT "\\documentclass{article}\\usepackage{enjutree}\\begin{document}\n\\begin{enjutree}{}\n";
42 |     print OUT "$_\n";
43 |     print OUT "\\end{enjutree}\\end{document}\n";
44 |     safesystem("$PDFLATEX -output-directory /tmp /tmp/enju.tex") or die;
45 |     safesystem("$ACROREAD /tmp/enju.pdf") or die;
46 | }
47 | 
48 | 
49 | # Adapted from Moses's train-model.perl
50 | sub safesystem {
51 |   print STDERR "Executing: @_\n";
52 |   system(@_);
53 |   if ($? == -1) {
54 |       print STDERR "ERROR: Failed to execute: @_\n  $!\n";
55 |       exit(1);
56 |   }
57 |   elsif ($? & 127) {
58 |       printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
59 |           ($? & 127),  ($? & 128) ? 'with' : 'without';
60 |       exit(1);
61 |   }
62 |   else {
63 |     my $exitcode = $? >> 8;
64 |     print STDERR "Exit code: $exitcode\n" if $exitcode;
65 |     return ! $exitcode;
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/print-trees.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''A program to display parse trees (in Penn treebank format)
 4 | with the NLTK.
 5 | '''
 6 | 
 7 | import sys
 8 | 
 9 | try:
10 |   from nltk.tree import Tree
11 | except ImportError:
12 |   print('Error: cannot import the NLTK!')
13 |   print('You need to install the NLTK. Please visit http://nltk.org/install.html for details.')
14 |   print("On Ubuntu, the installation can be done via 'sudo apt-get install python-nltk'")
15 |   sys.exit()
16 | 
17 | def main():
18 |   for line in sys.stdin:
19 |     t = Tree.fromstring(line)
20 |     t.draw()
21 | 
22 | if __name__ == '__main__':
23 |   main()
24 | 


--------------------------------------------------------------------------------
/reverse.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | while(<>) {
3 |     print join(' ', reverse(split(/[ \t\n]+/))) . "\n";
4 | }
5 | 


--------------------------------------------------------------------------------
/romanize.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use utf8;
  5 | use FileHandle;
  6 | use Getopt::Long;
  7 | use List::Util qw(sum min max shuffle);
  8 | binmode STDIN, ":utf8";
  9 | binmode STDOUT, ":utf8";
 10 | binmode STDERR, ":utf8";
 11 | 
 12 | if(@ARGV != 0) {
 13 |     print STDERR "Usage: $0\n";
 14 |     exit 1;
 15 | }
 16 | 
 17 | our %Map = (
 18 |     "あ", "A",
 19 |     "い", "I",
 20 |     "う", "U",
 21 |     "え", "E",
 22 |     "お", "O",
 23 |     "か", "KA",
 24 |     "き", "KI",
 25 |     "く", "KU",
 26 |     "け", "KE",
 27 |     "こ", "KO",
 28 |     "さ", "SA",
 29 |     "し", "SHI",
 30 |     "す", "SU",
 31 |     "せ", "SE",
 32 |     "そ", "SO",
 33 |     "た", "TA",
 34 |     "ち", "CHI",
 35 |     "つ", "TSU",
 36 |     "て", "TE",
 37 |     "と", "TO",
 38 |     "な", "NA",
 39 |     "に", "NI",
 40 |     "ぬ", "NU",
 41 |     "ね", "NE",
 42 |     "の", "NO",
 43 |     "は", "HA",
 44 |     "ひ", "HI",
 45 |     "ふ", "FU",
 46 |     "へ", "HE",
 47 |     "ほ", "HO",
 48 |     "ま", "MA",
 49 |     "み", "MI",
 50 |     "む", "MU",
 51 |     "め", "ME",
 52 |     "も", "MO",
 53 |     "や", "YA",
 54 |     "ゆ", "YU",
 55 |     "よ", "YO",
 56 |     "ら", "RA",
 57 |     "り", "RI",
 58 |     "る", "RU",
 59 |     "れ", "RE",
 60 |     "ろ", "RO",
 61 |     "わ", "WA",
 62 |     "ゐ", "I",
 63 |     "ゑ", "E",
 64 |     "を", "O",
 65 |     "ん", "N",
 66 |     "ぁ", "A",
 67 |     "ぃ", "I",
 68 |     "ぅ", "U",
 69 |     "ぇ", "E",
 70 |     "ぉ", "O",
 71 |     "が", "GA",
 72 |     "ぎ", "GI",
 73 |     "ぐ", "GU",
 74 |     "げ", "GE",
 75 |     "ご", "GO",
 76 |     "ざ", "ZA",
 77 |     "じ", "JI",
 78 |     "ず", "ZU",
 79 |     "ぜ", "ZE",
 80 |     "ぞ", "ZO",
 81 |     "だ", "DA",
 82 |     "ぢ", "JI",
 83 |     "づ", "ZU",
 84 |     "で", "DE",
 85 |     "ど", "DO",
 86 |     "ば", "BA",
 87 |     "び", "BI",
 88 |     "ぶ", "BU",
 89 |     "べ", "BE",
 90 |     "ぼ", "BO",
 91 |     "ぱ", "PA",
 92 |     "ぴ", "PI",
 93 |     "ぷ", "PU",
 94 |     "ぺ", "PE",
 95 |     "ぽ", "PO",
 96 |     "きゃ", "KYA",
 97 |     "きゅ", "KYU",
 98 |     "きょ", "KYO",
 99 |     "しゃ", "SHA",
100 |     "しゅ", "SHU",
101 |     "しょ", "SHO",
102 |     "ちゃ", "CHA",
103 |     "ちゅ", "CHU",
104 |     "ちょ", "CHO",
105 |     "ちぇ", "CHE",
106 |     "にゃ", "NYA",
107 |     "にゅ", "NYU",
108 |     "にょ", "NYO",
109 |     "ひゃ", "HYA",
110 |     "ひゅ", "HYU",
111 |     "ひょ", "HYO",
112 |     "みゃ", "MYA",
113 |     "みゅ", "MYU",
114 |     "みょ", "MYO",
115 |     "りゃ", "RYA",
116 |     "りゅ", "RYU",
117 |     "りょ", "RYO",
118 |     "ぎゃ", "GYA",
119 |     "ぎゅ", "GYU",
120 |     "ぎょ", "GYO",
121 |     "じゃ", "JA",
122 |     "じゅ", "JU",
123 |     "じょ", "JO",
124 |     "びゃ", "BYA",
125 |     "びゅ", "BYU",
126 |     "びょ", "BYO",
127 |     "ぴゃ", "PYA",
128 |     "ぴゅ", "PYU",
129 |     "ぴょ", "PYO",
130 |     "ふぇ", "FE",
131 |     "ふぁ", "FA",
132 |     "ふぉ", "FO",
133 |     "ふぃ", "FI",
134 |     "じぇ", "JE",
135 |     "じぁ", "JA",
136 |     "じぉ", "JO",
137 |     "っど", "D",
138 |     "っと", "T",
139 | );
140 | 
141 | sub new {
142 |     my($class, %opt) = @_;
143 |     bless { %opt }, $class;
144 | }
145 | 
146 | sub _hepburn_for {
147 |     my($string, $index) = @_;
148 | 
149 |     my($hepburn, $char);
150 |     if ($index + 1 < length $string) {
151 |         $char    = substr $string, $index, 2;
152 |         $hepburn = $Map{$char};
153 |     }
154 |     if (!$hepburn && $index < length $string) {
155 |         $char    = substr $string, $index, 1;
156 |         $hepburn = $Map{$char};
157 |     }
158 | 
159 |     return { char => $char, hepburn => $hepburn };
160 | }
161 | 
162 | sub romanize {
163 |     my($string) = @_;
164 |     return $string if not $string =~ /(\p{InHiragana}|\p{InKatakana})/g;
165 | 
166 |     unless (utf8::is_utf8($string)) {
167 |         die "romanize(string): should be UTF-8 flagged string";
168 |     }
169 | 
170 |     $string =~ tr/ァ-ン/ぁ-ん/;
171 | 
172 |     my $output;
173 |     my $last_hepburn;
174 |     my $last_char;
175 |     my $i = 0;
176 | 
177 |     while ($i < length $string) {
178 |         my $hr = _hepburn_for($string, $i);
179 | 
180 |         # １．撥音 ヘボン式ではB ・M ・P の前に N の代わりに M をおく
181 |         if ($hr->{char} eq 'ん') {
182 |             my $next = _hepburn_for($string, $i + 1);
183 |             $hr->{hepburn} = $next->{hepburn} && $next->{hepburn} =~ /^[BMP]/
184 |                 ? 'M' : 'N';
185 |         }
186 | 
187 |         # ２．促音 子音を重ねて示す
188 |         elsif ($hr->{char} eq 'っ') {
189 |             my $next = _hepburn_for($string, $i + 1);
190 | 
191 |             # チ（CH I）、チャ（CHA）、チュ（CHU）、チョ（CHO）音に限り、その前に T を加える。
192 |             if ($next->{hepburn}) {
193 |                 $hr->{hepburn} = $next->{hepburn} =~ /^CH/
194 |                     ? 'T' : substr($next->{hepburn}, 0, 1);
195 |             }
196 |         }
197 | 
198 |         # ３．長音 ヘボン式では長音を表記しない
199 |         elsif ($hr->{char} eq "ー") {
200 |             $hr->{hepburn} = "";
201 |         }
202 | 
203 |         if (defined $hr->{hepburn}) {
204 |             if ($last_hepburn) {
205 |                 my $h_test = $last_hepburn . $hr->{hepburn};
206 |                 if (length $h_test > 2) {
207 |                     $h_test = substr $h_test, -2;
208 |                 }
209 | 
210 |                 # ３．長音 ヘボン式では長音を表記しない
211 |                 if (grep $h_test eq $_, qw( AA II UU EE )) {
212 |                     $hr->{hepburn} = '';
213 |                 }
214 | 
215 |             }
216 | 
217 |             $output .= $hr->{hepburn};
218 |         } else {
219 |             $output .= $hr->{char};
220 |         }
221 | 
222 |         $last_hepburn = $hr->{hepburn};
223 |         $last_char    = $hr->{char};
224 |         $i += length $hr->{char};
225 |     }
226 | 
227 |     return ucfirst(lc($output));
228 | }
229 | 
230 | while(<STDIN>) {
231 |     chomp;
232 |     my @arr = map { romanize($_) } split(/ /);
233 |     print "@arr\n";
234 | }
235 | 
236 | 


--------------------------------------------------------------------------------
/serverfy.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use utf8;
 5 | use Getopt::Long;
 6 | use RPC::XML::Server;
 7 | use MIME::Base64;
 8 | use File::Basename;
 9 | use List::Util qw(sum min max shuffle);
10 | use FileHandle;
11 | use IPC::Open2;
12 | binmode STDIN, ":utf8";
13 | binmode STDOUT, ":utf8";
14 | binmode STDERR, ":utf8";
15 | 
16 | $RPC::XML::ENCODING = 'utf-8';
17 | $RPC::XML::FORCE_STRING_ENCODING = 1;
18 | 
19 | my $PORT = 9002;
20 | my $TIMEOUT = 20;
21 | GetOptions(
22 | "port=s" => \$PORT,
23 | "timeout=s" => \$TIMEOUT,
24 | );
25 | 
26 | if(@ARGV != 1) {
27 |     print STDERR "Usage: $0\n";
28 |     exit 1;
29 | }
30 | 
31 | print STDERR "Serverfying:\n$ARGV[0]\n";
32 | my $pid = open2(*Reader, *Writer, $ARGV[0]) or die "Could not run $ARGV[0]\n";
33 | 
34 | # 関数を追加
35 | sub run_cmd {
36 |     my $s = shift; # サーバーオブジェクト
37 |     my $t = shift; # txt name
38 |     # Remove bad unicode and the optional <req> header
39 |     utf8::decode($t);
40 |     $t =~ tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;
41 |     utf8::encode($t);
42 |     return "" if $t =~ /^ *$/;
43 |     # Send this to the processor
44 |     print STDERR "IN: $t\n";
45 |     print Writer "$t\n";
46 |     my $got;
47 |     eval {
48 |         local $SIG{ALRM} = sub { die "ALARUM" };
49 |         alarm($TIMEOUT);
50 |         chomp($got = <Reader>);
51 |         alarm(0);
52 |     };
53 |     if ($@ =~ /ALARUM/) { 
54 |         close Writer or die $!;
55 |         close Reader or die $!;
56 |         waitpid($pid, 0);
57 |         $pid = open2(*Reader, *Writer, $ARGV[0]) or die "Could not run $ARGV[0]\n";
58 |         return "__FAILURE__: Timeout of $TIMEOUT seconds reached, restarting.";
59 |     }
60 |     my $pr = $got;
61 |     utf8::decode($pr);
62 |     print STDERR "OUT: $pr\n";
63 |  	return $got;
64 | }
65 | 
66 | # サーバーの情報を初期化
67 | my $srv = RPC::XML::Server->new(port => $PORT);
68 | 
69 | # メソッドを構築
70 | my $ls_method = RPC::XML::Procedure->new();
71 | 
72 | # メソッドを追加
73 | $srv->add_method({ name => 'process', code => \&run_cmd, signature => [ 'string string' ] });
74 | 
75 | # サーバーを立ち上げる
76 | $srv->server_loop;
77 | 


--------------------------------------------------------------------------------
/shuffle.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use Getopt::Long;
 7 | use List::Util qw(sum min max shuffle);
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my @arr = <STDIN>;
13 | @arr = shuffle(@arr);
14 | for(@arr) { print $_; }
15 | 


--------------------------------------------------------------------------------
/split-column.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use FileHandle;
 7 | use Getopt::Long;
 8 | use List::Util qw(sum min max shuffle);
 9 | binmode STDIN, ":utf8";
10 | binmode STDOUT, ":utf8";
11 | binmode STDERR, ":utf8";
12 | 
13 | if(@ARGV != 1) {
14 |     print STDERR "Usage: $0\n";
15 |     exit 1;
16 | }
17 | 
18 | while(<STDIN>) {
19 |     chomp;
20 |     my @arr = split(/\t/);
21 |     print "$arr[$ARGV[0]]\n";
22 | }
23 | 


--------------------------------------------------------------------------------
/splittraintest.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use FileHandle;
 7 | use Getopt::Long;
 8 | use List::Util qw(sum min max shuffle);
 9 | binmode STDIN, ":utf8";
10 | binmode STDOUT, ":utf8";
11 | binmode STDERR, ":utf8";
12 | 
13 | if(@ARGV < 3) {
14 |     print STDERR "Usage: $0 FRACTION TRAIN TEST [DEV]\n";
15 |     exit 1;
16 | }
17 | 
18 | my $fraction = $ARGV[0];
19 | open TRAIN, ">:utf8", $ARGV[1] or die $!;
20 | open TEST, ">:utf8", $ARGV[2] or die $!;
21 | if(@ARGV == 4) {
22 |     open DEV, ">:utf8", $ARGV[3] or die $!;
23 | }
24 | 
25 | my $i = 0;
26 | while(<STDIN>) {
27 |     $i++;
28 |     if(($i % $fraction) == 0) {
29 |         print TEST $_;
30 |     } elsif ((($i % $fraction) == 1) and (@ARGV == 4)) {
31 |         print DEV $_;
32 |     } else {
33 |         print TRAIN $_;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/syntactic-complexity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | '''
  4 | A program to calculate syntactic complexity of parse trees. (Relies on NLTK)
  5 | This is an implementation of some of the methods in:
  6 | 
  7 | Syntactic complexity measures for detecting Mild Cognitive Impairment
  8 | Brian Roark, Margaret Mitchell and Kristy Hollingshead
  9 | Proc BioNLP 2007.
 10 | '''
 11 | 
 12 | import sys
 13 | 
 14 | try:
 15 |   from nltk.tree import Tree
 16 |   from nltk.draw.tree import TreeWidget
 17 |   from nltk.draw.util import (CanvasFrame, CanvasWidget, BoxWidget,
 18 |                               TextWidget, ParenWidget, OvalWidget)
 19 | except ImportError:
 20 |   print 'Error: cannot import the NLTK!'
 21 |   print 'You need to install the NLTK. Please visit http://nltk.org/install.html for details.'
 22 |   print "On Ubuntu, the installation can be done via 'sudo apt-get install python-nltk'"
 23 |   sys.exit()
 24 | 
 25 | def calc_words(t):
 26 |     if type(t) == str:
 27 |         return 1
 28 |     else:
 29 |         val = 0
 30 |         for child in t:
 31 |             val += calc_words(child)
 32 |         return val
 33 | 
 34 | def calc_nodes(t):
 35 |     if type(t) == str:
 36 |         return 0
 37 |     else:
 38 |         val = 0
 39 |         for child in t:
 40 |             val += calc_nodes(child)+1
 41 |         return val
 42 | 
 43 | def calc_yngve(t, par):
 44 |     if type(t) == str:
 45 |         return par
 46 |     else:
 47 |         val = 0
 48 |         for i, child in enumerate(reversed(t)):
 49 |             val += calc_yngve(child, par+i)
 50 |         return val
 51 | 
 52 | def is_sent(val):
 53 |     return len(val) > 0 and val[0] == "S"
 54 | 
 55 | def calc_frazier(t, par, par_lab):
 56 |     # print t
 57 |     # print par
 58 |     if type(t) == str:
 59 |         # print par-1
 60 |         return par-1
 61 |     else:
 62 |         val = 0
 63 |         for i, child in enumerate(t):
 64 |             # For all but the leftmost child, zero
 65 |             score = 0
 66 |             if i == 0:
 67 |                 my_lab = t.node
 68 |                 # If it's a sentence, and not duplicated, add 1.5
 69 |                 if is_sent(my_lab):
 70 |                     score = (0 if is_sent(par_lab) else par+1.5)
 71 |                 # Otherwise, unless it's a root node, add one
 72 |                 elif my_lab != "" and my_lab != "ROOT" and my_lab != "TOP":
 73 |                     score = par + 1
 74 |             val += calc_frazier(child, score, my_lab)
 75 |         return val
 76 | 
 77 | def main():
 78 |     sents = 0
 79 |     words_tot = 0
 80 |     yngve_tot = 0
 81 |     frazier_tot = 0
 82 |     nodes_tot = 0
 83 |     for line in sys.stdin:
 84 |         if line.strip() == "":
 85 |             continue
 86 |         t = Tree.parse(line)
 87 |         words = calc_words(t)
 88 |         words_tot += words
 89 |         sents += 1
 90 |         yngve = calc_yngve(t, 0)
 91 |         yngve_avg = float(yngve)/words
 92 |         yngve_tot += yngve_avg
 93 |         nodes = calc_nodes(t)
 94 |         nodes_avg = float(nodes)/words
 95 |         nodes_tot += nodes_avg
 96 |         frazier = calc_frazier(t, 0, "")
 97 |         frazier_avg = float(frazier)/words
 98 |         frazier_tot += frazier_avg
 99 |         # print "Sentence=%d\twords=%d\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words, yngve_avg, frazier_avg, nodes_avg)
100 |     yngve_avg = float(yngve_tot)/sents
101 |     frazier_avg = float(frazier_tot)/sents
102 |     nodes_avg = float(nodes_tot)/sents
103 |     words_avg = float(words_tot)/sents
104 |     print "Total\tsents=%d\twords=%f\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words_avg, yngve_avg, frazier_avg, nodes_avg)
105 | 
106 | if __name__ == '__main__':
107 |   main()
108 | 


--------------------------------------------------------------------------------
/syseval-combine.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #
  4 | # What's this?
  5 | # ------------
  6 | #
  7 | # This script allows you to combine multiple MT system outputs into a format
  8 | # suitable for manual evaluation. It should be used in combination with
  9 | # syseval-report.pl.
 10 | #
 11 | # Usage and Data Format
 12 | # ---------------------
 13 | #
 14 | # syseval-combine.pl               \
 15 | #            -src test.src         \
 16 | #            -ref test.trg         \
 17 | #            -ids output.ids       \
 18 | #            -min 1                \
 19 | #            -max 30               \
 20 | #            system-1.trg system-2.trg system-3.trg ... \
 21 | #            > output.csv
 22 | #
 23 | # Where test.src is the input, test.trg is the reference, output.ids is a
 24 | # list of ids used in syseval-report.pl. -min and -max are the minimum and
 25 | # maximum length of sentences to use in evaluation, and system-1.trg,
 26 | # system-2.trg, etc. are system output files. output.txt will be output in
 27 | # tab-separated format for reading into spreadsheet software such as Excel or
 28 | # OpenOffice Calc.
 29 | #
 30 | # Note that the output will be a tab-separated file with the reference and
 31 | # source first, followed by a system output and a blank space for entering
 32 | # its rating. The order of the system outputs will be randomized to prevent
 33 | # any effect of ordering on the decisions, and also identical hypotheses will
 34 | # be combined so the rater does not need to grade the same sentence twice.
 35 | # This order is saved in the output.ids file, and can be restored after the
 36 | # manual evaluation is finished by syseval-report.pl.
 37 | 
 38 | use strict;
 39 | use warnings;
 40 | use utf8;
 41 | use Getopt::Long;
 42 | use FileHandle;
 43 | use List::Util qw(sum min max shuffle);
 44 | binmode STDIN, ":utf8";
 45 | binmode STDOUT, ":utf8";
 46 | binmode STDERR, ":utf8";
 47 | 
 48 | my $SRC = "";
 49 | my $REF = "";
 50 | my $IDS = "";
 51 | my $EM = "";   # If an exact match, output this
 52 | my $MAX = 999;
 53 | my $MIN = 1;
 54 | my $NUM = 0;
 55 | my $DUP = 0; # Allow duplicates
 56 | GetOptions(
 57 |     "ref=s" => \$REF,
 58 |     "src=s" => \$SRC,
 59 |     "ids=s" => \$IDS,
 60 |     "em=s" => \$EM,
 61 |     "max=i" => \$MAX,
 62 |     "min=i" => \$MIN,
 63 |     "num=i" => \$NUM,
 64 |     "dup" => \$DUP,
 65 | );
 66 | 
 67 | if(@ARGV == 0) {
 68 |     print STDERR "Usage: $0 EVAL_FILES...\n";
 69 |     exit 1;
 70 | }
 71 | 
 72 | my @handles;
 73 | my ($refh, $srch, $idsh);
 74 | for(@ARGV) {
 75 |    my $fh = IO::File->new("< $_") or die "Couldn't open $_";
 76 |    binmode $fh, ":utf8";
 77 |    push @handles, $fh;
 78 | }
 79 | die "Must define a reference -ref or a source -src" if not $REF and not $SRC;
 80 | my @title;
 81 | if($REF) {
 82 |     push @title, "Reference";
 83 |     $refh = IO::File->new("< $REF") or die "Couldn't open $REF";
 84 |     binmode $refh, ":utf8";
 85 | }
 86 | if($SRC) {
 87 |     push @title, "Source";
 88 |     $srch = IO::File->new("< $SRC") or die "Couldn't open $SRC";
 89 |     binmode $srch, ":utf8";
 90 | }
 91 | if($IDS) {
 92 |     $idsh = IO::File->new("> $IDS") or die "Couldn't open $IDS";
 93 |     binmode $idsh, ":utf8";
 94 | }
 95 | for(0 .. $#ARGV) {
 96 |     push @title, "Output", "Rating";
 97 | }
 98 | print join("\t", @title)."\n";
 99 | 
100 | my @lines;
101 | 
102 | my %dups;
103 | while(1) {
104 |     my ($ref, $src, %vals, @valarr);
105 |     my $len = 0;
106 |     foreach my $i (0 .. $#handles) {
107 |         my $hand = $handles[$i];
108 |         my $val = <$hand>;
109 |         if(not $val) {
110 |             @lines = shuffle(@lines);
111 |             @lines = @lines[0 .. $NUM-1] if $NUM;
112 |             print join("\n", map {$_->[0]} @lines)."\n";
113 |             print $idsh join("\n", map {$_->[1]} @lines)."\n" if $idsh;
114 |             exit;
115 |         }
116 |         chomp $val;
117 |         $val =~ s/ *$//g; $val =~ s/^ *//g;
118 |         $vals{$val}++;
119 |         push @valarr, $val;
120 |     }
121 |     if($refh) { $ref = <$refh>; chomp $ref; $ref =~ s/ +$//g; $len = scalar(split(/ /, $ref)); }
122 |     if($srch) { $src = <$srch>; chomp $src; $src =~ s/ +$//g; $len = scalar(split(/ /, $src)); }
123 |     if(($len <= $MAX) and ($len >= $MIN)) {
124 |         my @cols;
125 |         push @cols, $ref if $refh;
126 |         push @cols, $src if $srch;
127 |         my @shufvals = shuffle(keys %vals);
128 |         foreach my $i (0 .. $#shufvals) {
129 |             $vals{$shufvals[$i]} = $i;
130 |             push @cols, $shufvals[$i], ((defined($ref) and ($shufvals[$i] eq $ref)) ? $EM : "");
131 |         }
132 |         my @valarr = map { $vals{$_} } @valarr;
133 |         push @lines, [join("\t", @cols), join("\t", @valarr)] if $src and not ($dups{$src}++ and not $DUP);
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/syseval-combine.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import random
  5 | 
  6 | """
  7 | What's this?
  8 | ------------
  9 | 
 10 | This script allows you to combine multiple MT system outputs into a format
 11 | suitable for manual evaluation. It should be used in combination with
 12 | syseval-report.py.
 13 | 
 14 | Usage and Data Format
 15 | ---------------------
 16 | 
 17 | syseval-combine.pl               \
 18 |            -src test.src         \
 19 |            -ref test.trg         \
 20 |            -ids output.ids       \
 21 |            -min 1                \
 22 |            -max 30               \
 23 |            system-1.trg system-2.trg system-3.trg ... \
 24 |            > output.csv
 25 | 
 26 | Where test.src is the input, test.trg is the reference, output.ids is a
 27 | list of ids used in syseval-report.pl. -min and -max are the minimum and
 28 | maximum length of sentences to use in evaluation, and system-1.trg,
 29 | system-2.trg, etc. are system output files. output.txt will be output in
 30 | tab-separated format for reading into spreadsheet software such as Excel or
 31 | OpenOffice Calc.
 32 | 
 33 | Note that the output will be a tab-separated file with the reference and
 34 | source first, followed by a system output and a blank space for entering
 35 | its rating. The order of the system outputs will be randomized to prevent
 36 | any effect of ordering on the decisions, and also identical hypotheses will
 37 | be combined so the rater does not need to grade the same sentence twice.
 38 | This order is saved in the output.ids file, and can be restored after the
 39 | manual evaluation is finished by syseval-report.py.
 40 | """
 41 | 
 42 | def load_lines(fname: str) -> list[str]:
 43 |     with open(fname, 'r') as fin:
 44 |         return [x.strip() for x in fin]
 45 | 
 46 | def print_line(src: str, ref: str | None, hyp: str, rating: str, file):
 47 |     if ref:
 48 |         print(f'{src}\t{ref}\t{hyp}\t{rating}', file=file)
 49 |     else:
 50 |         print(f'{src}\t{hyp}\t{rating}', file=file)
 51 | 
 52 | def main():
 53 | 
 54 |     # Make parser object
 55 |     p = argparse.ArgumentParser(description=__doc__,
 56 |                                 formatter_class=argparse.RawDescriptionHelpFormatter)
 57 | 
 58 |     p.add_argument("--src", help="the source file")
 59 |     p.add_argument("--ref", required=False, help="the reference file")
 60 |     p.add_argument("--hyps", nargs="+", help="the hypotheses")
 61 |     p.add_argument("--min", default=0, help="the minimum sentence length")
 62 |     p.add_argument("--max", default=10000, help="the maximum sentence length")
 63 |     p.add_argument("--lines", default=None, help="the number of lines to print out")
 64 |     p.add_argument("--ids_out", help="the file to write IDs to")
 65 |     p.add_argument("--tsv_out", help="the file to write the tsv to")
 66 |     args = p.parse_args()
 67 | 
 68 |     src_lines = load_lines(args.src)
 69 |     ref_lines = load_lines(args.ref) if args.ref else None
 70 |     hyps_lines = [load_lines(x) for x in args.hyps]
 71 | 
 72 |     if not all([len(x) == len(src_lines) for x in hyps_lines]):
 73 |         raise ValueError(
 74 |             f'src and hyp lines don\'t match ({len(src_lines)} != {[len(x) for x in hyps_lines]})')
 75 |     if ref_lines and len(ref_lines) != len(src_lines):
 76 |         raise ValueError(f'src and ref lines don\'t match ({len(src_lines)} != {len(ref_lines)})')
 77 | 
 78 |     valid_ids = []
 79 |     for i, src_line in enumerate(src_lines):
 80 |         src_len = len(' '.split(src_line))
 81 |         if src_len >= args.min and src_len <= args.max:
 82 |             valid_ids.append(i)
 83 |     random.shuffle(valid_ids)
 84 |     if args.lines and args.lines < valid_ids:
 85 |         valid_ids = valid_ids[:args.lines]
 86 | 
 87 |     with open(args.tsv_out, 'w') as tsv_out, open(args.ids_out, 'w') as ids_out:
 88 |         print_line('Source', 'Reference' if args.ref else None, 'Translation', 'Rating', tsv_out)
 89 |         for i in valid_ids:
 90 |             hyps_line = [x[i] for x in hyps_lines]
 91 |             dedup_line = list(set(hyps_line))
 92 |             random.shuffle(dedup_line)
 93 |             dedup_map = {v: str(i) for i, v in enumerate(dedup_line)}
 94 |             hyps_ids = [dedup_map[x] for x in hyps_line]
 95 |             ref_line = ref_lines[i] if ref_lines else None
 96 |             print('\t'.join(hyps_ids), file=ids_out)
 97 |             print_line(src_lines[i], ref_line, dedup_line[0], 'XXX', tsv_out)
 98 |             for x in dedup_line[1:]:
 99 |                 print_line('', ' ' if ref_lines else None, x, 'XXX', tsv_out)
100 |             print('', file=tsv_out)
101 | 
102 | if __name__ == '__main__':
103 |     main()


--------------------------------------------------------------------------------
/syseval-report.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #
  4 | # What's this?
  5 | # ------------
  6 | #
  7 | # Given a CSV-format manual evaluation file created by syseval-combine.pl,
  8 | # this script restores the orders of the systems and calculates an average
  9 | # score for each system.
 10 | #
 11 | # Usage and Data Format
 12 | # ---------------------
 13 | #
 14 | # syseval-report.pl finished-eval.csv output.ids
 15 | #
 16 | # Where finished-eval is a completely filled in manual evaluation CSV file
 17 | # and output.ids is the ID file output by syseval-combine.pl
 18 | 
 19 | use strict;
 20 | use warnings;
 21 | use utf8;
 22 | use Getopt::Long;
 23 | use List::Util qw(sum min max shuffle);
 24 | binmode STDIN, ":utf8";
 25 | binmode STDOUT, ":utf8";
 26 | binmode STDERR, ":utf8";
 27 | 
 28 | my $LINES = -1;
 29 | my $COMPS = "0-1";
 30 | GetOptions(
 31 |     "lines=i" => \$LINES,
 32 |     "comps=s" => \$COMPS,
 33 | );
 34 | 
 35 | if(@ARGV != 2) {
 36 |     print STDERR "Usage: $0 TSV IDS\n";
 37 |     exit 1;
 38 | }
 39 | open FILE0, "<:utf8", $ARGV[0] or die "Couldn't open $ARGV[0]\n";
 40 | open FILE1, "<:utf8", $ARGV[1] or die "Couldn't open $ARGV[1]\n";
 41 | 
 42 | my %lettermap = (
 43 |     "S" => 5,
 44 |     "AA" => 5,
 45 |     "A" => 4,
 46 |     "B" => 3,
 47 |     "C" => 2,
 48 |     "D" => 1,
 49 |     "F" => 1
 50 | );
 51 | 
 52 | my ($stsv, $sids, $lines, @scores, @tsvs, @vals, @refs, $header);
 53 | while(defined($stsv = <FILE0>) and defined($sids = <FILE1>)) {
 54 |     if((not $header) and ($stsv =~ /^(Source|Reference)\t/)) {
 55 |         $header = 1; $stsv = <FILE0>;
 56 |     }
 57 |     next if($stsv =~ /\tX\t/ or $stsv =~ /\tX$/);
 58 |     ++$lines;
 59 |     last if ($LINES != -1) and ($lines > $LINES);
 60 |     chomp $stsv; chomp $sids;
 61 |     push @tsvs, $stsv;
 62 |     $stsv =~ s/[\t ]+$//g;
 63 |     my @atsv = split(/\t/, $stsv);
 64 |     my @aids = split(/\t/, $sids);
 65 |     $refs[$lines-1] = shift(@atsv);
 66 |     $refs[$lines-1] .= "\t".shift(@atsv) if(@atsv % 2 == 1);
 67 |     if((max(@aids)+1)*2 != @atsv) { die "MISMATCHED LINES: ". (max(@aids)+1)*2 ." != ".scalar(@atsv)."\n$stsv\n$sids\n"; }
 68 |     foreach my $i (0 .. $#aids) {
 69 |         $scores[$i] = [] if not $scores[$i];
 70 |         my $pos = $aids[$i]*2;
 71 |         $atsv[$pos+1] = $lettermap{$atsv[$pos+1]} if(defined $lettermap{$atsv[$pos+1]});
 72 |         $atsv[$pos+1] =~ /^[0-9\.]+$/ or die "Unfinished line $stsv\n";
 73 |         push @{$scores[$i]}, $atsv[$pos+1];
 74 |         $vals[$i] = [] if not $vals[$i];
 75 |         push @{$vals[$i]}, $atsv[$pos];
 76 |     }
 77 | }
 78 | 
 79 | # for(split(/,/, $COMPS)) {
 80 | #    my ($base, $sys) = split(/-/);
 81 | # }
 82 | 
 83 | foreach my $i (1 .. scalar(@{$scores[0]})) {
 84 |     my @line = ($i, (map { $scores[$_]->[$i-1] } (0 .. $#scores)), (map { $vals[$_]->[$i-1] } (0 .. $#vals)), $refs[$i-1]);
 85 |     print join("\t", @line)."\n";
 86 | }
 87 | my @avgscores = map { sum(@$_) / $lines } @scores;
 88 | print join("\t", "Total", map { sprintf("%.2f", $_) } @avgscores)."\n";
 89 | foreach my $lev (2 .. 5) {
 90 |     my @levs = map { sprintf("%.2f", sum(map { ($_ >= $lev) ? 1 : 0 } @$_) * 100 / $lines) } @scores;
 91 |     print join("\t", "$lev+", @levs)."\n";
 92 | }
 93 | foreach my $i (0 .. $#avgscores-1) {
 94 |     foreach my $j ($i+1 .. $#avgscores) {
 95 |         my ($w, $t, $l) = bootstrap($scores[$i], $scores[$j]);
 96 |         print "$i-vs.-$j\t$w\t$t\t$l\n";
 97 |     }
 98 | }
 99 | 
100 | # Perform bootstrap resampling to estimate probabilities
101 | sub bootstrap {
102 |     my $ITER = 10000;
103 |     my @sysa = @{shift(@_)};
104 |     my @sysb = @{shift(@_)};
105 |     @sysa == @sysb or die "Uneven numbers of scores";
106 |     my @ids = (0 .. $#sysa);
107 |     my @wins = (0, 0, 0);
108 |     for(1 .. $ITER) {
109 |         @ids = shuffle(@ids);
110 |         my ($sa, $sb);
111 |         for(@ids[0 .. int(@ids/2)]) {
112 |             $sa += $sysa[$_];
113 |             $sb += $sysb[$_];
114 |             # print "$sysa[$_] $sysb[$_]\n";
115 |         }
116 |         if($sa > $sb) { $wins[0]++; }
117 |         elsif($sb > $sa) { $wins[2]++; }
118 |         else { $wins[1]++; }
119 |     }
120 |     return map { $_ / $ITER } @wins;
121 | }
122 | 


--------------------------------------------------------------------------------
/syseval-report.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import numpy as np
 4 | 
 5 | """
 6 | What's this?
 7 | ------------
 8 | 
 9 | Given a TSV-format manual evaluation file created by syseval-combine.py,
10 | this script restores the orders of the systems and calculates an average
11 | score for each system.
12 | 
13 | Usage and Data Format
14 | ---------------------
15 | 
16 | syseval-report.py finished-eval.tsv output.ids
17 | 
18 | Where finished-eval is a completely filled in manual evaluation TSV file
19 | and output.ids is the ID file output by syseval-combine.pl
20 | """
21 | 
22 | with open(sys.argv[1], 'r') as tsv_in, open(sys.argv[2], 'r') as ids_in:
23 |     header = next(tsv_in).split('\t')
24 |     if header[0] != 'Source':
25 |         raise ValueError(f'Illegal header {header}')
26 |     has_reference = 1 if header[1] == 'Reference' else 0
27 |     parsed_tsv = []
28 |     curr_data = {}
29 |     for line in tsv_in:
30 |         cols = [x.strip() for x in line.split('\t')]
31 |         if len(cols) > 1:
32 |             if 'src' not in curr_data:
33 |                 curr_data['src'] = cols[0]
34 |                 if has_reference:
35 |                     curr_data['ref'] = cols[1]
36 |                 curr_data['hyps'] = []
37 |                 curr_data['scores'] = []
38 |             curr_data['hyps'].append(cols[1+has_reference])
39 |             curr_data['scores'].append(cols[2+has_reference])
40 |         else:
41 |             parsed_tsv.append(curr_data)
42 |             curr_data = {}
43 |     all_scores = None
44 |     for ids_str, data in zip(ids_in, parsed_tsv):
45 |         ids = [int(x) for x in ids_str.strip().split('\t')]
46 |         if not all_scores:
47 |             all_scores = [list() for _ in ids]
48 |         my_hyps = [data['hyps'][i] for i in ids]
49 |         my_scores = [data['scores'][i] for i in ids]
50 |         out_cols = [data['src']]
51 |         if 'ref' in data:
52 |             out_cols.append(data['ref'])
53 |         for i, (hyp, score) in enumerate(zip(my_hyps, my_scores)):
54 |             out_cols += [hyp, score]
55 |             try:
56 |                 all_scores[i].append(float(score))
57 |             except:
58 |                 print(f'WARNING: illegal score {score}', file=sys.stderr)
59 |         print('\t'.join(out_cols))
60 |     avg_scores = [str(np.average(x)) for x in all_scores]
61 |     print('\n--- Average Scores ---\n'+'\t'.join(avg_scores))
62 | 
63 | 


--------------------------------------------------------------------------------
/t/levenshtein.t:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | use Test::Simple tests => 1;
 5 | 
 6 | use Levenshtein;
 7 | 
 8 | # TODO: Write tests about levenshtein().
 9 | # this is a dummy test.
10 | my $a = "test";
11 | my $b = "test";
12 | ok($a eq $b);
13 | 


--------------------------------------------------------------------------------
/txt2penntokens.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use FileHandle;
 7 | use Getopt::Long;
 8 | use List::Util qw(sum min max shuffle);
 9 | binmode STDIN, ":utf8";
10 | binmode STDOUT, ":utf8";
11 | binmode STDERR, ":utf8";
12 | 
13 | if(@ARGV != 0) {
14 |     print STDERR "Usage: $0\n";
15 |     exit 1;
16 | }
17 | 
18 | while(<STDIN>) {
19 |     $_ =~ s/@/-AT-/g;
20 |     $_ =~ s/" ([^"]+) "/`` $1 ''/g;
21 |     $_ =~ s/\(/-LRB-/g;
22 |     $_ =~ s/\)/-RRB-/g;
23 |     $_ =~ s/\[/-LSB-/g;
24 |     $_ =~ s/\]/-RSB-/g;
25 |     $_ =~ s/\{/-LCB-/g;
26 |     $_ =~ s/\}/-RCB-/g;
27 |     $_ =~ s/\//\\\//g;
28 |     print "$_";
29 | }
30 | 


--------------------------------------------------------------------------------
/wc.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | $| = 1;
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | use utf8;
 8 | use FileHandle;
 9 | use Getopt::Long;
10 | use List::Util qw(sum min max shuffle);
11 | binmode STDIN, ":utf8";
12 | binmode STDOUT, ":utf8";
13 | binmode STDERR, ":utf8";
14 | 
15 | if(@ARGV == 0) {
16 |     my ($l, $w, $c);
17 |     while(<STDIN>) {
18 |         chomp;
19 |         $l++;
20 |         $c += length($_);
21 |         $w += split(/ /);
22 |     }
23 |     print "$l $w $c\n";
24 | } else {
25 |     foreach my $f (@ARGV) {
26 |         open FILE, "<:utf8", $f or die "Couldn't open $f";
27 |         my ($l, $w, $c);
28 |         while(<FILE>) {
29 |             chomp;
30 |             $l++;
31 |             $c += length($_);
32 |             $w += split(/ /);
33 |         }
34 |         print "$l $w $c\t$f\n";
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/zen2han.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use utf8;
 6 | use List::Util qw(sum min max shuffle);
 7 | use Getopt::Long;
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | my $NOHYPHEN = 0;
13 | my $ZENSPACE = 0;
14 | my $UNDERSPACE = 0;
15 | my $TRIM = 0;
16 | GetOptions(
17 |     "nohyphen" => \$NOHYPHEN,
18 |     "zenspace" => \$ZENSPACE,
19 |     "underspace" => \$UNDERSPACE,
20 |     "trim" => \$TRIM,
21 | );
22 | 
23 | if(@ARGV != 0) {
24 |     print STDERR "Usage: han2zen.pl < INPUT > OUTPUT\n";
25 |     exit 1;
26 | }
27 | 
28 | while(<STDIN>) {
29 |     chomp;
30 |     if($TRIM) { s/^ +//g; s/ +$//g; }
31 |     tr/ａ-ｚＡ-Ｚ０-９（）［］｛｝＜＞．，＿％「」、”？・＋：。！＆＊/a-zA-Z0-9()[]{}<>.,_%｢｣､"?･+:｡!&*/;
32 |     s/／/\//g;
33 |     if(not $NOHYPHEN) { s/－/-/g; }
34 |     if($ZENSPACE) { s/ /　/g; }
35 |     elsif($UNDERSPACE) { s/　/__/g; }
36 |     print "$_\n";
37 | }
38 | 


--------------------------------------------------------------------------------
/zenspace2under.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use utf8;
 5 | use FileHandle;
 6 | use Getopt::Long;
 7 | use List::Util qw(sum min max shuffle);
 8 | binmode STDIN, ":utf8";
 9 | binmode STDOUT, ":utf8";
10 | binmode STDERR, ":utf8";
11 | 
12 | if(@ARGV != 0) {
13 |     print STDERR "Usage: $0\n";
14 |     exit 1;
15 | }
16 | 
17 | while(<STDIN>) {
18 |     s/　/__/g;
19 |     print $_;
20 | }
21 | 


--------------------------------------------------------------------------------